diff --git a/openmp/libomptarget/CMakeLists.txt b/openmp/libomptarget/CMakeLists.txt
--- a/openmp/libomptarget/CMakeLists.txt
+++ b/openmp/libomptarget/CMakeLists.txt
@@ -106,7 +106,6 @@
   "Path to folder containing llvm library libomptarget.so")
 
 # Build offloading plugins and device RTLs if they are available.
-add_subdirectory(plugins)
 add_subdirectory(plugins-nextgen)
 add_subdirectory(DeviceRTL)
 add_subdirectory(tools)
diff --git a/openmp/libomptarget/plugins/CMakeLists.txt b/openmp/libomptarget/plugins/CMakeLists.txt
deleted file mode 100644
--- a/openmp/libomptarget/plugins/CMakeLists.txt
+++ /dev/null
@@ -1,93 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build plugins for the user system if available.
-#
-##===----------------------------------------------------------------------===##
-
-add_subdirectory(common)
-
-# void build_generic_elf64(string tmachine, string tmachine_name, string tmachine_libname, string elf_machine_id);
-# - build a plugin for an ELF based generic 64-bit target based on libffi.
-# - tmachine: name of the machine processor as used in the cmake build system.
-# - tmachine_name: name of the machine to be printed with the debug messages.
-# - tmachine_libname: machine name to be appended to the plugin library name.
-macro(build_generic_elf64 tmachine tmachine_name tmachine_libname tmachine_triple elf_machine_id)
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$")
-  if(LIBOMPTARGET_DEP_LIBFFI_FOUND)
-
-    libomptarget_say("Building ${tmachine_name} offloading plugin.")
-
-    # Define macro to be used as prefix of the runtime messages for this target.
-    add_definitions("-DTARGET_NAME=${tmachine_name}")
-
-    # Define macro with the ELF ID for this target.
-    add_definitions("-DTARGET_ELF_ID=${elf_machine_id}")
-
-    add_llvm_library("omptarget.rtl.${tmachine_libname}"
-      SHARED
-
-      ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/src/rtl.cpp
-
-      ADDITIONAL_HEADER_DIRS
-      ${LIBOMPTARGET_INCLUDE_DIR}
-      ${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR}
-
-        LINK_LIBS 
-        PRIVATE
-        elf_common
-        ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES}
-        ${OPENMP_PTHREAD_LIB}
-        "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
-
-      NO_INSTALL_RPATH
-    )
-
-    # Install plugin under the lib destination folder.
-    install(TARGETS "omptarget.rtl.${tmachine_libname}"
-      LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-    set_target_properties("omptarget.rtl.${tmachine_libname}" PROPERTIES
-      INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.."
-      CXX_VISIBILITY_PRESET protected)
-
-    target_include_directories( "omptarget.rtl.${tmachine_libname}" PRIVATE
-      ${LIBOMPTARGET_INCLUDE_DIR}
-      ${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR})
-
-    list(APPEND LIBOMPTARGET_TESTED_PLUGINS
-      "omptarget.rtl.${tmachine_libname}")
-
-    # Report to the parent scope that we are building a plugin.
-    set(LIBOMPTARGET_SYSTEM_TARGETS
-      "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple} ${tmachine_triple}-oldDriver" PARENT_SCOPE)
-    set(LIBOMPTARGET_SYSTEM_TARGETS
-      "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple} ${tmachine_triple}-LTO" PARENT_SCOPE)
-    set(LIBOMPTARGET_TESTED_PLUGINS
-      "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
-
-  else(LIBOMPTARGET_DEP_LIBFFI_FOUND)
-    libomptarget_say("Not building ${tmachine_name} offloading plugin: libffi dependency not found.")
-  endif(LIBOMPTARGET_DEP_LIBFFI_FOUND)
-else()
-  libomptarget_say("Not building ${tmachine_name} offloading plugin: machine not found in the system.")
-endif()
-endmacro()
-
-add_subdirectory(aarch64)
-add_subdirectory(amdgpu)
-add_subdirectory(cuda)
-add_subdirectory(ppc64)
-add_subdirectory(ppc64le)
-add_subdirectory(ve)
-add_subdirectory(x86_64)
-add_subdirectory(remote)
-
-# Make sure the parent scope can see the plugins that will be created.
-set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
-set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
-
diff --git a/openmp/libomptarget/plugins/aarch64/CMakeLists.txt b/openmp/libomptarget/plugins/aarch64/CMakeLists.txt
deleted file mode 100644
--- a/openmp/libomptarget/plugins/aarch64/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for an aarch64 machine if available.
-#
-##===----------------------------------------------------------------------===##
-
-if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  build_generic_elf64("aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183")
-else()
- libomptarget_say("Not building aarch64 offloading plugin: machine not found in the system.")
-endif()
diff --git a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt b/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
+++ /dev/null
@@ -1,125 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is dual licensed under the MIT and the University of Illinois Open
-# Source Licenses. See LICENSE.txt for details.
-#
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for an AMDGPU machine if available.
-#
-##===----------------------------------------------------------------------===##
-
-################################################################################
-set(LIBOMPTARGET_BUILD_AMDGPU_PLUGIN TRUE CACHE BOOL
-  "Whether to build AMDGPU plugin")
-if (NOT LIBOMPTARGET_BUILD_AMDGPU_PLUGIN)
-  libomptarget_say("Not building AMDGPU offloading plugin: LIBOMPTARGET_BUILD_AMDGPU_PLUGIN is false")
-  return()
-endif()
-
-# as of rocm-3.7, hsa is installed with cmake packages and kmt is found via hsa
-find_package(hsa-runtime64 QUIET 1.2.0 HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
-
-if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux")
-  libomptarget_say("Not building AMDGPU plugin: only support AMDGPU in Linux x86_64, ppc64le, or aarch64 hosts")
-  return()
-endif()
-
-################################################################################
-# Define the suffix for the runtime messaging dumps.
-add_definitions(-DTARGET_NAME=AMDGPU)
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "(ppc64le)|(aarch64)$")
-   add_definitions(-DLITTLEENDIAN_CPU=1)
-endif()
-
-if(CMAKE_BUILD_TYPE MATCHES Debug)
-  add_definitions(-DDEBUG)
-endif()
-
-set(LIBOMPTARGET_DLOPEN_LIBHSA OFF)
-option(LIBOMPTARGET_FORCE_DLOPEN_LIBHSA "Build with dlopened libhsa" ${LIBOMPTARGET_DLOPEN_LIBHSA})
-
-if (${hsa-runtime64_FOUND} AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBHSA)
-  libomptarget_say("Building AMDGPU plugin linked against libhsa")
-  set(LIBOMPTARGET_EXTRA_SOURCE)
-  set(LIBOMPTARGET_DEP_LIBRARIES hsa-runtime64::hsa-runtime64)
-else()
-  libomptarget_say("Building AMDGPU plugin for dlopened libhsa")
-  include_directories(dynamic_hsa)
-  set(LIBOMPTARGET_EXTRA_SOURCE dynamic_hsa/hsa.cpp)
-  set(LIBOMPTARGET_DEP_LIBRARIES)
-endif()
-
-if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
-  # On FreeBSD, the 'environ' symbol is undefined at link time, but resolved by
-  # the dynamic linker at runtime. Therefore, allow the symbol to be undefined
-  # when creating a shared library.
-  set(LDFLAGS_UNDEFINED "-Wl,--allow-shlib-undefined")
-else()
-  set(LDFLAGS_UNDEFINED "-Wl,-z,defs")
-endif()
-
-add_llvm_library(omptarget.rtl.amdgpu SHARED
-  impl/impl.cpp
-  impl/interop_hsa.cpp
-  impl/data.cpp
-  impl/get_elf_mach_gfx_name.cpp
-  impl/system.cpp
-  impl/msgpack.cpp
-  src/rtl.cpp
-  ${LIBOMPTARGET_EXTRA_SOURCE}
-
-  ADDITIONAL_HEADER_DIRS
-  ${LIBOMPTARGET_INCLUDE_DIR}
-  ${CMAKE_CURRENT_SOURCE_DIR}/impl
-  ${CMAKE_CURRENT_SOURCE_DIR}/../../plugins-nextgen/amdgpu/utils
-
-  LINK_COMPONENTS
-  Support
-  Object
-
-  LINK_LIBS 
-  PRIVATE
-  elf_common
-  ${LIBOMPTARGET_DEP_LIBRARIES}
-  ${OPENMP_PTHREAD_LIB}
-  "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
-  ${LDFLAGS_UNDEFINED}
-
-  NO_INSTALL_RPATH
-)
-add_dependencies(omptarget.rtl.amdgpu omptarget.devicertl.amdgpu)
-
-target_include_directories(
-  omptarget.rtl.amdgpu
-  PRIVATE
-  ${LIBOMPTARGET_INCLUDE_DIR}
-  ${CMAKE_CURRENT_SOURCE_DIR}/impl
-  ${CMAKE_CURRENT_SOURCE_DIR}/../../plugins-nextgen/amdgpu/utils
-)
-
-
-# Install plugin under the lib destination folder.
-install(TARGETS omptarget.rtl.amdgpu LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-set_target_properties(omptarget.rtl.amdgpu PROPERTIES 
-  INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.."
-  CXX_VISIBILITY_PRESET protected)
-
-# Report to the parent scope that we are building a plugin for hsa.
-# This controls whether tests are run for the nvptx offloading target
-# Run them if libhsa is available, or if the user explicitly asked for dlopen
-# Otherwise this plugin is being built speculatively and there may be no hsa available
-option(LIBOMPTARGET_FORCE_AMDGPU_TESTS "Build AMDGPU libomptarget tests" OFF)
-if (LIBOMPTARGET_FOUND_AMDGPU_GPU OR LIBOMPTARGET_FORCE_AMDGPU_TESTS)
-  # Report to the parent scope that we are building a plugin for amdgpu
-  set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa amdgcn-amd-amdhsa-oldDriver" PARENT_SCOPE)
-  set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa amdgcn-amd-amdhsa-LTO" PARENT_SCOPE)
-  list(APPEND LIBOMPTARGET_TESTED_PLUGINS "omptarget.rtl.amdgpu")
-  set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
-else()
-  libomptarget_say("Not generating amdgcn test targets as libhsa is not linkable")
-  return()
-endif()
-
diff --git a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.h b/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.h
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.h
+++ /dev/null
@@ -1,368 +0,0 @@
-//===--- amdgpu/dynamic_hsa/hsa.h --------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// The parts of the hsa api that are presently in use by the amdgpu plugin
-//
-//===----------------------------------------------------------------------===//
-#ifndef HSA_RUNTIME_INC_HSA_H_
-#define HSA_RUNTIME_INC_HSA_H_
-
-#include <stddef.h>
-#include <stdint.h>
-
-// Detect and set large model builds.
-#undef HSA_LARGE_MODEL
-#if defined(__LP64__) || defined(_M_X64)
-#define HSA_LARGE_MODEL
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef enum {
-  HSA_STATUS_SUCCESS = 0x0,
-  HSA_STATUS_INFO_BREAK = 0x1,
-  HSA_STATUS_ERROR = 0x1000,
-  HSA_STATUS_ERROR_INVALID_CODE_OBJECT = 0x1010,
-  HSA_STATUS_ERROR_NOT_INITIALIZED = 0x100B,
-} hsa_status_t;
-
-hsa_status_t hsa_status_string(hsa_status_t status, const char **status_string);
-
-typedef struct hsa_dim3_s {
-  uint32_t x;
-  uint32_t y;
-  uint32_t z;
-} hsa_dim3_t;
-
-hsa_status_t hsa_init();
-
-hsa_status_t hsa_shut_down();
-
-typedef struct hsa_agent_s {
-  uint64_t handle;
-} hsa_agent_t;
-
-typedef enum {
-  HSA_DEVICE_TYPE_CPU = 0,
-  HSA_DEVICE_TYPE_GPU = 1,
-  HSA_DEVICE_TYPE_DSP = 2
-} hsa_device_type_t;
-
-typedef enum {
-  HSA_ISA_INFO_NAME_LENGTH = 0,
-  HSA_ISA_INFO_NAME = 1
-} hsa_isa_info_t;
-
-typedef enum {
-  HSA_AGENT_INFO_NAME = 0,
-  HSA_AGENT_INFO_VENDOR_NAME = 1,
-  HSA_AGENT_INFO_FEATURE = 2,
-  HSA_AGENT_INFO_PROFILE = 4,
-  HSA_AGENT_INFO_WAVEFRONT_SIZE = 6,
-  HSA_AGENT_INFO_WORKGROUP_MAX_DIM = 7,
-  HSA_AGENT_INFO_WORKGROUP_MAX_SIZE = 8,
-  HSA_AGENT_INFO_GRID_MAX_DIM = 9,
-  HSA_AGENT_INFO_GRID_MAX_SIZE = 10,
-  HSA_AGENT_INFO_FBARRIER_MAX_SIZE = 11,
-  HSA_AGENT_INFO_QUEUES_MAX = 12,
-  HSA_AGENT_INFO_QUEUE_MIN_SIZE = 13,
-  HSA_AGENT_INFO_QUEUE_MAX_SIZE = 14,
-  HSA_AGENT_INFO_DEVICE = 17,
-  HSA_AGENT_INFO_CACHE_SIZE = 18,
-  HSA_AGENT_INFO_FAST_F16_OPERATION = 24,
-} hsa_agent_info_t;
-
-typedef enum {
-  HSA_SYSTEM_INFO_VERSION_MAJOR = 0,
-  HSA_SYSTEM_INFO_VERSION_MINOR = 1,
-} hsa_system_info_t;
-
-typedef enum {
-  HSA_AGENT_FEATURE_KERNEL_DISPATCH = 1,
-  HSA_AGENT_FEATURE_AGENT_DISPATCH = 2,
-} hsa_agent_feature_t;
-
-typedef struct hsa_region_s {
-  uint64_t handle;
-} hsa_region_t;
-
-typedef struct hsa_isa_s {
-  uint64_t handle;
-} hsa_isa_t;
-
-hsa_status_t hsa_system_get_info(hsa_system_info_t attribute, void *value);
-
-hsa_status_t hsa_agent_get_info(hsa_agent_t agent, hsa_agent_info_t attribute,
-                                void *value);
-
-hsa_status_t hsa_isa_get_info_alt(hsa_isa_t isa, hsa_isa_info_t attribute,
-                                  void *value);
-
-hsa_status_t hsa_iterate_agents(hsa_status_t (*callback)(hsa_agent_t agent,
-                                                         void *data),
-                                void *data);
-
-hsa_status_t hsa_agent_iterate_isas(hsa_agent_t agent,
-                                    hsa_status_t (*callback)(hsa_isa_t isa,
-                                                             void *data),
-                                    void *data);
-
-typedef struct hsa_signal_s {
-  uint64_t handle;
-} hsa_signal_t;
-
-#ifdef HSA_LARGE_MODEL
-typedef int64_t hsa_signal_value_t;
-#else
-typedef int32_t hsa_signal_value_t;
-#endif
-
-hsa_status_t hsa_signal_create(hsa_signal_value_t initial_value,
-                               uint32_t num_consumers,
-                               const hsa_agent_t *consumers,
-                               hsa_signal_t *signal);
-
-hsa_status_t hsa_amd_signal_create(hsa_signal_value_t initial_value,
-                                   uint32_t num_consumers,
-                                   const hsa_agent_t *consumers,
-                                   uint64_t attributes, hsa_signal_t *signal);
-
-hsa_status_t hsa_signal_destroy(hsa_signal_t signal);
-
-void hsa_signal_store_relaxed(hsa_signal_t signal, hsa_signal_value_t value);
-
-void hsa_signal_store_screlease(hsa_signal_t signal, hsa_signal_value_t value);
-
-hsa_signal_value_t hsa_signal_load_scacquire(hsa_signal_t signal);
-
-void hsa_signal_subtract_screlease(hsa_signal_t signal,
-                                   hsa_signal_value_t value);
-
-typedef enum {
-  HSA_SIGNAL_CONDITION_EQ = 0,
-  HSA_SIGNAL_CONDITION_NE = 1,
-} hsa_signal_condition_t;
-
-typedef enum {
-  HSA_WAIT_STATE_BLOCKED = 0,
-  HSA_WAIT_STATE_ACTIVE = 1
-} hsa_wait_state_t;
-
-hsa_signal_value_t hsa_signal_wait_scacquire(hsa_signal_t signal,
-                                             hsa_signal_condition_t condition,
-                                             hsa_signal_value_t compare_value,
-                                             uint64_t timeout_hint,
-                                             hsa_wait_state_t wait_state_hint);
-
-typedef enum {
-  HSA_QUEUE_TYPE_MULTI = 0,
-  HSA_QUEUE_TYPE_SINGLE = 1,
-} hsa_queue_type_t;
-
-typedef enum {
-  HSA_QUEUE_FEATURE_KERNEL_DISPATCH = 1,
-  HSA_QUEUE_FEATURE_AGENT_DISPATCH = 2
-} hsa_queue_feature_t;
-
-typedef uint32_t hsa_queue_type32_t;
-
-typedef struct hsa_queue_s {
-  hsa_queue_type32_t type;
-  uint32_t features;
-
-#ifdef HSA_LARGE_MODEL
-  void *base_address;
-#elif defined HSA_LITTLE_ENDIAN
-  void *base_address;
-  uint32_t reserved0;
-#else
-  uint32_t reserved0;
-  void *base_address;
-#endif
-  hsa_signal_t doorbell_signal;
-  uint32_t size;
-  uint32_t reserved1;
-  uint64_t id;
-} hsa_queue_t;
-
-hsa_status_t hsa_queue_create(hsa_agent_t agent, uint32_t size,
-                              hsa_queue_type32_t type,
-                              void (*callback)(hsa_status_t status,
-                                               hsa_queue_t *source, void *data),
-                              void *data, uint32_t private_segment_size,
-                              uint32_t group_segment_size, hsa_queue_t **queue);
-
-hsa_status_t hsa_queue_destroy(hsa_queue_t *queue);
-
-uint64_t hsa_queue_load_read_index_scacquire(const hsa_queue_t *queue);
-
-uint64_t hsa_queue_add_write_index_relaxed(const hsa_queue_t *queue,
-                                           uint64_t value);
-
-typedef enum {
-  HSA_PACKET_TYPE_KERNEL_DISPATCH = 2,
-  HSA_PACKET_TYPE_BARRIER_AND = 3,
-} hsa_packet_type_t;
-
-typedef enum { HSA_FENCE_SCOPE_SYSTEM = 2 } hsa_fence_scope_t;
-
-typedef enum {
-  HSA_PACKET_HEADER_TYPE = 0,
-  HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE = 9,
-  HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE = 11
-} hsa_packet_header_t;
-
-typedef enum {
-  HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS = 0
-} hsa_kernel_dispatch_packet_setup_t;
-
-typedef enum {
-  HSA_KERNEL_DISPATCH_PACKET_SETUP_WIDTH_DIMENSIONS = 2
-} hsa_kernel_dispatch_packet_setup_width_t;
-
-typedef struct hsa_kernel_dispatch_packet_s {
-  uint16_t header;
-  uint16_t setup;
-  uint16_t workgroup_size_x;
-  uint16_t workgroup_size_y;
-  uint16_t workgroup_size_z;
-  uint16_t reserved0;
-  uint32_t grid_size_x;
-  uint32_t grid_size_y;
-  uint32_t grid_size_z;
-  uint32_t private_segment_size;
-  uint32_t group_segment_size;
-  uint64_t kernel_object;
-#ifdef HSA_LARGE_MODEL
-  void *kernarg_address;
-#elif defined HSA_LITTLE_ENDIAN
-  void *kernarg_address;
-  uint32_t reserved1;
-#else
-  uint32_t reserved1;
-  void *kernarg_address;
-#endif
-  uint64_t reserved2;
-  hsa_signal_t completion_signal;
-} hsa_kernel_dispatch_packet_t;
-
-typedef struct hsa_barrier_and_packet_s {
-  uint16_t header;
-  uint16_t reserved0;
-  uint32_t reserved1;
-  hsa_signal_t dep_signal[5];
-  uint64_t reserved2;
-  hsa_signal_t completion_signal;
-} hsa_barrier_and_packet_t;
-
-typedef enum { HSA_PROFILE_BASE = 0, HSA_PROFILE_FULL = 1 } hsa_profile_t;
-
-typedef enum {
-  HSA_EXECUTABLE_STATE_UNFROZEN = 0,
-  HSA_EXECUTABLE_STATE_FROZEN = 1
-} hsa_executable_state_t;
-
-typedef struct hsa_executable_s {
-  uint64_t handle;
-} hsa_executable_t;
-
-typedef struct hsa_executable_symbol_s {
-  uint64_t handle;
-} hsa_executable_symbol_t;
-
-typedef enum {
-  HSA_EXECUTABLE_SYMBOL_INFO_TYPE = 0,
-  HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH = 1,
-  HSA_EXECUTABLE_SYMBOL_INFO_NAME = 2,
-  HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS = 21,
-  HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE = 9,
-  HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT = 22,
-  HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE = 11,
-  HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE = 13,
-  HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE = 14,
-} hsa_executable_symbol_info_t;
-
-typedef struct hsa_code_object_s {
-  uint64_t handle;
-} hsa_code_object_t;
-
-typedef enum {
-  HSA_SYMBOL_KIND_VARIABLE = 0,
-  HSA_SYMBOL_KIND_KERNEL = 1,
-  HSA_SYMBOL_KIND_INDIRECT_FUNCTION = 2
-} hsa_symbol_kind_t;
-
-typedef enum {
-  HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT = 0,
-  HSA_DEFAULT_FLOAT_ROUNDING_MODE_ZERO = 1,
-  HSA_DEFAULT_FLOAT_ROUNDING_MODE_NEAR = 2,
-} hsa_default_float_rounding_mode_t;
-
-hsa_status_t hsa_memory_copy(void *dst, const void *src, size_t size);
-
-hsa_status_t hsa_executable_create(hsa_profile_t profile,
-                                   hsa_executable_state_t executable_state,
-                                   const char *options,
-                                   hsa_executable_t *executable);
-
-hsa_status_t hsa_executable_create_alt(
-    hsa_profile_t profile,
-    hsa_default_float_rounding_mode_t default_float_rounding_mode,
-    const char *options, hsa_executable_t *executable);
-
-hsa_status_t hsa_executable_destroy(hsa_executable_t executable);
-
-hsa_status_t hsa_executable_freeze(hsa_executable_t executable,
-                                   const char *options);
-
-hsa_status_t hsa_executable_validate(hsa_executable_t executable,
-                                     uint32_t *result);
-
-hsa_status_t
-hsa_executable_symbol_get_info(hsa_executable_symbol_t executable_symbol,
-                               hsa_executable_symbol_info_t attribute,
-                               void *value);
-
-hsa_status_t hsa_executable_iterate_symbols(
-    hsa_executable_t executable,
-    hsa_status_t (*callback)(hsa_executable_t exec,
-                             hsa_executable_symbol_t symbol, void *data),
-    void *data);
-
-hsa_status_t hsa_executable_get_symbol_by_name(hsa_executable_t executable,
-                                               const char *symbol_name,
-                                               const hsa_agent_t *agent,
-                                               hsa_executable_symbol_t *symbol);
-
-hsa_status_t hsa_code_object_deserialize(void *serialized_code_object,
-                                         size_t serialized_code_object_size,
-                                         const char *options,
-                                         hsa_code_object_t *code_object);
-
-hsa_status_t hsa_executable_load_code_object(hsa_executable_t executable,
-                                             hsa_agent_t agent,
-                                             hsa_code_object_t code_object,
-                                             const char *options);
-
-hsa_status_t hsa_code_object_destroy(hsa_code_object_t code_object);
-
-typedef bool (*hsa_amd_signal_handler)(hsa_signal_value_t value, void *arg);
-
-hsa_status_t hsa_amd_signal_async_handler(hsa_signal_t signal,
-                                          hsa_signal_condition_t cond,
-                                          hsa_signal_value_t value,
-                                          hsa_amd_signal_handler handler,
-                                          void *arg);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.cpp b/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.cpp
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-//===--- amdgpu/dynamic_hsa/hsa.cpp ------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Implement subset of hsa api by calling into hsa library via dlopen
-// Does the dlopen/dlsym calls as part of the call to hsa_init
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/DynamicLibrary.h"
-
-#include "Debug.h"
-#include "dlwrap.h"
-#include "hsa.h"
-#include "hsa_ext_amd.h"
-#include <memory>
-
-DLWRAP_INITIALIZE()
-
-DLWRAP_INTERNAL(hsa_init, 0)
-
-DLWRAP(hsa_status_string, 2)
-DLWRAP(hsa_shut_down, 0)
-DLWRAP(hsa_system_get_info, 2)
-DLWRAP(hsa_agent_get_info, 3)
-DLWRAP(hsa_isa_get_info_alt, 3)
-DLWRAP(hsa_iterate_agents, 2)
-DLWRAP(hsa_agent_iterate_isas, 3)
-DLWRAP(hsa_signal_create, 4)
-DLWRAP(hsa_signal_destroy, 1)
-DLWRAP(hsa_signal_store_relaxed, 2)
-DLWRAP(hsa_signal_store_screlease, 2)
-DLWRAP(hsa_signal_wait_scacquire, 5)
-DLWRAP(hsa_signal_load_scacquire, 1)
-DLWRAP(hsa_signal_subtract_screlease, 2)
-DLWRAP(hsa_queue_create, 8)
-DLWRAP(hsa_queue_destroy, 1)
-DLWRAP(hsa_queue_load_read_index_scacquire, 1)
-DLWRAP(hsa_queue_add_write_index_relaxed, 2)
-DLWRAP(hsa_memory_copy, 3)
-DLWRAP(hsa_executable_create, 4)
-DLWRAP(hsa_executable_create_alt, 4)
-DLWRAP(hsa_executable_destroy, 1)
-DLWRAP(hsa_executable_freeze, 2)
-DLWRAP(hsa_executable_validate, 2)
-DLWRAP(hsa_executable_symbol_get_info, 3)
-DLWRAP(hsa_executable_get_symbol_by_name, 4)
-DLWRAP(hsa_executable_iterate_symbols, 3)
-DLWRAP(hsa_code_object_deserialize, 4)
-DLWRAP(hsa_executable_load_code_object, 4)
-DLWRAP(hsa_code_object_destroy, 1)
-DLWRAP(hsa_amd_agent_memory_pool_get_info, 4)
-DLWRAP(hsa_amd_agent_iterate_memory_pools, 3)
-DLWRAP(hsa_amd_memory_pool_allocate, 4)
-DLWRAP(hsa_amd_memory_pool_free, 1)
-DLWRAP(hsa_amd_memory_async_copy, 8)
-DLWRAP(hsa_amd_memory_pool_get_info, 3)
-DLWRAP(hsa_amd_agents_allow_access, 4)
-DLWRAP(hsa_amd_memory_lock, 5)
-DLWRAP(hsa_amd_memory_unlock, 1)
-DLWRAP(hsa_amd_memory_fill, 3)
-DLWRAP(hsa_amd_register_system_event_handler, 2)
-DLWRAP(hsa_amd_signal_create, 5)
-DLWRAP(hsa_amd_signal_async_handler, 5)
-DLWRAP(hsa_amd_pointer_info, 5)
-
-DLWRAP_FINALIZE()
-
-#ifndef DYNAMIC_HSA_PATH
-#define DYNAMIC_HSA_PATH "libhsa-runtime64.so"
-#endif
-
-#ifndef TARGET_NAME
-#error "Missing TARGET_NAME macro"
-#endif
-#ifndef DEBUG_PREFIX
-#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
-#endif
-
-static bool checkForHSA() {
-  // return true if dlopen succeeded and all functions found
-
-  const char *HsaLib = DYNAMIC_HSA_PATH;
-  std::string ErrMsg;
-  auto DynlibHandle = std::make_unique<llvm::sys::DynamicLibrary>(
-      llvm::sys::DynamicLibrary::getPermanentLibrary(HsaLib, &ErrMsg));
-  if (!DynlibHandle->isValid()) {
-    DP("Unable to load library '%s': %s!\n", HsaLib, ErrMsg.c_str());
-    return false;
-  }
-
-  for (size_t I = 0; I < dlwrap::size(); I++) {
-    const char *Sym = dlwrap::symbol(I);
-
-    void *P = DynlibHandle->getAddressOfSymbol(Sym);
-    if (P == nullptr) {
-      DP("Unable to find '%s' in '%s'!\n", Sym, HsaLib);
-      return false;
-    }
-    DP("Implementing %s with dlsym(%s) -> %p\n", Sym, Sym, P);
-
-    *dlwrap::pointer(I) = P;
-  }
-
-  return true;
-}
-
-hsa_status_t hsa_init() {
-  if (!checkForHSA()) {
-    return HSA_STATUS_ERROR;
-  }
-  return dlwrap_hsa_init();
-}
diff --git a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa_ext_amd.h b/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa_ext_amd.h
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ /dev/null
@@ -1,163 +0,0 @@
-//===--- amdgpu/dynamic_hsa/hsa_ext_amd.h ------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// The parts of the hsa api that are presently in use by the amdgpu plugin
-//
-//===----------------------------------------------------------------------===//
-#ifndef HSA_RUNTIME_EXT_AMD_H_
-#define HSA_RUNTIME_EXT_AMD_H_
-
-#include "hsa.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct hsa_amd_memory_pool_s {
-  uint64_t handle;
-} hsa_amd_memory_pool_t;
-
-typedef enum hsa_amd_memory_pool_global_flag_s {
-  HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT = 1,
-  HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED = 2,
-  HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED = 4
-} hsa_amd_memory_pool_global_flag_t;
-
-typedef enum {
-  HSA_AMD_SEGMENT_GLOBAL = 0,
-  HSA_AMD_SEGMENT_READONLY = 1,
-  HSA_AMD_SEGMENT_PRIVATE = 2,
-  HSA_AMD_SEGMENT_GROUP = 3,
-} hsa_amd_segment_t;
-
-typedef enum {
-  HSA_AMD_MEMORY_POOL_INFO_SEGMENT = 0,
-  HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS = 1,
-  HSA_AMD_MEMORY_POOL_INFO_SIZE = 2,
-  HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED = 5,
-  HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE = 6,
-  HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT = 7,
-  HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL = 15,
-} hsa_amd_memory_pool_info_t;
-
-typedef enum {
-  HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS = 0,
-} hsa_amd_agent_memory_pool_info_t;
-
-typedef enum {
-  HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED = 0,
-} hsa_amd_memory_pool_access_t;
-
-typedef enum hsa_amd_agent_info_s {
-  HSA_AMD_AGENT_INFO_CACHELINE_SIZE = 0xA001,
-  HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT = 0xA002,
-  HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY = 0xA003,
-  HSA_AMD_AGENT_INFO_PRODUCT_NAME = 0xA009,
-  HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU = 0xA00A,
-  HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU = 0xA00B,
-  HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES = 0xA010
-} hsa_amd_agent_info_t;
-
-hsa_status_t hsa_amd_memory_pool_get_info(hsa_amd_memory_pool_t memory_pool,
-                                          hsa_amd_memory_pool_info_t attribute,
-                                          void *value);
-
-hsa_status_t hsa_amd_agent_iterate_memory_pools(
-    hsa_agent_t agent,
-    hsa_status_t (*callback)(hsa_amd_memory_pool_t memory_pool, void *data),
-    void *data);
-
-hsa_status_t hsa_amd_memory_pool_allocate(hsa_amd_memory_pool_t memory_pool,
-                                          size_t size, uint32_t flags,
-                                          void **ptr);
-
-hsa_status_t hsa_amd_memory_pool_free(void *ptr);
-
-hsa_status_t hsa_amd_memory_async_copy(void *dst, hsa_agent_t dst_agent,
-                                       const void *src, hsa_agent_t src_agent,
-                                       size_t size, uint32_t num_dep_signals,
-                                       const hsa_signal_t *dep_signals,
-                                       hsa_signal_t completion_signal);
-
-hsa_status_t hsa_amd_agent_memory_pool_get_info(
-    hsa_agent_t agent, hsa_amd_memory_pool_t memory_pool,
-    hsa_amd_agent_memory_pool_info_t attribute, void *value);
-
-hsa_status_t hsa_amd_agents_allow_access(uint32_t num_agents,
-                                         const hsa_agent_t *agents,
-                                         const uint32_t *flags,
-                                         const void *ptr);
-
-hsa_status_t hsa_amd_memory_lock(void* host_ptr, size_t size,
-                                hsa_agent_t* agents, int num_agent,
-                                void** agent_ptr);
-
-hsa_status_t hsa_amd_memory_unlock(void* host_ptr);
-
-hsa_status_t hsa_amd_memory_fill(void *ptr, uint32_t value, size_t count);
-
-typedef enum hsa_amd_event_type_s {
-  HSA_AMD_GPU_MEMORY_FAULT_EVENT = 0,
-} hsa_amd_event_type_t;
-
-typedef struct hsa_amd_gpu_memory_fault_info_s {
-  hsa_agent_t agent;
-  uint64_t virtual_address;
-  uint32_t fault_reason_mask;
-} hsa_amd_gpu_memory_fault_info_t;
-
-typedef struct hsa_amd_event_s {
-  hsa_amd_event_type_t event_type;
-  union {
-    hsa_amd_gpu_memory_fault_info_t memory_fault;
-  };
-} hsa_amd_event_t;
-
-typedef hsa_status_t (*hsa_amd_system_event_callback_t)(
-    const hsa_amd_event_t *event, void *data);
-
-hsa_status_t
-hsa_amd_register_system_event_handler(hsa_amd_system_event_callback_t callback,
-                                      void *data);
-
-typedef enum {
-  HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT = 1 << 0,
-  HSA_AMD_MEMORY_FAULT_READ_ONLY = 1 << 1,
-  HSA_AMD_MEMORY_FAULT_NX = 1 << 2,
-  HSA_AMD_MEMORY_FAULT_HOST_ONLY = 1 << 3,
-  HSA_AMD_MEMORY_FAULT_DRAMECC = 1 << 4,
-  HSA_AMD_MEMORY_FAULT_IMPRECISE = 1 << 5,
-  HSA_AMD_MEMORY_FAULT_SRAMECC = 1 << 6,
-  HSA_AMD_MEMORY_FAULT_HANG = 1 << 31
-} hsa_amd_memory_fault_reason_t;
-
-typedef enum {
-  HSA_EXT_POINTER_TYPE_UNKNOWN = 0,
-  HSA_EXT_POINTER_TYPE_HSA = 1,
-  HSA_EXT_POINTER_TYPE_LOCKED = 2
-} hsa_amd_pointer_type_t;
-
-typedef struct hsa_amd_pointer_info_s {
-  uint32_t size;
-  hsa_amd_pointer_type_t type;
-  void* agentBaseAddress;
-  void* hostBaseAddress;
-  size_t sizeInBytes;
-} hsa_amd_pointer_info_t;
-
-hsa_status_t hsa_amd_pointer_info(const void* ptr,
-                                          hsa_amd_pointer_info_t* info,
-                                          void* (*alloc)(size_t),
-                                          uint32_t* num_agents_accessible,
-                                          hsa_agent_t** accessible);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp b/openmp/libomptarget/plugins/amdgpu/impl/data.cpp
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-//===--- amdgpu/impl/data.cpp ------------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#include "impl_runtime.h"
-#include "hsa_api.h"
-#include "internal.h"
-#include "rt.h"
-#include <cassert>
-#include <stdio.h>
-#include <string.h>
-#include <vector>
-
-using core::TaskImpl;
-
-namespace core {
-namespace Runtime {
-hsa_status_t HostMalloc(void **ptr, size_t size,
-                        hsa_amd_memory_pool_t MemoryPool) {
-  hsa_status_t err = hsa_amd_memory_pool_allocate(MemoryPool, size, 0, ptr);
-  DP("Malloced %p\n", *ptr);
-  if (err == HSA_STATUS_SUCCESS) {
-    err = core::allow_access_to_all_gpu_agents(*ptr);
-  }
-  return err;
-}
-
-hsa_status_t Memfree(void *ptr) {
-  hsa_status_t err = hsa_amd_memory_pool_free(ptr);
-  DP("Freed %p\n", ptr);
-  return err;
-}
-} // namespace Runtime
-} // namespace core
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h
+++ /dev/null
@@ -1,15 +0,0 @@
-//===--- amdgpu/impl/get_elf_mach_gfx_name.h ---------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef GET_ELF_MACH_GFX_NAME_H_INCLUDED
-#define GET_ELF_MACH_GFX_NAME_H_INCLUDED
-
-#include <stdint.h>
-
-const char *get_elf_mach_gfx_name(uint32_t EFlags);
-
-#endif
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-//===--- amdgpu/impl/get_elf_mach_gfx_name.cpp -------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#include "get_elf_mach_gfx_name.h"
-
-// This header conflicts with the system elf.h (macros vs enums of the same
-// identifier) and contains more up to date values for the enum checked here.
-// rtl.cpp uses the system elf.h.
-#include "llvm/BinaryFormat/ELF.h"
-
-const char *get_elf_mach_gfx_name(uint32_t EFlags) {
-  using namespace llvm::ELF;
-  uint32_t Gfx = (EFlags & EF_AMDGPU_MACH);
-  switch (Gfx) {
-  case EF_AMDGPU_MACH_AMDGCN_GFX801:
-    return "gfx801";
-  case EF_AMDGPU_MACH_AMDGCN_GFX802:
-    return "gfx802";
-  case EF_AMDGPU_MACH_AMDGCN_GFX803:
-    return "gfx803";
-  case EF_AMDGPU_MACH_AMDGCN_GFX805:
-    return "gfx805";
-  case EF_AMDGPU_MACH_AMDGCN_GFX810:
-    return "gfx810";
-  case EF_AMDGPU_MACH_AMDGCN_GFX900:
-    return "gfx900";
-  case EF_AMDGPU_MACH_AMDGCN_GFX902:
-    return "gfx902";
-  case EF_AMDGPU_MACH_AMDGCN_GFX904:
-    return "gfx904";
-  case EF_AMDGPU_MACH_AMDGCN_GFX906:
-    return "gfx906";
-  case EF_AMDGPU_MACH_AMDGCN_GFX908:
-    return "gfx908";
-  case EF_AMDGPU_MACH_AMDGCN_GFX909:
-    return "gfx909";
-  case EF_AMDGPU_MACH_AMDGCN_GFX90A:
-    return "gfx90a";
-  case EF_AMDGPU_MACH_AMDGCN_GFX90C:
-    return "gfx90c";
-  case EF_AMDGPU_MACH_AMDGCN_GFX940:
-    return "gfx940";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1010:
-    return "gfx1010";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1011:
-    return "gfx1011";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1012:
-    return "gfx1012";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1013:
-    return "gfx1013";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1030:
-    return "gfx1030";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1031:
-    return "gfx1031";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1032:
-    return "gfx1032";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1033:
-    return "gfx1033";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1034:
-    return "gfx1034";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1035:
-    return "gfx1035";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1036:
-    return "gfx1036";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1100:
-    return "gfx1100";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1101:
-    return "gfx1101";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1102:
-    return "gfx1102";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1103:
-    return "gfx1103";
-  default:
-    return "--unknown gfx";
-  }
-}
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/hsa_api.h b/openmp/libomptarget/plugins/amdgpu/impl/hsa_api.h
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/hsa_api.h
+++ /dev/null
@@ -1,26 +0,0 @@
-//===--- amdgpu/impl/hsa_api.h ------------------------------------ C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef AMDGPU_HSA_API_H_INCLUDED
-#define AMDGPU_HSA_API_H_INCLUDED
-
-#if defined(__has_include)
-#if __has_include("hsa/hsa.h")
-#include "hsa/hsa.h"
-#include "hsa/hsa_ext_amd.h"
-#elif __has_include("hsa.h")
-#include "hsa.h"
-#include "hsa_ext_amd.h"
-#endif
-#else
-#include "hsa/hsa.h"
-#include "hsa_ext_amd.h"
-#endif
-
-
-
-#endif
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp b/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-//===--- amdgpu/impl/impl.cpp ------------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#include "rt.h"
-#include <memory>
-
-/*
- * Data
- */
-
-hsa_status_t is_locked(void *ptr, void **agentBaseAddress) {
-  hsa_status_t err = HSA_STATUS_SUCCESS;
-  hsa_amd_pointer_info_t info;
-  info.size = sizeof(hsa_amd_pointer_info_t);
-  err = hsa_amd_pointer_info(ptr, &info, /*alloc=*/nullptr,
-                             /*num_agents_accessible=*/nullptr,
-                             /*accessible=*/nullptr);
-  if (err != HSA_STATUS_SUCCESS) {
-    DP("Error when getting pointer info\n");
-    return err;
-  }
-
-  if (info.type == HSA_EXT_POINTER_TYPE_LOCKED) {
-    // When user passes in a basePtr+offset we need to fix the
-    // locked pointer to include the offset: ROCr always returns
-    // the base locked address, not the shifted one.
-    if ((char *)info.hostBaseAddress <= (char *)ptr &&
-        (char *)ptr < (char *)info.hostBaseAddress + info.sizeInBytes)
-      *agentBaseAddress =
-          (void *)((uint64_t)info.agentBaseAddress + (uint64_t)ptr -
-                   (uint64_t)info.hostBaseAddress);
-    else // address is already device-agent accessible, no need to compute
-         // offset
-      *agentBaseAddress = ptr;
-  } else
-    *agentBaseAddress = nullptr;
-
-  return HSA_STATUS_SUCCESS;
-}
-
-// host pointer (either src or dest) must be locked via hsa_amd_memory_lock
-static hsa_status_t invoke_hsa_copy(hsa_signal_t signal, void *dest,
-                                    hsa_agent_t agent, const void *src,
-                                    size_t size) {
-  const hsa_signal_value_t init = 1;
-  const hsa_signal_value_t success = 0;
-  hsa_signal_store_screlease(signal, init);
-
-  hsa_status_t err = hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0,
-                                               nullptr, signal);
-  if (err != HSA_STATUS_SUCCESS)
-    return err;
-
-  // async_copy reports success by decrementing and failure by setting to < 0
-  hsa_signal_value_t got = init;
-  while (got == init)
-    got = hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_NE, init,
-                                    UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
-
-  if (got != success)
-    return HSA_STATUS_ERROR;
-
-  return err;
-}
-
-struct implFreePtrDeletor {
-  void operator()(void *p) {
-    core::Runtime::Memfree(p); // ignore failure to free
-  }
-};
-
-enum CopyDirection { H2D, D2H };
-
-static hsa_status_t locking_async_memcpy(enum CopyDirection direction,
-                                         hsa_signal_t signal, void *dest,
-                                         hsa_agent_t agent, void *src,
-                                         void *lockingPtr, size_t size) {
-  void *lockedPtr = nullptr;
-  hsa_status_t err = is_locked(lockingPtr, &lockedPtr);
-  bool HostPtrIsLocked = true;
-  if (err != HSA_STATUS_SUCCESS)
-    return err;
-  if (!lockedPtr) { // not locked
-    HostPtrIsLocked = false;
-    hsa_agent_t agents[1] = {agent};
-    err = hsa_amd_memory_lock(lockingPtr, size, agents, /*num_agent=*/1,
-                              (void **)&lockedPtr);
-    if (err != HSA_STATUS_SUCCESS)
-      return err;
-    DP("locking_async_memcpy: lockingPtr=%p lockedPtr=%p Size = %lu\n",
-       lockingPtr, lockedPtr, size);
-  }
-
-  switch (direction) {
-  case H2D:
-    err = invoke_hsa_copy(signal, dest, agent, lockedPtr, size);
-    break;
-  case D2H:
-    err = invoke_hsa_copy(signal, lockedPtr, agent, src, size);
-    break;
-  }
-
-  if (err != HSA_STATUS_SUCCESS && !HostPtrIsLocked) {
-    // do not leak locked host pointers, but discard potential error message
-    // because the initial error was in the copy function
-    hsa_amd_memory_unlock(lockingPtr);
-    return err;
-  }
-
-  // unlock only if not user locked
-  if (!HostPtrIsLocked)
-    err = hsa_amd_memory_unlock(lockingPtr);
-  if (err != HSA_STATUS_SUCCESS)
-    return err;
-
-  return HSA_STATUS_SUCCESS;
-}
-
-hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
-                             void *hostSrc, size_t size,
-                             hsa_agent_t device_agent,
-                             hsa_amd_memory_pool_t MemoryPool) {
-  hsa_status_t err;
-
-  err = locking_async_memcpy(CopyDirection::H2D, signal, deviceDest,
-                             device_agent, hostSrc, hostSrc, size);
-
-  if (err == HSA_STATUS_SUCCESS)
-    return err;
-
-  // async memcpy sometimes fails in situations where
-  // allocate + copy succeeds. Looks like it might be related to
-  // locking part of a read only segment. Fall back for now.
-  void *tempHostPtr;
-  hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool);
-  if (ret != HSA_STATUS_SUCCESS) {
-    DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size);
-    return ret;
-  }
-  std::unique_ptr<void, implFreePtrDeletor> del(tempHostPtr);
-  memcpy(tempHostPtr, hostSrc, size);
-
-  return locking_async_memcpy(CopyDirection::H2D, signal, deviceDest,
-                              device_agent, tempHostPtr, tempHostPtr, size);
-}
-
-hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *hostDest,
-                             void *deviceSrc, size_t size,
-                             hsa_agent_t deviceAgent,
-                             hsa_amd_memory_pool_t MemoryPool) {
-  hsa_status_t err;
-
-  // device has always visibility over both pointers, so use that
-  err = locking_async_memcpy(CopyDirection::D2H, signal, hostDest, deviceAgent,
-                             deviceSrc, hostDest, size);
-
-  if (err == HSA_STATUS_SUCCESS)
-    return err;
-
-  // hsa_memory_copy sometimes fails in situations where
-  // allocate + copy succeeds. Looks like it might be related to
-  // locking part of a read only segment. Fall back for now.
-  void *tempHostPtr;
-  hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool);
-  if (ret != HSA_STATUS_SUCCESS) {
-    DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size);
-    return ret;
-  }
-  std::unique_ptr<void, implFreePtrDeletor> del(tempHostPtr);
-
-  err = locking_async_memcpy(CopyDirection::D2H, signal, tempHostPtr,
-                             deviceAgent, deviceSrc, tempHostPtr, size);
-  if (err != HSA_STATUS_SUCCESS)
-    return HSA_STATUS_ERROR;
-
-  memcpy(hostDest, tempHostPtr, size);
-  return HSA_STATUS_SUCCESS;
-}
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h b/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h
+++ /dev/null
@@ -1,34 +0,0 @@
-//===--- amdgpu/impl/impl_runtime.h ------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef INCLUDE_IMPL_RUNTIME_H_
-#define INCLUDE_IMPL_RUNTIME_H_
-
-#include "hsa_api.h"
-
-extern "C" {
-
-// Check if pointer ptr is already locked
-hsa_status_t is_locked(void *ptr, void **agentBaseAddress);
-
-hsa_status_t impl_module_register_from_memory_to_place(
-    void *module_bytes, size_t module_size, int DeviceId,
-    hsa_status_t (*on_deserialized_data)(void *data, size_t size,
-                                         void *cb_state),
-    void *cb_state);
-
-hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
-                             void *hostSrc, size_t size,
-                             hsa_agent_t device_agent,
-                             hsa_amd_memory_pool_t MemoryPool);
-
-hsa_status_t impl_memcpy_d2h(hsa_signal_t sig, void *hostDest, void *deviceSrc,
-                             size_t size, hsa_agent_t device_agent,
-                             hsa_amd_memory_pool_t MemoryPool);
-}
-
-#endif // INCLUDE_IMPL_RUNTIME_H_
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h
+++ /dev/null
@@ -1,154 +0,0 @@
-//===--- amdgpu/impl/internal.h ----------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef SRC_RUNTIME_INCLUDE_INTERNAL_H_
-#define SRC_RUNTIME_INCLUDE_INTERNAL_H_
-#include <inttypes.h>
-#include <pthread.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <cstring>
-#include <map>
-#include <queue>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "hsa_api.h"
-
-#include "impl_runtime.h"
-
-#ifndef TARGET_NAME
-#error "Missing TARGET_NAME macro"
-#endif
-#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
-#include "Debug.h"
-
-#define MAX_NUM_KERNELS (1024 * 16)
-
-// ---------------------- Kernel Start -------------
-typedef struct atl_kernel_info_s {
-  uint64_t kernel_object;
-  uint32_t group_segment_size;
-  uint32_t private_segment_size;
-  uint32_t sgpr_count;
-  uint32_t vgpr_count;
-  uint32_t sgpr_spill_count;
-  uint32_t vgpr_spill_count;
-  uint32_t kernel_segment_size;
-  uint32_t explicit_argument_count;
-  uint32_t implicit_argument_count;
-} atl_kernel_info_t;
-
-typedef struct atl_symbol_info_s {
-  uint64_t addr;
-  uint32_t size;
-} atl_symbol_info_t;
-
-// ---------------------- Kernel End -------------
-
-namespace core {
-class TaskgroupImpl;
-class TaskImpl;
-class Kernel;
-class KernelImpl;
-} // namespace core
-
-struct SignalPoolT {
-  SignalPoolT() {}
-  SignalPoolT(const SignalPoolT &) = delete;
-  SignalPoolT(SignalPoolT &&) = delete;
-  ~SignalPoolT() {
-    size_t N = state.size();
-    for (size_t i = 0; i < N; i++) {
-      hsa_signal_t signal = state.front();
-      state.pop();
-      hsa_status_t rc = hsa_signal_destroy(signal);
-      if (rc != HSA_STATUS_SUCCESS) {
-        DP("Signal pool destruction failed\n");
-      }
-    }
-  }
-  size_t size() {
-    lock l(&mutex);
-    return state.size();
-  }
-  void push(hsa_signal_t s) {
-    lock l(&mutex);
-    state.push(s);
-  }
-  hsa_signal_t pop(void) {
-    lock l(&mutex);
-    if (!state.empty()) {
-      hsa_signal_t res = state.front();
-      state.pop();
-      return res;
-    }
-
-    // Pool empty, attempt to create another signal
-    hsa_signal_t new_signal;
-    hsa_status_t err = hsa_signal_create(0, 0, NULL, &new_signal);
-    if (err == HSA_STATUS_SUCCESS) {
-      return new_signal;
-    }
-
-    // Fail
-    return {0};
-  }
-
-private:
-  static pthread_mutex_t mutex;
-  std::queue<hsa_signal_t> state;
-  struct lock {
-    lock(pthread_mutex_t *m) : m(m) { pthread_mutex_lock(m); }
-    ~lock() { pthread_mutex_unlock(m); }
-    pthread_mutex_t *m;
-  };
-};
-
-namespace core {
-hsa_status_t atl_init_gpu_context();
-
-hsa_status_t init_hsa();
-hsa_status_t finalize_hsa();
-/*
- * Generic utils
- */
-template <typename T> inline T alignDown(T value, size_t alignment) {
-  return (T)(value & ~(alignment - 1));
-}
-
-template <typename T> inline T *alignDown(T *value, size_t alignment) {
-  return reinterpret_cast<T *>(alignDown((intptr_t)value, alignment));
-}
-
-template <typename T> inline T alignUp(T value, size_t alignment) {
-  return alignDown((T)(value + alignment - 1), alignment);
-}
-
-template <typename T> inline T *alignUp(T *value, size_t alignment) {
-  return reinterpret_cast<T *>(
-      alignDown((intptr_t)(value + alignment - 1), alignment));
-}
-
-extern bool atl_is_impl_initialized();
-
-bool handle_group_signal(hsa_signal_value_t value, void *arg);
-
-hsa_status_t allow_access_to_all_gpu_agents(void *ptr);
-} // namespace core
-
-inline const char *get_error_string(hsa_status_t err) {
-  const char *res;
-  hsa_status_t rc = hsa_status_string(err, &res);
-  return (rc == HSA_STATUS_SUCCESS) ? res : "HSA_STATUS UNKNOWN.";
-}
-
-#endif // SRC_RUNTIME_INCLUDE_INTERNAL_H_
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.h b/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.h
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.h
+++ /dev/null
@@ -1,26 +0,0 @@
-//===--- amdgpu/impl/interop_hsa.h -------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef INCLUDE_INTEROP_HSA_H_
-#define INCLUDE_INTEROP_HSA_H_
-
-#include "impl_runtime.h"
-#include "hsa_api.h"
-#include "internal.h"
-
-#include <map>
-#include <string>
-
-extern "C" {
-
-hsa_status_t interop_hsa_get_symbol_info(
-    const std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
-    int DeviceId, const char *symbol, void **var_addr, unsigned int *var_size);
-
-}
-
-#endif // INCLUDE_INTEROP_HSA_H_
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.cpp b/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.cpp
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-//===--- amdgpu/impl/interop_hsa.cpp ------------------------------ C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#include "interop_hsa.h"
-#include "internal.h"
-
-hsa_status_t interop_hsa_get_symbol_info(
-    const std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
-    int DeviceId, const char *symbol, void **var_addr, unsigned int *var_size) {
-  /*
-     // Typical usage:
-     void *var_addr;
-     size_t var_size;
-     interop_hsa_get_symbol_addr(gpu_place, "symbol_name", &var_addr,
-     &var_size);
-     impl_memcpy(signal, host_add, var_addr, var_size);
-  */
-
-  if (!symbol || !var_addr || !var_size)
-    return HSA_STATUS_ERROR;
-
-  // get the symbol info
-  std::string symbolStr = std::string(symbol);
-  auto It = SymbolInfoTable.find(symbolStr);
-  if (It != SymbolInfoTable.end()) {
-    atl_symbol_info_t info = It->second;
-    *var_addr = reinterpret_cast<void *>(info.addr);
-    *var_size = info.size;
-    return HSA_STATUS_SUCCESS;
-  } else {
-    *var_addr = NULL;
-    *var_size = 0;
-    return HSA_STATUS_ERROR;
-  }
-}
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.h b/openmp/libomptarget/plugins/amdgpu/impl/msgpack.h
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.h
+++ /dev/null
@@ -1,282 +0,0 @@
-//===--- amdgpu/impl/msgpack.h ------------------------------------ C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef MSGPACK_H
-#define MSGPACK_H
-
-#include <functional>
-
-namespace msgpack {
-
-// The message pack format is dynamically typed, schema-less. Format is:
-// message: [type][header][payload]
-// where type is one byte, header length is a fixed length function of type
-// payload is zero to N bytes, with the length encoded in [type][header]
-
-// Scalar fields include boolean, signed integer, float, string etc
-// Composite types are sequences of messages
-// Array field is [header][element][element]...
-// Map field is [header][key][value][key][value]...
-
-// Multibyte integer fields are big endian encoded
-// The map key can be any message type
-// Maps may contain duplicate keys
-// Data is not uniquely encoded, e.g. integer "8" may be stored as one byte or
-// in as many as nine, as signed or unsigned. Implementation defined.
-// Similarly "foo" may embed the length in the type field or in multiple bytes
-
-// This parser is structured as an iterator over a sequence of bytes.
-// It calls a user provided function on each message in order to extract fields
-// The default implementation for each scalar type is to do nothing. For map or
-// arrays, the default implementation returns just after that message to support
-// iterating to the next message, but otherwise has no effect.
-
-struct byte_range {
-  const unsigned char *start;
-  const unsigned char *end;
-};
-
-const unsigned char *skip_next_message(const unsigned char *start,
-                                       const unsigned char *end);
-
-template <typename Derived> class functors_defaults {
-public:
-  void cb_string(size_t N, const unsigned char *str) {
-    derived().handle_string(N, str);
-  }
-  void cb_boolean(bool x) { derived().handle_boolean(x); }
-  void cb_signed(int64_t x) { derived().handle_signed(x); }
-  void cb_unsigned(uint64_t x) { derived().handle_unsigned(x); }
-  void cb_array_elements(byte_range bytes) {
-    derived().handle_array_elements(bytes);
-  }
-  void cb_map_elements(byte_range key, byte_range value) {
-    derived().handle_map_elements(key, value);
-  }
-  const unsigned char *cb_array(uint64_t N, byte_range bytes) {
-    return derived().handle_array(N, bytes);
-  }
-  const unsigned char *cb_map(uint64_t N, byte_range bytes) {
-    return derived().handle_map(N, bytes);
-  }
-
-private:
-  Derived &derived() { return *static_cast<Derived *>(this); }
-
-  // Default implementations for scalar ops are no-ops
-  void handle_string(size_t, const unsigned char *) {}
-  void handle_boolean(bool) {}
-  void handle_signed(int64_t) {}
-  void handle_unsigned(uint64_t) {}
-  void handle_array_elements(byte_range) {}
-  void handle_map_elements(byte_range, byte_range) {}
-
-  // Default implementation for sequences is to skip over the messages
-  const unsigned char *handle_array(uint64_t N, byte_range bytes) {
-    for (uint64_t i = 0; i < N; i++) {
-      const unsigned char *next = skip_next_message(bytes.start, bytes.end);
-      if (!next) {
-        return nullptr;
-      }
-      cb_array_elements(bytes);
-      bytes.start = next;
-    }
-    return bytes.start;
-  }
-  const unsigned char *handle_map(uint64_t N, byte_range bytes) {
-    for (uint64_t i = 0; i < N; i++) {
-      const unsigned char *start_key = bytes.start;
-      const unsigned char *end_key = skip_next_message(start_key, bytes.end);
-      if (!end_key) {
-        return nullptr;
-      }
-      const unsigned char *start_value = end_key;
-      const unsigned char *end_value =
-          skip_next_message(start_value, bytes.end);
-      if (!end_value) {
-        return nullptr;
-      }
-      cb_map_elements({start_key, end_key}, {start_value, end_value});
-      bytes.start = end_value;
-    }
-    return bytes.start;
-  }
-};
-
-typedef enum : uint8_t {
-#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) NAME,
-#include "msgpack.def"
-#undef X
-} type;
-
-[[noreturn]] void internal_error();
-type parse_type(unsigned char x);
-unsigned bytes_used_fixed(type ty);
-
-typedef uint64_t (*payload_info_t)(const unsigned char *);
-payload_info_t payload_info(msgpack::type ty);
-
-template <typename T, typename R> R bitcast(T x);
-
-template <typename F, msgpack::type ty>
-const unsigned char *handle_msgpack_given_type(byte_range bytes, F f) {
-  const unsigned char *start = bytes.start;
-  const unsigned char *end = bytes.end;
-  const uint64_t available = end - start;
-  assert(available != 0);
-  assert(ty == parse_type(*start));
-
-  const uint64_t bytes_used = bytes_used_fixed(ty);
-  if (available < bytes_used) {
-    return 0;
-  }
-  const uint64_t available_post_header = available - bytes_used;
-
-  const payload_info_t info = payload_info(ty);
-  const uint64_t N = info(start);
-
-  switch (ty) {
-  case msgpack::t:
-  case msgpack::f: {
-    // t is 0b11000010, f is 0b11000011, masked with 0x1
-    f.cb_boolean(N);
-    return start + bytes_used;
-  }
-
-  case msgpack::posfixint:
-  case msgpack::uint8:
-  case msgpack::uint16:
-  case msgpack::uint32:
-  case msgpack::uint64: {
-    f.cb_unsigned(N);
-    return start + bytes_used;
-  }
-
-  case msgpack::negfixint:
-  case msgpack::int8:
-  case msgpack::int16:
-  case msgpack::int32:
-  case msgpack::int64: {
-    f.cb_signed(bitcast<uint64_t, int64_t>(N));
-    return start + bytes_used;
-  }
-
-  case msgpack::fixstr:
-  case msgpack::str8:
-  case msgpack::str16:
-  case msgpack::str32: {
-    if (available_post_header < N) {
-      return 0;
-    } else {
-      f.cb_string(N, start + bytes_used);
-      return start + bytes_used + N;
-    }
-  }
-
-  case msgpack::fixarray:
-  case msgpack::array16:
-  case msgpack::array32: {
-    return f.cb_array(N, {start + bytes_used, end});
-  }
-
-  case msgpack::fixmap:
-  case msgpack::map16:
-  case msgpack::map32: {
-    return f.cb_map(N, {start + bytes_used, end});
-  }
-
-  case msgpack::nil:
-  case msgpack::bin8:
-  case msgpack::bin16:
-  case msgpack::bin32:
-  case msgpack::float32:
-  case msgpack::float64:
-  case msgpack::ext8:
-  case msgpack::ext16:
-  case msgpack::ext32:
-  case msgpack::fixext1:
-  case msgpack::fixext2:
-  case msgpack::fixext4:
-  case msgpack::fixext8:
-  case msgpack::fixext16:
-  case msgpack::never_used: {
-    if (available_post_header < N) {
-      return 0;
-    }
-    return start + bytes_used + N;
-  }
-  }
-  internal_error();
-}
-
-template <typename F>
-const unsigned char *handle_msgpack(byte_range bytes, F f) {
-  const unsigned char *start = bytes.start;
-  const unsigned char *end = bytes.end;
-  const uint64_t available = end - start;
-  if (available == 0) {
-    return 0;
-  }
-  const type ty = parse_type(*start);
-
-  switch (ty) {
-#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER)                                  \
-  case msgpack::NAME:                                                          \
-    return handle_msgpack_given_type<F, msgpack::NAME>(bytes, f);
-#include "msgpack.def"
-#undef X
-  }
-
-  internal_error();
-}
-
-bool message_is_string(byte_range bytes, const char *str);
-
-template <typename C> void foronly_string(byte_range bytes, C callback) {
-  struct inner : functors_defaults<inner> {
-    inner(C &cb) : cb(cb) {}
-    C &cb;
-    void handle_string(size_t N, const unsigned char *str) { cb(N, str); }
-  };
-  handle_msgpack<inner>(bytes, {callback});
-}
-
-template <typename C> void foronly_unsigned(byte_range bytes, C callback) {
-  struct inner : functors_defaults<inner> {
-    inner(C &cb) : cb(cb) {}
-    C &cb;
-    void handle_unsigned(uint64_t x) { cb(x); }
-  };
-  handle_msgpack<inner>(bytes, {callback});
-}
-
-template <typename C> void foreach_array(byte_range bytes, C callback) {
-  struct inner : functors_defaults<inner> {
-    inner(C &cb) : cb(cb) {}
-    C &cb;
-    void handle_array_elements(byte_range element) { cb(element); }
-  };
-  handle_msgpack<inner>(bytes, {callback});
-}
-
-template <typename C> void foreach_map(byte_range bytes, C callback) {
-  struct inner : functors_defaults<inner> {
-    inner(C &cb) : cb(cb) {}
-    C &cb;
-    void handle_map_elements(byte_range key, byte_range value) {
-      cb(key, value);
-    }
-  };
-  handle_msgpack<inner>(bytes, {callback});
-}
-
-// Crude approximation to json
-void dump(byte_range);
-
-} // namespace msgpack
-
-#endif
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.cpp b/openmp/libomptarget/plugins/amdgpu/impl/msgpack.cpp
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.cpp
+++ /dev/null
@@ -1,271 +0,0 @@
-//===--- amdgpu/impl/msgpack.cpp ---------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#include <cassert>
-#include <cstdint>
-#include <cstring>
-#include <functional>
-#include <string>
-
-#include "msgpack.h"
-
-namespace msgpack {
-
-[[noreturn]] void internal_error() {
-  printf("internal error\n");
-  exit(1);
-}
-
-const char *type_name(type ty) {
-  switch (ty) {
-#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER)                                  \
-  case NAME:                                                                   \
-    return #NAME;
-#include "msgpack.def"
-#undef X
-  }
-  internal_error();
-}
-
-unsigned bytes_used_fixed(msgpack::type ty) {
-  using namespace msgpack;
-  switch (ty) {
-#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER)                                  \
-  case NAME:                                                                   \
-    return WIDTH;
-#include "msgpack.def"
-#undef X
-  }
-  internal_error();
-}
-
-msgpack::type parse_type(unsigned char x) {
-
-#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER)                                  \
-  if (x >= LOWER && x <= UPPER) {                                              \
-    return NAME;                                                               \
-  } else
-#include "msgpack.def"
-#undef X
-  { internal_error(); }
-}
-
-template <typename T, typename R> R bitcast(T x) {
-  static_assert(sizeof(T) == sizeof(R), "");
-  R tmp;
-  memcpy(&tmp, &x, sizeof(T));
-  return tmp;
-}
-template int64_t bitcast<uint64_t, int64_t>(uint64_t);
-} // namespace msgpack
-
-// Helper functions for reading additional payload from the header
-// Depending on the type, this can be a number of bytes, elements,
-// key-value pairs or an embedded integer.
-// Each takes a pointer to the start of the header and returns a uint64_t
-
-namespace {
-namespace payload {
-uint64_t read_zero(const unsigned char *) { return 0; }
-
-// Read the first byte and zero/sign extend it
-uint64_t read_embedded_u8(const unsigned char *start) { return start[0]; }
-uint64_t read_embedded_s8(const unsigned char *start) {
-  int64_t res = msgpack::bitcast<uint8_t, int8_t>(start[0]);
-  return msgpack::bitcast<int64_t, uint64_t>(res);
-}
-
-// Read a masked part of the first byte
-uint64_t read_via_mask_0x1(const unsigned char *start) { return *start & 0x1u; }
-uint64_t read_via_mask_0xf(const unsigned char *start) { return *start & 0xfu; }
-uint64_t read_via_mask_0x1f(const unsigned char *start) {
-  return *start & 0x1fu;
-}
-
-// Read 1/2/4/8 bytes immediately following the type byte and zero/sign extend
-// Big endian format.
-uint64_t read_size_field_u8(const unsigned char *from) {
-  from++;
-  return from[0];
-}
-
-// TODO: detect whether host is little endian or not, and whether the intrinsic
-// is available. And probably use the builtin to test the diy
-const bool use_bswap = false;
-
-uint64_t read_size_field_u16(const unsigned char *from) {
-  from++;
-  if (use_bswap) {
-    uint16_t b;
-    memcpy(&b, from, 2);
-    return __builtin_bswap16(b);
-  } else {
-    return (from[0] << 8u) | from[1];
-  }
-}
-uint64_t read_size_field_u32(const unsigned char *from) {
-  from++;
-  if (use_bswap) {
-    uint32_t b;
-    memcpy(&b, from, 4);
-    return __builtin_bswap32(b);
-  } else {
-    return (from[0] << 24u) | (from[1] << 16u) | (from[2] << 8u) |
-           (from[3] << 0u);
-  }
-}
-uint64_t read_size_field_u64(const unsigned char *from) {
-  from++;
-  if (use_bswap) {
-    uint64_t b;
-    memcpy(&b, from, 8);
-    return __builtin_bswap64(b);
-  } else {
-    return ((uint64_t)from[0] << 56u) | ((uint64_t)from[1] << 48u) |
-           ((uint64_t)from[2] << 40u) | ((uint64_t)from[3] << 32u) |
-           (from[4] << 24u) | (from[5] << 16u) | (from[6] << 8u) |
-           (from[7] << 0u);
-  }
-}
-
-uint64_t read_size_field_s8(const unsigned char *from) {
-  uint8_t u = read_size_field_u8(from);
-  int64_t res = msgpack::bitcast<uint8_t, int8_t>(u);
-  return msgpack::bitcast<int64_t, uint64_t>(res);
-}
-uint64_t read_size_field_s16(const unsigned char *from) {
-  uint16_t u = read_size_field_u16(from);
-  int64_t res = msgpack::bitcast<uint16_t, int16_t>(u);
-  return msgpack::bitcast<int64_t, uint64_t>(res);
-}
-uint64_t read_size_field_s32(const unsigned char *from) {
-  uint32_t u = read_size_field_u32(from);
-  int64_t res = msgpack::bitcast<uint32_t, int32_t>(u);
-  return msgpack::bitcast<int64_t, uint64_t>(res);
-}
-uint64_t read_size_field_s64(const unsigned char *from) {
-  uint64_t u = read_size_field_u64(from);
-  int64_t res = msgpack::bitcast<uint64_t, int64_t>(u);
-  return msgpack::bitcast<int64_t, uint64_t>(res);
-}
-} // namespace payload
-} // namespace
-
-namespace msgpack {
-
-payload_info_t payload_info(msgpack::type ty) {
-  using namespace msgpack;
-  switch (ty) {
-#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER)                                  \
-  case NAME:                                                                   \
-    return payload::PAYLOAD;
-#include "msgpack.def"
-#undef X
-  }
-  internal_error();
-}
-
-} // namespace msgpack
-
-const unsigned char *msgpack::skip_next_message(const unsigned char *start,
-                                                const unsigned char *end) {
-  class f : public functors_defaults<f> {};
-  return handle_msgpack({start, end}, f());
-}
-
-namespace msgpack {
-bool message_is_string(byte_range bytes, const char *needle) {
-  bool matched = false;
-  size_t needleN = strlen(needle);
-
-  foronly_string(bytes, [=, &matched](size_t N, const unsigned char *str) {
-    if (N == needleN) {
-      if (memcmp(needle, str, N) == 0) {
-        matched = true;
-      }
-    }
-  });
-  return matched;
-}
-
-void dump(byte_range bytes) {
-  struct inner : functors_defaults<inner> {
-    inner(unsigned indent) : indent(indent) {}
-    const unsigned by = 2;
-    unsigned indent = 0;
-
-    void handle_string(size_t N, const unsigned char *bytes) {
-      char *tmp = (char *)malloc(N + 1);
-      memcpy(tmp, bytes, N);
-      tmp[N] = '\0';
-      printf("\"%s\"", tmp);
-      free(tmp);
-    }
-
-    void handle_signed(int64_t x) { printf("%ld", x); }
-    void handle_unsigned(uint64_t x) { printf("%lu", x); }
-
-    const unsigned char *handle_array(uint64_t N, byte_range bytes) {
-      printf("\n%*s[\n", indent, "");
-      indent += by;
-
-      for (uint64_t i = 0; i < N; i++) {
-        indent += by;
-        printf("%*s", indent, "");
-        const unsigned char *next = handle_msgpack<inner>(bytes, {indent});
-        printf(",\n");
-        indent -= by;
-        bytes.start = next;
-        if (!next) {
-          break;
-        }
-      }
-      indent -= by;
-      printf("%*s]", indent, "");
-
-      return bytes.start;
-    }
-
-    const unsigned char *handle_map(uint64_t N, byte_range bytes) {
-      printf("\n%*s{\n", indent, "");
-      indent += by;
-
-      for (uint64_t i = 0; i < 2 * N; i += 2) {
-        const unsigned char *start_key = bytes.start;
-        printf("%*s", indent, "");
-        const unsigned char *end_key =
-            handle_msgpack<inner>({start_key, bytes.end}, {indent});
-        if (!end_key) {
-          break;
-        }
-
-        printf(" : ");
-
-        const unsigned char *start_value = end_key;
-        const unsigned char *end_value =
-            handle_msgpack<inner>({start_value, bytes.end}, {indent});
-
-        if (!end_value) {
-          break;
-        }
-
-        printf(",\n");
-        bytes.start = end_value;
-      }
-
-      indent -= by;
-      printf("%*s}", indent, "");
-
-      return bytes.start;
-    }
-  };
-
-  handle_msgpack<inner>(bytes, {0});
-  printf("\n");
-}
-
-} // namespace msgpack
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.def b/openmp/libomptarget/plugins/amdgpu/impl/msgpack.def
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.def
+++ /dev/null
@@ -1,46 +0,0 @@
-//===--- amdgpu/impl/msgpack.def ---------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// name, header width, reader, [lower, upper] encoding
-X(posfixint, 1, read_embedded_u8, 0x00, 0x7f)
-X(negfixint, 1, read_embedded_s8, 0xe0, 0xff)
-X(fixmap, 1, read_via_mask_0xf, 0x80, 0x8f)
-X(fixarray, 1, read_via_mask_0xf, 0x90, 0x9f)
-X(fixstr, 1, read_via_mask_0x1f, 0xa0, 0xbf)
-X(nil, 1, read_zero, 0xc0, 0xc0)
-X(never_used, 1, read_zero, 0xc1, 0xc1)
-X(f, 1, read_via_mask_0x1, 0xc2, 0xc2)
-X(t, 1, read_via_mask_0x1, 0xc3, 0xc3)
-X(bin8, 2, read_size_field_u8, 0xc4, 0xc4)
-X(bin16, 3, read_size_field_u16, 0xc5, 0xc5)
-X(bin32, 5, read_size_field_u32, 0xc6, 0xc6)
-X(ext8, 3, read_size_field_u8, 0xc7, 0xc7)
-X(ext16, 4, read_size_field_u16, 0xc8, 0xc8)
-X(ext32, 6, read_size_field_u32, 0xc9, 0xc9)
-X(float32, 5, read_zero, 0xca, 0xca)
-X(float64, 9, read_zero, 0xcb, 0xcb)
-X(uint8, 2, read_size_field_u8, 0xcc, 0xcc)
-X(uint16, 3, read_size_field_u16, 0xcd, 0xcd)
-X(uint32, 5, read_size_field_u32, 0xce, 0xce)
-X(uint64, 9, read_size_field_u64, 0xcf, 0xcf)
-X(int8, 2, read_size_field_s8, 0xd0, 0xd0)
-X(int16, 3, read_size_field_s16, 0xd1, 0xd1)
-X(int32, 5, read_size_field_s32, 0xd2, 0xd2)
-X(int64, 9, read_size_field_s64, 0xd3, 0xd3)
-X(fixext1, 3, read_zero, 0xd4, 0xd4)
-X(fixext2, 4, read_zero, 0xd5, 0xd5)
-X(fixext4, 6, read_zero, 0xd6, 0xd6)
-X(fixext8, 10, read_zero, 0xd7, 0xd7)
-X(fixext16, 18, read_zero, 0xd8, 0xd8)
-X(str8, 2, read_size_field_u8, 0xd9, 0xd9)
-X(str16, 3, read_size_field_u16, 0xda, 0xda)
-X(str32, 5, read_size_field_u32, 0xdb, 0xdb)
-X(array16, 3, read_size_field_u16, 0xdc, 0xdc)
-X(array32, 5, read_size_field_u32, 0xdd, 0xdd)
-X(map16, 3, read_size_field_u16, 0xde, 0xde)
-X(map32, 5, read_size_field_u32, 0xdf, 0xdf)
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/rt.h b/openmp/libomptarget/plugins/amdgpu/impl/rt.h
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/rt.h
+++ /dev/null
@@ -1,34 +0,0 @@
-//===--- amdgpu/impl/rt.h ----------------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef SRC_RUNTIME_INCLUDE_RT_H_
-#define SRC_RUNTIME_INCLUDE_RT_H_
-
-#include "hsa_api.h"
-#include "impl_runtime.h"
-#include "internal.h"
-
-#include <string>
-
-namespace core {
-namespace Runtime {
-hsa_status_t Memfree(void *);
-hsa_status_t HostMalloc(void **ptr, size_t size,
-                        hsa_amd_memory_pool_t MemoryPool);
-
-} // namespace Runtime
-hsa_status_t RegisterModuleFromMemory(
-    std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
-    std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
-    void *module_bytes, size_t module_size, hsa_agent_t agent,
-    hsa_status_t (*on_deserialized_data)(void *data, size_t size,
-                                         void *cb_state),
-    void *cb_state, std::vector<hsa_executable_t> &HSAExecutables);
-
-} // namespace core
-
-#endif // SRC_RUNTIME_INCLUDE_RT_H_
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
+++ /dev/null
@@ -1,744 +0,0 @@
-//===--- amdgpu/impl/system.cpp ----------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/Object/ELF.h"
-#include "llvm/Object/ELFObjectFile.h"
-
-#include <cassert>
-#include <sstream>
-#include <string>
-
-#include "internal.h"
-#include "rt.h"
-
-#include "msgpack.h"
-
-using namespace llvm;
-using namespace llvm::object;
-using namespace llvm::ELF;
-
-namespace hsa {
-// Wrap HSA iterate API in a shim that allows passing general callables
-template <typename C>
-hsa_status_t executable_iterate_symbols(hsa_executable_t executable, C cb) {
-  auto L = [](hsa_executable_t executable, hsa_executable_symbol_t symbol,
-              void *data) -> hsa_status_t {
-    C *unwrapped = static_cast<C *>(data);
-    return (*unwrapped)(executable, symbol);
-  };
-  return hsa_executable_iterate_symbols(executable, L,
-                                        static_cast<void *>(&cb));
-}
-} // namespace hsa
-
-typedef unsigned char *address;
-/*
- * Note descriptors.
- */
-// FreeBSD already declares Elf_Note (indirectly via <libelf.h>)
-#if !defined(__FreeBSD__)
-typedef struct {
-  uint32_t n_namesz; /* Length of note's name. */
-  uint32_t n_descsz; /* Length of note's value. */
-  uint32_t n_type;   /* Type of note. */
-  // then name
-  // then padding, optional
-  // then desc, at 4 byte alignment (not 8, despite being elf64)
-} Elf_Note;
-#endif
-
-class KernelArgMD {
-public:
-  enum class ValueKind {
-    HiddenGlobalOffsetX,
-    HiddenGlobalOffsetY,
-    HiddenGlobalOffsetZ,
-    HiddenNone,
-    HiddenPrintfBuffer,
-    HiddenDefaultQueue,
-    HiddenCompletionAction,
-    HiddenMultiGridSyncArg,
-    HiddenHostcallBuffer,
-    HiddenHeapV1,
-    Unknown
-  };
-
-  KernelArgMD()
-      : name_(std::string()), size_(0), offset_(0),
-        valueKind_(ValueKind::Unknown) {}
-
-  // fields
-  std::string name_;
-  uint32_t size_;
-  uint32_t offset_;
-  ValueKind valueKind_;
-};
-
-static const std::map<std::string, KernelArgMD::ValueKind> ArgValueKind = {
-    // v3
-    //    {"by_value", KernelArgMD::ValueKind::ByValue},
-    //    {"global_buffer", KernelArgMD::ValueKind::GlobalBuffer},
-    //    {"dynamic_shared_pointer",
-    //    KernelArgMD::ValueKind::DynamicSharedPointer},
-    //    {"sampler", KernelArgMD::ValueKind::Sampler},
-    //    {"image", KernelArgMD::ValueKind::Image},
-    //    {"pipe", KernelArgMD::ValueKind::Pipe},
-    //    {"queue", KernelArgMD::ValueKind::Queue},
-    {"hidden_global_offset_x", KernelArgMD::ValueKind::HiddenGlobalOffsetX},
-    {"hidden_global_offset_y", KernelArgMD::ValueKind::HiddenGlobalOffsetY},
-    {"hidden_global_offset_z", KernelArgMD::ValueKind::HiddenGlobalOffsetZ},
-    {"hidden_none", KernelArgMD::ValueKind::HiddenNone},
-    {"hidden_printf_buffer", KernelArgMD::ValueKind::HiddenPrintfBuffer},
-    {"hidden_default_queue", KernelArgMD::ValueKind::HiddenDefaultQueue},
-    {"hidden_completion_action",
-     KernelArgMD::ValueKind::HiddenCompletionAction},
-    {"hidden_multigrid_sync_arg",
-     KernelArgMD::ValueKind::HiddenMultiGridSyncArg},
-    {"hidden_hostcall_buffer", KernelArgMD::ValueKind::HiddenHostcallBuffer},
-    {"hidden_heap_v1", KernelArgMD::ValueKind::HiddenHeapV1}};
-
-namespace core {
-
-hsa_status_t callbackEvent(const hsa_amd_event_t *event, void *data) {
-  if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT) {
-    hsa_amd_gpu_memory_fault_info_t memory_fault = event->memory_fault;
-    // memory_fault.agent
-    // memory_fault.virtual_address
-    // memory_fault.fault_reason_mask
-    // fprintf("[GPU Error at %p: Reason is ", memory_fault.virtual_address);
-    std::stringstream stream;
-    stream << std::hex << (uintptr_t)memory_fault.virtual_address;
-    std::string addr("0x" + stream.str());
-
-    std::string err_string = "[GPU Memory Error] Addr: " + addr;
-    err_string += " Reason: ";
-    if (!(memory_fault.fault_reason_mask & 0x00111111)) {
-      err_string += "No Idea! ";
-    } else {
-      if (memory_fault.fault_reason_mask & 0x00000001)
-        err_string += "Page not present or supervisor privilege. ";
-      if (memory_fault.fault_reason_mask & 0x00000010)
-        err_string += "Write access to a read-only page. ";
-      if (memory_fault.fault_reason_mask & 0x00000100)
-        err_string += "Execute access to a page marked NX. ";
-      if (memory_fault.fault_reason_mask & 0x00001000)
-        err_string += "Host access only. ";
-      if (memory_fault.fault_reason_mask & 0x00010000)
-        err_string += "ECC failure (if supported by HW). ";
-      if (memory_fault.fault_reason_mask & 0x00100000)
-        err_string += "Can't determine the exact fault address. ";
-    }
-    fprintf(stderr, "%s\n", err_string.c_str());
-    return HSA_STATUS_ERROR;
-  }
-  return HSA_STATUS_SUCCESS;
-}
-
-hsa_status_t atl_init_gpu_context() {
-  hsa_status_t err = hsa_amd_register_system_event_handler(callbackEvent, NULL);
-  if (err != HSA_STATUS_SUCCESS) {
-    printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-           "Registering the system for memory faults", get_error_string(err));
-    return HSA_STATUS_ERROR;
-  }
-
-  return HSA_STATUS_SUCCESS;
-}
-
-static bool isImplicit(KernelArgMD::ValueKind value_kind) {
-  switch (value_kind) {
-  case KernelArgMD::ValueKind::HiddenGlobalOffsetX:
-  case KernelArgMD::ValueKind::HiddenGlobalOffsetY:
-  case KernelArgMD::ValueKind::HiddenGlobalOffsetZ:
-  case KernelArgMD::ValueKind::HiddenNone:
-  case KernelArgMD::ValueKind::HiddenPrintfBuffer:
-  case KernelArgMD::ValueKind::HiddenDefaultQueue:
-  case KernelArgMD::ValueKind::HiddenCompletionAction:
-  case KernelArgMD::ValueKind::HiddenMultiGridSyncArg:
-  case KernelArgMD::ValueKind::HiddenHostcallBuffer:
-  case KernelArgMD::ValueKind::HiddenHeapV1:
-    return true;
-  default:
-    return false;
-  }
-}
-
-static std::pair<const unsigned char *, const unsigned char *>
-findMetadata(const ELFObjectFile<ELF64LE> &ELFObj) {
-  constexpr std::pair<const unsigned char *, const unsigned char *> Failure = {
-      nullptr, nullptr};
-  const auto &Elf = ELFObj.getELFFile();
-  auto PhdrsOrErr = Elf.program_headers();
-  if (!PhdrsOrErr) {
-    consumeError(PhdrsOrErr.takeError());
-    return Failure;
-  }
-
-  for (auto Phdr : *PhdrsOrErr) {
-    if (Phdr.p_type != PT_NOTE)
-      continue;
-
-    Error Err = Error::success();
-    for (auto Note : Elf.notes(Phdr, Err)) {
-      if (Note.getType() == 7 || Note.getType() == 8)
-        return Failure;
-
-      // Code object v2 uses yaml metadata and is no longer supported.
-      if (Note.getType() == NT_AMD_HSA_METADATA && Note.getName() == "AMD")
-        return Failure;
-      // Code object v3 should have AMDGPU metadata.
-      if (Note.getType() == NT_AMDGPU_METADATA && Note.getName() != "AMDGPU")
-        return Failure;
-
-      ArrayRef<uint8_t> Desc = Note.getDesc();
-      return {Desc.data(), Desc.data() + Desc.size()};
-    }
-
-    if (Err) {
-      consumeError(std::move(Err));
-      return Failure;
-    }
-  }
-
-  return Failure;
-}
-
-static std::pair<const unsigned char *, const unsigned char *>
-find_metadata(void *binary, size_t binSize) {
-  constexpr std::pair<const unsigned char *, const unsigned char *> Failure = {
-      nullptr, nullptr};
-
-  StringRef Buffer = StringRef(static_cast<const char *>(binary), binSize);
-  auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""),
-                                                  /*InitContent=*/false);
-  if (!ElfOrErr) {
-    consumeError(ElfOrErr.takeError());
-    return Failure;
-  }
-
-  if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(ElfOrErr->get()))
-    return findMetadata(*ELFObj);
-  return Failure;
-}
-
-namespace {
-int map_lookup_array(msgpack::byte_range message, const char *needle,
-                     msgpack::byte_range *res, uint64_t *size) {
-  unsigned count = 0;
-  struct s : msgpack::functors_defaults<s> {
-    s(unsigned &count, uint64_t *size) : count(count), size(size) {}
-    unsigned &count;
-    uint64_t *size;
-    const unsigned char *handle_array(uint64_t N, msgpack::byte_range bytes) {
-      count++;
-      *size = N;
-      return bytes.end;
-    }
-  };
-
-  msgpack::foreach_map(message,
-                       [&](msgpack::byte_range key, msgpack::byte_range value) {
-                         if (msgpack::message_is_string(key, needle)) {
-                           // If the message is an array, record number of
-                           // elements in *size
-                           msgpack::handle_msgpack<s>(value, {count, size});
-                           // return the whole array
-                           *res = value;
-                         }
-                       });
-  // Only claim success if exactly one key/array pair matched
-  return count != 1;
-}
-
-int map_lookup_string(msgpack::byte_range message, const char *needle,
-                      std::string *res) {
-  unsigned count = 0;
-  struct s : public msgpack::functors_defaults<s> {
-    s(unsigned &count, std::string *res) : count(count), res(res) {}
-    unsigned &count;
-    std::string *res;
-    void handle_string(size_t N, const unsigned char *str) {
-      count++;
-      *res = std::string(str, str + N);
-    }
-  };
-  msgpack::foreach_map(message,
-                       [&](msgpack::byte_range key, msgpack::byte_range value) {
-                         if (msgpack::message_is_string(key, needle)) {
-                           msgpack::handle_msgpack<s>(value, {count, res});
-                         }
-                       });
-  return count != 1;
-}
-
-int map_lookup_uint64_t(msgpack::byte_range message, const char *needle,
-                        uint64_t *res) {
-  unsigned count = 0;
-  msgpack::foreach_map(message,
-                       [&](msgpack::byte_range key, msgpack::byte_range value) {
-                         if (msgpack::message_is_string(key, needle)) {
-                           msgpack::foronly_unsigned(value, [&](uint64_t x) {
-                             count++;
-                             *res = x;
-                           });
-                         }
-                       });
-  return count != 1;
-}
-
-int array_lookup_element(msgpack::byte_range message, uint64_t elt,
-                         msgpack::byte_range *res) {
-  int rc = 1;
-  uint64_t i = 0;
-  msgpack::foreach_array(message, [&](msgpack::byte_range value) {
-    if (i == elt) {
-      *res = value;
-      rc = 0;
-    }
-    i++;
-  });
-  return rc;
-}
-
-int populate_kernelArgMD(msgpack::byte_range args_element,
-                         KernelArgMD *kernelarg) {
-  using namespace msgpack;
-  int error = 0;
-  foreach_map(args_element, [&](byte_range key, byte_range value) -> void {
-    if (message_is_string(key, ".name")) {
-      foronly_string(value, [&](size_t N, const unsigned char *str) {
-        kernelarg->name_ = std::string(str, str + N);
-      });
-    } else if (message_is_string(key, ".size")) {
-      foronly_unsigned(value, [&](uint64_t x) { kernelarg->size_ = x; });
-    } else if (message_is_string(key, ".offset")) {
-      foronly_unsigned(value, [&](uint64_t x) { kernelarg->offset_ = x; });
-    } else if (message_is_string(key, ".value_kind")) {
-      foronly_string(value, [&](size_t N, const unsigned char *str) {
-        std::string s = std::string(str, str + N);
-        auto itValueKind = ArgValueKind.find(s);
-        if (itValueKind != ArgValueKind.end()) {
-          kernelarg->valueKind_ = itValueKind->second;
-        }
-      });
-    }
-  });
-  return error;
-}
-} // namespace
-
-static hsa_status_t get_code_object_custom_metadata(
-    void *binary, size_t binSize,
-    std::map<std::string, atl_kernel_info_t> &KernelInfoTable) {
-  // parse code object with different keys from v2
-  // also, the kernel name is not the same as the symbol name -- so a
-  // symbol->name map is needed
-
-  std::pair<const unsigned char *, const unsigned char *> metadata =
-      find_metadata(binary, binSize);
-  if (!metadata.first) {
-    return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
-  }
-
-  uint64_t kernelsSize = 0;
-  int msgpack_errors = 0;
-  msgpack::byte_range kernel_array;
-  msgpack_errors =
-      map_lookup_array({metadata.first, metadata.second}, "amdhsa.kernels",
-                       &kernel_array, &kernelsSize);
-  if (msgpack_errors != 0) {
-    printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
-           "kernels lookup in program metadata");
-    return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
-  }
-
-  for (size_t i = 0; i < kernelsSize; i++) {
-    assert(msgpack_errors == 0);
-    std::string kernelName;
-    std::string symbolName;
-
-    msgpack::byte_range element;
-    msgpack_errors += array_lookup_element(kernel_array, i, &element);
-    if (msgpack_errors != 0) {
-      printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
-             "element lookup in kernel metadata");
-      return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
-    }
-
-    msgpack_errors += map_lookup_string(element, ".name", &kernelName);
-    msgpack_errors += map_lookup_string(element, ".symbol", &symbolName);
-    if (msgpack_errors != 0) {
-      printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
-             "strings lookup in kernel metadata");
-      return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
-    }
-
-    // Make sure that kernelName + ".kd" == symbolName
-    if ((kernelName + ".kd") != symbolName) {
-      printf("[%s:%d] Kernel name mismatching symbol: %s != %s + .kd\n",
-             __FILE__, __LINE__, symbolName.c_str(), kernelName.c_str());
-      return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
-    }
-
-    atl_kernel_info_t info = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-
-    uint64_t sgpr_count, vgpr_count, sgpr_spill_count, vgpr_spill_count;
-    msgpack_errors += map_lookup_uint64_t(element, ".sgpr_count", &sgpr_count);
-    if (msgpack_errors != 0) {
-      printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
-             "sgpr count metadata lookup in kernel metadata");
-      return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
-    }
-
-    info.sgpr_count = sgpr_count;
-
-    msgpack_errors += map_lookup_uint64_t(element, ".vgpr_count", &vgpr_count);
-    if (msgpack_errors != 0) {
-      printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
-             "vgpr count metadata lookup in kernel metadata");
-      return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
-    }
-
-    info.vgpr_count = vgpr_count;
-
-    msgpack_errors +=
-        map_lookup_uint64_t(element, ".sgpr_spill_count", &sgpr_spill_count);
-    if (msgpack_errors != 0) {
-      printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
-             "sgpr spill count metadata lookup in kernel metadata");
-      return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
-    }
-
-    info.sgpr_spill_count = sgpr_spill_count;
-
-    msgpack_errors +=
-        map_lookup_uint64_t(element, ".vgpr_spill_count", &vgpr_spill_count);
-    if (msgpack_errors != 0) {
-      printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
-             "vgpr spill count metadata lookup in kernel metadata");
-      return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
-    }
-
-    info.vgpr_spill_count = vgpr_spill_count;
-
-    size_t kernel_explicit_args_size = 0;
-    uint64_t kernel_segment_size;
-    msgpack_errors += map_lookup_uint64_t(element, ".kernarg_segment_size",
-                                          &kernel_segment_size);
-    if (msgpack_errors != 0) {
-      printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
-             "kernarg segment size metadata lookup in kernel metadata");
-      return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
-    }
-
-    bool hasHiddenArgs = false;
-    if (kernel_segment_size > 0) {
-      uint64_t argsSize;
-      size_t offset = 0;
-
-      msgpack::byte_range args_array;
-      msgpack_errors +=
-          map_lookup_array(element, ".args", &args_array, &argsSize);
-      if (msgpack_errors != 0) {
-        printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
-               "kernel args metadata lookup in kernel metadata");
-        return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
-      }
-
-      for (size_t i = 0; i < argsSize; ++i) {
-        KernelArgMD lcArg;
-
-        msgpack::byte_range args_element;
-        msgpack_errors += array_lookup_element(args_array, i, &args_element);
-        if (msgpack_errors != 0) {
-          printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
-                 "iterate args map in kernel args metadata");
-          return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
-        }
-
-        msgpack_errors += populate_kernelArgMD(args_element, &lcArg);
-        if (msgpack_errors != 0) {
-          printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
-                 "iterate args map in kernel args metadata");
-          return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
-        }
-        // v3 has offset field and not align field
-        size_t new_offset = lcArg.offset_;
-        size_t padding = new_offset - offset;
-        offset = new_offset;
-        DP("Arg[%lu] \"%s\" (%u, %u)\n", i, lcArg.name_.c_str(), lcArg.size_,
-           lcArg.offset_);
-        offset += lcArg.size_;
-
-        // check if the arg is a hidden/implicit arg
-        // this logic assumes that all hidden args are 8-byte aligned
-        if (!isImplicit(lcArg.valueKind_)) {
-          info.explicit_argument_count++;
-          kernel_explicit_args_size += lcArg.size_;
-        } else {
-          info.implicit_argument_count++;
-          hasHiddenArgs = true;
-        }
-        kernel_explicit_args_size += padding;
-      }
-    }
-
-    // TODO: Probably don't want this arithmetic
-    info.kernel_segment_size =
-        (hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size);
-    DP("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(),
-       kernel_segment_size, info.kernel_segment_size);
-
-    // kernel received, now add it to the kernel info table
-    KernelInfoTable[kernelName] = info;
-  }
-
-  return HSA_STATUS_SUCCESS;
-}
-
-static hsa_status_t
-populate_InfoTables(hsa_executable_symbol_t symbol,
-                    std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
-                    std::map<std::string, atl_symbol_info_t> &SymbolInfoTable) {
-  hsa_symbol_kind_t type;
-
-  uint32_t name_length;
-  hsa_status_t err;
-  err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE,
-                                       &type);
-  if (err != HSA_STATUS_SUCCESS) {
-    printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-           "Symbol info extraction", get_error_string(err));
-    return err;
-  }
-  DP("Exec Symbol type: %d\n", type);
-  if (type == HSA_SYMBOL_KIND_KERNEL) {
-    err = hsa_executable_symbol_get_info(
-        symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length);
-    if (err != HSA_STATUS_SUCCESS) {
-      printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-             "Symbol info extraction", get_error_string(err));
-      return err;
-    }
-    char *name = reinterpret_cast<char *>(malloc(name_length + 1));
-    err = hsa_executable_symbol_get_info(symbol,
-                                         HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
-    if (err != HSA_STATUS_SUCCESS) {
-      printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-             "Symbol info extraction", get_error_string(err));
-      return err;
-    }
-    // remove the suffix .kd from symbol name.
-    name[name_length - 3] = 0;
-
-    atl_kernel_info_t info;
-    std::string kernelName(name);
-    // by now, the kernel info table should already have an entry
-    // because the non-ROCr custom code object parsing is called before
-    // iterating over the code object symbols using ROCr
-    if (KernelInfoTable.find(kernelName) == KernelInfoTable.end()) {
-      DP("amdgpu internal consistency error\n");
-      return HSA_STATUS_ERROR;
-    }
-    // found, so assign and update
-    info = KernelInfoTable[kernelName];
-
-    /* Extract dispatch information from the symbol */
-    err = hsa_executable_symbol_get_info(
-        symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
-        &(info.kernel_object));
-    if (err != HSA_STATUS_SUCCESS) {
-      printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-             "Extracting the symbol from the executable",
-             get_error_string(err));
-      return err;
-    }
-    err = hsa_executable_symbol_get_info(
-        symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
-        &(info.group_segment_size));
-    if (err != HSA_STATUS_SUCCESS) {
-      printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-             "Extracting the group segment size from the executable",
-             get_error_string(err));
-      return err;
-    }
-    err = hsa_executable_symbol_get_info(
-        symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
-        &(info.private_segment_size));
-    if (err != HSA_STATUS_SUCCESS) {
-      printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-             "Extracting the private segment from the executable",
-             get_error_string(err));
-      return err;
-    }
-
-    DP("Kernel %s --> %lx symbol %u group segsize %u pvt segsize %u bytes "
-       "kernarg\n",
-       kernelName.c_str(), info.kernel_object, info.group_segment_size,
-       info.private_segment_size, info.kernel_segment_size);
-
-    // assign it back to the kernel info table
-    KernelInfoTable[kernelName] = info;
-    free(name);
-  } else if (type == HSA_SYMBOL_KIND_VARIABLE) {
-    err = hsa_executable_symbol_get_info(
-        symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length);
-    if (err != HSA_STATUS_SUCCESS) {
-      printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-             "Symbol info extraction", get_error_string(err));
-      return err;
-    }
-    char *name = reinterpret_cast<char *>(malloc(name_length + 1));
-    err = hsa_executable_symbol_get_info(symbol,
-                                         HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
-    if (err != HSA_STATUS_SUCCESS) {
-      printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-             "Symbol info extraction", get_error_string(err));
-      return err;
-    }
-    name[name_length] = 0;
-
-    atl_symbol_info_t info;
-
-    err = hsa_executable_symbol_get_info(
-        symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &(info.addr));
-    if (err != HSA_STATUS_SUCCESS) {
-      printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-             "Symbol info address extraction", get_error_string(err));
-      return err;
-    }
-
-    err = hsa_executable_symbol_get_info(
-        symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &(info.size));
-    if (err != HSA_STATUS_SUCCESS) {
-      printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-             "Symbol info size extraction", get_error_string(err));
-      return err;
-    }
-
-    DP("Symbol %s = %p (%u bytes)\n", name, (void *)info.addr, info.size);
-    SymbolInfoTable[std::string(name)] = info;
-    free(name);
-  } else {
-    DP("Symbol is an indirect function\n");
-  }
-  return HSA_STATUS_SUCCESS;
-}
-
-hsa_status_t RegisterModuleFromMemory(
-    std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
-    std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
-    void *module_bytes, size_t module_size, hsa_agent_t agent,
-    hsa_status_t (*on_deserialized_data)(void *data, size_t size,
-                                         void *cb_state),
-    void *cb_state, std::vector<hsa_executable_t> &HSAExecutables) {
-  hsa_status_t err;
-  hsa_executable_t executable = {0};
-  hsa_profile_t agent_profile;
-
-  err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_profile);
-  if (err != HSA_STATUS_SUCCESS) {
-    printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-           "Query the agent profile", get_error_string(err));
-    return HSA_STATUS_ERROR;
-  }
-  // FIXME: Assume that every profile is FULL until we understand how to build
-  // GCN with base profile
-  agent_profile = HSA_PROFILE_FULL;
-  /* Create the empty executable.  */
-  err = hsa_executable_create(agent_profile, HSA_EXECUTABLE_STATE_UNFROZEN, "",
-                              &executable);
-  if (err != HSA_STATUS_SUCCESS) {
-    printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-           "Create the executable", get_error_string(err));
-    return HSA_STATUS_ERROR;
-  }
-
-  bool module_load_success = false;
-  do // Existing control flow used continue, preserve that for this patch
-  {
-    {
-      // Some metadata info is not available through ROCr API, so use custom
-      // code object metadata parsing to collect such metadata info
-
-      err = get_code_object_custom_metadata(module_bytes, module_size,
-                                            KernelInfoTable);
-      if (err != HSA_STATUS_SUCCESS) {
-        DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-           "Getting custom code object metadata", get_error_string(err));
-        continue;
-      }
-
-      // Deserialize code object.
-      hsa_code_object_t code_object = {0};
-      err = hsa_code_object_deserialize(module_bytes, module_size, NULL,
-                                        &code_object);
-      if (err != HSA_STATUS_SUCCESS) {
-        DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-           "Code Object Deserialization", get_error_string(err));
-        continue;
-      }
-      assert(0 != code_object.handle);
-
-      // Mutating the device image here avoids another allocation & memcpy
-      void *code_object_alloc_data =
-          reinterpret_cast<void *>(code_object.handle);
-      hsa_status_t impl_err =
-          on_deserialized_data(code_object_alloc_data, module_size, cb_state);
-      if (impl_err != HSA_STATUS_SUCCESS) {
-        printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-               "Error in deserialized_data callback",
-               get_error_string(impl_err));
-        return impl_err;
-      }
-
-      /* Load the code object.  */
-      err =
-          hsa_executable_load_code_object(executable, agent, code_object, NULL);
-      if (err != HSA_STATUS_SUCCESS) {
-        DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-           "Loading the code object", get_error_string(err));
-        continue;
-      }
-
-      // cannot iterate over symbols until executable is frozen
-    }
-    module_load_success = true;
-  } while (0);
-  DP("Modules loaded successful? %d\n", module_load_success);
-  if (module_load_success) {
-    /* Freeze the executable; it can now be queried for symbols.  */
-    err = hsa_executable_freeze(executable, "");
-    if (err != HSA_STATUS_SUCCESS) {
-      printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-             "Freeze the executable", get_error_string(err));
-      return HSA_STATUS_ERROR;
-    }
-
-    err = hsa::executable_iterate_symbols(
-        executable,
-        [&](hsa_executable_t, hsa_executable_symbol_t symbol) -> hsa_status_t {
-          return populate_InfoTables(symbol, KernelInfoTable, SymbolInfoTable);
-        });
-    if (err != HSA_STATUS_SUCCESS) {
-      printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-             "Iterating over symbols for execuatable", get_error_string(err));
-      return HSA_STATUS_ERROR;
-    }
-
-    // save the executable and destroy during finalize
-    HSAExecutables.push_back(executable);
-    return HSA_STATUS_SUCCESS;
-  } else {
-    return HSA_STATUS_ERROR;
-  }
-}
-
-} // namespace core
diff --git a/openmp/libomptarget/plugins/amdgpu/src/print_tracing.h b/openmp/libomptarget/plugins/amdgpu/src/print_tracing.h
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/print_tracing.h
+++ /dev/null
@@ -1,20 +0,0 @@
-//===--- amdgpu/src/print_tracing.h ------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef LIBOMPTARGET_PLUGINS_AMGGPU_SRC_PRINT_TRACING_H_INCLUDED
-#define LIBOMPTARGET_PLUGINS_AMGGPU_SRC_PRINT_TRACING_H_INCLUDED
-
-enum PrintTraceControlBits {
-  LAUNCH = 1,          // print a message to stderr for each kernel launch
-  RTL_TIMING = 2,      // Print timing info around each RTL step
-  STARTUP_DETAILS = 4, // Details around loading up kernel
-  RTL_TO_STDOUT = 8    // Redirect RTL tracing to stdout
-};
-
-extern int print_kernel_trace; // set by environment variable
-
-#endif
diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ /dev/null
@@ -1,2615 +0,0 @@
-//===--- amdgpu/src/rtl.cpp --------------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// RTL for AMD hsa machine
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Frontend/OpenMP/OMPConstants.h"
-#include "llvm/Frontend/OpenMP/OMPGridValues.h"
-#include "llvm/Object/ELF.h"
-#include "llvm/Object/ELFObjectFile.h"
-
-#include <algorithm>
-#include <assert.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <functional>
-#include <list>
-#include <memory>
-#include <mutex>
-#include <shared_mutex>
-#include <unordered_map>
-#include <vector>
-
-#include "ELFSymbols.h"
-#include "impl_runtime.h"
-#include "interop_hsa.h"
-
-#include "UtilitiesRTL.h"
-#include "internal.h"
-#include "rt.h"
-
-#include "DeviceEnvironment.h"
-#include "get_elf_mach_gfx_name.h"
-#include "omptargetplugin.h"
-#include "print_tracing.h"
-
-using namespace llvm;
-using namespace llvm::object;
-using namespace llvm::ELF;
-using namespace llvm::omp::target::plugin::utils;
-
-// hostrpc interface, FIXME: consider moving to its own include these are
-// statically linked into amdgpu/plugin if present from hostrpc_services.a,
-// linked as --whole-archive to override the weak symbols that are used to
-// implement a fallback for toolchains that do not yet have a hostrpc library.
-extern "C" {
-uint64_t hostrpc_assign_buffer(hsa_agent_t Agent, hsa_queue_t *ThisQ,
-                               uint32_t DeviceId);
-hsa_status_t hostrpc_init();
-hsa_status_t hostrpc_terminate();
-
-__attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; }
-__attribute__((weak)) hsa_status_t hostrpc_terminate() {
-  return HSA_STATUS_SUCCESS;
-}
-__attribute__((weak)) uint64_t hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *,
-                                                     uint32_t DeviceId) {
-  DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library "
-     "missing\n",
-     DeviceId);
-  return 0;
-}
-}
-
-// Heuristic parameters used for kernel launch
-// Number of teams per CU to allow scheduling flexibility
-static const unsigned DefaultTeamsPerCU = 4;
-
-int print_kernel_trace;
-
-#ifdef OMPTARGET_DEBUG
-#define check(msg, status)                                                     \
-  if (status != HSA_STATUS_SUCCESS) {                                          \
-    DP(#msg " failed\n");                                                      \
-  } else {                                                                     \
-    DP(#msg " succeeded\n");                                                   \
-  }
-#else
-#define check(msg, status)                                                     \
-  {}
-#endif
-
-#include "elf_common.h"
-
-namespace hsa {
-template <typename C> hsa_status_t iterate_agents(C Cb) {
-  auto L = [](hsa_agent_t Agent, void *Data) -> hsa_status_t {
-    C *Unwrapped = static_cast<C *>(Data);
-    return (*Unwrapped)(Agent);
-  };
-  return hsa_iterate_agents(L, static_cast<void *>(&Cb));
-}
-
-template <typename C>
-hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C Cb) {
-  auto L = [](hsa_amd_memory_pool_t MemoryPool, void *Data) -> hsa_status_t {
-    C *Unwrapped = static_cast<C *>(Data);
-    return (*Unwrapped)(MemoryPool);
-  };
-
-  return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast<void *>(&Cb));
-}
-
-} // namespace hsa
-
-/// Keep entries table per device
-struct FuncOrGblEntryTy {
-  __tgt_target_table Table;
-  std::vector<__tgt_offload_entry> Entries;
-};
-
-struct KernelArgPool {
-private:
-  static pthread_mutex_t Mutex;
-
-public:
-  uint32_t KernargSegmentSize;
-  void *KernargRegion = nullptr;
-  std::queue<int> FreeKernargSegments;
-
-  uint32_t kernargSizeIncludingImplicit() {
-    return KernargSegmentSize + sizeof(AMDGPUImplicitArgsTy);
-  }
-
-  ~KernelArgPool() {
-    if (KernargRegion) {
-      auto R = hsa_amd_memory_pool_free(KernargRegion);
-      if (R != HSA_STATUS_SUCCESS) {
-        DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(R));
-      }
-    }
-  }
-
-  // Can't really copy or move a mutex
-  KernelArgPool() = default;
-  KernelArgPool(const KernelArgPool &) = delete;
-  KernelArgPool(KernelArgPool &&) = delete;
-
-  KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool)
-      : KernargSegmentSize(KernargSegmentSize) {
-
-    // impl uses one pool per kernel for all gpus, with a fixed upper size
-    // preserving that exact scheme here, including the queue<int>
-
-    hsa_status_t Err = hsa_amd_memory_pool_allocate(
-        MemoryPool, kernargSizeIncludingImplicit() * MAX_NUM_KERNELS, 0,
-        &KernargRegion);
-
-    if (Err != HSA_STATUS_SUCCESS) {
-      DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(Err));
-      KernargRegion = nullptr; // paranoid
-      return;
-    }
-
-    Err = core::allow_access_to_all_gpu_agents(KernargRegion);
-    if (Err != HSA_STATUS_SUCCESS) {
-      DP("hsa allow_access_to_all_gpu_agents failed: %s\n",
-         get_error_string(Err));
-      auto R = hsa_amd_memory_pool_free(KernargRegion);
-      if (R != HSA_STATUS_SUCCESS) {
-        // if free failed, can't do anything more to resolve it
-        DP("hsa memory poll free failed: %s\n", get_error_string(Err));
-      }
-      KernargRegion = nullptr;
-      return;
-    }
-
-    for (int I = 0; I < MAX_NUM_KERNELS; I++) {
-      FreeKernargSegments.push(I);
-    }
-  }
-
-  void *allocate(uint64_t ArgNum) {
-    assert((ArgNum * sizeof(void *)) == KernargSegmentSize);
-    Lock L(&Mutex);
-    void *Res = nullptr;
-    if (!FreeKernargSegments.empty()) {
-
-      int FreeIdx = FreeKernargSegments.front();
-      Res = static_cast<void *>(static_cast<char *>(KernargRegion) +
-                                (FreeIdx * kernargSizeIncludingImplicit()));
-      assert(FreeIdx == pointerToIndex(Res));
-      FreeKernargSegments.pop();
-    }
-    return Res;
-  }
-
-  void deallocate(void *Ptr) {
-    Lock L(&Mutex);
-    int Idx = pointerToIndex(Ptr);
-    FreeKernargSegments.push(Idx);
-  }
-
-private:
-  int pointerToIndex(void *Ptr) {
-    ptrdiff_t Bytes =
-        static_cast<char *>(Ptr) - static_cast<char *>(KernargRegion);
-    assert(Bytes >= 0);
-    assert(Bytes % kernargSizeIncludingImplicit() == 0);
-    return Bytes / kernargSizeIncludingImplicit();
-  }
-  struct Lock {
-    Lock(pthread_mutex_t *M) : M(M) { pthread_mutex_lock(M); }
-    ~Lock() { pthread_mutex_unlock(M); }
-    pthread_mutex_t *M;
-  };
-};
-pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER;
-
-std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>>
-    KernelArgPoolMap;
-
-/// Use a single entity to encode a kernel and a set of flags
-struct KernelTy {
-  llvm::omp::OMPTgtExecModeFlags ExecutionMode;
-  int16_t ConstWGSize;
-  int32_t DeviceId;
-  void *CallStackAddr = nullptr;
-  const char *Name;
-
-  KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize,
-           int32_t DeviceId, void *CallStackAddr, const char *Name,
-           uint32_t KernargSegmentSize,
-           hsa_amd_memory_pool_t &KernArgMemoryPool)
-      : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize),
-        DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) {
-    DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
-
-    std::string N(Name);
-    if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
-      KernelArgPoolMap.insert(
-          std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool(
-                                KernargSegmentSize, KernArgMemoryPool))));
-    }
-  }
-};
-
-/// List that contains all the kernels.
-/// FIXME: we may need this to be per device and per library.
-std::list<KernelTy> KernelsList;
-
-template <typename Callback> static hsa_status_t findAgents(Callback CB) {
-
-  hsa_status_t Err =
-      hsa::iterate_agents([&](hsa_agent_t Agent) -> hsa_status_t {
-        hsa_device_type_t DeviceType;
-        // get_info fails iff HSA runtime not yet initialized
-        hsa_status_t Err =
-            hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType);
-
-        if (Err != HSA_STATUS_SUCCESS) {
-          if (print_kernel_trace > 0)
-            DP("rtl.cpp: err %s\n", get_error_string(Err));
-
-          return Err;
-        }
-
-        CB(DeviceType, Agent);
-        return HSA_STATUS_SUCCESS;
-      });
-
-  // iterate_agents fails iff HSA runtime not yet initialized
-  if (print_kernel_trace > 0 && Err != HSA_STATUS_SUCCESS) {
-    DP("rtl.cpp: err %s\n", get_error_string(Err));
-  }
-
-  return Err;
-}
-
-static void callbackQueue(hsa_status_t Status, hsa_queue_t *Source,
-                          void *Data) {
-  if (Status != HSA_STATUS_SUCCESS) {
-    const char *StatusString;
-    if (hsa_status_string(Status, &StatusString) != HSA_STATUS_SUCCESS) {
-      StatusString = "unavailable";
-    }
-    DP("[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, __LINE__, Source,
-       Status, StatusString);
-    abort();
-  }
-}
-
-namespace core {
-namespace {
-
-bool checkResult(hsa_status_t Err, const char *ErrMsg) {
-  if (Err == HSA_STATUS_SUCCESS)
-    return true;
-
-  REPORT("%s", ErrMsg);
-  REPORT("%s", get_error_string(Err));
-  return false;
-}
-
-void packetStoreRelease(uint32_t *Packet, uint16_t Header, uint16_t Rest) {
-  __atomic_store_n(Packet, Header | (Rest << 16), __ATOMIC_RELEASE);
-}
-
-uint16_t createHeader() {
-  uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
-  Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
-  Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
-  return Header;
-}
-
-hsa_status_t isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) {
-  bool AllocAllowed = false;
-  hsa_status_t Err = hsa_amd_memory_pool_get_info(
-      MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
-      &AllocAllowed);
-  if (Err != HSA_STATUS_SUCCESS) {
-    DP("Alloc allowed in memory pool check failed: %s\n",
-       get_error_string(Err));
-    return Err;
-  }
-
-  size_t Size = 0;
-  Err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE,
-                                     &Size);
-  if (Err != HSA_STATUS_SUCCESS) {
-    DP("Get memory pool size failed: %s\n", get_error_string(Err));
-    return Err;
-  }
-
-  return (AllocAllowed && Size > 0) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
-}
-
-hsa_status_t addMemoryPool(hsa_amd_memory_pool_t MemoryPool, void *Data) {
-  std::vector<hsa_amd_memory_pool_t> *Result =
-      static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data);
-
-  hsa_status_t Err;
-  if ((Err = isValidMemoryPool(MemoryPool)) != HSA_STATUS_SUCCESS) {
-    return Err;
-  }
-
-  Result->push_back(MemoryPool);
-  return HSA_STATUS_SUCCESS;
-}
-
-} // namespace
-} // namespace core
-
-struct EnvironmentVariables {
-  int NumTeams;
-  int TeamLimit;
-  int TeamThreadLimit;
-  int MaxTeamsDefault;
-  int DynamicMemSize;
-};
-
-template <uint32_t wavesize>
-static constexpr const llvm::omp::GV &getGridValue() {
-  return llvm::omp::getAMDGPUGridValues<wavesize>();
-}
-
-struct HSALifetime {
-  // Wrapper around HSA used to ensure it is constructed before other types
-  // and destructed after, which means said other types can use raii for
-  // cleanup without risking running outside of the lifetime of HSA
-  const hsa_status_t S;
-
-  bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; }
-  HSALifetime() : S(hsa_init()) {}
-
-  ~HSALifetime() {
-    if (S == HSA_STATUS_SUCCESS) {
-      hsa_status_t Err = hsa_shut_down();
-      if (Err != HSA_STATUS_SUCCESS) {
-        // Can't call into HSA to get a string from the integer
-        DP("Shutting down HSA failed: %d\n", Err);
-      }
-    }
-  }
-};
-
-// Handle scheduling of multiple hsa_queue's per device to
-// multiple threads (one scheduler per device)
-class HSAQueueScheduler {
-public:
-  HSAQueueScheduler() : Current(0) {}
-
-  HSAQueueScheduler(const HSAQueueScheduler &) = delete;
-
-  HSAQueueScheduler(HSAQueueScheduler &&Q) {
-    Current = Q.Current.load();
-    for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
-      HSAQueues[I] = Q.HSAQueues[I];
-      Q.HSAQueues[I] = nullptr;
-    }
-  }
-
-  // \return false if any HSA queue creation fails
-  bool createQueues(hsa_agent_t HSAAgent, uint32_t QueueSize) {
-    for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
-      hsa_queue_t *Q = nullptr;
-      hsa_status_t Rc =
-          hsa_queue_create(HSAAgent, QueueSize, HSA_QUEUE_TYPE_MULTI,
-                           callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q);
-      if (Rc != HSA_STATUS_SUCCESS) {
-        DP("Failed to create HSA queue %d\n", I);
-        return false;
-      }
-      HSAQueues[I] = Q;
-    }
-    return true;
-  }
-
-  ~HSAQueueScheduler() {
-    for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
-      if (HSAQueues[I]) {
-        hsa_status_t Err = hsa_queue_destroy(HSAQueues[I]);
-        if (Err != HSA_STATUS_SUCCESS)
-          DP("Error destroying HSA queue");
-      }
-    }
-  }
-
-  // \return next queue to use for device
-  hsa_queue_t *next() {
-    return HSAQueues[(Current.fetch_add(1, std::memory_order_relaxed)) %
-                     NUM_QUEUES_PER_DEVICE];
-  }
-
-private:
-  // Number of queues per device
-  enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 };
-  hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {};
-  std::atomic<uint8_t> Current;
-};
-
-/// Class containing all the device information
-class RTLDeviceInfoTy : HSALifetime {
-  std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
-
-  struct QueueDeleter {
-    void operator()(hsa_queue_t *Q) {
-      if (Q) {
-        hsa_status_t Err = hsa_queue_destroy(Q);
-        if (Err != HSA_STATUS_SUCCESS) {
-          DP("Error destroying hsa queue: %s\n", get_error_string(Err));
-        }
-      }
-    }
-  };
-
-public:
-  bool ConstructionSucceeded = false;
-
-  // load binary populates symbol tables and mutates various global state
-  // run uses those symbol tables
-  std::shared_timed_mutex LoadRunLock;
-
-  int NumberOfDevices = 0;
-
-  // GPU devices
-  std::vector<hsa_agent_t> HSAAgents;
-  std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu
-
-  // CPUs
-  std::vector<hsa_agent_t> CPUAgents;
-
-  // Device properties
-  std::vector<int> ComputeUnits;
-  std::vector<int> GroupsPerDevice;
-  std::vector<int> ThreadsPerGroup;
-  std::vector<int> WarpSize;
-  std::vector<std::string> GPUName;
-  std::vector<std::string> TargetID;
-
-  // OpenMP properties
-  std::vector<int> NumTeams;
-  std::vector<int> NumThreads;
-
-  // OpenMP Environment properties
-  EnvironmentVariables Env;
-
-  // OpenMP Requires Flags
-  int64_t RequiresFlags;
-
-  // Resource pools
-  SignalPoolT FreeSignalPool;
-
-  bool HostcallRequired = false;
-
-  std::vector<hsa_executable_t> HSAExecutables;
-
-  std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
-  std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
-
-  hsa_amd_memory_pool_t KernArgPool;
-
-  // fine grained memory pool for host allocations
-  hsa_amd_memory_pool_t HostFineGrainedMemoryPool;
-
-  // fine and coarse-grained memory pools per offloading device
-  std::vector<hsa_amd_memory_pool_t> DeviceFineGrainedMemoryPools;
-  std::vector<hsa_amd_memory_pool_t> DeviceCoarseGrainedMemoryPools;
-
-  struct ImplFreePtrDeletor {
-    void operator()(void *P) {
-      core::Runtime::Memfree(P); // ignore failure to free
-    }
-  };
-
-  // device_State shared across loaded binaries, error if inconsistent size
-  std::vector<std::pair<std::unique_ptr<void, ImplFreePtrDeletor>, uint64_t>>
-      DeviceStateStore;
-
-  static const unsigned HardTeamLimit =
-      (1 << 16) - 1; // 64K needed to fit in uint16
-  static const int DefaultNumTeams = 128;
-
-  // These need to be per-device since different devices can have different
-  // wave sizes, but are currently the same number for each so that refactor
-  // can be postponed.
-  static_assert(getGridValue<32>().GV_Max_Teams ==
-                    getGridValue<64>().GV_Max_Teams,
-                "");
-  static const int MaxTeams = getGridValue<64>().GV_Max_Teams;
-
-  static_assert(getGridValue<32>().GV_Max_WG_Size ==
-                    getGridValue<64>().GV_Max_WG_Size,
-                "");
-  static const int MaxWgSize = getGridValue<64>().GV_Max_WG_Size;
-
-  static_assert(getGridValue<32>().GV_Default_WG_Size ==
-                    getGridValue<64>().GV_Default_WG_Size,
-                "");
-  static const int DefaultWgSize = getGridValue<64>().GV_Default_WG_Size;
-
-  using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, void *, size_t Size,
-                                      hsa_agent_t, hsa_amd_memory_pool_t);
-  hsa_status_t freesignalpoolMemcpy(void *Dest, void *Src, size_t Size,
-                                    MemcpyFunc Func, int32_t DeviceId) {
-    hsa_agent_t Agent = HSAAgents[DeviceId];
-    hsa_signal_t S = FreeSignalPool.pop();
-    if (S.handle == 0) {
-      return HSA_STATUS_ERROR;
-    }
-    hsa_status_t R = Func(S, Dest, Src, Size, Agent, HostFineGrainedMemoryPool);
-    FreeSignalPool.push(S);
-    return R;
-  }
-
-  hsa_status_t freesignalpoolMemcpyD2H(void *Dest, void *Src, size_t Size,
-                                       int32_t DeviceId) {
-    return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_d2h, DeviceId);
-  }
-
-  hsa_status_t freesignalpoolMemcpyH2D(void *Dest, void *Src, size_t Size,
-                                       int32_t DeviceId) {
-    return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_h2d, DeviceId);
-  }
-
-  static void printDeviceInfo(int32_t DeviceId, hsa_agent_t Agent) {
-    char TmpChar[1000];
-    uint16_t Major, Minor;
-    uint32_t TmpUInt;
-    uint32_t TmpUInt2;
-    uint32_t CacheSize[4];
-    bool TmpBool;
-    uint16_t WorkgroupMaxDim[3];
-    hsa_dim3_t GridMaxDim;
-
-    // Getting basic information about HSA and Device
-    core::checkResult(
-        hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major),
-        "Error from hsa_system_get_info when obtaining "
-        "HSA_SYSTEM_INFO_VERSION_MAJOR\n");
-    core::checkResult(
-        hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor),
-        "Error from hsa_system_get_info when obtaining "
-        "HSA_SYSTEM_INFO_VERSION_MINOR\n");
-    printf("    HSA Runtime Version: \t\t%u.%u \n", Major, Minor);
-    printf("    HSA OpenMP Device Number: \t\t%d \n", DeviceId);
-    core::checkResult(
-        hsa_agent_get_info(
-            Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AMD_AGENT_INFO_PRODUCT_NAME\n");
-    printf("    Product Name: \t\t\t%s \n", TmpChar);
-    core::checkResult(hsa_agent_get_info(Agent, HSA_AGENT_INFO_NAME, TmpChar),
-                      "Error returned from hsa_agent_get_info when obtaining "
-                      "HSA_AGENT_INFO_NAME\n");
-    printf("    Device Name: \t\t\t%s \n", TmpChar);
-    core::checkResult(
-        hsa_agent_get_info(Agent, HSA_AGENT_INFO_VENDOR_NAME, TmpChar),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AGENT_INFO_NAME\n");
-    printf("    Vendor Name: \t\t\t%s \n", TmpChar);
-    hsa_device_type_t DevType;
-    core::checkResult(
-        hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DevType),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AGENT_INFO_DEVICE\n");
-    printf("    Device Type: \t\t\t%s \n",
-           DevType == HSA_DEVICE_TYPE_CPU
-               ? "CPU"
-               : (DevType == HSA_DEVICE_TYPE_GPU
-                      ? "GPU"
-                      : (DevType == HSA_DEVICE_TYPE_DSP ? "DSP" : "UNKNOWN")));
-    core::checkResult(
-        hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUES_MAX, &TmpUInt),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AGENT_INFO_QUEUES_MAX\n");
-    printf("    Max Queues: \t\t\t%u \n", TmpUInt);
-    core::checkResult(
-        hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &TmpUInt),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AGENT_INFO_QUEUE_MIN_SIZE\n");
-    printf("    Queue Min Size: \t\t\t%u \n", TmpUInt);
-    core::checkResult(
-        hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &TmpUInt),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AGENT_INFO_QUEUE_MAX_SIZE\n");
-    printf("    Queue Max Size: \t\t\t%u \n", TmpUInt);
-
-    // Getting cache information
-    printf("    Cache:\n");
-
-    // FIXME: This is deprecated according to HSA documentation. But using
-    // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during
-    // runtime.
-    core::checkResult(
-        hsa_agent_get_info(Agent, HSA_AGENT_INFO_CACHE_SIZE, CacheSize),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AGENT_INFO_CACHE_SIZE\n");
-
-    for (int I = 0; I < 4; I++) {
-      if (CacheSize[I]) {
-        printf("      L%u: \t\t\t\t%u bytes\n", I, CacheSize[I]);
-      }
-    }
-
-    core::checkResult(
-        hsa_agent_get_info(Agent,
-                           (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE,
-                           &TmpUInt),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AMD_AGENT_INFO_CACHELINE_SIZE\n");
-    printf("    Cacheline Size: \t\t\t%u \n", TmpUInt);
-    core::checkResult(
-        hsa_agent_get_info(
-            Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY,
-            &TmpUInt),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY\n");
-    printf("    Max Clock Freq(MHz): \t\t%u \n", TmpUInt);
-    core::checkResult(
-        hsa_agent_get_info(
-            Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
-            &TmpUInt),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT\n");
-    printf("    Compute Units: \t\t\t%u \n", TmpUInt);
-    core::checkResult(hsa_agent_get_info(
-                          Agent,
-                          (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU,
-                          &TmpUInt),
-                      "Error returned from hsa_agent_get_info when obtaining "
-                      "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n");
-    printf("    SIMD per CU: \t\t\t%u \n", TmpUInt);
-    core::checkResult(
-        hsa_agent_get_info(Agent, HSA_AGENT_INFO_FAST_F16_OPERATION, &TmpBool),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n");
-    printf("    Fast F16 Operation: \t\t%s \n", (TmpBool ? "TRUE" : "FALSE"));
-    core::checkResult(
-        hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &TmpUInt2),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AGENT_INFO_WAVEFRONT_SIZE\n");
-    printf("    Wavefront Size: \t\t\t%u \n", TmpUInt2);
-    core::checkResult(
-        hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &TmpUInt),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AGENT_INFO_WORKGROUP_MAX_SIZE\n");
-    printf("    Workgroup Max Size: \t\t%u \n", TmpUInt);
-    core::checkResult(hsa_agent_get_info(Agent,
-                                         HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
-                                         WorkgroupMaxDim),
-                      "Error returned from hsa_agent_get_info when obtaining "
-                      "HSA_AGENT_INFO_WORKGROUP_MAX_DIM\n");
-    printf("    Workgroup Max Size per Dimension:\n");
-    printf("      x: \t\t\t\t%u\n", WorkgroupMaxDim[0]);
-    printf("      y: \t\t\t\t%u\n", WorkgroupMaxDim[1]);
-    printf("      z: \t\t\t\t%u\n", WorkgroupMaxDim[2]);
-    core::checkResult(hsa_agent_get_info(
-                          Agent,
-                          (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU,
-                          &TmpUInt),
-                      "Error returned from hsa_agent_get_info when obtaining "
-                      "HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU\n");
-    printf("    Max Waves Per CU: \t\t\t%u \n", TmpUInt);
-    printf("    Max Work-item Per CU: \t\t%u \n", TmpUInt * TmpUInt2);
-    core::checkResult(
-        hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_SIZE, &TmpUInt),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AGENT_INFO_GRID_MAX_SIZE\n");
-    printf("    Grid Max Size: \t\t\t%u \n", TmpUInt);
-    core::checkResult(
-        hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AGENT_INFO_GRID_MAX_DIM\n");
-    printf("    Grid Max Size per Dimension: \t\t\n");
-    printf("      x: \t\t\t\t%u\n", GridMaxDim.x);
-    printf("      y: \t\t\t\t%u\n", GridMaxDim.y);
-    printf("      z: \t\t\t\t%u\n", GridMaxDim.z);
-    core::checkResult(
-        hsa_agent_get_info(Agent, HSA_AGENT_INFO_FBARRIER_MAX_SIZE, &TmpUInt),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AGENT_INFO_FBARRIER_MAX_SIZE\n");
-    printf("    Max fbarriers/Workgrp: \t\t%u\n", TmpUInt);
-
-    printf("    Memory Pools:\n");
-    auto CbMem = [](hsa_amd_memory_pool_t Region, void *Data) -> hsa_status_t {
-      std::string TmpStr;
-      size_t Size;
-      bool Alloc, Access;
-      hsa_amd_segment_t Segment;
-      hsa_amd_memory_pool_global_flag_t GlobalFlags;
-      core::checkResult(
-          hsa_amd_memory_pool_get_info(
-              Region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags),
-          "Error returned from hsa_amd_memory_pool_get_info when obtaining "
-          "HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS\n");
-      core::checkResult(hsa_amd_memory_pool_get_info(
-                            Region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &Segment),
-                        "Error returned from hsa_amd_memory_pool_get_info when "
-                        "obtaining HSA_AMD_MEMORY_POOL_INFO_SEGMENT\n");
-
-      switch (Segment) {
-      case HSA_AMD_SEGMENT_GLOBAL:
-        TmpStr = "GLOBAL; FLAGS: ";
-        if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & GlobalFlags)
-          TmpStr += "KERNARG, ";
-        if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & GlobalFlags)
-          TmpStr += "FINE GRAINED, ";
-        if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED & GlobalFlags)
-          TmpStr += "COARSE GRAINED, ";
-        break;
-      case HSA_AMD_SEGMENT_READONLY:
-        TmpStr = "READONLY";
-        break;
-      case HSA_AMD_SEGMENT_PRIVATE:
-        TmpStr = "PRIVATE";
-        break;
-      case HSA_AMD_SEGMENT_GROUP:
-        TmpStr = "GROUP";
-        break;
-      }
-      printf("      Pool %s: \n", TmpStr.c_str());
-
-      core::checkResult(hsa_amd_memory_pool_get_info(
-                            Region, HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size),
-                        "Error returned from hsa_amd_memory_pool_get_info when "
-                        "obtaining HSA_AMD_MEMORY_POOL_INFO_SIZE\n");
-      printf("        Size: \t\t\t\t %zu bytes\n", Size);
-      core::checkResult(
-          hsa_amd_memory_pool_get_info(
-              Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &Alloc),
-          "Error returned from hsa_amd_memory_pool_get_info when obtaining "
-          "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED\n");
-      printf("        Allocatable: \t\t\t %s\n", (Alloc ? "TRUE" : "FALSE"));
-      core::checkResult(
-          hsa_amd_memory_pool_get_info(
-              Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &Size),
-          "Error returned from hsa_amd_memory_pool_get_info when obtaining "
-          "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE\n");
-      printf("        Runtime Alloc Granule: \t\t %zu bytes\n", Size);
-      core::checkResult(
-          hsa_amd_memory_pool_get_info(
-              Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, &Size),
-          "Error returned from hsa_amd_memory_pool_get_info when obtaining "
-          "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT\n");
-      printf("        Runtime Alloc alignment: \t %zu bytes\n", Size);
-      core::checkResult(
-          hsa_amd_memory_pool_get_info(
-              Region, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &Access),
-          "Error returned from hsa_amd_memory_pool_get_info when obtaining "
-          "HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL\n");
-      printf("        Accessable by all: \t\t %s\n",
-             (Access ? "TRUE" : "FALSE"));
-
-      return HSA_STATUS_SUCCESS;
-    };
-    // Iterate over all the memory regions for this agent. Get the memory region
-    // type and size
-    hsa_amd_agent_iterate_memory_pools(Agent, CbMem, nullptr);
-
-    printf("    ISAs:\n");
-    auto CBIsas = [](hsa_isa_t Isa, void *Data) -> hsa_status_t {
-      char TmpChar[1000];
-      core::checkResult(hsa_isa_get_info_alt(Isa, HSA_ISA_INFO_NAME, TmpChar),
-                        "Error returned from hsa_isa_get_info_alt when "
-                        "obtaining HSA_ISA_INFO_NAME\n");
-      printf("        Name: \t\t\t\t %s\n", TmpChar);
-
-      return HSA_STATUS_SUCCESS;
-    };
-    // Iterate over all the memory regions for this agent. Get the memory region
-    // type and size
-    hsa_agent_iterate_isas(Agent, CBIsas, nullptr);
-  }
-
-  // Record entry point associated with device
-  void addOffloadEntry(int32_t DeviceId, __tgt_offload_entry Entry) {
-    assert(DeviceId < (int32_t)FuncGblEntries.size() &&
-           "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
-
-    E.Entries.push_back(Entry);
-  }
-
-  // Return true if the entry is associated with device
-  bool findOffloadEntry(int32_t DeviceId, void *Addr) {
-    assert(DeviceId < (int32_t)FuncGblEntries.size() &&
-           "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
-
-    for (auto &It : E.Entries) {
-      if (It.addr == Addr)
-        return true;
-    }
-
-    return false;
-  }
-
-  // Return the pointer to the target entries table
-  __tgt_target_table *getOffloadEntriesTable(int32_t DeviceId) {
-    assert(DeviceId < (int32_t)FuncGblEntries.size() &&
-           "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
-
-    int32_t Size = E.Entries.size();
-
-    // Table is empty
-    if (!Size)
-      return 0;
-
-    __tgt_offload_entry *Begin = &E.Entries[0];
-    __tgt_offload_entry *End = &E.Entries[Size - 1];
-
-    // Update table info according to the entries and return the pointer
-    E.Table.EntriesBegin = Begin;
-    E.Table.EntriesEnd = ++End;
-
-    return &E.Table;
-  }
-
-  // Clear entries table for a device
-  void clearOffloadEntriesTable(int DeviceId) {
-    assert(DeviceId < (int32_t)FuncGblEntries.size() &&
-           "Unexpected device id!");
-    FuncGblEntries[DeviceId].emplace_back();
-    FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
-    // KernelArgPoolMap.clear();
-    E.Entries.clear();
-    E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
-  }
-
-  hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool,
-                                   unsigned int DeviceId) {
-    assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here.");
-    uint32_t GlobalFlags = 0;
-    hsa_status_t Err = hsa_amd_memory_pool_get_info(
-        MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
-
-    if (Err != HSA_STATUS_SUCCESS) {
-      return Err;
-    }
-
-    if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
-      DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool;
-    } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
-      DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool;
-    }
-
-    return HSA_STATUS_SUCCESS;
-  }
-
-  hsa_status_t setupDevicePools(const std::vector<hsa_agent_t> &Agents) {
-    for (unsigned int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) {
-      hsa_status_t Err = hsa::amd_agent_iterate_memory_pools(
-          Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) {
-            hsa_status_t ValidStatus = core::isValidMemoryPool(MemoryPool);
-            if (ValidStatus != HSA_STATUS_SUCCESS) {
-              DP("Alloc allowed in memory pool check failed: %s\n",
-                 get_error_string(ValidStatus));
-              return HSA_STATUS_SUCCESS;
-            }
-            return addDeviceMemoryPool(MemoryPool, DeviceId);
-          });
-
-      if (Err != HSA_STATUS_SUCCESS) {
-        DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-           "Iterate all memory pools", get_error_string(Err));
-        return Err;
-      }
-    }
-    return HSA_STATUS_SUCCESS;
-  }
-
-  hsa_status_t setupHostMemoryPools(std::vector<hsa_agent_t> &Agents) {
-    std::vector<hsa_amd_memory_pool_t> HostPools;
-
-    // collect all the "valid" pools for all the given agents.
-    for (const auto &Agent : Agents) {
-      hsa_status_t Err = hsa_amd_agent_iterate_memory_pools(
-          Agent, core::addMemoryPool, static_cast<void *>(&HostPools));
-      if (Err != HSA_STATUS_SUCCESS) {
-        DP("addMemoryPool returned %s, continuing\n", get_error_string(Err));
-      }
-    }
-
-    // We need two fine-grained pools.
-    //  1. One with kernarg flag set for storing kernel arguments
-    //  2. Second for host allocations
-    bool FineGrainedMemoryPoolSet = false;
-    bool KernArgPoolSet = false;
-    for (const auto &MemoryPool : HostPools) {
-      hsa_status_t Err = HSA_STATUS_SUCCESS;
-      uint32_t GlobalFlags = 0;
-      Err = hsa_amd_memory_pool_get_info(
-          MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
-      if (Err != HSA_STATUS_SUCCESS) {
-        DP("Get memory pool info failed: %s\n", get_error_string(Err));
-        return Err;
-      }
-
-      if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
-        if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) {
-          KernArgPool = MemoryPool;
-          KernArgPoolSet = true;
-        } else {
-          HostFineGrainedMemoryPool = MemoryPool;
-          FineGrainedMemoryPoolSet = true;
-        }
-      }
-    }
-
-    if (FineGrainedMemoryPoolSet && KernArgPoolSet)
-      return HSA_STATUS_SUCCESS;
-
-    return HSA_STATUS_ERROR;
-  }
-
-  hsa_amd_memory_pool_t getDeviceMemoryPool(unsigned int DeviceId) {
-    assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() &&
-           "Invalid device Id");
-    return DeviceCoarseGrainedMemoryPools[DeviceId];
-  }
-
-  hsa_amd_memory_pool_t getHostMemoryPool() {
-    return HostFineGrainedMemoryPool;
-  }
-
-  static int readEnv(const char *Env, int Default = -1) {
-    const char *EnvStr = getenv(Env);
-    int Res = Default;
-    if (EnvStr) {
-      Res = std::stoi(EnvStr);
-      DP("Parsed %s=%d\n", Env, Res);
-    }
-    return Res;
-  }
-
-  RTLDeviceInfoTy() {
-    DP("Start initializing " GETNAME(TARGET_NAME) "\n");
-
-    // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr
-    // anytime. You do not need a debug library build.
-    //  0 => no tracing
-    //  1 => tracing dispatch only
-    // >1 => verbosity increase
-
-    if (!HSAInitSuccess()) {
-      DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n");
-      return;
-    }
-
-    if (char *EnvStr = getenv("LIBOMPTARGET_KERNEL_TRACE"))
-      print_kernel_trace = atoi(EnvStr);
-    else
-      print_kernel_trace = 0;
-
-    hsa_status_t Err = core::atl_init_gpu_context();
-    if (Err != HSA_STATUS_SUCCESS) {
-      DP("Error when initializing " GETNAME(TARGET_NAME) "\n");
-      return;
-    }
-
-    // Init hostcall soon after initializing hsa
-    hostrpc_init();
-
-    Err = findAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) {
-      if (DeviceType == HSA_DEVICE_TYPE_CPU) {
-        CPUAgents.push_back(Agent);
-      } else {
-        HSAAgents.push_back(Agent);
-      }
-    });
-    if (Err != HSA_STATUS_SUCCESS)
-      return;
-
-    NumberOfDevices = (int)HSAAgents.size();
-
-    if (NumberOfDevices == 0) {
-      DP("There are no devices supporting HSA.\n");
-      return;
-    }
-    DP("There are %d devices supporting HSA.\n", NumberOfDevices);
-
-    // Init the device info
-    HSAQueueSchedulers.reserve(NumberOfDevices);
-    FuncGblEntries.resize(NumberOfDevices);
-    ThreadsPerGroup.resize(NumberOfDevices);
-    ComputeUnits.resize(NumberOfDevices);
-    GPUName.resize(NumberOfDevices);
-    GroupsPerDevice.resize(NumberOfDevices);
-    WarpSize.resize(NumberOfDevices);
-    NumTeams.resize(NumberOfDevices);
-    NumThreads.resize(NumberOfDevices);
-    DeviceStateStore.resize(NumberOfDevices);
-    KernelInfoTable.resize(NumberOfDevices);
-    SymbolInfoTable.resize(NumberOfDevices);
-    DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices);
-    DeviceFineGrainedMemoryPools.resize(NumberOfDevices);
-
-    Err = setupDevicePools(HSAAgents);
-    if (Err != HSA_STATUS_SUCCESS) {
-      DP("Setup for Device Memory Pools failed\n");
-      return;
-    }
-
-    Err = setupHostMemoryPools(CPUAgents);
-    if (Err != HSA_STATUS_SUCCESS) {
-      DP("Setup for Host Memory Pools failed\n");
-      return;
-    }
-
-    for (int I = 0; I < NumberOfDevices; I++) {
-      uint32_t QueueSize = 0;
-      {
-        hsa_status_t Err = hsa_agent_get_info(
-            HSAAgents[I], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &QueueSize);
-        if (Err != HSA_STATUS_SUCCESS) {
-          DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", I);
-          return;
-        }
-        enum { MaxQueueSize = 4096 };
-        if (QueueSize > MaxQueueSize) {
-          QueueSize = MaxQueueSize;
-        }
-      }
-
-      {
-        HSAQueueScheduler QSched;
-        if (!QSched.createQueues(HSAAgents[I], QueueSize))
-          return;
-        HSAQueueSchedulers.emplace_back(std::move(QSched));
-      }
-
-      DeviceStateStore[I] = {nullptr, 0};
-    }
-
-    for (int I = 0; I < NumberOfDevices; I++) {
-      ThreadsPerGroup[I] = RTLDeviceInfoTy::DefaultWgSize;
-      GroupsPerDevice[I] = RTLDeviceInfoTy::DefaultNumTeams;
-      ComputeUnits[I] = 1;
-      DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", I,
-         GroupsPerDevice[I], ThreadsPerGroup[I]);
-    }
-
-    // Get environment variables regarding teams
-    Env.TeamLimit = readEnv("OMP_TEAM_LIMIT");
-    Env.NumTeams = readEnv("OMP_NUM_TEAMS");
-    Env.MaxTeamsDefault = readEnv("OMP_MAX_TEAMS_DEFAULT");
-    Env.TeamThreadLimit = readEnv("OMP_TEAMS_THREAD_LIMIT");
-    Env.DynamicMemSize = readEnv("LIBOMPTARGET_SHARED_MEMORY_SIZE", 0);
-
-    // Default state.
-    RequiresFlags = OMP_REQ_UNDEFINED;
-
-    ConstructionSucceeded = true;
-  }
-
-  ~RTLDeviceInfoTy() {
-    DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n");
-    if (!HSAInitSuccess()) {
-      // Then none of these can have been set up and they can't be torn down
-      return;
-    }
-    // Run destructors on types that use HSA before
-    // impl_finalize removes access to it
-    DeviceStateStore.clear();
-    KernelArgPoolMap.clear();
-    // Terminate hostrpc before finalizing hsa
-    hostrpc_terminate();
-
-    hsa_status_t Err;
-    for (uint32_t I = 0; I < HSAExecutables.size(); I++) {
-      Err = hsa_executable_destroy(HSAExecutables[I]);
-      if (Err != HSA_STATUS_SUCCESS) {
-        DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-           "Destroying executable", get_error_string(Err));
-      }
-    }
-  }
-};
-
-pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER;
-
-// Putting accesses to DeviceInfo global behind a function call prior
-// to changing to use init_plugin/deinit_plugin calls
-static RTLDeviceInfoTy DeviceInfoState;
-static RTLDeviceInfoTy &DeviceInfo() { return DeviceInfoState; }
-
-namespace {
-
-int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
-                     __tgt_async_info *AsyncInfo) {
-  assert(AsyncInfo && "AsyncInfo is nullptr");
-  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
-  // Return success if we are not copying back to host from target.
-  if (!HstPtr)
-    return OFFLOAD_SUCCESS;
-  hsa_status_t Err;
-  DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
-     (long long unsigned)(Elf64_Addr)TgtPtr,
-     (long long unsigned)(Elf64_Addr)HstPtr);
-
-  Err = DeviceInfo().freesignalpoolMemcpyD2H(HstPtr, TgtPtr, (size_t)Size,
-                                             DeviceId);
-
-  if (Err != HSA_STATUS_SUCCESS) {
-    DP("Error when copying data from device to host. Pointers: "
-       "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
-       (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
-    return OFFLOAD_FAIL;
-  }
-  DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
-     (long long unsigned)(Elf64_Addr)TgtPtr,
-     (long long unsigned)(Elf64_Addr)HstPtr);
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
-                   __tgt_async_info *AsyncInfo) {
-  assert(AsyncInfo && "AsyncInfo is nullptr");
-  hsa_status_t Err;
-  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
-  // Return success if we are not doing host to target.
-  if (!HstPtr)
-    return OFFLOAD_SUCCESS;
-
-  DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,
-     (long long unsigned)(Elf64_Addr)HstPtr,
-     (long long unsigned)(Elf64_Addr)TgtPtr);
-  Err = DeviceInfo().freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size,
-                                             DeviceId);
-  if (Err != HSA_STATUS_SUCCESS) {
-    DP("Error when copying data from host to device. Pointers: "
-       "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
-       (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
-    return OFFLOAD_FAIL;
-  }
-  return OFFLOAD_SUCCESS;
-}
-
-// Async.
-// The implementation was written with cuda streams in mind. The semantics of
-// that are to execute kernels on a queue in order of insertion. A synchronise
-// call then makes writes visible between host and device. This means a series
-// of N data_submit_async calls are expected to execute serially. HSA offers
-// various options to run the data copies concurrently. This may require changes
-// to libomptarget.
-
-// __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that
-// there are no outstanding kernels that need to be synchronized. Any async call
-// may be passed a Queue==0, at which point the cuda implementation will set it
-// to non-null (see getStream). The cuda streams are per-device. Upstream may
-// change this interface to explicitly initialize the AsyncInfo_pointer, but
-// until then hsa lazily initializes it as well.
-
-void initAsyncInfo(__tgt_async_info *AsyncInfo) {
-  // set non-null while using async calls, return to null to indicate completion
-  assert(AsyncInfo);
-  if (!AsyncInfo->Queue) {
-    AsyncInfo->Queue = reinterpret_cast<void *>(UINT64_MAX);
-  }
-}
-void finiAsyncInfo(__tgt_async_info *AsyncInfo) {
-  assert(AsyncInfo);
-  assert(AsyncInfo->Queue);
-  AsyncInfo->Queue = 0;
-}
-
-// Determine launch values for kernel.
-struct LaunchVals {
-  int WorkgroupSize;
-  int GridSize;
-};
-LaunchVals getLaunchVals(int WarpSize, EnvironmentVariables Env,
-                         int ConstWGSize,
-                         llvm::omp::OMPTgtExecModeFlags ExecutionMode,
-                         int NumTeams, int ThreadLimit, uint64_t LoopTripcount,
-                         int DeviceNumTeams) {
-
-  int ThreadsPerGroup = RTLDeviceInfoTy::DefaultWgSize;
-  int NumGroups = 0;
-
-  int MaxTeams = Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams;
-  if (MaxTeams > static_cast<int>(RTLDeviceInfoTy::HardTeamLimit))
-    MaxTeams = RTLDeviceInfoTy::HardTeamLimit;
-
-  if (print_kernel_trace & STARTUP_DETAILS) {
-    DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::MaxTeams);
-    DP("Max_Teams: %d\n", MaxTeams);
-    DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize);
-    DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::MaxWgSize);
-    DP("RTLDeviceInfoTy::Default_WG_Size: %d\n",
-       RTLDeviceInfoTy::DefaultWgSize);
-    DP("thread_limit: %d\n", ThreadLimit);
-    DP("threadsPerGroup: %d\n", ThreadsPerGroup);
-    DP("ConstWGSize: %d\n", ConstWGSize);
-  }
-  // check for thread_limit() clause
-  if (ThreadLimit > 0) {
-    ThreadsPerGroup = ThreadLimit;
-    DP("Setting threads per block to requested %d\n", ThreadLimit);
-    // Add master warp for GENERIC
-    if (ExecutionMode ==
-        llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
-      ThreadsPerGroup += WarpSize;
-      DP("Adding master wavefront: +%d threads\n", WarpSize);
-    }
-    if (ThreadsPerGroup > RTLDeviceInfoTy::MaxWgSize) { // limit to max
-      ThreadsPerGroup = RTLDeviceInfoTy::MaxWgSize;
-      DP("Setting threads per block to maximum %d\n", ThreadsPerGroup);
-    }
-  }
-  // check flat_max_work_group_size attr here
-  if (ThreadsPerGroup > ConstWGSize) {
-    ThreadsPerGroup = ConstWGSize;
-    DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n",
-       ThreadsPerGroup);
-  }
-  if (print_kernel_trace & STARTUP_DETAILS)
-    DP("threadsPerGroup: %d\n", ThreadsPerGroup);
-  DP("Preparing %d threads\n", ThreadsPerGroup);
-
-  // Set default num_groups (teams)
-  if (Env.TeamLimit > 0)
-    NumGroups = (MaxTeams < Env.TeamLimit) ? MaxTeams : Env.TeamLimit;
-  else
-    NumGroups = MaxTeams;
-  DP("Set default num of groups %d\n", NumGroups);
-
-  if (print_kernel_trace & STARTUP_DETAILS) {
-    DP("num_groups: %d\n", NumGroups);
-    DP("num_teams: %d\n", NumTeams);
-  }
-
-  // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size
-  // This reduction is typical for default case (no thread_limit clause).
-  // or when user goes crazy with num_teams clause.
-  // FIXME: We cant distinguish between a constant or variable thread limit.
-  // So we only handle constant thread_limits.
-  if (ThreadsPerGroup >
-      RTLDeviceInfoTy::DefaultWgSize) //  256 < threadsPerGroup <= 1024
-    // Should we round threadsPerGroup up to nearest WarpSize
-    // here?
-    NumGroups = (MaxTeams * RTLDeviceInfoTy::MaxWgSize) / ThreadsPerGroup;
-
-  // check for num_teams() clause
-  if (NumTeams > 0) {
-    NumGroups = (NumTeams < NumGroups) ? NumTeams : NumGroups;
-  }
-  if (print_kernel_trace & STARTUP_DETAILS) {
-    DP("num_groups: %d\n", NumGroups);
-    DP("Env.NumTeams %d\n", Env.NumTeams);
-    DP("Env.TeamLimit %d\n", Env.TeamLimit);
-  }
-
-  if (Env.NumTeams > 0) {
-    NumGroups = (Env.NumTeams < NumGroups) ? Env.NumTeams : NumGroups;
-    DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams);
-  } else if (Env.TeamLimit > 0) {
-    NumGroups = (Env.TeamLimit < NumGroups) ? Env.TeamLimit : NumGroups;
-    DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit);
-  } else {
-    if (NumTeams <= 0) {
-      if (LoopTripcount > 0) {
-        if (ExecutionMode ==
-            llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) {
-          // round up to the nearest integer
-          NumGroups = ((LoopTripcount - 1) / ThreadsPerGroup) + 1;
-        } else if (ExecutionMode ==
-                   llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
-          NumGroups = LoopTripcount;
-        } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ {
-          // This is a generic kernel that was transformed to use SPMD-mode
-          // execution but uses Generic-mode semantics for scheduling.
-          NumGroups = LoopTripcount;
-        }
-        DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
-           "threads per block %d\n",
-           NumGroups, LoopTripcount, ThreadsPerGroup);
-      }
-    } else {
-      NumGroups = NumTeams;
-    }
-    if (NumGroups > MaxTeams) {
-      NumGroups = MaxTeams;
-      if (print_kernel_trace & STARTUP_DETAILS)
-        DP("Limiting num_groups %d to Max_Teams %d \n", NumGroups, MaxTeams);
-    }
-    if (NumGroups > NumTeams && NumTeams > 0) {
-      NumGroups = NumTeams;
-      if (print_kernel_trace & STARTUP_DETAILS)
-        DP("Limiting num_groups %d to clause num_teams %d \n", NumGroups,
-           NumTeams);
-    }
-  }
-
-  // num_teams clause always honored, no matter what, unless DEFAULT is active.
-  if (NumTeams > 0) {
-    NumGroups = NumTeams;
-    // Cap num_groups to EnvMaxTeamsDefault if set.
-    if (Env.MaxTeamsDefault > 0 && NumGroups > Env.MaxTeamsDefault)
-      NumGroups = Env.MaxTeamsDefault;
-  }
-  if (print_kernel_trace & STARTUP_DETAILS) {
-    DP("threadsPerGroup: %d\n", ThreadsPerGroup);
-    DP("num_groups: %d\n", NumGroups);
-    DP("loop_tripcount: %ld\n", LoopTripcount);
-  }
-  DP("Final %d num_groups and %d threadsPerGroup\n", NumGroups,
-     ThreadsPerGroup);
-
-  LaunchVals Res;
-  Res.WorkgroupSize = ThreadsPerGroup;
-  Res.GridSize = ThreadsPerGroup * NumGroups;
-  return Res;
-}
-
-static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) {
-  uint64_t PacketId = hsa_queue_add_write_index_relaxed(Queue, 1);
-  bool Full = true;
-  while (Full) {
-    Full =
-        PacketId >= (Queue->size + hsa_queue_load_read_index_scacquire(Queue));
-  }
-  return PacketId;
-}
-
-int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
-                        ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams,
-                        int32_t ThreadLimit, uint64_t LoopTripcount) {
-  // Set the context we are using
-  // update thread limit content in gpu memory if un-initialized or specified
-  // from host
-
-  DP("Run target team region thread_limit %d\n", ThreadLimit);
-
-  // All args are references.
-  std::vector<void *> Args(ArgNum);
-  std::vector<void *> Ptrs(ArgNum);
-
-  DP("Arg_num: %d\n", ArgNum);
-  for (int32_t I = 0; I < ArgNum; ++I) {
-    Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
-    Args[I] = &Ptrs[I];
-    DP("Offseted base: arg[%d]:" DPxMOD "\n", I, DPxPTR(Ptrs[I]));
-  }
-
-  KernelTy *KernelInfo = (KernelTy *)TgtEntryPtr;
-
-  std::string KernelName = std::string(KernelInfo->Name);
-  auto &KernelInfoTable = DeviceInfo().KernelInfoTable;
-  if (KernelInfoTable[DeviceId].find(KernelName) ==
-      KernelInfoTable[DeviceId].end()) {
-    DP("Kernel %s not found\n", KernelName.c_str());
-    return OFFLOAD_FAIL;
-  }
-
-  const atl_kernel_info_t KernelInfoEntry =
-      KernelInfoTable[DeviceId][KernelName];
-  const uint32_t GroupSegmentSize =
-      KernelInfoEntry.group_segment_size + DeviceInfo().Env.DynamicMemSize;
-  const uint32_t SgprCount = KernelInfoEntry.sgpr_count;
-  const uint32_t VgprCount = KernelInfoEntry.vgpr_count;
-  const uint32_t SgprSpillCount = KernelInfoEntry.sgpr_spill_count;
-  const uint32_t VgprSpillCount = KernelInfoEntry.vgpr_spill_count;
-
-  assert(ArgNum == (int)KernelInfoEntry.explicit_argument_count);
-
-  /*
-   * Set limit based on ThreadsPerGroup and GroupsPerDevice
-   */
-  LaunchVals LV =
-      getLaunchVals(DeviceInfo().WarpSize[DeviceId], DeviceInfo().Env,
-                    KernelInfo->ConstWGSize, KernelInfo->ExecutionMode,
-                    NumTeams,      // From run_region arg
-                    ThreadLimit,   // From run_region arg
-                    LoopTripcount, // From run_region arg
-                    DeviceInfo().NumTeams[KernelInfo->DeviceId]);
-  const int GridSize = LV.GridSize;
-  const int WorkgroupSize = LV.WorkgroupSize;
-
-  if (print_kernel_trace >= LAUNCH) {
-    int NumGroups = GridSize / WorkgroupSize;
-    // enum modes are SPMD, GENERIC, NONE 0,1,2
-    // if doing rtl timing, print to stderr, unless stdout requested.
-    bool TraceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING);
-    fprintf(TraceToStdout ? stdout : stderr,
-            "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) "
-            "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
-            "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n",
-            DeviceId, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,
-            ArgNum, NumGroups, WorkgroupSize, NumTeams, ThreadLimit,
-            GroupSegmentSize, SgprCount, VgprCount, SgprSpillCount,
-            VgprSpillCount, LoopTripcount, KernelInfo->Name);
-  }
-
-  // Run on the device.
-  {
-    hsa_queue_t *Queue = DeviceInfo().HSAQueueSchedulers[DeviceId].next();
-    if (!Queue) {
-      return OFFLOAD_FAIL;
-    }
-    uint64_t PacketId = acquireAvailablePacketId(Queue);
-
-    const uint32_t Mask = Queue->size - 1; // size is a power of 2
-    hsa_kernel_dispatch_packet_t *Packet =
-        (hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask);
-
-    // packet->header is written last
-    Packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
-    Packet->workgroup_size_x = WorkgroupSize;
-    Packet->workgroup_size_y = 1;
-    Packet->workgroup_size_z = 1;
-    Packet->reserved0 = 0;
-    Packet->grid_size_x = GridSize;
-    Packet->grid_size_y = 1;
-    Packet->grid_size_z = 1;
-    Packet->private_segment_size = KernelInfoEntry.private_segment_size;
-    Packet->group_segment_size = GroupSegmentSize;
-    Packet->kernel_object = KernelInfoEntry.kernel_object;
-    Packet->kernarg_address = 0;     // use the block allocator
-    Packet->reserved2 = 0;           // impl writes id_ here
-    Packet->completion_signal = {0}; // may want a pool of signals
-
-    KernelArgPool *ArgPool = nullptr;
-    void *KernArg = nullptr;
-    {
-      auto It = KernelArgPoolMap.find(std::string(KernelInfo->Name));
-      if (It != KernelArgPoolMap.end()) {
-        ArgPool = (It->second).get();
-      }
-    }
-    if (!ArgPool) {
-      DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name,
-         DeviceId);
-    }
-    {
-      if (ArgPool) {
-        assert(ArgPool->KernargSegmentSize == (ArgNum * sizeof(void *)));
-        KernArg = ArgPool->allocate(ArgNum);
-      }
-      if (!KernArg) {
-        DP("Allocate kernarg failed\n");
-        return OFFLOAD_FAIL;
-      }
-
-      // Copy explicit arguments
-      for (int I = 0; I < ArgNum; I++) {
-        memcpy((char *)KernArg + sizeof(void *) * I, Args[I], sizeof(void *));
-      }
-
-      // Initialize implicit arguments. TODO: Which of these can be dropped
-      AMDGPUImplicitArgsTy *ImplArgs = reinterpret_cast<AMDGPUImplicitArgsTy *>(
-          static_cast<char *>(KernArg) + ArgPool->KernargSegmentSize);
-      memset(ImplArgs, 0,
-             sizeof(AMDGPUImplicitArgsTy)); // may not be necessary
-      ImplArgs->OffsetX = 0;
-      ImplArgs->OffsetY = 0;
-      ImplArgs->OffsetZ = 0;
-
-      // assign a hostcall buffer for the selected Q
-      if (__atomic_load_n(&DeviceInfo().HostcallRequired, __ATOMIC_ACQUIRE)) {
-        // hostrpc_assign_buffer is not thread safe, and this function is
-        // under a multiple reader lock, not a writer lock.
-        static pthread_mutex_t HostcallInitLock = PTHREAD_MUTEX_INITIALIZER;
-        pthread_mutex_lock(&HostcallInitLock);
-        uint64_t Buffer = hostrpc_assign_buffer(
-            DeviceInfo().HSAAgents[DeviceId], Queue, DeviceId);
-        pthread_mutex_unlock(&HostcallInitLock);
-        if (!Buffer) {
-          DP("hostrpc_assign_buffer failed, gpu would dereference null and "
-             "error\n");
-          return OFFLOAD_FAIL;
-        }
-
-        DP("Implicit argument count: %d\n",
-           KernelInfoEntry.implicit_argument_count);
-        if (KernelInfoEntry.implicit_argument_count >= 4) {
-          // Initialise pointer for implicit_argument_count != 0 ABI
-          // Guess that the right implicit argument is at offset 24 after
-          // the explicit arguments. In the future, should be able to read
-          // the offset from msgpack. Clang is not annotating it at present.
-          uint64_t Offset =
-              sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3);
-          if ((Offset + 8) > ArgPool->kernargSizeIncludingImplicit()) {
-            DP("Bad offset of hostcall: %lu, exceeds kernarg size w/ implicit "
-               "args: %d\n",
-               Offset + 8, ArgPool->kernargSizeIncludingImplicit());
-          } else {
-            memcpy(static_cast<char *>(KernArg) + Offset, &Buffer, 8);
-          }
-        }
-
-        // initialise pointer for implicit_argument_count == 0 ABI
-        ImplArgs->HostcallPtr = Buffer;
-      }
-
-      Packet->kernarg_address = KernArg;
-    }
-
-    hsa_signal_t S = DeviceInfo().FreeSignalPool.pop();
-    if (S.handle == 0) {
-      DP("Failed to get signal instance\n");
-      return OFFLOAD_FAIL;
-    }
-    Packet->completion_signal = S;
-    hsa_signal_store_relaxed(Packet->completion_signal, 1);
-
-    // Publish the packet indicating it is ready to be processed
-    core::packetStoreRelease(reinterpret_cast<uint32_t *>(Packet),
-                             core::createHeader(), Packet->setup);
-
-    // Since the packet is already published, its contents must not be
-    // accessed any more
-    hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
-
-    while (hsa_signal_wait_scacquire(S, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
-                                     HSA_WAIT_STATE_BLOCKED) != 0)
-      ;
-
-    assert(ArgPool);
-    ArgPool->deallocate(KernArg);
-    DeviceInfo().FreeSignalPool.push(S);
-  }
-
-  DP("Kernel completed\n");
-  return OFFLOAD_SUCCESS;
-}
-
-bool elfMachineIdIsAmdgcn(__tgt_device_image *Image) {
-  const uint16_t AmdgcnMachineID = EM_AMDGPU;
-  const int32_t R = elf_check_machine(Image, AmdgcnMachineID);
-  if (!R) {
-    DP("Supported machine ID not found\n");
-  }
-  return R;
-}
-
-uint32_t elfEFlags(__tgt_device_image *Image) {
-  const char *ImgBegin = (char *)Image->ImageStart;
-  size_t ImgSize = (char *)Image->ImageEnd - ImgBegin;
-
-  StringRef Buffer = StringRef(ImgBegin, ImgSize);
-  auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""),
-                                                  /*InitContent=*/false);
-  if (!ElfOrErr) {
-    consumeError(ElfOrErr.takeError());
-    return 0;
-  }
-
-  if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(ElfOrErr->get()))
-    return ELFObj->getPlatformFlags();
-  return 0;
-}
-
-template <typename T> bool enforceUpperBound(T *Value, T Upper) {
-  bool Changed = *Value > Upper;
-  if (Changed) {
-    *Value = Upper;
-  }
-  return Changed;
-}
-
-struct SymbolInfo {
-  const void *Addr = nullptr;
-  uint32_t Size = UINT32_MAX;
-  uint32_t ShType = SHT_NULL;
-};
-
-int getSymbolInfoWithoutLoading(const ELFObjectFile<ELF64LE> &ELFObj,
-                                StringRef SymName, SymbolInfo *Res) {
-  auto SymOrErr = getELFSymbol(ELFObj, SymName);
-  if (!SymOrErr) {
-    std::string ErrorString = toString(SymOrErr.takeError());
-    DP("Failed ELF lookup: %s\n", ErrorString.c_str());
-    return 1;
-  }
-  if (!*SymOrErr)
-    return 1;
-
-  auto SymSecOrErr = ELFObj.getELFFile().getSection((*SymOrErr)->st_shndx);
-  if (!SymSecOrErr) {
-    std::string ErrorString = toString(SymOrErr.takeError());
-    DP("Failed ELF lookup: %s\n", ErrorString.c_str());
-    return 1;
-  }
-
-  Res->Addr = (*SymOrErr)->st_value + ELFObj.getELFFile().base();
-  Res->Size = static_cast<uint32_t>((*SymOrErr)->st_size);
-  Res->ShType = static_cast<uint32_t>((*SymSecOrErr)->sh_type);
-  return 0;
-}
-
-int getSymbolInfoWithoutLoading(char *Base, size_t ImgSize, const char *SymName,
-                                SymbolInfo *Res) {
-  StringRef Buffer = StringRef(Base, ImgSize);
-  auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""),
-                                                  /*InitContent=*/false);
-  if (!ElfOrErr) {
-    REPORT("Failed to load ELF: %s\n", toString(ElfOrErr.takeError()).c_str());
-    return 1;
-  }
-
-  if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(ElfOrErr->get()))
-    return getSymbolInfoWithoutLoading(*ELFObj, SymName, Res);
-  return 1;
-}
-
-hsa_status_t interopGetSymbolInfo(char *Base, size_t ImgSize,
-                                  const char *SymName, const void **VarAddr,
-                                  uint32_t *VarSize) {
-  SymbolInfo SI;
-  int Rc = getSymbolInfoWithoutLoading(Base, ImgSize, SymName, &SI);
-  if (Rc == 0) {
-    *VarAddr = SI.Addr;
-    *VarSize = SI.Size;
-    return HSA_STATUS_SUCCESS;
-  }
-  return HSA_STATUS_ERROR;
-}
-
-template <typename C>
-hsa_status_t moduleRegisterFromMemoryToPlace(
-    std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
-    std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
-    void *ModuleBytes, size_t ModuleSize, int DeviceId, C Cb,
-    std::vector<hsa_executable_t> &HSAExecutables) {
-  auto L = [](void *Data, size_t Size, void *CbState) -> hsa_status_t {
-    C *Unwrapped = static_cast<C *>(CbState);
-    return (*Unwrapped)(Data, Size);
-  };
-  return core::RegisterModuleFromMemory(
-      KernelInfoTable, SymbolInfoTable, ModuleBytes, ModuleSize,
-      DeviceInfo().HSAAgents[DeviceId], L, static_cast<void *>(&Cb),
-      HSAExecutables);
-}
-
-uint64_t getDeviceStateBytes(char *ImageStart, size_t ImgSize) {
-  uint64_t DeviceStateBytes = 0;
-  {
-    // If this is the deviceRTL, get the state variable size
-    SymbolInfo SizeSi;
-    int Rc = getSymbolInfoWithoutLoading(
-        ImageStart, ImgSize, "omptarget_nvptx_device_State_size", &SizeSi);
-
-    if (Rc == 0) {
-      if (SizeSi.Size != sizeof(uint64_t)) {
-        DP("Found device_State_size variable with wrong size\n");
-        return 0;
-      }
-
-      // Read number of bytes directly from the elf
-      memcpy(&DeviceStateBytes, SizeSi.Addr, sizeof(uint64_t));
-    }
-  }
-  return DeviceStateBytes;
-}
-
-struct DeviceEnvironment {
-  // initialise an DeviceEnvironmentTy in the deviceRTL
-  // patches around differences in the deviceRTL between trunk, aomp,
-  // rocmcc. Over time these differences will tend to zero and this class
-  // simplified.
-  // Symbol may be in .data or .bss, and may be missing fields, todo:
-  // review aomp/trunk/rocm and simplify the following
-
-  // The symbol may also have been deadstripped because the device side
-  // accessors were unused.
-
-  // If the symbol is in .data (aomp, rocm) it can be written directly.
-  // If it is in .bss, we must wait for it to be allocated space on the
-  // gpu (trunk) and initialize after loading.
-  const char *sym() { return "__omp_rtl_device_environment"; }
-
-  DeviceEnvironmentTy HostDeviceEnv;
-  SymbolInfo SI;
-  bool Valid = false;
-
-  __tgt_device_image *Image;
-  const size_t ImgSize;
-
-  DeviceEnvironment(int DeviceId, int NumberDevices, int DynamicMemSize,
-                    __tgt_device_image *Image, const size_t ImgSize)
-      : Image(Image), ImgSize(ImgSize) {
-
-    HostDeviceEnv.NumDevices = NumberDevices;
-    HostDeviceEnv.DeviceNum = DeviceId;
-    HostDeviceEnv.DebugKind = 0;
-    HostDeviceEnv.DynamicMemSize = DynamicMemSize;
-    if (char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
-      HostDeviceEnv.DebugKind = std::stoi(EnvStr);
-
-    int Rc = getSymbolInfoWithoutLoading((char *)Image->ImageStart, ImgSize,
-                                         sym(), &SI);
-    if (Rc != 0) {
-      DP("Finding global device environment '%s' - symbol missing.\n", sym());
-      return;
-    }
-
-    if (SI.Size > sizeof(HostDeviceEnv)) {
-      DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), SI.Size,
-         sizeof(HostDeviceEnv));
-      return;
-    }
-
-    Valid = true;
-  }
-
-  bool inImage() { return SI.ShType != SHT_NOBITS; }
-
-  hsa_status_t beforeLoading(void *Data, size_t Size) {
-    if (Valid) {
-      if (inImage()) {
-        DP("Setting global device environment before load (%u bytes)\n",
-           SI.Size);
-        uint64_t Offset = reinterpret_cast<const char *>(SI.Addr) -
-                          reinterpret_cast<const char *>(Image->ImageStart);
-        void *Pos = reinterpret_cast<char *>(Data) + Offset;
-        memcpy(Pos, &HostDeviceEnv, SI.Size);
-      }
-    }
-    return HSA_STATUS_SUCCESS;
-  }
-
-  hsa_status_t afterLoading() {
-    if (Valid) {
-      if (!inImage()) {
-        DP("Setting global device environment after load (%u bytes)\n",
-           SI.Size);
-        int DeviceId = HostDeviceEnv.DeviceNum;
-        auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId];
-        void *StatePtr;
-        uint32_t StatePtrSize;
-        hsa_status_t Err = interop_hsa_get_symbol_info(
-            SymbolInfo, DeviceId, sym(), &StatePtr, &StatePtrSize);
-        if (Err != HSA_STATUS_SUCCESS) {
-          DP("failed to find %s in loaded image\n", sym());
-          return Err;
-        }
-
-        if (StatePtrSize != SI.Size) {
-          DP("Symbol had size %u before loading, %u after\n", StatePtrSize,
-             SI.Size);
-          return HSA_STATUS_ERROR;
-        }
-
-        return DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &HostDeviceEnv,
-                                                    StatePtrSize, DeviceId);
-      }
-    }
-    return HSA_STATUS_SUCCESS;
-  }
-};
-
-hsa_status_t implCalloc(void **RetPtr, size_t Size, int DeviceId) {
-  uint64_t Rounded = 4 * ((Size + 3) / 4);
-  void *Ptr;
-  hsa_amd_memory_pool_t MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId);
-  hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Rounded, 0, &Ptr);
-  if (Err != HSA_STATUS_SUCCESS) {
-    return Err;
-  }
-
-  hsa_status_t Rc = hsa_amd_memory_fill(Ptr, 0, Rounded / 4);
-  if (Rc != HSA_STATUS_SUCCESS) {
-    DP("zero fill device_state failed with %u\n", Rc);
-    core::Runtime::Memfree(Ptr);
-    return HSA_STATUS_ERROR;
-  }
-
-  *RetPtr = Ptr;
-  return HSA_STATUS_SUCCESS;
-}
-
-bool imageContainsSymbol(void *Data, size_t Size, const char *Sym) {
-  SymbolInfo SI;
-  int Rc = getSymbolInfoWithoutLoading((char *)Data, Size, Sym, &SI);
-  return (Rc == 0) && (SI.Addr != nullptr);
-}
-
-hsa_status_t lock_memory(void *HostPtr, size_t Size, hsa_agent_t Agent,
-                         void **LockedHostPtr) {
-  hsa_status_t err = is_locked(HostPtr, LockedHostPtr);
-  if (err != HSA_STATUS_SUCCESS)
-    return err;
-
-  // HostPtr is already locked, just return it
-  if (*LockedHostPtr)
-    return HSA_STATUS_SUCCESS;
-
-  hsa_agent_t Agents[1] = {Agent};
-  return hsa_amd_memory_lock(HostPtr, Size, Agents, /*num_agent=*/1,
-                             LockedHostPtr);
-}
-
-hsa_status_t unlock_memory(void *HostPtr) {
-  void *LockedHostPtr = nullptr;
-  hsa_status_t err = is_locked(HostPtr, &LockedHostPtr);
-  if (err != HSA_STATUS_SUCCESS)
-    return err;
-
-  // if LockedHostPtr is nullptr, then HostPtr was not locked
-  if (!LockedHostPtr)
-    return HSA_STATUS_SUCCESS;
-
-  err = hsa_amd_memory_unlock(HostPtr);
-  return err;
-}
-
-} // namespace
-
-namespace core {
-hsa_status_t allow_access_to_all_gpu_agents(void *Ptr) {
-  return hsa_amd_agents_allow_access(DeviceInfo().HSAAgents.size(),
-                                     &DeviceInfo().HSAAgents[0], NULL, Ptr);
-}
-} // namespace core
-
-static hsa_status_t GetIsaInfo(hsa_isa_t isa, void *data) {
-  hsa_status_t err;
-  uint32_t name_len;
-  err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME_LENGTH, &name_len);
-  if (err != HSA_STATUS_SUCCESS) {
-    DP("Error getting ISA info length\n");
-    return err;
-  }
-
-  char TargetID[name_len];
-  err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME, TargetID);
-  if (err != HSA_STATUS_SUCCESS) {
-    DP("Error getting ISA info name\n");
-    return err;
-  }
-
-  auto TripleTargetID = llvm::StringRef(TargetID);
-  if (TripleTargetID.consume_front("amdgcn-amd-amdhsa")) {
-    DeviceInfo().TargetID.push_back(TripleTargetID.ltrim('-').str());
-  }
-  return HSA_STATUS_SUCCESS;
-}
-
-extern "C" {
-int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
-  return elfMachineIdIsAmdgcn(Image);
-}
-
-int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image,
-                                       __tgt_image_info *info) {
-  if (!__tgt_rtl_is_valid_binary(image))
-    return false;
-
-  // A subarchitecture was not specified. Assume it is compatible.
-  if (!info->Arch)
-    return true;
-
-  int32_t NumberOfDevices = __tgt_rtl_number_of_devices();
-
-  for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) {
-    __tgt_rtl_init_device(DeviceId);
-    hsa_agent_t agent = DeviceInfo().HSAAgents[DeviceId];
-    hsa_status_t err = hsa_agent_iterate_isas(agent, GetIsaInfo, &DeviceId);
-    if (err != HSA_STATUS_SUCCESS) {
-      DP("Error iterating ISAs\n");
-      return false;
-    }
-    if (!isImageCompatibleWithEnv(info, DeviceInfo().TargetID[DeviceId]))
-      return false;
-  }
-  DP("Image has Target ID compatible with the current environment: %s\n",
-     info->Arch);
-  return true;
-}
-
-int32_t __tgt_rtl_init_plugin() { return OFFLOAD_SUCCESS; }
-int32_t __tgt_rtl_deinit_plugin() { return OFFLOAD_SUCCESS; }
-
-int __tgt_rtl_number_of_devices() {
-  // If the construction failed, no methods are safe to call
-  if (DeviceInfo().ConstructionSucceeded) {
-    return DeviceInfo().NumberOfDevices;
-  }
-  DP("AMDGPU plugin construction failed. Zero devices available\n");
-  return 0;
-}
-
-int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
-  DP("Init requires flags to %ld\n", RequiresFlags);
-  DeviceInfo().RequiresFlags = RequiresFlags;
-  return RequiresFlags;
-}
-
-int32_t __tgt_rtl_init_device(int DeviceId) {
-  hsa_status_t Err = hsa_init();
-  if (Err != HSA_STATUS_SUCCESS) {
-    DP("HSA Initialization Failed.\n");
-    return HSA_STATUS_ERROR;
-  }
-  // this is per device id init
-  DP("Initialize the device id: %d\n", DeviceId);
-
-  hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId];
-
-  // Get number of Compute Unit
-  uint32_t ComputeUnits = 0;
-  Err = hsa_agent_get_info(
-      Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
-      &ComputeUnits);
-  if (Err != HSA_STATUS_SUCCESS) {
-    DeviceInfo().ComputeUnits[DeviceId] = 1;
-    DP("Error getting compute units : settiing to 1\n");
-  } else {
-    DeviceInfo().ComputeUnits[DeviceId] = ComputeUnits;
-    DP("Using %d compute unis per grid\n", DeviceInfo().ComputeUnits[DeviceId]);
-  }
-
-  char GetInfoName[64]; // 64 max size returned by get info
-  Err = hsa_agent_get_info(Agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME,
-                           (void *)GetInfoName);
-  if (Err)
-    DeviceInfo().GPUName[DeviceId] = "--unknown gpu--";
-  else {
-    DeviceInfo().GPUName[DeviceId] = GetInfoName;
-  }
-
-  if (print_kernel_trace & STARTUP_DETAILS)
-    DP("Device#%-2d CU's: %2d %s\n", DeviceId,
-       DeviceInfo().ComputeUnits[DeviceId],
-       DeviceInfo().GPUName[DeviceId].c_str());
-
-  // Query attributes to determine number of threads/block and blocks/grid.
-  uint16_t WorkgroupMaxDim[3];
-  Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
-                           &WorkgroupMaxDim);
-  if (Err != HSA_STATUS_SUCCESS) {
-    DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::DefaultNumTeams;
-    DP("Error getting grid dims: num groups : %d\n",
-       RTLDeviceInfoTy::DefaultNumTeams);
-  } else if (WorkgroupMaxDim[0] <= RTLDeviceInfoTy::HardTeamLimit) {
-    DeviceInfo().GroupsPerDevice[DeviceId] = WorkgroupMaxDim[0];
-    DP("Using %d ROCm blocks per grid\n",
-       DeviceInfo().GroupsPerDevice[DeviceId]);
-  } else {
-    DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::HardTeamLimit;
-    DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping "
-       "at the hard limit\n",
-       WorkgroupMaxDim[0], RTLDeviceInfoTy::HardTeamLimit);
-  }
-
-  // Get thread limit
-  hsa_dim3_t GridMaxDim;
-  Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim);
-  if (Err == HSA_STATUS_SUCCESS) {
-    DeviceInfo().ThreadsPerGroup[DeviceId] =
-        reinterpret_cast<uint32_t *>(&GridMaxDim)[0] /
-        DeviceInfo().GroupsPerDevice[DeviceId];
-
-    if (DeviceInfo().ThreadsPerGroup[DeviceId] == 0) {
-      DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
-      DP("Default thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize);
-    } else if (enforceUpperBound(&DeviceInfo().ThreadsPerGroup[DeviceId],
-                                 RTLDeviceInfoTy::MaxWgSize)) {
-      DP("Capped thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize);
-    } else {
-      DP("Using ROCm Queried thread limit: %d\n",
-         DeviceInfo().ThreadsPerGroup[DeviceId]);
-    }
-  } else {
-    DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
-    DP("Error getting max block dimension, use default:%d \n",
-       RTLDeviceInfoTy::MaxWgSize);
-  }
-
-  // Get wavefront size
-  uint32_t WavefrontSize = 0;
-  Err =
-      hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &WavefrontSize);
-  if (Err == HSA_STATUS_SUCCESS) {
-    DP("Queried wavefront size: %d\n", WavefrontSize);
-    DeviceInfo().WarpSize[DeviceId] = WavefrontSize;
-  } else {
-    // TODO: Burn the wavefront size into the code object
-    DP("Warning: Unknown wavefront size, assuming 64\n");
-    DeviceInfo().WarpSize[DeviceId] = 64;
-  }
-
-  // Adjust teams to the env variables
-
-  if (DeviceInfo().Env.TeamLimit > 0 &&
-      (enforceUpperBound(&DeviceInfo().GroupsPerDevice[DeviceId],
-                         DeviceInfo().Env.TeamLimit))) {
-    DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n",
-       DeviceInfo().Env.TeamLimit);
-  }
-
-  // Set default number of teams
-  if (DeviceInfo().Env.NumTeams > 0) {
-    DeviceInfo().NumTeams[DeviceId] = DeviceInfo().Env.NumTeams;
-    DP("Default number of teams set according to environment %d\n",
-       DeviceInfo().Env.NumTeams);
-  } else {
-    char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC");
-    int TeamsPerCU = DefaultTeamsPerCU;
-    if (TeamsPerCUEnvStr) {
-      TeamsPerCU = std::stoi(TeamsPerCUEnvStr);
-    }
-
-    DeviceInfo().NumTeams[DeviceId] =
-        TeamsPerCU * DeviceInfo().ComputeUnits[DeviceId];
-    DP("Default number of teams = %d * number of compute units %d\n",
-       TeamsPerCU, DeviceInfo().ComputeUnits[DeviceId]);
-  }
-
-  if (enforceUpperBound(&DeviceInfo().NumTeams[DeviceId],
-                        DeviceInfo().GroupsPerDevice[DeviceId])) {
-    DP("Default number of teams exceeds device limit, capping at %d\n",
-       DeviceInfo().GroupsPerDevice[DeviceId]);
-  }
-
-  // Adjust threads to the env variables
-  if (DeviceInfo().Env.TeamThreadLimit > 0 &&
-      (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId],
-                         DeviceInfo().Env.TeamThreadLimit))) {
-    DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n",
-       DeviceInfo().Env.TeamThreadLimit);
-  }
-
-  // Set default number of threads
-  DeviceInfo().NumThreads[DeviceId] = RTLDeviceInfoTy::DefaultWgSize;
-  DP("Default number of threads set according to library's default %d\n",
-     RTLDeviceInfoTy::DefaultWgSize);
-  if (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId],
-                        DeviceInfo().ThreadsPerGroup[DeviceId])) {
-    DP("Default number of threads exceeds device limit, capping at %d\n",
-       DeviceInfo().ThreadsPerGroup[DeviceId]);
-  }
-
-  DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n",
-     DeviceId, DeviceInfo().GroupsPerDevice[DeviceId],
-     DeviceInfo().ThreadsPerGroup[DeviceId]);
-
-  DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", DeviceId,
-     DeviceInfo().WarpSize[DeviceId], DeviceInfo().ThreadsPerGroup[DeviceId],
-     DeviceInfo().GroupsPerDevice[DeviceId],
-     DeviceInfo().GroupsPerDevice[DeviceId] *
-         DeviceInfo().ThreadsPerGroup[DeviceId]);
-
-  return OFFLOAD_SUCCESS;
-}
-
-static __tgt_target_table *
-__tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image);
-
-__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
-                                          __tgt_device_image *Image) {
-  DeviceInfo().LoadRunLock.lock();
-  __tgt_target_table *Res = __tgt_rtl_load_binary_locked(DeviceId, Image);
-  DeviceInfo().LoadRunLock.unlock();
-  return Res;
-}
-
-__tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
-                                                 __tgt_device_image *Image) {
-  // This function loads the device image onto gpu[DeviceId] and does other
-  // per-image initialization work. Specifically:
-  //
-  // - Initialize an DeviceEnvironmentTy instance embedded in the
-  //   image at the symbol "__omp_rtl_device_environment"
-  //   Fields DebugKind, DeviceNum, NumDevices. Used by the deviceRTL.
-  //
-  // - Allocate a large array per-gpu (could be moved to init_device)
-  //   - Read a uint64_t at symbol omptarget_nvptx_device_State_size
-  //   - Allocate at least that many bytes of gpu memory
-  //   - Zero initialize it
-  //   - Write the pointer to the symbol omptarget_nvptx_device_State
-  //
-  // - Pulls some per-kernel information together from various sources and
-  //   records it in the KernelsList for quicker access later
-  //
-  // The initialization can be done before or after loading the image onto the
-  // gpu. This function presently does a mixture. Using the hsa api to get/set
-  // the information is simpler to implement, in exchange for more complicated
-  // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes
-  // back from the gpu vs a hashtable lookup on the host.
-
-  const size_t ImgSize = (char *)Image->ImageEnd - (char *)Image->ImageStart;
-
-  DeviceInfo().clearOffloadEntriesTable(DeviceId);
-
-  // We do not need to set the ELF version because the caller of this function
-  // had to do that to decide the right runtime to use
-
-  if (!elfMachineIdIsAmdgcn(Image))
-    return NULL;
-
-  {
-    auto Env =
-        DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices,
-                          DeviceInfo().Env.DynamicMemSize, Image, ImgSize);
-
-    auto &KernelInfo = DeviceInfo().KernelInfoTable[DeviceId];
-    auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId];
-    hsa_status_t Err = moduleRegisterFromMemoryToPlace(
-        KernelInfo, SymbolInfo, (void *)Image->ImageStart, ImgSize, DeviceId,
-        [&](void *Data, size_t Size) {
-          if (imageContainsSymbol(Data, Size, "needs_hostcall_buffer")) {
-            __atomic_store_n(&DeviceInfo().HostcallRequired, true,
-                             __ATOMIC_RELEASE);
-          }
-          return Env.beforeLoading(Data, Size);
-        },
-        DeviceInfo().HSAExecutables);
-
-    check("Module registering", Err);
-    if (Err != HSA_STATUS_SUCCESS) {
-      const char *DeviceName = DeviceInfo().GPUName[DeviceId].c_str();
-      const char *ElfName = get_elf_mach_gfx_name(elfEFlags(Image));
-
-      if (strcmp(DeviceName, ElfName) != 0) {
-        DP("Possible gpu arch mismatch: device:%s, image:%s please check"
-           " compiler flag: -march=<gpu>\n",
-           DeviceName, ElfName);
-      } else {
-        DP("Error loading image onto GPU: %s\n", get_error_string(Err));
-      }
-
-      return NULL;
-    }
-
-    Err = Env.afterLoading();
-    if (Err != HSA_STATUS_SUCCESS) {
-      return NULL;
-    }
-  }
-
-  DP("AMDGPU module successfully loaded!\n");
-
-  {
-    // the device_State array is either large value in bss or a void* that
-    // needs to be assigned to a pointer to an array of size device_state_bytes
-    // If absent, it has been deadstripped and needs no setup.
-
-    void *StatePtr;
-    uint32_t StatePtrSize;
-    auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId];
-    hsa_status_t Err = interop_hsa_get_symbol_info(
-        SymbolInfoMap, DeviceId, "omptarget_nvptx_device_State", &StatePtr,
-        &StatePtrSize);
-
-    if (Err != HSA_STATUS_SUCCESS) {
-      DP("No device_state symbol found, skipping initialization\n");
-    } else {
-      if (StatePtrSize < sizeof(void *)) {
-        DP("unexpected size of state_ptr %u != %zu\n", StatePtrSize,
-           sizeof(void *));
-        return NULL;
-      }
-
-      // if it's larger than a void*, assume it's a bss array and no further
-      // initialization is required. Only try to set up a pointer for
-      // sizeof(void*)
-      if (StatePtrSize == sizeof(void *)) {
-        uint64_t DeviceStateBytes =
-            getDeviceStateBytes((char *)Image->ImageStart, ImgSize);
-        if (DeviceStateBytes == 0) {
-          DP("Can't initialize device_State, missing size information\n");
-          return NULL;
-        }
-
-        auto &DSS = DeviceInfo().DeviceStateStore[DeviceId];
-        if (DSS.first.get() == nullptr) {
-          assert(DSS.second == 0);
-          void *Ptr = NULL;
-          hsa_status_t Err = implCalloc(&Ptr, DeviceStateBytes, DeviceId);
-          if (Err != HSA_STATUS_SUCCESS) {
-            DP("Failed to allocate device_state array\n");
-            return NULL;
-          }
-          DSS = {
-              std::unique_ptr<void, RTLDeviceInfoTy::ImplFreePtrDeletor>{Ptr},
-              DeviceStateBytes,
-          };
-        }
-
-        void *Ptr = DSS.first.get();
-        if (DeviceStateBytes != DSS.second) {
-          DP("Inconsistent sizes of device_State unsupported\n");
-          return NULL;
-        }
-
-        // write ptr to device memory so it can be used by later kernels
-        Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr,
-                                                   sizeof(void *), DeviceId);
-        if (Err != HSA_STATUS_SUCCESS) {
-          DP("memcpy install of state_ptr failed\n");
-          return NULL;
-        }
-      }
-    }
-  }
-
-  // Here, we take advantage of the data that is appended after img_end to get
-  // the symbols' name we need to load. This data consist of the host entries
-  // begin and end as well as the target name (see the offloading linker script
-  // creation in clang compiler).
-
-  // Find the symbols in the module by name. The name can be obtain by
-  // concatenating the host entry name with the target name
-
-  __tgt_offload_entry *HostBegin = Image->EntriesBegin;
-  __tgt_offload_entry *HostEnd = Image->EntriesEnd;
-
-  for (__tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
-
-    if (!E->addr) {
-      // The host should have always something in the address to
-      // uniquely identify the target region.
-      DP("Analyzing host entry '<null>' (size = %lld)...\n",
-         (unsigned long long)E->size);
-      return NULL;
-    }
-
-    if (E->size) {
-      __tgt_offload_entry Entry = *E;
-
-      void *Varptr;
-      uint32_t Varsize;
-
-      auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId];
-      hsa_status_t Err = interop_hsa_get_symbol_info(
-          SymbolInfoMap, DeviceId, E->name, &Varptr, &Varsize);
-
-      if (Err != HSA_STATUS_SUCCESS) {
-        // Inform the user what symbol prevented offloading
-        DP("Loading global '%s' (Failed)\n", E->name);
-        return NULL;
-      }
-
-      if (Varsize != E->size) {
-        DP("Loading global '%s' - size mismatch (%u != %lu)\n", E->name,
-           Varsize, E->size);
-        return NULL;
-      }
-
-      DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
-         DPxPTR(E - HostBegin), E->name, DPxPTR(Varptr));
-      Entry.addr = (void *)Varptr;
-
-      DeviceInfo().addOffloadEntry(DeviceId, Entry);
-
-      if (DeviceInfo().RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
-          E->flags & OMP_DECLARE_TARGET_LINK) {
-        // If unified memory is present any target link variables
-        // can access host addresses directly. There is no longer a
-        // need for device copies.
-        Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr,
-                                                   sizeof(void *), DeviceId);
-        if (Err != HSA_STATUS_SUCCESS)
-          DP("Error when copying USM\n");
-        DP("Copy linked variable host address (" DPxMOD ")"
-           "to device address (" DPxMOD ")\n",
-           DPxPTR(*((void **)E->addr)), DPxPTR(Varptr));
-      }
-
-      continue;
-    }
-
-    DP("to find the kernel name: %s size: %lu\n", E->name, strlen(E->name));
-
-    // errors in kernarg_segment_size previously treated as = 0 (or as undef)
-    uint32_t KernargSegmentSize = 0;
-    auto &KernelInfoMap = DeviceInfo().KernelInfoTable[DeviceId];
-    hsa_status_t Err = HSA_STATUS_SUCCESS;
-    if (!E->name) {
-      Err = HSA_STATUS_ERROR;
-    } else {
-      std::string KernelStr = std::string(E->name);
-      auto It = KernelInfoMap.find(KernelStr);
-      if (It != KernelInfoMap.end()) {
-        atl_kernel_info_t Info = It->second;
-        KernargSegmentSize = Info.kernel_segment_size;
-      } else {
-        Err = HSA_STATUS_ERROR;
-      }
-    }
-
-    // default value GENERIC (in case symbol is missing from cubin file)
-    llvm::omp::OMPTgtExecModeFlags ExecModeVal =
-        llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
-
-    // get flat group size if present, else Default_WG_Size
-    int16_t WGSizeVal = RTLDeviceInfoTy::DefaultWgSize;
-
-    // get Kernel Descriptor if present.
-    // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp
-    struct KernDescValType {
-      uint16_t Version;
-      uint16_t TSize;
-      uint16_t WGSize;
-    };
-    struct KernDescValType KernDescVal;
-    std::string KernDescNameStr(E->name);
-    KernDescNameStr += "_kern_desc";
-    const char *KernDescName = KernDescNameStr.c_str();
-
-    const void *KernDescPtr;
-    uint32_t KernDescSize;
-    void *CallStackAddr = nullptr;
-    Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, KernDescName,
-                               &KernDescPtr, &KernDescSize);
-
-    if (Err == HSA_STATUS_SUCCESS) {
-      if ((size_t)KernDescSize != sizeof(KernDescVal))
-        DP("Loading global computation properties '%s' - size mismatch (%u != "
-           "%lu)\n",
-           KernDescName, KernDescSize, sizeof(KernDescVal));
-
-      memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize);
-
-      // Check structure size against recorded size.
-      if ((size_t)KernDescSize != KernDescVal.TSize)
-        DP("KernDescVal size %lu does not match advertized size %d for '%s'\n",
-           sizeof(KernDescVal), KernDescVal.TSize, KernDescName);
-
-      DP("After loading global for %s KernDesc \n", KernDescName);
-      DP("KernDesc: Version: %d\n", KernDescVal.Version);
-      DP("KernDesc: TSize: %d\n", KernDescVal.TSize);
-      DP("KernDesc: WG_Size: %d\n", KernDescVal.WGSize);
-
-      if (KernDescVal.WGSize == 0) {
-        KernDescVal.WGSize = RTLDeviceInfoTy::DefaultWgSize;
-        DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WGSize);
-      }
-      WGSizeVal = KernDescVal.WGSize;
-      DP("WGSizeVal %d\n", WGSizeVal);
-      check("Loading KernDesc computation property", Err);
-    } else {
-      DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName);
-
-      // Flat group size
-      std::string WGSizeNameStr(E->name);
-      WGSizeNameStr += "_wg_size";
-      const char *WGSizeName = WGSizeNameStr.c_str();
-
-      const void *WGSizePtr;
-      uint32_t WGSize;
-      Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, WGSizeName,
-                                 &WGSizePtr, &WGSize);
-
-      if (Err == HSA_STATUS_SUCCESS) {
-        if ((size_t)WGSize != sizeof(int16_t)) {
-          DP("Loading global computation properties '%s' - size mismatch (%u "
-             "!= "
-             "%lu)\n",
-             WGSizeName, WGSize, sizeof(int16_t));
-          return NULL;
-        }
-
-        memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize);
-
-        DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal);
-
-        if (WGSizeVal < RTLDeviceInfoTy::DefaultWgSize ||
-            WGSizeVal > RTLDeviceInfoTy::MaxWgSize) {
-          DP("Error wrong WGSize value specified in HSA code object file: "
-             "%d\n",
-             WGSizeVal);
-          WGSizeVal = RTLDeviceInfoTy::DefaultWgSize;
-        }
-      } else {
-        DP("Warning: Loading WGSize '%s' - symbol not found, "
-           "using default value %d\n",
-           WGSizeName, WGSizeVal);
-      }
-
-      check("Loading WGSize computation property", Err);
-    }
-
-    // Read execution mode from global in binary
-    std::string ExecModeNameStr(E->name);
-    ExecModeNameStr += "_exec_mode";
-    const char *ExecModeName = ExecModeNameStr.c_str();
-
-    const void *ExecModePtr;
-    uint32_t VarSize;
-    Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, ExecModeName,
-                               &ExecModePtr, &VarSize);
-
-    if (Err == HSA_STATUS_SUCCESS) {
-      if ((size_t)VarSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
-        DP("Loading global computation properties '%s' - size mismatch(%u != "
-           "%lu)\n",
-           ExecModeName, VarSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
-        return NULL;
-      }
-
-      memcpy(&ExecModeVal, ExecModePtr, (size_t)VarSize);
-
-      DP("After loading global for %s ExecMode = %d\n", ExecModeName,
-         ExecModeVal);
-
-      if (ExecModeVal < 0 ||
-          ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) {
-        DP("Error wrong exec_mode value specified in HSA code object file: "
-           "%d\n",
-           ExecModeVal);
-        return NULL;
-      }
-    } else {
-      DP("Loading global exec_mode '%s' - symbol missing, using default "
-         "value "
-         "GENERIC (1)\n",
-         ExecModeName);
-    }
-    check("Loading computation property", Err);
-
-    KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId,
-                                   CallStackAddr, E->name, KernargSegmentSize,
-                                   DeviceInfo().KernArgPool));
-    __tgt_offload_entry Entry = *E;
-    Entry.addr = (void *)&KernelsList.back();
-    DeviceInfo().addOffloadEntry(DeviceId, Entry);
-    DP("Entry point %ld maps to %s\n", E - HostBegin, E->name);
-  }
-
-  return DeviceInfo().getOffloadEntriesTable(DeviceId);
-}
-
-void *__tgt_rtl_data_alloc(int DeviceId, int64_t Size, void *, int32_t Kind) {
-  void *Ptr = NULL;
-  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
-
-  hsa_amd_memory_pool_t MemoryPool;
-  switch (Kind) {
-  case TARGET_ALLOC_DEFAULT:
-  case TARGET_ALLOC_DEVICE:
-    // GPU memory
-    MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId);
-    break;
-  case TARGET_ALLOC_HOST:
-    // non-migratable memory accessible by host and device(s)
-    MemoryPool = DeviceInfo().getHostMemoryPool();
-    break;
-  default:
-    REPORT("Invalid target data allocation kind or requested allocator not "
-           "implemented yet\n");
-    return NULL;
-  }
-
-  hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, &Ptr);
-  DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", Size,
-     (long long unsigned)(Elf64_Addr)Ptr);
-  Ptr = (Err == HSA_STATUS_SUCCESS) ? Ptr : NULL;
-  return Ptr;
-}
-
-int32_t __tgt_rtl_data_submit(int DeviceId, void *TgtPtr, void *HstPtr,
-                              int64_t Size) {
-  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
-  __tgt_async_info AsyncInfo;
-  int32_t Rc = dataSubmit(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo);
-  if (Rc != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
-}
-
-int32_t __tgt_rtl_data_submit_async(int DeviceId, void *TgtPtr, void *HstPtr,
-                                    int64_t Size, __tgt_async_info *AsyncInfo) {
-  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
-  if (AsyncInfo) {
-    initAsyncInfo(AsyncInfo);
-    return dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfo);
-  }
-  return __tgt_rtl_data_submit(DeviceId, TgtPtr, HstPtr, Size);
-}
-
-int32_t __tgt_rtl_data_retrieve(int DeviceId, void *HstPtr, void *TgtPtr,
-                                int64_t Size) {
-  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
-  __tgt_async_info AsyncInfo;
-  int32_t Rc = dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo);
-  if (Rc != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
-}
-
-int32_t __tgt_rtl_data_retrieve_async(int DeviceId, void *HstPtr, void *TgtPtr,
-                                      int64_t Size,
-                                      __tgt_async_info *AsyncInfo) {
-  assert(AsyncInfo && "AsyncInfo is nullptr");
-  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
-  initAsyncInfo(AsyncInfo);
-  return dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfo);
-}
-
-int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr, int32_t) {
-  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
-  // HSA can free pointers allocated from different types of memory pool.
-  hsa_status_t Err;
-  DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)TgtPtr);
-  Err = core::Runtime::Memfree(TgtPtr);
-  if (Err != HSA_STATUS_SUCCESS) {
-    DP("Error when freeing CUDA memory\n");
-    return OFFLOAD_FAIL;
-  }
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr,
-                                void **TgtArgs, ptrdiff_t *TgtOffsets,
-                                KernelArgsTy *KernelArgs,
-                                __tgt_async_info *AsyncInfo) {
-  assert(!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] &&
-         !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
-         "Only one dimensional kernels supported.");
-  assert(AsyncInfo && "AsyncInfo is nullptr");
-  initAsyncInfo(AsyncInfo);
-
-  DeviceInfo().LoadRunLock.lock_shared();
-  int32_t Res =
-      runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets,
-                      KernelArgs->NumArgs, KernelArgs->NumTeams[0],
-                      KernelArgs->ThreadLimit[0], KernelArgs->Tripcount);
-
-  DeviceInfo().LoadRunLock.unlock_shared();
-  return Res;
-}
-
-int32_t __tgt_rtl_synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfo) {
-  assert(AsyncInfo && "AsyncInfo is nullptr");
-
-  // Cuda asserts that AsyncInfo->Queue is non-null, but this invariant
-  // is not ensured by devices.cpp for amdgcn
-  // assert(AsyncInfo->Queue && "AsyncInfo->Queue is nullptr");
-  if (AsyncInfo->Queue) {
-    finiAsyncInfo(AsyncInfo);
-  }
-  return OFFLOAD_SUCCESS;
-}
-
-void __tgt_rtl_print_device_info(int32_t DeviceId) {
-  // TODO: Assertion to see if DeviceId is correct
-  // NOTE: We don't need to set context for print device info.
-
-  DeviceInfo().printDeviceInfo(DeviceId, DeviceInfo().HSAAgents[DeviceId]);
-}
-
-int32_t __tgt_rtl_data_lock(int32_t DeviceId, void *HostPtr, int64_t Size,
-                            void **LockedHostPtr) {
-  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
-
-  hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId];
-  hsa_status_t err = lock_memory(HostPtr, Size, Agent, LockedHostPtr);
-  if (err != HSA_STATUS_SUCCESS) {
-    DP("Error in tgt_rtl_data_lock\n");
-    return OFFLOAD_FAIL;
-  }
-  DP("Tgt lock host data %ld bytes, (HostPtr:%016llx).\n", Size,
-     (long long unsigned)(Elf64_Addr)*LockedHostPtr);
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t __tgt_rtl_data_unlock(int DeviceId, void *HostPtr) {
-  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
-  hsa_status_t err = unlock_memory(HostPtr);
-  if (err != HSA_STATUS_SUCCESS) {
-    DP("Error in tgt_rtl_data_unlock\n");
-    return OFFLOAD_FAIL;
-  }
-
-  DP("Tgt unlock data (tgt:%016llx).\n",
-     (long long unsigned)(Elf64_Addr)HostPtr);
-  return OFFLOAD_SUCCESS;
-}
-
-} // extern "C"
diff --git a/openmp/libomptarget/plugins/common/CMakeLists.txt b/openmp/libomptarget/plugins/common/CMakeLists.txt
deleted file mode 100644
--- a/openmp/libomptarget/plugins/common/CMakeLists.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Common parts which can be used by all plugins
-#
-##===----------------------------------------------------------------------===##
-
-add_subdirectory(elf_common)
-add_subdirectory(MemoryManager)
diff --git a/openmp/libomptarget/plugins/common/MemoryManager/CMakeLists.txt b/openmp/libomptarget/plugins/common/MemoryManager/CMakeLists.txt
deleted file mode 100644
--- a/openmp/libomptarget/plugins/common/MemoryManager/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-
-add_library(MemoryManager INTERFACE)
-
-target_include_directories(MemoryManager INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
diff --git a/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h b/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h
deleted file mode 100644
--- a/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h
+++ /dev/null
@@ -1,347 +0,0 @@
-//===----------- MemoryManager.h - Target independent memory manager ------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Target independent memory manager.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_MEMORYMANAGER_MEMORYMANAGER_H
-#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_MEMORYMANAGER_MEMORYMANAGER_H
-
-#include <cassert>
-#include <functional>
-#include <list>
-#include <mutex>
-#include <set>
-#include <unordered_map>
-#include <vector>
-
-#include "Debug.h"
-#include "omptargetplugin.h"
-
-/// Base class of per-device allocator.
-class DeviceAllocatorTy {
-public:
-  virtual ~DeviceAllocatorTy() = default;
-
-  /// Allocate a memory of size \p Size . \p HstPtr is used to assist the
-  /// allocation.
-  virtual void *allocate(size_t Size, void *HstPtr,
-                         TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0;
-
-  /// Delete the pointer \p TgtPtr on the device
-  virtual int free(void *TgtPtr, TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0;
-};
-
-/// Class of memory manager. The memory manager is per-device by using
-/// per-device allocator. Therefore, each plugin using memory manager should
-/// have an allocator for each device.
-class MemoryManagerTy {
-  static constexpr const size_t BucketSize[] = {
-      0,       1U << 2, 1U << 3,  1U << 4,  1U << 5,  1U << 6, 1U << 7,
-      1U << 8, 1U << 9, 1U << 10, 1U << 11, 1U << 12, 1U << 13};
-
-  static constexpr const int NumBuckets =
-      sizeof(BucketSize) / sizeof(BucketSize[0]);
-
-  /// Find the previous number that is power of 2 given a number that is not
-  /// power of 2.
-  static size_t floorToPowerOfTwo(size_t Num) {
-    Num |= Num >> 1;
-    Num |= Num >> 2;
-    Num |= Num >> 4;
-    Num |= Num >> 8;
-    Num |= Num >> 16;
-#if INTPTR_MAX == INT64_MAX
-    Num |= Num >> 32;
-#elif INTPTR_MAX == INT32_MAX
-    // Do nothing with 32-bit
-#else
-#error Unsupported architecture
-#endif
-    Num += 1;
-    return Num >> 1;
-  }
-
-  /// Find a suitable bucket
-  static int findBucket(size_t Size) {
-    const size_t F = floorToPowerOfTwo(Size);
-
-    DP("findBucket: Size %zu is floored to %zu.\n", Size, F);
-
-    int L = 0, H = NumBuckets - 1;
-    while (H - L > 1) {
-      int M = (L + H) >> 1;
-      if (BucketSize[M] == F)
-        return M;
-      if (BucketSize[M] > F)
-        H = M - 1;
-      else
-        L = M;
-    }
-
-    assert(L >= 0 && L < NumBuckets && "L is out of range");
-
-    DP("findBucket: Size %zu goes to bucket %d\n", Size, L);
-
-    return L;
-  }
-
-  /// A structure stores the meta data of a target pointer
-  struct NodeTy {
-    /// Memory size
-    const size_t Size;
-    /// Target pointer
-    void *Ptr;
-
-    /// Constructor
-    NodeTy(size_t Size, void *Ptr) : Size(Size), Ptr(Ptr) {}
-  };
-
-  /// To make \p NodePtrTy ordered when they're put into \p std::multiset.
-  struct NodeCmpTy {
-    bool operator()(const NodeTy &LHS, const NodeTy &RHS) const {
-      return LHS.Size < RHS.Size;
-    }
-  };
-
-  /// A \p FreeList is a set of Nodes. We're using \p std::multiset here to make
-  /// the look up procedure more efficient.
-  using FreeListTy = std::multiset<std::reference_wrapper<NodeTy>, NodeCmpTy>;
-
-  /// A list of \p FreeListTy entries, each of which is a \p std::multiset of
-  /// Nodes whose size is less or equal to a specific bucket size.
-  std::vector<FreeListTy> FreeLists;
-  /// A list of mutex for each \p FreeListTy entry
-  std::vector<std::mutex> FreeListLocks;
-  /// A table to map from a target pointer to its node
-  std::unordered_map<void *, NodeTy> PtrToNodeTable;
-  /// The mutex for the table \p PtrToNodeTable
-  std::mutex MapTableLock;
-
-  /// The reference to a device allocator
-  DeviceAllocatorTy &DeviceAllocator;
-
-  /// The threshold to manage memory using memory manager. If the request size
-  /// is larger than \p SizeThreshold, the allocation will not be managed by the
-  /// memory manager.
-  size_t SizeThreshold = 1U << 13;
-
-  /// Request memory from target device
-  void *allocateOnDevice(size_t Size, void *HstPtr) const {
-    return DeviceAllocator.allocate(Size, HstPtr, TARGET_ALLOC_DEVICE);
-  }
-
-  /// Deallocate data on device
-  int deleteOnDevice(void *Ptr) const { return DeviceAllocator.free(Ptr); }
-
-  /// This function is called when it tries to allocate memory on device but the
-  /// device returns out of memory. It will first free all memory in the
-  /// FreeList and try to allocate again.
-  void *freeAndAllocate(size_t Size, void *HstPtr) {
-    std::vector<void *> RemoveList;
-
-    // Deallocate all memory in FreeList
-    for (int I = 0; I < NumBuckets; ++I) {
-      FreeListTy &List = FreeLists[I];
-      std::lock_guard<std::mutex> Lock(FreeListLocks[I]);
-      if (List.empty())
-        continue;
-      for (const NodeTy &N : List) {
-        deleteOnDevice(N.Ptr);
-        RemoveList.push_back(N.Ptr);
-      }
-      FreeLists[I].clear();
-    }
-
-    // Remove all nodes in the map table which have been released
-    if (!RemoveList.empty()) {
-      std::lock_guard<std::mutex> LG(MapTableLock);
-      for (void *P : RemoveList)
-        PtrToNodeTable.erase(P);
-    }
-
-    // Try allocate memory again
-    return allocateOnDevice(Size, HstPtr);
-  }
-
-  /// The goal is to allocate memory on the device. It first tries to
-  /// allocate directly on the device. If a \p nullptr is returned, it might
-  /// be because the device is OOM. In that case, it will free all unused
-  /// memory and then try again.
-  void *allocateOrFreeAndAllocateOnDevice(size_t Size, void *HstPtr) {
-    void *TgtPtr = allocateOnDevice(Size, HstPtr);
-    // We cannot get memory from the device. It might be due to OOM. Let's
-    // free all memory in FreeLists and try again.
-    if (TgtPtr == nullptr) {
-      DP("Failed to get memory on device. Free all memory in FreeLists and "
-         "try again.\n");
-      TgtPtr = freeAndAllocate(Size, HstPtr);
-    }
-
-    if (TgtPtr == nullptr)
-      DP("Still cannot get memory on device probably because the device is "
-         "OOM.\n");
-
-    return TgtPtr;
-  }
-
-public:
-  /// Constructor. If \p Threshold is non-zero, then the default threshold will
-  /// be overwritten by \p Threshold.
-  MemoryManagerTy(DeviceAllocatorTy &DeviceAllocator, size_t Threshold = 0)
-      : FreeLists(NumBuckets), FreeListLocks(NumBuckets),
-        DeviceAllocator(DeviceAllocator) {
-    if (Threshold)
-      SizeThreshold = Threshold;
-  }
-
-  /// Destructor
-  ~MemoryManagerTy() {
-    for (auto Itr = PtrToNodeTable.begin(); Itr != PtrToNodeTable.end();
-         ++Itr) {
-      assert(Itr->second.Ptr && "nullptr in map table");
-      deleteOnDevice(Itr->second.Ptr);
-    }
-  }
-
-  /// Allocate memory of size \p Size from target device. \p HstPtr is used to
-  /// assist the allocation.
-  void *allocate(size_t Size, void *HstPtr) {
-    // If the size is zero, we will not bother the target device. Just return
-    // nullptr directly.
-    if (Size == 0)
-      return nullptr;
-
-    DP("MemoryManagerTy::allocate: size %zu with host pointer " DPxMOD ".\n",
-       Size, DPxPTR(HstPtr));
-
-    // If the size is greater than the threshold, allocate it directly from
-    // device.
-    if (Size > SizeThreshold) {
-      DP("%zu is greater than the threshold %zu. Allocate it directly from "
-         "device\n",
-         Size, SizeThreshold);
-      void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr);
-
-      DP("Got target pointer " DPxMOD ". Return directly.\n", DPxPTR(TgtPtr));
-
-      return TgtPtr;
-    }
-
-    NodeTy *NodePtr = nullptr;
-
-    // Try to get a node from FreeList
-    {
-      const int B = findBucket(Size);
-      FreeListTy &List = FreeLists[B];
-
-      NodeTy TempNode(Size, nullptr);
-      std::lock_guard<std::mutex> LG(FreeListLocks[B]);
-      const auto Itr = List.find(TempNode);
-
-      if (Itr != List.end()) {
-        NodePtr = &Itr->get();
-        List.erase(Itr);
-      }
-    }
-
-    if (NodePtr != nullptr)
-      DP("Find one node " DPxMOD " in the bucket.\n", DPxPTR(NodePtr));
-
-    // We cannot find a valid node in FreeLists. Let's allocate on device and
-    // create a node for it.
-    if (NodePtr == nullptr) {
-      DP("Cannot find a node in the FreeLists. Allocate on device.\n");
-      // Allocate one on device
-      void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr);
-
-      if (TgtPtr == nullptr)
-        return nullptr;
-
-      // Create a new node and add it into the map table
-      {
-        std::lock_guard<std::mutex> Guard(MapTableLock);
-        auto Itr = PtrToNodeTable.emplace(TgtPtr, NodeTy(Size, TgtPtr));
-        NodePtr = &Itr.first->second;
-      }
-
-      DP("Node address " DPxMOD ", target pointer " DPxMOD ", size %zu\n",
-         DPxPTR(NodePtr), DPxPTR(TgtPtr), Size);
-    }
-
-    assert(NodePtr && "NodePtr should not be nullptr at this point");
-
-    return NodePtr->Ptr;
-  }
-
-  /// Deallocate memory pointed by \p TgtPtr
-  int free(void *TgtPtr) {
-    DP("MemoryManagerTy::free: target memory " DPxMOD ".\n", DPxPTR(TgtPtr));
-
-    NodeTy *P = nullptr;
-
-    // Look it up into the table
-    {
-      std::lock_guard<std::mutex> G(MapTableLock);
-      auto Itr = PtrToNodeTable.find(TgtPtr);
-
-      // We don't remove the node from the map table because the map does not
-      // change.
-      if (Itr != PtrToNodeTable.end())
-        P = &Itr->second;
-    }
-
-    // The memory is not managed by the manager
-    if (P == nullptr) {
-      DP("Cannot find its node. Delete it on device directly.\n");
-      return deleteOnDevice(TgtPtr);
-    }
-
-    // Insert the node to the free list
-    const int B = findBucket(P->Size);
-
-    DP("Found its node " DPxMOD ". Insert it to bucket %d.\n", DPxPTR(P), B);
-
-    {
-      std::lock_guard<std::mutex> G(FreeListLocks[B]);
-      FreeLists[B].insert(*P);
-    }
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  /// Get the size threshold from the environment variable
-  /// \p LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD . Returns a <tt>
-  /// std::pair<size_t, bool> </tt> where the first element represents the
-  /// threshold and the second element represents whether user disables memory
-  /// manager explicitly by setting the var to 0. If user doesn't specify
-  /// anything, returns <0, true>.
-  static std::pair<size_t, bool> getSizeThresholdFromEnv() {
-    size_t Threshold = 0;
-
-    if (const char *Env =
-            std::getenv("LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD")) {
-      Threshold = std::stoul(Env);
-      if (Threshold == 0) {
-        DP("Disabled memory manager as user set "
-           "LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD=0.\n");
-        return std::make_pair(0, false);
-      }
-    }
-
-    return std::make_pair(Threshold, true);
-  }
-};
-
-// GCC still cannot handle the static data member like Clang so we still need
-// this part.
-constexpr const size_t MemoryManagerTy::BucketSize[];
-constexpr const int MemoryManagerTy::NumBuckets;
-
-#endif // LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_MEMORYMANAGER_MEMORYMANAGER_H
diff --git a/openmp/libomptarget/plugins/common/elf_common/CMakeLists.txt b/openmp/libomptarget/plugins/common/elf_common/CMakeLists.txt
deleted file mode 100644
--- a/openmp/libomptarget/plugins/common/elf_common/CMakeLists.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Common ELF functionality for target plugins
-#
-##===----------------------------------------------------------------------===##
-
-# NOTE: Don't try to build `elf_common` using `add_llvm_library`.
-# See openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt
-# for more explanation.
-add_library(elf_common OBJECT elf_common.cpp ELFSymbols.cpp)
-
-# This is required when using LLVM libraries.
-llvm_update_compile_flags(elf_common)
-
-if (LLVM_LINK_LLVM_DYLIB)
-  set(llvm_libs LLVM)
-else()
-  llvm_map_components_to_libnames(llvm_libs BinaryFormat Object Support)
-endif()
-
-target_link_libraries(elf_common PUBLIC ${llvm_libs} ${OPENMP_PTHREAD_LIB})
-
-# Build elf_common with PIC to be able to link it with plugin shared libraries.
-set_property(TARGET elf_common PROPERTY POSITION_INDEPENDENT_CODE ON)
-
-# Expose elf_common.h directory to the users of this library.
-target_include_directories(elf_common
-  INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}
-  PRIVATE ${LIBOMPTARGET_INCLUDE_DIR}
-)
diff --git a/openmp/libomptarget/plugins/common/elf_common/ELFSymbols.h b/openmp/libomptarget/plugins/common/elf_common/ELFSymbols.h
deleted file mode 100644
--- a/openmp/libomptarget/plugins/common/elf_common/ELFSymbols.h
+++ /dev/null
@@ -1,27 +0,0 @@
-//===-- ELFSymbols.h - ELF Symbol look-up functionality ---------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// ELF routines for obtaining a symbol from an Elf file without loading it.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_ELF_COMMON_ELF_SYMBOLS_H
-#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_ELF_COMMON_ELF_SYMBOLS_H
-
-#include "llvm/Object/ELF.h"
-#include "llvm/Object/ELFObjectFile.h"
-
-/// Returns the symbol associated with the \p Name in the \p ELFObj. It will
-/// first search for the hash sections to identify symbols from the hash table.
-/// If that fails it will fall back to a linear search in the case of an
-/// executable file without a hash table.
-llvm::Expected<const typename llvm::object::ELF64LE::Sym *>
-getELFSymbol(const llvm::object::ELFObjectFile<llvm::object::ELF64LE> &ELFObj,
-             llvm::StringRef Name);
-
-#endif
diff --git a/openmp/libomptarget/plugins/common/elf_common/ELFSymbols.cpp b/openmp/libomptarget/plugins/common/elf_common/ELFSymbols.cpp
deleted file mode 100644
--- a/openmp/libomptarget/plugins/common/elf_common/ELFSymbols.cpp
+++ /dev/null
@@ -1,201 +0,0 @@
-//===-- ELFSymbols.cpp - ELF Symbol look-up functionality -------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "ELFSymbols.h"
-
-using namespace llvm;
-using namespace llvm::object;
-using namespace llvm::ELF;
-
-template <class ELFT>
-static Expected<const typename ELFT::Sym *>
-getSymbolFromGnuHashTable(StringRef Name, const typename ELFT::GnuHash &HashTab,
-                          ArrayRef<typename ELFT::Sym> SymTab,
-                          StringRef StrTab) {
-  const uint32_t NameHash = hashGnu(Name);
-  const typename ELFT::Word NBucket = HashTab.nbuckets;
-  const typename ELFT::Word SymOffset = HashTab.symndx;
-  ArrayRef<typename ELFT::Off> Filter = HashTab.filter();
-  ArrayRef<typename ELFT::Word> Bucket = HashTab.buckets();
-  ArrayRef<typename ELFT::Word> Chain = HashTab.values(SymTab.size());
-
-  // Check the bloom filter and exit early if the symbol is not present.
-  uint64_t ElfClassBits = ELFT::Is64Bits ? 64 : 32;
-  typename ELFT::Off Word =
-      Filter[(NameHash / ElfClassBits) % HashTab.maskwords];
-  uint64_t Mask = (0x1ull << (NameHash % ElfClassBits)) |
-                  (0x1ull << ((NameHash >> HashTab.shift2) % ElfClassBits));
-  if ((Word & Mask) != Mask)
-    return nullptr;
-
-  // The symbol may or may not be present, check the hash values.
-  for (typename ELFT::Word I = Bucket[NameHash % NBucket];
-       I >= SymOffset && I < SymTab.size(); I = I + 1) {
-    const uint32_t ChainHash = Chain[I - SymOffset];
-
-    if ((NameHash | 0x1) != (ChainHash | 0x1))
-      continue;
-
-    if (SymTab[I].st_name >= StrTab.size())
-      return createError("symbol [index " + Twine(I) +
-                         "] has invalid st_name: " + Twine(SymTab[I].st_name));
-    if (StrTab.drop_front(SymTab[I].st_name).data() == Name)
-      return &SymTab[I];
-
-    if (ChainHash & 0x1)
-      return nullptr;
-  }
-  return nullptr;
-}
-
-template <class ELFT>
-static Expected<const typename ELFT::Sym *>
-getSymbolFromSysVHashTable(StringRef Name, const typename ELFT::Hash &HashTab,
-                           ArrayRef<typename ELFT::Sym> SymTab,
-                           StringRef StrTab) {
-  const uint32_t Hash = hashSysV(Name);
-  const typename ELFT::Word NBucket = HashTab.nbucket;
-  ArrayRef<typename ELFT::Word> Bucket = HashTab.buckets();
-  ArrayRef<typename ELFT::Word> Chain = HashTab.chains();
-  for (typename ELFT::Word I = Bucket[Hash % NBucket]; I != ELF::STN_UNDEF;
-       I = Chain[I]) {
-    if (I >= SymTab.size())
-      return createError(
-          "symbol [index " + Twine(I) +
-          "] is greater than the number of symbols: " + Twine(SymTab.size()));
-    if (SymTab[I].st_name >= StrTab.size())
-      return createError("symbol [index " + Twine(I) +
-                         "] has invalid st_name: " + Twine(SymTab[I].st_name));
-
-    if (StrTab.drop_front(SymTab[I].st_name).data() == Name)
-      return &SymTab[I];
-  }
-  return nullptr;
-}
-
-template <class ELFT>
-static Expected<const typename ELFT::Sym *>
-getHashTableSymbol(const ELFFile<ELFT> &Elf, const typename ELFT::Shdr &Sec,
-                   StringRef Name) {
-  if (Sec.sh_type != ELF::SHT_HASH && Sec.sh_type != ELF::SHT_GNU_HASH)
-    return createError(
-        "invalid sh_type for hash table, expected SHT_HASH or SHT_GNU_HASH");
-  Expected<typename ELFT::ShdrRange> SectionsOrError = Elf.sections();
-  if (!SectionsOrError)
-    return SectionsOrError.takeError();
-
-  auto SymTabOrErr = getSection<ELFT>(*SectionsOrError, Sec.sh_link);
-  if (!SymTabOrErr)
-    return SymTabOrErr.takeError();
-
-  auto StrTabOrErr =
-      Elf.getStringTableForSymtab(**SymTabOrErr, *SectionsOrError);
-  if (!StrTabOrErr)
-    return StrTabOrErr.takeError();
-  StringRef StrTab = *StrTabOrErr;
-
-  auto SymsOrErr = Elf.symbols(*SymTabOrErr);
-  if (!SymsOrErr)
-    return SymsOrErr.takeError();
-  ArrayRef<typename ELFT::Sym> SymTab = *SymsOrErr;
-
-  // If this is a GNU hash table we verify its size and search the symbol
-  // table using the GNU hash table format.
-  if (Sec.sh_type == ELF::SHT_GNU_HASH) {
-    const typename ELFT::GnuHash *HashTab =
-        reinterpret_cast<const typename ELFT::GnuHash *>(Elf.base() +
-                                                         Sec.sh_offset);
-    if (Sec.sh_offset + Sec.sh_size >= Elf.getBufSize())
-      return createError("section has invalid sh_offset: " +
-                         Twine(Sec.sh_offset));
-    if (Sec.sh_size < sizeof(typename ELFT::GnuHash) ||
-        Sec.sh_size <
-            sizeof(typename ELFT::GnuHash) +
-                sizeof(typename ELFT::Word) * HashTab->maskwords +
-                sizeof(typename ELFT::Word) * HashTab->nbuckets +
-                sizeof(typename ELFT::Word) * (SymTab.size() - HashTab->symndx))
-      return createError("section has invalid sh_size: " + Twine(Sec.sh_size));
-    return getSymbolFromGnuHashTable<ELFT>(Name, *HashTab, SymTab, StrTab);
-  }
-
-  // If this is a Sys-V hash table we verify its size and search the symbol
-  // table using the Sys-V hash table format.
-  if (Sec.sh_type == ELF::SHT_HASH) {
-    const typename ELFT::Hash *HashTab =
-        reinterpret_cast<const typename ELFT::Hash *>(Elf.base() +
-                                                      Sec.sh_offset);
-    if (Sec.sh_offset + Sec.sh_size >= Elf.getBufSize())
-      return createError("section has invalid sh_offset: " +
-                         Twine(Sec.sh_offset));
-    if (Sec.sh_size < sizeof(typename ELFT::Hash) ||
-        Sec.sh_size < sizeof(typename ELFT::Hash) +
-                          sizeof(typename ELFT::Word) * HashTab->nbucket +
-                          sizeof(typename ELFT::Word) * HashTab->nchain)
-      return createError("section has invalid sh_size: " + Twine(Sec.sh_size));
-
-    return getSymbolFromSysVHashTable<ELFT>(Name, *HashTab, SymTab, StrTab);
-  }
-
-  return nullptr;
-}
-
-template <class ELFT>
-static Expected<const typename ELFT::Sym *>
-getSymTableSymbol(const ELFFile<ELFT> &Elf, const typename ELFT::Shdr &Sec,
-                  StringRef Name) {
-  if (Sec.sh_type != ELF::SHT_SYMTAB && Sec.sh_type != ELF::SHT_DYNSYM)
-    return createError(
-        "invalid sh_type for hash table, expected SHT_SYMTAB or SHT_DYNSYM");
-  Expected<typename ELFT::ShdrRange> SectionsOrError = Elf.sections();
-  if (!SectionsOrError)
-    return SectionsOrError.takeError();
-
-  auto StrTabOrErr = Elf.getStringTableForSymtab(Sec, *SectionsOrError);
-  if (!StrTabOrErr)
-    return StrTabOrErr.takeError();
-  StringRef StrTab = *StrTabOrErr;
-
-  auto SymsOrErr = Elf.symbols(&Sec);
-  if (!SymsOrErr)
-    return SymsOrErr.takeError();
-  ArrayRef<typename ELFT::Sym> SymTab = *SymsOrErr;
-
-  for (const typename ELFT::Sym &Sym : SymTab)
-    if (StrTab.drop_front(Sym.st_name).data() == Name)
-      return &Sym;
-
-  return nullptr;
-}
-
-Expected<const typename ELF64LE::Sym *>
-getELFSymbol(const ELFObjectFile<ELF64LE> &ELFObj, StringRef Name) {
-  // First try to look up the symbol via the hash table.
-  for (ELFSectionRef Sec : ELFObj.sections()) {
-    if (Sec.getType() != SHT_HASH && Sec.getType() != SHT_GNU_HASH)
-      continue;
-
-    auto HashTabOrErr = ELFObj.getELFFile().getSection(Sec.getIndex());
-    if (!HashTabOrErr)
-      return HashTabOrErr.takeError();
-    return getHashTableSymbol<ELF64LE>(ELFObj.getELFFile(), **HashTabOrErr,
-                                       Name);
-  }
-
-  // If this is an executable file check the entire standard symbol table.
-  for (ELFSectionRef Sec : ELFObj.sections()) {
-    if (Sec.getType() != SHT_SYMTAB)
-      continue;
-
-    auto SymTabOrErr = ELFObj.getELFFile().getSection(Sec.getIndex());
-    if (!SymTabOrErr)
-      return SymTabOrErr.takeError();
-    return getSymTableSymbol<ELF64LE>(ELFObj.getELFFile(), **SymTabOrErr, Name);
-  }
-
-  return nullptr;
-}
diff --git a/openmp/libomptarget/plugins/common/elf_common/elf_common.h b/openmp/libomptarget/plugins/common/elf_common/elf_common.h
deleted file mode 100644
--- a/openmp/libomptarget/plugins/common/elf_common/elf_common.h
+++ /dev/null
@@ -1,27 +0,0 @@
-//===-- elf_common.h - Common ELF functionality -----------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Common ELF functionality for target plugins.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_ELF_COMMON_ELF_COMMON_H
-#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_ELF_COMMON_ELF_COMMON_H
-
-#include "omptargetplugin.h"
-#include <cstdint>
-
-/// Return non-zero, if the given \p image is an ELF object, which
-/// e_machine matches \p target_id; return zero otherwise.
-EXTERN int32_t elf_check_machine(__tgt_device_image *Image, uint16_t TargetId);
-
-/// Return non-zero, if the given \p image is an ET_DYN ELF object;
-/// return zero otherwise.
-EXTERN int32_t elf_is_dynamic(__tgt_device_image *Image);
-
-#endif // LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_ELF_COMMON_ELF_COMMON_H
diff --git a/openmp/libomptarget/plugins/common/elf_common/elf_common.cpp b/openmp/libomptarget/plugins/common/elf_common/elf_common.cpp
deleted file mode 100644
--- a/openmp/libomptarget/plugins/common/elf_common/elf_common.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-//===-- elf_common.cpp - Common ELF functionality -------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Common ELF functionality for target plugins.
-//
-//===----------------------------------------------------------------------===//
-#include "elf_common.h"
-#include "Debug.h"
-
-#include "llvm/BinaryFormat/Magic.h"
-#include "llvm/Object/Binary.h"
-#include "llvm/Object/ELFObjectFile.h"
-#include "llvm/Object/ELFTypes.h"
-#include "llvm/Object/ObjectFile.h"
-#include "llvm/Support/MemoryBuffer.h"
-
-#ifndef TARGET_NAME
-#define TARGET_NAME ELF Common
-#endif
-#define DEBUG_PREFIX "TARGET " GETNAME(TARGET_NAME)
-
-using namespace llvm;
-using namespace llvm::ELF;
-using namespace llvm::object;
-
-/// If the given range of bytes [\p BytesBegin, \p BytesEnd) represents
-/// a valid ELF, then invoke \p Callback on the ELFObjectFileBase
-/// created from this range, otherwise, return 0.
-/// If \p Callback is invoked, then return whatever value \p Callback returns.
-template <typename F>
-static int32_t withBytesAsElf(char *BytesBegin, char *BytesEnd, F Callback) {
-  size_t Size = BytesEnd - BytesBegin;
-  StringRef StrBuf(BytesBegin, Size);
-
-  auto Magic = identify_magic(StrBuf);
-  if (Magic != file_magic::elf && Magic != file_magic::elf_relocatable &&
-      Magic != file_magic::elf_executable &&
-      Magic != file_magic::elf_shared_object && Magic != file_magic::elf_core) {
-    DP("Not an ELF image!\n");
-    return 0;
-  }
-
-  std::unique_ptr<MemoryBuffer> MemBuf =
-      MemoryBuffer::getMemBuffer(StrBuf, "", false);
-  Expected<std::unique_ptr<ObjectFile>> BinOrErr =
-      ObjectFile::createELFObjectFile(MemBuf->getMemBufferRef(),
-                                      /*InitContent=*/false);
-  if (!BinOrErr) {
-    DP("Unable to get ELF handle: %s!\n",
-       toString(BinOrErr.takeError()).c_str());
-    return 0;
-  }
-
-  auto *Object = dyn_cast<const ELFObjectFileBase>(BinOrErr->get());
-
-  if (!Object) {
-    DP("Unknown ELF format!\n");
-    return 0;
-  }
-
-  return Callback(Object);
-}
-
-// Check whether an image is valid for execution on target_id
-int32_t elf_check_machine(__tgt_device_image *Image, uint16_t TargetId) {
-  auto CheckMachine = [TargetId](const ELFObjectFileBase *Object) {
-    return TargetId == Object->getEMachine();
-  };
-  return withBytesAsElf(reinterpret_cast<char *>(Image->ImageStart),
-                        reinterpret_cast<char *>(Image->ImageEnd),
-                        CheckMachine);
-}
-
-int32_t elf_is_dynamic(__tgt_device_image *Image) {
-  auto CheckDynType = [](const ELFObjectFileBase *Object) {
-    uint16_t Type = Object->getEType();
-    DP("ELF Type: %d\n", Type);
-    return Type == ET_DYN;
-  };
-  return withBytesAsElf(reinterpret_cast<char *>(Image->ImageStart),
-                        reinterpret_cast<char *>(Image->ImageEnd),
-                        CheckDynType);
-}
diff --git a/openmp/libomptarget/plugins/cuda/CMakeLists.txt b/openmp/libomptarget/plugins/cuda/CMakeLists.txt
deleted file mode 100644
--- a/openmp/libomptarget/plugins/cuda/CMakeLists.txt
+++ /dev/null
@@ -1,77 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for a CUDA machine if available.
-#
-##===----------------------------------------------------------------------===##
-set(LIBOMPTARGET_BUILD_CUDA_PLUGIN TRUE CACHE BOOL
-  "Whether to build CUDA plugin")
-if (NOT LIBOMPTARGET_BUILD_CUDA_PLUGIN)
-  libomptarget_say("Not building CUDA offloading plugin: LIBOMPTARGET_BUILD_CUDA_PLUGIN is false")
-  return()
-endif()
-
-if (NOT(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux"))
-  libomptarget_say("Not building CUDA offloading plugin: only support CUDA in Linux x86_64, ppc64le, or aarch64 hosts.")
-  return()
-endif()
-
-libomptarget_say("Building CUDA offloading plugin.")
-
-set(LIBOMPTARGET_DLOPEN_LIBCUDA OFF)
-option(LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA "Build with dlopened libcuda" ${LIBOMPTARGET_DLOPEN_LIBCUDA})
-
-add_llvm_library(omptarget.rtl.cuda SHARED
-  src/rtl.cpp
-
-  LINK_COMPONENTS
-  Support
-  Object
-
-  LINK_LIBS PRIVATE
-  elf_common
-  MemoryManager
-  ${OPENMP_PTHREAD_LIB}
-  "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports,-z,defs"
-
-  NO_INSTALL_RPATH
-)
-
-if(LIBOMPTARGET_DEP_CUDA_FOUND AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA)
-  libomptarget_say("Building CUDA plugin linked against libcuda")
-  target_link_libraries(omptarget.rtl.cuda PRIVATE CUDA::cuda_driver)
-else()
-  libomptarget_say("Building CUDA plugin for dlopened libcuda")
-  target_include_directories(omptarget.rtl.cuda PRIVATE dynamic_cuda)
-  target_sources(omptarget.rtl.cuda PRIVATE dynamic_cuda/cuda.cpp)
-endif()
-add_dependencies(omptarget.rtl.cuda omptarget.devicertl.nvptx)
-
-# Define the suffix for the runtime messaging dumps.
-target_compile_definitions(omptarget.rtl.cuda PRIVATE TARGET_NAME="CUDA")
-target_include_directories(omptarget.rtl.cuda PRIVATE ${LIBOMPTARGET_INCLUDE_DIR})
-
-# Install plugin under the lib destination folder.
-install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-set_target_properties(omptarget.rtl.cuda PROPERTIES 
-  INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.."
-  CXX_VISIBILITY_PRESET protected)
-
-# Report to the parent scope that we are building a plugin for CUDA.
-# This controls whether tests are run for the nvptx offloading target
-# Run them if libcuda is available, or if the user explicitly asked for dlopen
-# Otherwise this plugin is being built speculatively and there may be no cuda available
-option(LIBOMPTARGET_FORCE_NVIDIA_TESTS "Build NVIDIA libomptarget tests" OFF)
-if (LIBOMPTARGET_FOUND_NVIDIA_GPU OR LIBOMPTARGET_FORCE_NVIDIA_TESTS)
-  libomptarget_say("Enable tests using CUDA plugin")
-  set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda nvptx64-nvidia-cuda-LTO" PARENT_SCOPE)
-  list(APPEND LIBOMPTARGET_TESTED_PLUGINS "omptarget.rtl.cuda")
-  set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
-else()
-  libomptarget_say("Disabling tests using CUDA plugin as cuda may not be available")
-endif()
diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h
deleted file mode 100644
--- a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h
+++ /dev/null
@@ -1,271 +0,0 @@
-//===--- cuda/dynamic_cuda/cuda.h --------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// The parts of the cuda api that are presently in use by the openmp cuda plugin
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef DYNAMIC_CUDA_CUDA_H_INCLUDED
-#define DYNAMIC_CUDA_CUDA_H_INCLUDED
-
-#include <cstddef>
-#include <cstdint>
-
-typedef int CUdevice;
-typedef uintptr_t CUdeviceptr;
-typedef struct CUmod_st *CUmodule;
-typedef struct CUctx_st *CUcontext;
-typedef struct CUfunc_st *CUfunction;
-typedef struct CUstream_st *CUstream;
-typedef struct CUevent_st *CUevent;
-
-#define CU_DEVICE_INVALID ((CUdevice)-2)
-
-typedef enum cudaError_enum {
-  CUDA_SUCCESS = 0,
-  CUDA_ERROR_INVALID_VALUE = 1,
-  CUDA_ERROR_NO_DEVICE = 100,
-  CUDA_ERROR_INVALID_HANDLE = 400,
-  CUDA_ERROR_NOT_READY = 600,
-  CUDA_ERROR_TOO_MANY_PEERS = 711,
-} CUresult;
-
-typedef enum CUstream_flags_enum {
-  CU_STREAM_DEFAULT = 0x0,
-  CU_STREAM_NON_BLOCKING = 0x1,
-} CUstream_flags;
-
-typedef enum CUlimit_enum {
-  CU_LIMIT_STACK_SIZE = 0x0,
-  CU_LIMIT_PRINTF_FIFO_SIZE = 0x1,
-  CU_LIMIT_MALLOC_HEAP_SIZE = 0x2,
-  CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH = 0x3,
-  CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x4,
-  CU_LIMIT_MAX_L2_FETCH_GRANULARITY = 0x5,
-  CU_LIMIT_PERSISTING_L2_CACHE_SIZE = 0x6,
-  CU_LIMIT_MAX
-} CUlimit;
-
-typedef enum CUdevice_attribute_enum {
-  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,
-  CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,
-  CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,
-  CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,
-  CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,
-  CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,
-  CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,
-  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,
-  CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,
-  CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,
-  CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,
-  CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,
-  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,
-  CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,
-  CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,
-  CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,
-  CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,
-  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,
-  CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,
-  CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,
-  CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,
-  CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29,
-  CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30,
-  CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,
-  CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32,
-  CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33,
-  CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34,
-  CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35,
-  CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36,
-  CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37,
-  CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38,
-  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,
-  CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,
-  CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43,
-  CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49,
-  CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50,
-  CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74,
-  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,
-  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,
-  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77,
-  CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78,
-  CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79,
-  CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80,
-  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81,
-  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82,
-  CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83,
-  CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84,
-  CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85,
-  CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86,
-  CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87,
-  CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88,
-  CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89,
-  CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90,
-  CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91,
-  CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92,
-  CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93,
-  CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94,
-  CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95,
-  CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96,
-  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97,
-  CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98,
-  CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99,
-  CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100,
-  CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101,
-  CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED = 102,
-  CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 102,
-  CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103,
-  CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104,
-  CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105,
-  CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106,
-  CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107,
-  CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108,
-  CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE = 109,
-  CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110,
-  CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111,
-  CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED = 112,
-  CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113,
-  CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED = 114,
-  CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = 115,
-  CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED = 116,
-  CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS = 117,
-  CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING = 118,
-  CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES = 119,
-  CU_DEVICE_ATTRIBUTE_MAX,
-} CUdevice_attribute;
-
-typedef enum CUfunction_attribute_enum {
-  CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
-} CUfunction_attribute;
-
-typedef enum CUctx_flags_enum {
-  CU_CTX_SCHED_BLOCKING_SYNC = 0x04,
-  CU_CTX_SCHED_MASK = 0x07,
-} CUctx_flags;
-
-typedef enum CUmemAttach_flags_enum {
-  CU_MEM_ATTACH_GLOBAL = 0x1,
-  CU_MEM_ATTACH_HOST = 0x2,
-  CU_MEM_ATTACH_SINGLE = 0x4,
-} CUmemAttach_flags;
-
-typedef enum CUcomputeMode_enum {
-  CU_COMPUTEMODE_DEFAULT = 0,
-  CU_COMPUTEMODE_PROHIBITED = 2,
-  CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3,
-} CUcompute_mode;
-
-typedef enum CUevent_flags_enum {
-  CU_EVENT_DEFAULT = 0x0,
-  CU_EVENT_BLOCKING_SYNC = 0x1,
-  CU_EVENT_DISABLE_TIMING = 0x2,
-  CU_EVENT_INTERPROCESS = 0x4
-} CUevent_flags;
-
-CUresult cuCtxGetDevice(CUdevice *);
-CUresult cuDeviceGet(CUdevice *, int);
-CUresult cuDeviceGetAttribute(int *, CUdevice_attribute, CUdevice);
-CUresult cuDeviceGetCount(int *);
-CUresult cuFuncGetAttribute(int *, CUfunction_attribute, CUfunction);
-
-// Device info
-CUresult cuDeviceGetName(char *, int, CUdevice);
-CUresult cuDeviceTotalMem(size_t *, CUdevice);
-CUresult cuDriverGetVersion(int *);
-
-CUresult cuGetErrorString(CUresult, const char **);
-CUresult cuInit(unsigned);
-CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, unsigned, unsigned,
-                        unsigned, unsigned, unsigned, CUstream, void **,
-                        void **);
-
-CUresult cuMemAlloc(CUdeviceptr *, size_t);
-CUresult cuMemAllocHost(void **, size_t);
-CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int);
-
-CUresult cuMemcpyDtoDAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream);
-CUresult cuMemcpyDtoH(void *, CUdeviceptr, size_t);
-CUresult cuMemcpyDtoHAsync(void *, CUdeviceptr, size_t, CUstream);
-CUresult cuMemcpyHtoD(CUdeviceptr, const void *, size_t);
-CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream);
-
-CUresult cuMemFree(CUdeviceptr);
-CUresult cuMemFreeHost(void *);
-
-CUresult cuModuleGetFunction(CUfunction *, CUmodule, const char *);
-CUresult cuModuleGetGlobal(CUdeviceptr *, size_t *, CUmodule, const char *);
-
-CUresult cuModuleUnload(CUmodule);
-CUresult cuStreamCreate(CUstream *, unsigned);
-CUresult cuStreamDestroy(CUstream);
-CUresult cuStreamSynchronize(CUstream);
-CUresult cuStreamQuery(CUstream);
-CUresult cuCtxSetCurrent(CUcontext);
-CUresult cuDevicePrimaryCtxRelease(CUdevice);
-CUresult cuDevicePrimaryCtxGetState(CUdevice, unsigned *, int *);
-CUresult cuDevicePrimaryCtxSetFlags(CUdevice, unsigned);
-CUresult cuDevicePrimaryCtxRetain(CUcontext *, CUdevice);
-CUresult cuModuleLoadDataEx(CUmodule *, const void *, unsigned, void *,
-                            void **);
-
-CUresult cuDeviceCanAccessPeer(int *, CUdevice, CUdevice);
-CUresult cuCtxEnablePeerAccess(CUcontext, unsigned);
-CUresult cuMemcpyPeerAsync(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext,
-                           size_t, CUstream);
-
-CUresult cuCtxGetLimit(size_t *, CUlimit);
-CUresult cuCtxSetLimit(CUlimit, size_t);
-
-CUresult cuEventCreate(CUevent *, unsigned int);
-CUresult cuEventRecord(CUevent, CUstream);
-CUresult cuStreamWaitEvent(CUstream, CUevent, unsigned int);
-CUresult cuEventSynchronize(CUevent);
-CUresult cuEventDestroy(CUevent);
-
-#endif
diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp
deleted file mode 100644
--- a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-//===--- cuda/dynamic_cuda/cuda.pp ------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Implement subset of cuda api by calling into cuda library via dlopen
-// Does the dlopen/dlsym calls as part of the call to cuInit
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/DynamicLibrary.h"
-
-#include "Debug.h"
-#include "cuda.h"
-#include "dlwrap.h"
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-DLWRAP_INITIALIZE()
-
-DLWRAP_INTERNAL(cuInit, 1)
-
-DLWRAP(cuCtxGetDevice, 1)
-DLWRAP(cuDeviceGet, 2)
-DLWRAP(cuDeviceGetAttribute, 3)
-DLWRAP(cuDeviceGetCount, 1)
-DLWRAP(cuFuncGetAttribute, 3)
-
-// Device info
-DLWRAP(cuDeviceGetName, 3)
-DLWRAP(cuDeviceTotalMem, 2)
-DLWRAP(cuDriverGetVersion, 1)
-
-DLWRAP(cuGetErrorString, 2)
-DLWRAP(cuLaunchKernel, 11)
-
-DLWRAP(cuMemAlloc, 2)
-DLWRAP(cuMemAllocHost, 2)
-DLWRAP(cuMemAllocManaged, 3)
-
-DLWRAP(cuMemcpyDtoDAsync, 4)
-DLWRAP(cuMemcpyDtoH, 3)
-DLWRAP(cuMemcpyDtoHAsync, 4)
-DLWRAP(cuMemcpyHtoD, 3)
-DLWRAP(cuMemcpyHtoDAsync, 4)
-
-DLWRAP(cuMemFree, 1)
-DLWRAP(cuMemFreeHost, 1)
-DLWRAP(cuModuleGetFunction, 3)
-DLWRAP(cuModuleGetGlobal, 4)
-
-DLWRAP(cuModuleUnload, 1)
-DLWRAP(cuStreamCreate, 2)
-DLWRAP(cuStreamDestroy, 1)
-DLWRAP(cuStreamSynchronize, 1)
-DLWRAP(cuStreamQuery, 1)
-DLWRAP(cuCtxSetCurrent, 1)
-DLWRAP(cuDevicePrimaryCtxRelease, 1)
-DLWRAP(cuDevicePrimaryCtxGetState, 3)
-DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
-DLWRAP(cuDevicePrimaryCtxRetain, 2)
-DLWRAP(cuModuleLoadDataEx, 5)
-
-DLWRAP(cuDeviceCanAccessPeer, 3)
-DLWRAP(cuCtxEnablePeerAccess, 2)
-DLWRAP(cuMemcpyPeerAsync, 6)
-
-DLWRAP(cuCtxGetLimit, 2)
-DLWRAP(cuCtxSetLimit, 2)
-
-DLWRAP(cuEventCreate, 2)
-DLWRAP(cuEventRecord, 2)
-DLWRAP(cuStreamWaitEvent, 3)
-DLWRAP(cuEventSynchronize, 1)
-DLWRAP(cuEventDestroy, 1)
-
-DLWRAP_FINALIZE()
-
-#ifndef DYNAMIC_CUDA_PATH
-#define DYNAMIC_CUDA_PATH "libcuda.so"
-#endif
-
-#define TARGET_NAME CUDA
-#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
-
-static bool checkForCUDA() {
-  // return true if dlopen succeeded and all functions found
-
-  // Prefer _v2 versions of functions if found in the library
-  std::unordered_map<std::string, const char *> TryFirst = {
-      {"cuMemAlloc", "cuMemAlloc_v2"},
-      {"cuMemFree", "cuMemFree_v2"},
-      {"cuMemcpyDtoH", "cuMemcpyDtoH_v2"},
-      {"cuMemcpyHtoD", "cuMemcpyHtoD_v2"},
-      {"cuStreamDestroy", "cuStreamDestroy_v2"},
-      {"cuModuleGetGlobal", "cuModuleGetGlobal_v2"},
-      {"cuMemcpyDtoHAsync", "cuMemcpyDtoHAsync_v2"},
-      {"cuMemcpyDtoDAsync", "cuMemcpyDtoDAsync_v2"},
-      {"cuMemcpyHtoDAsync", "cuMemcpyHtoDAsync_v2"},
-      {"cuDevicePrimaryCtxRelease", "cuDevicePrimaryCtxRelease_v2"},
-      {"cuDevicePrimaryCtxSetFlags", "cuDevicePrimaryCtxSetFlags_v2"},
-  };
-
-  const char *CudaLib = DYNAMIC_CUDA_PATH;
-  std::string ErrMsg;
-  auto DynlibHandle = std::make_unique<llvm::sys::DynamicLibrary>(
-      llvm::sys::DynamicLibrary::getPermanentLibrary(CudaLib, &ErrMsg));
-  if (!DynlibHandle->isValid()) {
-    DP("Unable to load library '%s': %s!\n", CudaLib, ErrMsg.c_str());
-    return false;
-  }
-
-  for (size_t I = 0; I < dlwrap::size(); I++) {
-    const char *Sym = dlwrap::symbol(I);
-
-    auto It = TryFirst.find(Sym);
-    if (It != TryFirst.end()) {
-      const char *First = It->second;
-      void *P = DynlibHandle->getAddressOfSymbol(First);
-      if (P) {
-        DP("Implementing %s with dlsym(%s) -> %p\n", Sym, First, P);
-        *dlwrap::pointer(I) = P;
-        continue;
-      }
-    }
-
-    void *P = DynlibHandle->getAddressOfSymbol(Sym);
-    if (P == nullptr) {
-      DP("Unable to find '%s' in '%s'!\n", Sym, CudaLib);
-      return false;
-    }
-    DP("Implementing %s with dlsym(%s) -> %p\n", Sym, Sym, P);
-
-    *dlwrap::pointer(I) = P;
-  }
-
-  return true;
-}
-
-CUresult cuInit(unsigned X) {
-  // Note: Called exactly once from cuda rtl.cpp in a global constructor so
-  // does not need to handle being called repeatedly or concurrently
-  if (!checkForCUDA()) {
-    return CUDA_ERROR_INVALID_HANDLE;
-  }
-  return dlwrap_cuInit(X);
-}
diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
deleted file mode 100644
--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ /dev/null
@@ -1,1906 +0,0 @@
-//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// RTL for CUDA machine
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/StringRef.h"
-
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cuda.h>
-#include <list>
-#include <memory>
-#include <mutex>
-#include <string>
-#include <vector>
-
-#include "Debug.h"
-#include "DeviceEnvironment.h"
-#include "omptarget.h"
-#include "omptargetplugin.h"
-
-#define TARGET_NAME CUDA
-#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
-
-#include "MemoryManager.h"
-
-#include "llvm/Frontend/OpenMP/OMPConstants.h"
-
-using namespace llvm;
-
-// Utility for retrieving and printing CUDA error string.
-#ifdef OMPTARGET_DEBUG
-#define CUDA_ERR_STRING(err)                                                   \
-  do {                                                                         \
-    if (getDebugLevel() > 0) {                                                 \
-      const char *errStr = nullptr;                                            \
-      CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
-      if (errStr_status == CUDA_ERROR_INVALID_VALUE)                           \
-        REPORT("Unrecognized CUDA error code: %d\n", err);                     \
-      else if (errStr_status == CUDA_SUCCESS)                                  \
-        REPORT("CUDA error is: %s\n", errStr);                                 \
-      else {                                                                   \
-        REPORT("Unresolved CUDA error code: %d\n", err);                       \
-        REPORT("Unsuccessful cuGetErrorString return status: %d\n",            \
-               errStr_status);                                                 \
-      }                                                                        \
-    } else {                                                                   \
-      const char *errStr = nullptr;                                            \
-      CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
-      if (errStr_status == CUDA_SUCCESS)                                       \
-        REPORT("%s \n", errStr);                                               \
-    }                                                                          \
-  } while (false)
-#else // OMPTARGET_DEBUG
-#define CUDA_ERR_STRING(err)                                                   \
-  do {                                                                         \
-    const char *errStr = nullptr;                                              \
-    CUresult errStr_status = cuGetErrorString(err, &errStr);                   \
-    if (errStr_status == CUDA_SUCCESS)                                         \
-      REPORT("%s \n", errStr);                                                 \
-  } while (false)
-#endif // OMPTARGET_DEBUG
-
-#define BOOL2TEXT(b) ((b) ? "Yes" : "No")
-
-#include "elf_common.h"
-
-/// Keep entries table per device.
-struct FuncOrGblEntryTy {
-  __tgt_target_table Table;
-  std::vector<__tgt_offload_entry> Entries;
-};
-
-/// Use a single entity to encode a kernel and a set of flags.
-struct KernelTy {
-  CUfunction Func;
-
-  // execution mode of kernel
-  llvm::omp::OMPTgtExecModeFlags ExecutionMode;
-
-  /// Maximal number of threads per block for this kernel.
-  int MaxThreadsPerBlock = 0;
-
-  KernelTy(CUfunction Func, llvm::omp::OMPTgtExecModeFlags ExecutionMode)
-      : Func(Func), ExecutionMode(ExecutionMode) {}
-};
-
-namespace {
-bool checkResult(CUresult Err, const char *ErrMsg) {
-  if (Err == CUDA_SUCCESS)
-    return true;
-
-  REPORT("%s", ErrMsg);
-  CUDA_ERR_STRING(Err);
-  return false;
-}
-
-int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
-               CUstream Stream) {
-  CUresult Err =
-      cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
-
-  if (Err != CUDA_SUCCESS) {
-    DP("Error when copying data from device to device. Pointers: src "
-       "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
-       DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
-    CUDA_ERR_STRING(Err);
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
-  CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
-  CUevent Event = reinterpret_cast<CUevent>(EventPtr);
-
-  CUresult Err = cuEventRecord(Event, Stream);
-  if (Err != CUDA_SUCCESS) {
-    DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n",
-       DPxPTR(Stream), DPxPTR(Event));
-    CUDA_ERR_STRING(Err);
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-int syncEvent(void *EventPtr) {
-  CUevent Event = reinterpret_cast<CUevent>(EventPtr);
-
-  CUresult Err = cuEventSynchronize(Event);
-  if (Err != CUDA_SUCCESS) {
-    DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event));
-    CUDA_ERR_STRING(Err);
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-namespace {
-
-// Structure contains per-device data
-struct DeviceDataTy {
-  /// List that contains all the kernels.
-  std::list<KernelTy> KernelsList;
-
-  std::list<FuncOrGblEntryTy> FuncGblEntries;
-
-  CUcontext Context = nullptr;
-  // Device properties
-  unsigned int ThreadsPerBlock = 0;
-  unsigned int BlocksPerGrid = 0;
-  unsigned int WarpSize = 0;
-  // OpenMP properties
-  unsigned int NumTeams = 0;
-  unsigned int NumThreads = 0;
-};
-
-/// Resource allocator where \p T is the resource type.
-/// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL
-/// accordingly. The implementation should not raise any exception.
-template <typename T> struct AllocatorTy {
-  using ElementTy = T;
-  virtual ~AllocatorTy() {}
-
-  /// Create a resource and assign to R.
-  virtual int create(T &R) noexcept = 0;
-  /// Destroy the resource.
-  virtual int destroy(T) noexcept = 0;
-};
-
-/// Allocator for CUstream.
-struct StreamAllocatorTy final : public AllocatorTy<CUstream> {
-  /// See AllocatorTy<T>::create.
-  int create(CUstream &Stream) noexcept override {
-    if (!checkResult(cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING),
-                     "Error returned from cuStreamCreate\n"))
-      return OFFLOAD_FAIL;
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  /// See AllocatorTy<T>::destroy.
-  int destroy(CUstream Stream) noexcept override {
-    if (!checkResult(cuStreamDestroy(Stream),
-                     "Error returned from cuStreamDestroy\n"))
-      return OFFLOAD_FAIL;
-
-    return OFFLOAD_SUCCESS;
-  }
-};
-
-/// Allocator for CUevent.
-struct EventAllocatorTy final : public AllocatorTy<CUevent> {
-  /// See AllocatorTy<T>::create.
-  int create(CUevent &Event) noexcept override {
-    if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT),
-                     "Error returned from cuEventCreate\n"))
-      return OFFLOAD_FAIL;
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  /// See AllocatorTy<T>::destroy.
-  int destroy(CUevent Event) noexcept override {
-    if (!checkResult(cuEventDestroy(Event),
-                     "Error returned from cuEventDestroy\n"))
-      return OFFLOAD_FAIL;
-
-    return OFFLOAD_SUCCESS;
-  }
-};
-
-/// A generic pool of resources where \p T is the resource type.
-/// \p T should be copyable as the object is stored in \p std::vector .
-template <typename AllocTy> class ResourcePoolTy {
-  using ElementTy = typename AllocTy::ElementTy;
-  /// Index of the next available resource.
-  size_t Next = 0;
-  /// Mutex to guard the pool.
-  std::mutex Mutex;
-  /// Pool of resources. The difference between \p Resources and \p Pool is,
-  /// when a resource is acquired and released, it is all on \p Resources. When
-  /// a batch of new resources are needed, they are both added to \p Resources
-  /// and \p Pool. The reason for this setting is, \p Resources could contain
-  /// redundant elements because resources are not released, which can cause
-  /// double free. This setting makes sure that \p Pool always has every
-  /// resource allocated from the device.
-  std::vector<ElementTy> Resources;
-  std::vector<ElementTy> Pool;
-  /// A reference to the corresponding allocator.
-  AllocTy Allocator;
-
-  /// If `Resources` is used up, we will fill in more resources. It assumes that
-  /// the new size `Size` should be always larger than the current size.
-  bool resize(size_t Size) {
-    assert(Resources.size() == Pool.size() && "size mismatch");
-    auto CurSize = Resources.size();
-    assert(Size > CurSize && "Unexpected smaller size");
-    Pool.reserve(Size);
-    Resources.reserve(Size);
-    for (auto I = CurSize; I < Size; ++I) {
-      ElementTy NewItem;
-      int Ret = Allocator.create(NewItem);
-      if (Ret != OFFLOAD_SUCCESS)
-        return false;
-      Pool.push_back(NewItem);
-      Resources.push_back(NewItem);
-    }
-    return true;
-  }
-
-public:
-  ResourcePoolTy(AllocTy &&A, size_t Size = 0) noexcept
-      : Allocator(std::move(A)) {
-    if (Size)
-      (void)resize(Size);
-  }
-
-  ~ResourcePoolTy() noexcept { clear(); }
-
-  /// Get a resource from pool. `Next` always points to the next available
-  /// resource. That means, `[0, next-1]` have been assigned, and `[id,]` are
-  /// still available. If there is no resource left, we will ask for more. Each
-  /// time a resource is assigned, the id will increase one.
-  /// xxxxxs+++++++++
-  ///      ^
-  ///      Next
-  /// After assignment, the pool becomes the following and s is assigned.
-  /// xxxxxs+++++++++
-  ///       ^
-  ///       Next
-  int acquire(ElementTy &R) noexcept {
-    std::lock_guard<std::mutex> LG(Mutex);
-    if (Next == Resources.size()) {
-      auto NewSize = Resources.size() ? Resources.size() * 2 : 1;
-      if (!resize(NewSize))
-        return OFFLOAD_FAIL;
-    }
-
-    assert(Next < Resources.size());
-
-    R = Resources[Next++];
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  /// Return the resource back to the pool. When we return a resource, we need
-  /// to first decrease `Next`, and then copy the resource back. It is worth
-  /// noting that, the order of resources return might be different from that
-  /// they're assigned, that saying, at some point, there might be two identical
-  /// resources.
-  /// xxax+a+++++
-  ///     ^
-  ///     Next
-  /// However, it doesn't matter, because they're always on the two sides of
-  /// `Next`. The left one will in the end be overwritten by another resource.
-  /// Therefore, after several execution, the order of pool might be different
-  /// from its initial state.
-  void release(ElementTy R) noexcept {
-    std::lock_guard<std::mutex> LG(Mutex);
-    Resources[--Next] = R;
-  }
-
-  /// Released all stored resources and clear the pool.
-  /// Note: This function is not thread safe. Be sure to guard it if necessary.
-  void clear() noexcept {
-    for (auto &R : Pool)
-      (void)Allocator.destroy(R);
-    Pool.clear();
-    Resources.clear();
-  }
-};
-
-} // namespace
-
-class DeviceRTLTy {
-  int NumberOfDevices;
-  // OpenMP environment properties
-  int EnvNumTeams;
-  unsigned int EnvTeamLimit;
-  unsigned int EnvTeamThreadLimit;
-  // OpenMP requires flags
-  int64_t RequiresFlags;
-  // Amount of dynamic shared memory to use at launch.
-  uint64_t DynamicMemorySize;
-
-  /// Number of initial streams for each device.
-  int NumInitialStreams = 32;
-
-  /// Number of initial events for each device.
-  int NumInitialEvents = 8;
-
-  static constexpr const int32_t HardThreadLimit = 1024;
-  static constexpr const int32_t DefaultNumTeams = 128;
-  static constexpr const int32_t DefaultNumThreads = 128;
-
-  using StreamPoolTy = ResourcePoolTy<StreamAllocatorTy>;
-  std::vector<std::unique_ptr<StreamPoolTy>> StreamPool;
-
-  using EventPoolTy = ResourcePoolTy<EventAllocatorTy>;
-  std::vector<std::unique_ptr<EventPoolTy>> EventPool;
-
-  std::vector<DeviceDataTy> DeviceData;
-  std::vector<std::vector<CUmodule>> Modules;
-
-  /// Vector of flags indicating the initalization status of all associated
-  /// devices.
-  std::vector<bool> InitializedFlags;
-
-  enum class PeerAccessState : uint8_t { Unkown, Yes, No };
-  std::vector<std::vector<PeerAccessState>> PeerAccessMatrix;
-  std::mutex PeerAccessMatrixLock;
-
-  /// A class responsible for interacting with device native runtime library to
-  /// allocate and free memory.
-  class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
-  public:
-    void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
-      if (Size == 0)
-        return nullptr;
-
-      void *MemAlloc = nullptr;
-      CUresult Err;
-      switch (Kind) {
-      case TARGET_ALLOC_DEFAULT:
-      case TARGET_ALLOC_DEVICE:
-        CUdeviceptr DevicePtr;
-        Err = cuMemAlloc(&DevicePtr, Size);
-        MemAlloc = (void *)DevicePtr;
-        if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
-          return nullptr;
-        break;
-      case TARGET_ALLOC_HOST:
-        void *HostPtr;
-        Err = cuMemAllocHost(&HostPtr, Size);
-        MemAlloc = HostPtr;
-        if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
-          return nullptr;
-        break;
-      case TARGET_ALLOC_SHARED:
-        CUdeviceptr SharedPtr;
-        Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
-        MemAlloc = (void *)SharedPtr;
-        if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
-          return nullptr;
-        break;
-      }
-
-      return MemAlloc;
-    }
-
-    int free(void *TgtPtr, TargetAllocTy Kind) override {
-      CUresult Err;
-      // Host pinned memory must be freed differently.
-      switch (Kind) {
-      case TARGET_ALLOC_DEFAULT:
-      case TARGET_ALLOC_DEVICE:
-      case TARGET_ALLOC_SHARED:
-        Err = cuMemFree((CUdeviceptr)TgtPtr);
-        if (!checkResult(Err, "Error returned from cuMemFree\n"))
-          return OFFLOAD_FAIL;
-        break;
-      case TARGET_ALLOC_HOST:
-        Err = cuMemFreeHost(TgtPtr);
-        if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
-          return OFFLOAD_FAIL;
-        break;
-      }
-
-      return OFFLOAD_SUCCESS;
-    }
-  };
-
-  /// A vector of device allocators
-  std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
-
-  /// A vector of memory managers. Since the memory manager is non-copyable and
-  // non-removable, we wrap them into std::unique_ptr.
-  std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
-
-  /// Whether use memory manager
-  bool UseMemoryManager = true;
-
-  // Record entry point associated with device
-  void addOffloadEntry(const int DeviceId, const __tgt_offload_entry Entry) {
-    FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
-    E.Entries.push_back(Entry);
-  }
-
-  // Return a pointer to the entry associated with the pointer
-  const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
-                                             const void *Addr) const {
-    for (const __tgt_offload_entry &Itr :
-         DeviceData[DeviceId].FuncGblEntries.back().Entries)
-      if (Itr.addr == Addr)
-        return &Itr;
-
-    return nullptr;
-  }
-
-  // Return the pointer to the target entries table
-  __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
-    FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
-
-    if (E.Entries.empty())
-      return nullptr;
-
-    // Update table info according to the entries and return the pointer
-    E.Table.EntriesBegin = E.Entries.data();
-    E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
-
-    return &E.Table;
-  }
-
-  // Clear entries table for a device
-  void clearOffloadEntriesTable(const int DeviceId) {
-    DeviceData[DeviceId].FuncGblEntries.emplace_back();
-    FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
-    E.Entries.clear();
-    E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
-  }
-
-public:
-  CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
-    assert(AsyncInfo && "AsyncInfo is nullptr");
-
-    if (!AsyncInfo->Queue) {
-      CUstream S;
-      if (StreamPool[DeviceId]->acquire(S) != OFFLOAD_SUCCESS)
-        return nullptr;
-
-      AsyncInfo->Queue = S;
-    }
-
-    return reinterpret_cast<CUstream>(AsyncInfo->Queue);
-  }
-
-  // This class should not be copied
-  DeviceRTLTy(const DeviceRTLTy &) = delete;
-  DeviceRTLTy(DeviceRTLTy &&) = delete;
-
-  DeviceRTLTy()
-      : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
-        EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED),
-        DynamicMemorySize(0) {
-
-    DP("Start initializing CUDA\n");
-
-    CUresult Err = cuInit(0);
-    if (Err == CUDA_ERROR_INVALID_HANDLE) {
-      // Can't call cuGetErrorString if dlsym failed
-      DP("Failed to load CUDA shared library\n");
-      return;
-    }
-    if (Err == CUDA_ERROR_NO_DEVICE) {
-      DP("There are no devices supporting CUDA.\n");
-      return;
-    }
-    if (!checkResult(Err, "Error returned from cuInit\n")) {
-      return;
-    }
-
-    Err = cuDeviceGetCount(&NumberOfDevices);
-    if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
-      return;
-
-    if (NumberOfDevices == 0) {
-      DP("There are no devices supporting CUDA.\n");
-      return;
-    }
-
-    DeviceData.resize(NumberOfDevices);
-    Modules.resize(NumberOfDevices);
-    StreamPool.resize(NumberOfDevices);
-    EventPool.resize(NumberOfDevices);
-    PeerAccessMatrix.resize(NumberOfDevices);
-    for (auto &V : PeerAccessMatrix)
-      V.resize(NumberOfDevices, PeerAccessState::Unkown);
-
-    // Get environment variables regarding teams
-    if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
-      // OMP_TEAM_LIMIT has been set
-      EnvTeamLimit = std::stoi(EnvStr);
-      DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
-    }
-    if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) {
-      // OMP_TEAMS_THREAD_LIMIT has been set
-      EnvTeamThreadLimit = std::stoi(EnvStr);
-      DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
-    }
-    if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
-      // OMP_NUM_TEAMS has been set
-      EnvNumTeams = std::stoi(EnvStr);
-      DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
-    }
-    if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) {
-      // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set
-      DynamicMemorySize = std::stoi(EnvStr);
-      DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n",
-         DynamicMemorySize);
-    }
-    if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) {
-      // LIBOMPTARGET_NUM_INITIAL_STREAMS has been set
-      NumInitialStreams = std::stoi(EnvStr);
-      DP("Parsed LIBOMPTARGET_NUM_INITIAL_STREAMS=%d\n", NumInitialStreams);
-    }
-
-    for (int I = 0; I < NumberOfDevices; ++I)
-      DeviceAllocators.emplace_back();
-
-    // Get the size threshold from environment variable
-    std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
-    UseMemoryManager = Res.second;
-    size_t MemoryManagerThreshold = Res.first;
-
-    if (UseMemoryManager)
-      for (int I = 0; I < NumberOfDevices; ++I)
-        MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
-            DeviceAllocators[I], MemoryManagerThreshold));
-
-    // We lazily initialize all devices later.
-    InitializedFlags.assign(NumberOfDevices, false);
-  }
-
-  ~DeviceRTLTy() {
-    for (int DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId)
-      deinitDevice(DeviceId);
-  }
-
-  // Check whether a given DeviceId is valid
-  bool isValidDeviceId(const int DeviceId) const {
-    return DeviceId >= 0 && DeviceId < NumberOfDevices;
-  }
-
-  int getNumOfDevices() const { return NumberOfDevices; }
-
-  void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
-
-  int initDevice(const int DeviceId) {
-    CUdevice Device;
-
-    DP("Getting device %d\n", DeviceId);
-    CUresult Err = cuDeviceGet(&Device, DeviceId);
-    if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
-      return OFFLOAD_FAIL;
-
-    assert(InitializedFlags[DeviceId] == false && "Reinitializing device!");
-    InitializedFlags[DeviceId] = true;
-
-    // Query the current flags of the primary context and set its flags if
-    // it is inactive
-    unsigned int FormerPrimaryCtxFlags = 0;
-    int FormerPrimaryCtxIsActive = 0;
-    Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
-                                     &FormerPrimaryCtxIsActive);
-    if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
-      return OFFLOAD_FAIL;
-
-    if (FormerPrimaryCtxIsActive) {
-      DP("The primary context is active, no change to its flags\n");
-      if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
-          CU_CTX_SCHED_BLOCKING_SYNC)
-        DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
-    } else {
-      DP("The primary context is inactive, set its flags to "
-         "CU_CTX_SCHED_BLOCKING_SYNC\n");
-      Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
-      if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
-        return OFFLOAD_FAIL;
-    }
-
-    // Retain the per device primary context and save it to use whenever this
-    // device is selected.
-    Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
-    if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
-      return OFFLOAD_FAIL;
-
-    Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
-    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
-      return OFFLOAD_FAIL;
-
-    // Initialize the stream pool.
-    if (!StreamPool[DeviceId])
-      StreamPool[DeviceId] = std::make_unique<StreamPoolTy>(StreamAllocatorTy(),
-                                                            NumInitialStreams);
-
-    // Initialize the event pool.
-    if (!EventPool[DeviceId])
-      EventPool[DeviceId] =
-          std::make_unique<EventPoolTy>(EventAllocatorTy(), NumInitialEvents);
-
-    // Query attributes to determine number of threads/block and blocks/grid.
-    int MaxGridDimX;
-    Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
-                               Device);
-    if (Err != CUDA_SUCCESS) {
-      DP("Error getting max grid dimension, use default value %d\n",
-         DeviceRTLTy::DefaultNumTeams);
-      DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
-    } else {
-      DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
-      DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
-    }
-
-    // We are only exploiting threads along the x axis.
-    int MaxBlockDimX;
-    Err = cuDeviceGetAttribute(&MaxBlockDimX,
-                               CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
-    if (Err != CUDA_SUCCESS) {
-      DP("Error getting max block dimension, use default value %d\n",
-         DeviceRTLTy::DefaultNumThreads);
-      DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
-    } else {
-      DP("Using %d CUDA threads per block\n", MaxBlockDimX);
-      DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
-
-      if (EnvTeamThreadLimit > 0 &&
-          DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) {
-        DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
-           "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
-           DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit);
-        DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit;
-      }
-      if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) {
-        DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
-           "capping at the hard limit\n",
-           DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit);
-        DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
-      }
-    }
-
-    // Get and set warp size
-    int WarpSize;
-    Err =
-        cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
-    if (Err != CUDA_SUCCESS) {
-      DP("Error getting warp size, assume default value 32\n");
-      DeviceData[DeviceId].WarpSize = 32;
-    } else {
-      DP("Using warp size %d\n", WarpSize);
-      DeviceData[DeviceId].WarpSize = WarpSize;
-    }
-
-    // Adjust teams to the env variables
-    if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
-      DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
-         EnvTeamLimit);
-      DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
-    }
-
-    size_t StackLimit;
-    size_t HeapLimit;
-    if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) {
-      StackLimit = std::stol(EnvStr);
-      if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS)
-        return OFFLOAD_FAIL;
-    } else {
-      if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS)
-        return OFFLOAD_FAIL;
-    }
-    if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) {
-      HeapLimit = std::stol(EnvStr);
-      if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS)
-        return OFFLOAD_FAIL;
-    } else {
-      if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS)
-        return OFFLOAD_FAIL;
-    }
-
-    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-         "Device supports up to %d CUDA blocks and %d threads with a "
-         "warp size of %d\n",
-         DeviceData[DeviceId].BlocksPerGrid,
-         DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
-    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-         "Device heap size is %d Bytes, device stack size is %d Bytes per "
-         "thread\n",
-         (int)HeapLimit, (int)StackLimit);
-
-    // Set default number of teams
-    if (EnvNumTeams > 0) {
-      DP("Default number of teams set according to environment %d\n",
-         EnvNumTeams);
-      DeviceData[DeviceId].NumTeams = EnvNumTeams;
-    } else {
-      DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
-      DP("Default number of teams set according to library's default %d\n",
-         DeviceRTLTy::DefaultNumTeams);
-    }
-
-    if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
-      DP("Default number of teams exceeds device limit, capping at %d\n",
-         DeviceData[DeviceId].BlocksPerGrid);
-      DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
-    }
-
-    // Set default number of threads
-    DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
-    DP("Default number of threads set according to library's default %d\n",
-       DeviceRTLTy::DefaultNumThreads);
-    if (DeviceData[DeviceId].NumThreads >
-        DeviceData[DeviceId].ThreadsPerBlock) {
-      DP("Default number of threads exceeds device limit, capping at %d\n",
-         DeviceData[DeviceId].ThreadsPerBlock);
-      DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock;
-    }
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  int deinitDevice(const int DeviceId) {
-    auto IsInitialized = InitializedFlags[DeviceId];
-    if (!IsInitialized)
-      return OFFLOAD_SUCCESS;
-    InitializedFlags[DeviceId] = false;
-
-    if (UseMemoryManager)
-      MemoryManagers[DeviceId].release();
-
-    StreamPool[DeviceId].reset();
-    EventPool[DeviceId].reset();
-
-    DeviceDataTy &D = DeviceData[DeviceId];
-    if (!checkResult(cuCtxSetCurrent(D.Context),
-                     "Error returned from cuCtxSetCurrent\n"))
-      return OFFLOAD_FAIL;
-
-    // Unload all modules.
-    for (auto &M : Modules[DeviceId])
-      if (!checkResult(cuModuleUnload(M),
-                       "Error returned from cuModuleUnload\n"))
-        return OFFLOAD_FAIL;
-
-    // Destroy context.
-    CUdevice Device;
-    if (!checkResult(cuCtxGetDevice(&Device),
-                     "Error returned from cuCtxGetDevice\n"))
-      return OFFLOAD_FAIL;
-
-    if (!checkResult(cuDevicePrimaryCtxRelease(Device),
-                     "Error returned from cuDevicePrimaryCtxRelease\n"))
-      return OFFLOAD_FAIL;
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  __tgt_target_table *loadBinary(const int DeviceId,
-                                 const __tgt_device_image *Image) {
-    // Clear the offload table as we are going to create a new one.
-    clearOffloadEntriesTable(DeviceId);
-
-    // Create the module and extract the function pointers.
-    CUmodule Module;
-    DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
-    CUresult Err =
-        cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
-    if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
-      return nullptr;
-
-    DP("CUDA module successfully loaded!\n");
-
-    Modules[DeviceId].push_back(Module);
-
-    // Find the symbols in the module by name.
-    const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
-    const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
-
-    std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
-    for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
-      if (!E->addr) {
-        // We return nullptr when something like this happens, the host should
-        // have always something in the address to uniquely identify the target
-        // region.
-        DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
-        return nullptr;
-      }
-
-      if (E->size) {
-        __tgt_offload_entry Entry = *E;
-        CUdeviceptr CUPtr;
-        size_t CUSize;
-        Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
-        // We keep this style here because we need the name
-        if (Err != CUDA_SUCCESS) {
-          REPORT("Loading global '%s' Failed\n", E->name);
-          CUDA_ERR_STRING(Err);
-          return nullptr;
-        }
-
-        if (CUSize != E->size) {
-          DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
-             CUSize, E->size);
-          return nullptr;
-        }
-
-        DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
-           DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
-
-        Entry.addr = (void *)(CUPtr);
-
-        // Note: In the current implementation declare target variables
-        // can either be link or to. This means that once unified
-        // memory is activated via the requires directive, the variable
-        // can be used directly from the host in both cases.
-        // TODO: when variables types other than to or link are added,
-        // the below condition should be changed to explicitly
-        // check for to and link variables types:
-        // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
-        // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
-        if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
-          // If unified memory is present any target link or to variables
-          // can access host addresses directly. There is no longer a
-          // need for device copies.
-          cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
-          DP("Copy linked variable host address (" DPxMOD
-             ") to device address (" DPxMOD ")\n",
-             DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
-        }
-
-        addOffloadEntry(DeviceId, Entry);
-
-        continue;
-      }
-
-      CUfunction Func;
-      Err = cuModuleGetFunction(&Func, Module, E->name);
-      // We keep this style here because we need the name
-      if (Err != CUDA_SUCCESS) {
-        REPORT("Loading '%s' Failed\n", E->name);
-        CUDA_ERR_STRING(Err);
-        return nullptr;
-      }
-
-      DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
-         DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
-
-      // default value GENERIC (in case symbol is missing from cubin file)
-      llvm::omp::OMPTgtExecModeFlags ExecModeVal;
-      std::string ExecModeNameStr(E->name);
-      ExecModeNameStr += "_exec_mode";
-      const char *ExecModeName = ExecModeNameStr.c_str();
-
-      CUdeviceptr ExecModePtr;
-      size_t CUSize;
-      Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
-      if (Err == CUDA_SUCCESS) {
-        if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
-          DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
-             ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
-          return nullptr;
-        }
-
-        Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
-        if (Err != CUDA_SUCCESS) {
-          REPORT("Error when copying data from device to host. Pointers: "
-                 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
-                 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
-          CUDA_ERR_STRING(Err);
-          return nullptr;
-        }
-      } else {
-        DP("Loading global exec_mode '%s' - symbol missing, using default "
-           "value GENERIC (1)\n",
-           ExecModeName);
-      }
-
-      KernelsList.emplace_back(Func, ExecModeVal);
-
-      __tgt_offload_entry Entry = *E;
-      Entry.addr = &KernelsList.back();
-      addOffloadEntry(DeviceId, Entry);
-    }
-
-    // send device environment data to the device
-    {
-      // TODO: The device ID used here is not the real device ID used by OpenMP.
-      DeviceEnvironmentTy DeviceEnv{0, static_cast<uint32_t>(NumberOfDevices),
-                                    static_cast<uint32_t>(DeviceId),
-                                    static_cast<uint32_t>(DynamicMemorySize)};
-
-      if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
-        DeviceEnv.DebugKind = std::stoi(EnvStr);
-
-      const char *DeviceEnvName = "__omp_rtl_device_environment";
-      CUdeviceptr DeviceEnvPtr;
-      size_t CUSize;
-
-      Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
-      if (Err == CUDA_SUCCESS) {
-        if (CUSize != sizeof(DeviceEnv)) {
-          REPORT(
-              "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
-              DeviceEnvName, CUSize, sizeof(int32_t));
-          CUDA_ERR_STRING(Err);
-          return nullptr;
-        }
-
-        Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
-        if (Err != CUDA_SUCCESS) {
-          REPORT("Error when copying data from host to device. Pointers: "
-                 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
-                 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
-          CUDA_ERR_STRING(Err);
-          return nullptr;
-        }
-
-        DP("Sending global device environment data %zu bytes\n", CUSize);
-      } else {
-        DP("Finding global device environment '%s' - symbol missing.\n",
-           DeviceEnvName);
-        DP("Continue, considering this is a device RTL which does not accept "
-           "environment setting.\n");
-      }
-    }
-
-    return getOffloadEntriesTable(DeviceId);
-  }
-
-  void *dataAlloc(const int DeviceId, const int64_t Size,
-                  const TargetAllocTy Kind) {
-    switch (Kind) {
-    case TARGET_ALLOC_DEFAULT:
-    case TARGET_ALLOC_DEVICE:
-      if (UseMemoryManager)
-        return MemoryManagers[DeviceId]->allocate(Size, nullptr);
-      else
-        return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
-    case TARGET_ALLOC_HOST:
-    case TARGET_ALLOC_SHARED:
-      return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
-    }
-
-    REPORT("Invalid target data allocation kind or requested allocator not "
-           "implemented yet\n");
-
-    return nullptr;
-  }
-
-  int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
-                 const int64_t Size, __tgt_async_info *AsyncInfo) const {
-    assert(AsyncInfo && "AsyncInfo is nullptr");
-
-    CUstream Stream = getStream(DeviceId, AsyncInfo);
-    CUresult Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
-    if (Err != CUDA_SUCCESS) {
-      DP("Error when copying data from host to device. Pointers: host "
-         "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
-         DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
-      CUDA_ERR_STRING(Err);
-      return OFFLOAD_FAIL;
-    }
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
-                   const int64_t Size, __tgt_async_info *AsyncInfo) const {
-    assert(AsyncInfo && "AsyncInfo is nullptr");
-
-    CUstream Stream = getStream(DeviceId, AsyncInfo);
-    CUresult Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
-    if (Err != CUDA_SUCCESS) {
-      DP("Error when copying data from device to host. Pointers: host "
-         "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
-         DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
-      CUDA_ERR_STRING(Err);
-      return OFFLOAD_FAIL;
-    }
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
-                   int64_t Size, __tgt_async_info *AsyncInfo) {
-    assert(AsyncInfo && "AsyncInfo is nullptr");
-
-    CUresult Err;
-    CUstream Stream = getStream(SrcDevId, AsyncInfo);
-
-    // If they are two devices, we try peer to peer copy first
-    if (SrcDevId != DstDevId) {
-      std::lock_guard<std::mutex> LG(PeerAccessMatrixLock);
-
-      switch (PeerAccessMatrix[SrcDevId][DstDevId]) {
-      case PeerAccessState::No: {
-        REPORT("Peer access from %" PRId32 " to %" PRId32
-               " is not supported. Fall back to D2D memcpy.\n",
-               SrcDevId, DstDevId);
-        return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
-      }
-      case PeerAccessState::Unkown: {
-        int CanAccessPeer = 0;
-        Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
-        if (Err != CUDA_SUCCESS) {
-          REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
-                 ", dst = %" PRId32 ". Fall back to D2D memcpy.\n",
-                 SrcDevId, DstDevId);
-          CUDA_ERR_STRING(Err);
-          PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
-          return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
-        }
-
-        if (!CanAccessPeer) {
-          REPORT("P2P access from %d to %d is not supported. Fall back to D2D "
-                 "memcpy.\n",
-                 SrcDevId, DstDevId);
-          PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
-          return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
-        }
-
-        Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
-        if (Err != CUDA_SUCCESS) {
-          REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
-                 ", dst = %" PRId32 ". Fall back to D2D memcpy.\n",
-                 SrcDevId, DstDevId);
-          CUDA_ERR_STRING(Err);
-          PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
-          return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
-        }
-
-        PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::Yes;
-
-        [[fallthrough]];
-      }
-      case PeerAccessState::Yes: {
-        Err = cuMemcpyPeerAsync(
-            (CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
-            (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, Size, Stream);
-        if (Err == CUDA_SUCCESS)
-          return OFFLOAD_SUCCESS;
-
-        DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
-           ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32
-           ". Fall back to D2D memcpy.\n",
-           DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
-        CUDA_ERR_STRING(Err);
-
-        return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
-      }
-      }
-    }
-
-    return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
-  }
-
-  int dataDelete(const int DeviceId, void *TgtPtr, TargetAllocTy Kind) {
-    switch (Kind) {
-    case TARGET_ALLOC_DEFAULT:
-    case TARGET_ALLOC_DEVICE:
-      if (UseMemoryManager)
-        return MemoryManagers[DeviceId]->free(TgtPtr);
-      else
-        return DeviceAllocators[DeviceId].free(TgtPtr, Kind);
-    case TARGET_ALLOC_HOST:
-    case TARGET_ALLOC_SHARED:
-      return DeviceAllocators[DeviceId].free(TgtPtr, Kind);
-    }
-
-    REPORT("Invalid target data allocation kind or requested allocator not "
-           "implemented yet\n");
-
-    return OFFLOAD_FAIL;
-  }
-
-  int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
-                          ptrdiff_t *TgtOffsets, const int ArgNum,
-                          const int TeamNum, const int ThreadLimit,
-                          const unsigned int LoopTripCount,
-                          __tgt_async_info *AsyncInfo) const {
-    // All args are references.
-    std::vector<void *> Args(ArgNum);
-    std::vector<void *> Ptrs(ArgNum);
-
-    for (int I = 0; I < ArgNum; ++I) {
-      Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
-      Args[I] = &Ptrs[I];
-    }
-
-    KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
-
-    const bool IsSPMDGenericMode =
-        KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
-    const bool IsSPMDMode =
-        KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
-    const bool IsGenericMode =
-        KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
-
-    int CudaThreadsPerBlock;
-    if (ThreadLimit > 0) {
-      DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
-      CudaThreadsPerBlock = ThreadLimit;
-      // Add master warp if necessary
-      if (IsGenericMode) {
-        DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
-        CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
-      }
-    } else {
-      DP("Setting CUDA threads per block to default %d\n",
-         DeviceData[DeviceId].NumThreads);
-      CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
-    }
-
-    if ((unsigned)CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
-      DP("Threads per block capped at device limit %d\n",
-         DeviceData[DeviceId].ThreadsPerBlock);
-      CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
-    }
-
-    CUresult Err;
-    if (!KernelInfo->MaxThreadsPerBlock) {
-      Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
-                               CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                               KernelInfo->Func);
-      if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
-        return OFFLOAD_FAIL;
-    }
-
-    if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
-      DP("Threads per block capped at kernel limit %d\n",
-         KernelInfo->MaxThreadsPerBlock);
-      CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
-    }
-
-    unsigned int CudaBlocksPerGrid;
-    if (TeamNum <= 0) {
-      if (LoopTripCount > 0 && EnvNumTeams < 0) {
-        if (IsSPMDGenericMode) {
-          // If we reach this point, then we are executing a kernel that was
-          // transformed from Generic-mode to SPMD-mode. This kernel has
-          // SPMD-mode execution, but needs its blocks to be scheduled
-          // differently because the current loop trip count only applies to the
-          // `teams distribute` region and will create var too few blocks using
-          // the regular SPMD-mode method.
-          CudaBlocksPerGrid = LoopTripCount;
-        } else if (IsSPMDMode) {
-          // We have a combined construct, i.e. `target teams distribute
-          // parallel for [simd]`. We launch so many teams so that each thread
-          // will execute one iteration of the loop. round up to the nearest
-          // integer
-          CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
-        } else if (IsGenericMode) {
-          // If we reach this point, then we have a non-combined construct, i.e.
-          // `teams distribute` with a nested `parallel for` and each team is
-          // assigned one iteration of the `distribute` loop. E.g.:
-          //
-          // #pragma omp target teams distribute
-          // for(...loop_tripcount...) {
-          //   #pragma omp parallel for
-          //   for(...) {}
-          // }
-          //
-          // Threads within a team will execute the iterations of the `parallel`
-          // loop.
-          CudaBlocksPerGrid = LoopTripCount;
-        } else {
-          REPORT("Unknown execution mode: %d\n",
-                 static_cast<int8_t>(KernelInfo->ExecutionMode));
-          return OFFLOAD_FAIL;
-        }
-        DP("Using %d teams due to loop trip count %" PRIu32
-           " and number of threads per block %d\n",
-           CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
-      } else {
-        DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
-        CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
-      }
-    } else {
-      DP("Using requested number of teams %d\n", TeamNum);
-      CudaBlocksPerGrid = TeamNum;
-    }
-
-    if (CudaBlocksPerGrid > DeviceData[DeviceId].BlocksPerGrid) {
-      DP("Capping number of teams to team limit %d\n",
-         DeviceData[DeviceId].BlocksPerGrid);
-      CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
-    }
-
-    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-         "Launching kernel %s with %d blocks and %d threads in %s mode\n",
-         (getOffloadEntry(DeviceId, TgtEntryPtr))
-             ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
-             : "(null)",
-         CudaBlocksPerGrid, CudaThreadsPerBlock,
-         (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD"));
-
-    CUstream Stream = getStream(DeviceId, AsyncInfo);
-    Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
-                         /* gridDimZ */ 1, CudaThreadsPerBlock,
-                         /* blockDimY */ 1, /* blockDimZ */ 1,
-                         DynamicMemorySize, Stream, &Args[0], nullptr);
-    if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
-      return OFFLOAD_FAIL;
-
-    DP("Launch of entry point at " DPxMOD " successful!\n",
-       DPxPTR(TgtEntryPtr));
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const {
-    CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
-    CUresult Err = cuStreamSynchronize(Stream);
-
-    // Once the stream is synchronized, return it to stream pool and reset
-    // AsyncInfo. This is to make sure the synchronization only works for its
-    // own tasks.
-    StreamPool[DeviceId]->release(reinterpret_cast<CUstream>(AsyncInfo->Queue));
-    AsyncInfo->Queue = nullptr;
-
-    if (Err != CUDA_SUCCESS) {
-      DP("Error when synchronizing stream. stream = " DPxMOD
-         ", async info ptr = " DPxMOD "\n",
-         DPxPTR(Stream), DPxPTR(AsyncInfo));
-      CUDA_ERR_STRING(Err);
-    }
-    return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
-  }
-
-  int queryAsync(const int DeviceId, __tgt_async_info *AsyncInfo) const {
-    CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
-    CUresult Err = cuStreamQuery(Stream);
-
-    // Not ready streams must be considered as successful operations.
-    if (Err == CUDA_ERROR_NOT_READY)
-      return OFFLOAD_SUCCESS;
-
-    // Once the stream is synchronized or an error occurs, return it to the
-    // stream pool and reset AsyncInfo. This is to make sure the
-    // synchronization only works for its own tasks.
-    StreamPool[DeviceId]->release(Stream);
-    AsyncInfo->Queue = nullptr;
-
-    if (Err != CUDA_SUCCESS) {
-      DP("Error when querying for stream progress. stream = " DPxMOD
-         ", async info ptr = " DPxMOD "\n",
-         DPxPTR(Stream), DPxPTR(AsyncInfo));
-      CUDA_ERR_STRING(Err);
-    }
-    return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
-  }
-
-  void printDeviceInfo(int32_t DeviceId) {
-    char TmpChar[1000];
-    std::string TmpStr;
-    size_t TmpSt;
-    int TmpInt, TmpInt2, TmpInt3;
-
-    CUdevice Device;
-    checkResult(cuDeviceGet(&Device, DeviceId),
-                "Error returned from cuCtxGetDevice\n");
-
-    cuDriverGetVersion(&TmpInt);
-    printf("    CUDA Driver Version: \t\t%d \n", TmpInt);
-    printf("    CUDA Device Number: \t\t%d \n", DeviceId);
-    checkResult(cuDeviceGetName(TmpChar, 1000, Device),
-                "Error returned from cuDeviceGetName\n");
-    printf("    Device Name: \t\t\t%s \n", TmpChar);
-    checkResult(cuDeviceTotalMem(&TmpSt, Device),
-                "Error returned from cuDeviceTotalMem\n");
-    printf("    Global Memory Size: \t\t%zu bytes \n", TmpSt);
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Number of Multiprocessors: \t\t%d \n", TmpInt);
-    checkResult(
-        cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device),
-        "Error returned from cuDeviceGetAttribute\n");
-    printf("    Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt));
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Total Constant Memory: \t\t%d bytes\n", TmpInt);
-    checkResult(
-        cuDeviceGetAttribute(
-            &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device),
-        "Error returned from cuDeviceGetAttribute\n");
-    printf("    Max Shared Memory per Block: \t%d bytes \n", TmpInt);
-    checkResult(
-        cuDeviceGetAttribute(
-            &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device),
-        "Error returned from cuDeviceGetAttribute\n");
-    printf("    Registers per Block: \t\t%d \n", TmpInt);
-    checkResult(
-        cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device),
-        "Error returned from cuDeviceGetAttribute\n");
-    printf("    Warp Size: \t\t\t\t%d Threads \n", TmpInt);
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Maximum Threads per Block: \t\t%d \n", TmpInt);
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2,
-           TmpInt3);
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2,
-           TmpInt3);
-    checkResult(
-        cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device),
-        "Error returned from cuDeviceGetAttribute\n");
-    printf("    Maximum Memory Pitch: \t\t%d bytes \n", TmpInt);
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Texture Alignment: \t\t\t%d bytes \n", TmpInt);
-    checkResult(
-        cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device),
-        "Error returned from cuDeviceGetAttribute\n");
-    printf("    Clock Rate: \t\t\t%d kHz\n", TmpInt);
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt));
-    checkResult(
-        cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device),
-        "Error returned from cuDeviceGetAttribute\n");
-    printf("    Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt));
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
-    checkResult(
-        cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device),
-        "Error returned from cuDeviceGetAttribute\n");
-    if (TmpInt == CU_COMPUTEMODE_DEFAULT)
-      TmpStr = "DEFAULT";
-    else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
-      TmpStr = "PROHIBITED";
-    else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
-      TmpStr = "EXCLUSIVE PROCESS";
-    else
-      TmpStr = "unknown";
-    printf("    Compute Mode: \t\t\t%s \n", TmpStr.c_str());
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt));
-    checkResult(
-        cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device),
-        "Error returned from cuDeviceGetAttribute\n");
-    printf("    ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt));
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Memory Clock Rate: \t\t\t%d kHz\n", TmpInt);
-    checkResult(
-        cuDeviceGetAttribute(
-            &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device),
-        "Error returned from cuDeviceGetAttribute\n");
-    printf("    Memory Bus Width: \t\t\t%d bits\n", TmpInt);
-    checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
-                                     Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    L2 Cache Size: \t\t\t%d bytes \n", TmpInt);
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
-                    Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Max Threads Per SMP: \t\t%d \n", TmpInt);
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt);
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt));
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt));
-    checkResult(
-        cuDeviceGetAttribute(
-            &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device),
-        "Error returned from cuDeviceGetAttribute\n");
-    printf("    Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
-    checkResult(
-        cuDeviceGetAttribute(
-            &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device),
-        "Error returned from cuDeviceGetAttribute\n");
-    printf("    Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt));
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt));
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt));
-    checkResult(
-        cuDeviceGetAttribute(
-            &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device),
-        "Error returned from cuDeviceGetAttribute\n");
-    checkResult(
-        cuDeviceGetAttribute(
-            &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device),
-        "Error returned from cuDeviceGetAttribute\n");
-    printf("    Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
-  }
-
-  int createEvent(int DeviceId, void **P) {
-    CUevent Event = nullptr;
-    if (EventPool[DeviceId]->acquire(Event) != OFFLOAD_SUCCESS)
-      return OFFLOAD_FAIL;
-    *P = Event;
-    return OFFLOAD_SUCCESS;
-  }
-
-  int destroyEvent(int DeviceId, void *EventPtr) {
-    EventPool[DeviceId]->release(reinterpret_cast<CUevent>(EventPtr));
-    return OFFLOAD_SUCCESS;
-  }
-
-  int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo,
-                void *EventPtr) const {
-    CUstream Stream = getStream(DeviceId, AsyncInfo);
-    CUevent Event = reinterpret_cast<CUevent>(EventPtr);
-
-    // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from
-    // specific CUDA version, and defined as 0x0. In previous version, per CUDA
-    // API document, that argument has to be 0x0.
-    CUresult Err = cuStreamWaitEvent(Stream, Event, 0);
-    if (Err != CUDA_SUCCESS) {
-      DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n",
-         DPxPTR(Stream), DPxPTR(Event));
-      CUDA_ERR_STRING(Err);
-      return OFFLOAD_FAIL;
-    }
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  int releaseAsyncInfo(int DeviceId, __tgt_async_info *AsyncInfo) const {
-    if (AsyncInfo->Queue) {
-      StreamPool[DeviceId]->release(
-          reinterpret_cast<CUstream>(AsyncInfo->Queue));
-      AsyncInfo->Queue = nullptr;
-    }
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  int initAsyncInfo(int DeviceId, __tgt_async_info **AsyncInfo) const {
-    *AsyncInfo = new __tgt_async_info;
-    getStream(DeviceId, *AsyncInfo);
-    return OFFLOAD_SUCCESS;
-  }
-
-  int initDeviceInfo(int DeviceId, __tgt_device_info *DeviceInfo,
-                     const char **ErrStr) const {
-    assert(DeviceInfo && "DeviceInfo is nullptr");
-
-    if (!DeviceInfo->Context)
-      DeviceInfo->Context = DeviceData[DeviceId].Context;
-    if (!DeviceInfo->Device) {
-      CUdevice Dev;
-      CUresult Err = cuDeviceGet(&Dev, DeviceId);
-      if (Err == CUDA_SUCCESS) {
-        DeviceInfo->Device = reinterpret_cast<void *>(Dev);
-      } else {
-        cuGetErrorString(Err, ErrStr);
-        return OFFLOAD_FAIL;
-      }
-    }
-    return OFFLOAD_SUCCESS;
-  }
-
-  int setContext(int DeviceId) {
-    assert(InitializedFlags[DeviceId] && "Device is not initialized");
-
-    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
-    if (!checkResult(Err, "error returned from cuCtxSetCurrent"))
-      return OFFLOAD_FAIL;
-
-    return OFFLOAD_SUCCESS;
-  }
-};
-
-DeviceRTLTy DeviceRTL;
-} // namespace
-
-// Exposed library API function
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
-  return elf_check_machine(Image, /* EM_CUDA */ 190);
-}
-
-int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *Image,
-                                       __tgt_image_info *Info) {
-  if (!__tgt_rtl_is_valid_binary(Image))
-    return false;
-
-  // A subarchitecture was not specified. Assume it is compatible.
-  if (!Info || !Info->Arch)
-    return true;
-
-  int32_t NumberOfDevices = 0;
-  if (cuDeviceGetCount(&NumberOfDevices) != CUDA_SUCCESS)
-    return false;
-
-  StringRef ArchStr = StringRef(Info->Arch).drop_front(sizeof("sm_") - 1);
-  for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) {
-    CUdevice Device;
-    if (cuDeviceGet(&Device, DeviceId) != CUDA_SUCCESS)
-      return false;
-
-    int32_t Major, Minor;
-    if (cuDeviceGetAttribute(&Major,
-                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                             Device) != CUDA_SUCCESS)
-      return false;
-    if (cuDeviceGetAttribute(&Minor,
-                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
-                             Device) != CUDA_SUCCESS)
-      return false;
-
-    // A cubin generated for a certain compute capability is supported to run on
-    // any GPU with the same major revision and same or higher minor revision.
-    int32_t ImageMajor = ArchStr[0] - '0';
-    int32_t ImageMinor = ArchStr[1] - '0';
-    if (Major != ImageMajor || Minor < ImageMinor)
-      return false;
-  }
-
-  DP("Image has compatible compute capability: %s\n", Info->Arch);
-  return true;
-}
-
-int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
-
-int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
-  DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
-  DeviceRTL.setRequiresFlag(RequiresFlags);
-  return RequiresFlags;
-}
-
-int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDevId, int DstDevId) {
-  if (DeviceRTL.isValidDeviceId(SrcDevId) &&
-      DeviceRTL.isValidDeviceId(DstDevId))
-    return 1;
-
-  return 0;
-}
-
-int32_t __tgt_rtl_init_device(int32_t DeviceId) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  // Context is set when init the device.
-
-  return DeviceRTL.initDevice(DeviceId);
-}
-
-int32_t __tgt_rtl_deinit_device(int32_t DeviceId) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  // Context is set when deinit the device.
-
-  return DeviceRTL.deinitDevice(DeviceId);
-}
-
-__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
-                                          __tgt_device_image *Image) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-
-  if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
-    return nullptr;
-
-  return DeviceRTL.loadBinary(DeviceId, Image);
-}
-
-void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *,
-                           int32_t Kind) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-
-  if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
-    return nullptr;
-
-  return DeviceRTL.dataAlloc(DeviceId, Size, (TargetAllocTy)Kind);
-}
-
-int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
-                              int64_t Size) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  // Context is set in __tgt_rtl_data_submit_async.
-
-  __tgt_async_info AsyncInfo;
-  const int32_t Rc =
-      __tgt_rtl_data_submit_async(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo);
-  if (Rc != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
-}
-
-int32_t __tgt_rtl_data_submit_async(int32_t DeviceId, void *TgtPtr,
-                                    void *HstPtr, int64_t Size,
-                                    __tgt_async_info *AsyncInfoPtr) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  assert(AsyncInfoPtr && "async_info_ptr is nullptr");
-
-  if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return DeviceRTL.dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
-                                int64_t Size) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  // Context is set in __tgt_rtl_data_retrieve_async.
-
-  __tgt_async_info AsyncInfo;
-  const int32_t Rc =
-      __tgt_rtl_data_retrieve_async(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo);
-  if (Rc != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
-}
-
-int32_t __tgt_rtl_data_retrieve_async(int32_t DeviceId, void *HstPtr,
-                                      void *TgtPtr, int64_t Size,
-                                      __tgt_async_info *AsyncInfoPtr) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  assert(AsyncInfoPtr && "async_info_ptr is nullptr");
-
-  if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return DeviceRTL.dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_data_exchange_async(int32_t SrcDevId, void *SrcPtr,
-                                      int DstDevId, void *DstPtr, int64_t Size,
-                                      __tgt_async_info *AsyncInfo) {
-  assert(DeviceRTL.isValidDeviceId(SrcDevId) && "src_dev_id is invalid");
-  assert(DeviceRTL.isValidDeviceId(DstDevId) && "dst_dev_id is invalid");
-  assert(AsyncInfo && "AsyncInfo is nullptr");
-
-  if (DeviceRTL.setContext(SrcDevId) != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return DeviceRTL.dataExchange(SrcDevId, SrcPtr, DstDevId, DstPtr, Size,
-                                AsyncInfo);
-}
-
-int32_t __tgt_rtl_data_exchange(int32_t SrcDevId, void *SrcPtr,
-                                int32_t DstDevId, void *DstPtr, int64_t Size) {
-  assert(DeviceRTL.isValidDeviceId(SrcDevId) && "src_dev_id is invalid");
-  assert(DeviceRTL.isValidDeviceId(DstDevId) && "dst_dev_id is invalid");
-  // Context is set in __tgt_rtl_data_exchange_async.
-
-  __tgt_async_info AsyncInfo;
-  const int32_t Rc = __tgt_rtl_data_exchange_async(SrcDevId, SrcPtr, DstDevId,
-                                                   DstPtr, Size, &AsyncInfo);
-  if (Rc != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return __tgt_rtl_synchronize(SrcDevId, &AsyncInfo);
-}
-
-int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr, int32_t Kind) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-
-  if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return DeviceRTL.dataDelete(DeviceId, TgtPtr, (TargetAllocTy)Kind);
-}
-
-int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr,
-                                         void **TgtArgs, ptrdiff_t *TgtOffsets,
-                                         int32_t ArgNum, int32_t TeamNum,
-                                         int32_t ThreadLimit,
-                                         uint64_t LoopTripcount) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  // Context is set in __tgt_rtl_run_target_team_region_async.
-
-  __tgt_async_info AsyncInfo;
-  const int32_t Rc = __tgt_rtl_run_target_team_region_async(
-      DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, TeamNum, ThreadLimit,
-      LoopTripcount, &AsyncInfo);
-  if (Rc != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
-}
-
-int32_t __tgt_rtl_run_target_team_region_async(
-    int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets,
-    int32_t ArgNum, int32_t TeamNum, int32_t ThreadLimit,
-    uint64_t LoopTripcount, __tgt_async_info *AsyncInfoPtr) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-
-  if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return DeviceRTL.runTargetTeamRegion(DeviceId, TgtEntryPtr, TgtArgs,
-                                       TgtOffsets, ArgNum, TeamNum, ThreadLimit,
-                                       LoopTripcount, AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr,
-                                    void **TgtArgs, ptrdiff_t *TgtOffsets,
-                                    int32_t ArgNum) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  // Context is set in __tgt_rtl_run_target_region_async.
-
-  __tgt_async_info AsyncInfo;
-  const int32_t Rc = __tgt_rtl_run_target_region_async(
-      DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, &AsyncInfo);
-  if (Rc != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
-}
-
-int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, void *TgtEntryPtr,
-                                          void **TgtArgs, ptrdiff_t *TgtOffsets,
-                                          int32_t ArgNum,
-                                          __tgt_async_info *AsyncInfoPtr) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  // Context is set in __tgt_rtl_run_target_team_region_async.
-  return __tgt_rtl_run_target_team_region_async(
-      DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum,
-      /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
-      AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_synchronize(int32_t DeviceId,
-                              __tgt_async_info *AsyncInfoPtr) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  assert(AsyncInfoPtr && "async_info_ptr is nullptr");
-  assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr");
-  // NOTE: We don't need to set context for stream sync.
-  return DeviceRTL.synchronize(DeviceId, AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_query_async(int32_t DeviceId,
-                              __tgt_async_info *AsyncInfoPtr) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  assert(AsyncInfoPtr && "async_info_ptr is nullptr");
-  assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr");
-  // NOTE: We don't need to set context for stream query.
-  return DeviceRTL.queryAsync(DeviceId, AsyncInfoPtr);
-}
-
-void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
-  std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
-  InfoLevel.store(NewInfoLevel);
-}
-
-void __tgt_rtl_print_device_info(int32_t DeviceId) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  // NOTE: We don't need to set context for print device info.
-  DeviceRTL.printDeviceInfo(DeviceId);
-}
-
-int32_t __tgt_rtl_create_event(int32_t DeviceId, void **Event) {
-  assert(Event && "event is nullptr");
-
-  if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return DeviceRTL.createEvent(DeviceId, Event);
-}
-
-int32_t __tgt_rtl_record_event(int32_t DeviceId, void *EventPtr,
-                               __tgt_async_info *AsyncInfoPtr) {
-  assert(AsyncInfoPtr && "async_info_ptr is nullptr");
-  assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr");
-  assert(EventPtr && "event_ptr is nullptr");
-  // NOTE: We might not need to set context for event record.
-  return recordEvent(EventPtr, AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_wait_event(int32_t DeviceId, void *EventPtr,
-                             __tgt_async_info *AsyncInfoPtr) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  assert(AsyncInfoPtr && "async_info_ptr is nullptr");
-  assert(EventPtr && "event is nullptr");
-  // If we don't have a queue we need to set the context.
-  if (!AsyncInfoPtr->Queue && DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-  return DeviceRTL.waitEvent(DeviceId, AsyncInfoPtr, EventPtr);
-}
-
-int32_t __tgt_rtl_sync_event(int32_t DeviceId, void *EventPtr) {
-  assert(EventPtr && "event is nullptr");
-  // NOTE: We might not need to set context for event sync.
-  return syncEvent(EventPtr);
-}
-
-int32_t __tgt_rtl_destroy_event(int32_t DeviceId, void *EventPtr) {
-  assert(EventPtr && "event is nullptr");
-
-  if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return DeviceRTL.destroyEvent(DeviceId, EventPtr);
-}
-
-int32_t __tgt_rtl_release_async_info(int32_t DeviceId,
-                                     __tgt_async_info *AsyncInfo) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  assert(AsyncInfo && "async_info is nullptr");
-
-  if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return DeviceRTL.releaseAsyncInfo(DeviceId, AsyncInfo);
-}
-
-int32_t __tgt_rtl_init_async_info(int32_t DeviceId,
-                                  __tgt_async_info **AsyncInfo) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  assert(AsyncInfo && "async_info is nullptr");
-
-  if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return DeviceRTL.initAsyncInfo(DeviceId, AsyncInfo);
-}
-
-int32_t __tgt_rtl_init_device_info(int32_t DeviceId,
-                                   __tgt_device_info *DeviceInfoPtr,
-                                   const char **ErrStr) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  assert(DeviceInfoPtr && "device_info_ptr is nullptr");
-
-  if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return DeviceRTL.initDeviceInfo(DeviceId, DeviceInfoPtr, ErrStr);
-}
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/openmp/libomptarget/plugins/exports b/openmp/libomptarget/plugins/exports
deleted file mode 100644
--- a/openmp/libomptarget/plugins/exports
+++ /dev/null
@@ -1,6 +0,0 @@
-VERS1.0 {
-  global:
-    __tgt_rtl*;
-  local:
-    *;
-};
diff --git a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
deleted file mode 100644
--- a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
+++ /dev/null
@@ -1,280 +0,0 @@
-//===-RTLs/generic-64bit/src/rtl.cpp - Target RTLs Implementation - C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// RTL for generic 64-bit machine
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/DynamicLibrary.h"
-
-#include <cassert>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <ffi.h>
-#include <link.h>
-#include <list>
-#include <string>
-#include <vector>
-
-#include "Debug.h"
-#include "omptargetplugin.h"
-
-using namespace llvm;
-using namespace llvm::sys;
-
-#ifndef TARGET_NAME
-#define TARGET_NAME Generic ELF - 64bit
-#endif
-#define DEBUG_PREFIX "TARGET " GETNAME(TARGET_NAME) " RTL"
-
-#ifndef TARGET_ELF_ID
-#define TARGET_ELF_ID 0
-#endif
-
-#include "elf_common.h"
-
-#define NUMBER_OF_DEVICES 4
-#define OFFLOAD_SECTION_NAME "omp_offloading_entries"
-
-/// Array of Dynamic libraries loaded for this target.
-struct DynLibTy {
-  std::string FileName;
-  std::unique_ptr<DynamicLibrary> DynLib;
-};
-
-/// Keep entries table per device.
-struct FuncOrGblEntryTy {
-  __tgt_target_table Table;
-  SmallVector<__tgt_offload_entry> Entries;
-};
-
-/// Class containing all the device information.
-class RTLDeviceInfoTy {
-  std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
-
-public:
-  std::list<DynLibTy> DynLibs;
-
-  // Record entry point associated with device.
-  void createOffloadTable(int32_t DeviceId,
-                          SmallVector<__tgt_offload_entry> &&Entries) {
-    assert(DeviceId < (int32_t)FuncGblEntries.size() &&
-           "Unexpected device id!");
-    FuncGblEntries[DeviceId].emplace_back();
-    FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
-
-    E.Entries = Entries;
-    E.Table.EntriesBegin = E.Entries.begin();
-    E.Table.EntriesEnd = E.Entries.end();
-  }
-
-  // Return true if the entry is associated with device.
-  bool findOffloadEntry(int32_t DeviceId, void *Addr) {
-    assert(DeviceId < (int32_t)FuncGblEntries.size() &&
-           "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
-
-    for (__tgt_offload_entry *I = E.Table.EntriesBegin,
-                             *End = E.Table.EntriesEnd;
-         I < End; ++I) {
-      if (I->addr == Addr)
-        return true;
-    }
-
-    return false;
-  }
-
-  // Return the pointer to the target entries table.
-  __tgt_target_table *getOffloadEntriesTable(int32_t DeviceId) {
-    assert(DeviceId < (int32_t)FuncGblEntries.size() &&
-           "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
-
-    return &E.Table;
-  }
-
-  RTLDeviceInfoTy(int32_t NumDevices) { FuncGblEntries.resize(NumDevices); }
-
-  ~RTLDeviceInfoTy() {
-    // Close dynamic libraries
-    for (auto &Lib : DynLibs) {
-      if (Lib.DynLib->isValid())
-        remove(Lib.FileName.c_str());
-    }
-  }
-};
-
-static RTLDeviceInfoTy DeviceInfo(NUMBER_OF_DEVICES);
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
-// If we don't have a valid ELF ID we can just fail.
-#if TARGET_ELF_ID < 1
-  return 0;
-#else
-  return elf_check_machine(Image, TARGET_ELF_ID);
-#endif
-}
-
-int32_t __tgt_rtl_number_of_devices() { return NUMBER_OF_DEVICES; }
-
-int32_t __tgt_rtl_init_device(int32_t DeviceId) { return OFFLOAD_SUCCESS; }
-
-__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
-                                          __tgt_device_image *Image) {
-
-  DP("Dev %d: load binary from " DPxMOD " image\n", DeviceId,
-     DPxPTR(Image->ImageStart));
-
-  assert(DeviceId >= 0 && DeviceId < NUMBER_OF_DEVICES && "bad dev id");
-
-  size_t ImageSize = (size_t)Image->ImageEnd - (size_t)Image->ImageStart;
-
-  // load dynamic library and get the entry points. We use the dl library
-  // to do the loading of the library, but we could do it directly to avoid the
-  // dump to the temporary file.
-  //
-  // 1) Create tmp file with the library contents.
-  // 2) Use dlopen to load the file and dlsym to retrieve the symbols.
-  char TmpName[] = "/tmp/tmpfile_XXXXXX";
-  int TmpFd = mkstemp(TmpName);
-
-  if (TmpFd == -1)
-    return nullptr;
-
-  FILE *Ftmp = fdopen(TmpFd, "wb");
-
-  if (!Ftmp)
-    return nullptr;
-
-  fwrite(Image->ImageStart, ImageSize, 1, Ftmp);
-  fclose(Ftmp);
-
-  std::string ErrMsg;
-  auto DynLib = std::make_unique<sys::DynamicLibrary>(
-      sys::DynamicLibrary::getPermanentLibrary(TmpName, &ErrMsg));
-  DynLibTy Lib = {TmpName, std::move(DynLib)};
-
-  if (!Lib.DynLib->isValid()) {
-    DP("Target library loading error: %s\n", ErrMsg.c_str());
-    return NULL;
-  }
-
-  __tgt_offload_entry *HostBegin = Image->EntriesBegin;
-  __tgt_offload_entry *HostEnd = Image->EntriesEnd;
-
-  // Create a new offloading entry list using the device symbol address.
-  SmallVector<__tgt_offload_entry> Entries;
-  for (__tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
-    if (!E->addr)
-      return nullptr;
-
-    __tgt_offload_entry Entry = *E;
-
-    void *DevAddr = Lib.DynLib->getAddressOfSymbol(E->name);
-    Entry.addr = DevAddr;
-
-    DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
-       DPxPTR(E - HostBegin), E->name, DPxPTR(DevAddr));
-
-    Entries.emplace_back(Entry);
-  }
-
-  DeviceInfo.createOffloadTable(DeviceId, std::move(Entries));
-  DeviceInfo.DynLibs.emplace_back(std::move(Lib));
-
-  return DeviceInfo.getOffloadEntriesTable(DeviceId);
-}
-
-void __tgt_rtl_print_device_info(int32_t DeviceId) {
-  printf("    This is a generic-elf-64bit device\n");
-}
-
-// Sample implementation of explicit memory allocator. For this plugin all kinds
-// are equivalent to each other.
-void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *HstPtr,
-                           int32_t Kind) {
-  void *Ptr = NULL;
-
-  switch (Kind) {
-  case TARGET_ALLOC_DEVICE:
-  case TARGET_ALLOC_HOST:
-  case TARGET_ALLOC_SHARED:
-  case TARGET_ALLOC_DEFAULT:
-    Ptr = malloc(Size);
-    break;
-  default:
-    REPORT("Invalid target data allocation kind");
-  }
-
-  return Ptr;
-}
-
-int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
-                              int64_t Size) {
-  memcpy(TgtPtr, HstPtr, Size);
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
-                                int64_t Size) {
-  memcpy(HstPtr, TgtPtr, Size);
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr, int32_t) {
-  free(TgtPtr);
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr,
-                                void **TgtArgs, ptrdiff_t *TgtOffsets,
-                                KernelArgsTy *KernelArgs,
-                                __tgt_async_info *AsyncInfoPtr) {
-  assert(!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] &&
-         !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
-         "Only one dimensional kernels supported.");
-  // ignore team num and thread limit.
-
-  // Use libffi to launch execution.
-  ffi_cif Cif;
-
-  // All args are references.
-  std::vector<ffi_type *> ArgsTypes(KernelArgs->NumArgs, &ffi_type_pointer);
-  std::vector<void *> Args(KernelArgs->NumArgs);
-  std::vector<void *> Ptrs(KernelArgs->NumArgs);
-
-  for (uint32_t I = 0; I < KernelArgs->NumArgs; ++I) {
-    Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
-    Args[I] = &Ptrs[I];
-  }
-
-  ffi_status Status = ffi_prep_cif(&Cif, FFI_DEFAULT_ABI, KernelArgs->NumArgs,
-                                   &ffi_type_void, &ArgsTypes[0]);
-
-  assert(Status == FFI_OK && "Unable to prepare target launch!");
-
-  if (Status != FFI_OK)
-    return OFFLOAD_FAIL;
-
-  DP("Running entry point at " DPxMOD "...\n", DPxPTR(TgtEntryPtr));
-
-  void (*Entry)(void);
-  *((void **)&Entry) = TgtEntryPtr;
-  ffi_call(&Cif, Entry, NULL, &Args[0]);
-  return OFFLOAD_SUCCESS;
-}
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/openmp/libomptarget/plugins/ppc64/CMakeLists.txt b/openmp/libomptarget/plugins/ppc64/CMakeLists.txt
deleted file mode 100644
--- a/openmp/libomptarget/plugins/ppc64/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-##===----------------------------------------------------------------------===##
-# 
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# 
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for a ppc64 machine if available.
-#
-##===----------------------------------------------------------------------===##
-
-if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  build_generic_elf64("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu" "21")
-else()
- libomptarget_say("Not building ppc64 offloading plugin: machine not found in the system.")
-endif()
diff --git a/openmp/libomptarget/plugins/ppc64le/CMakeLists.txt b/openmp/libomptarget/plugins/ppc64le/CMakeLists.txt
deleted file mode 100644
--- a/openmp/libomptarget/plugins/ppc64le/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-##===----------------------------------------------------------------------===##
-# 
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# 
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for a ppc64le machine if available.
-#
-##===----------------------------------------------------------------------===##
-
-if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  build_generic_elf64("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu" "21")
-else()
- libomptarget_say("Not building ppc64le offloading plugin: machine not found in the system.")
-endif()
diff --git a/openmp/libomptarget/plugins/remote/CMakeLists.txt b/openmp/libomptarget/plugins/remote/CMakeLists.txt
deleted file mode 100644
--- a/openmp/libomptarget/plugins/remote/CMakeLists.txt
+++ /dev/null
@@ -1,54 +0,0 @@
-##===----------------------------------------------------------------------===##
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin (client) and server for remote offloading.
-#
-##===----------------------------------------------------------------------===#
-
-if (NOT(CMAKE_SYSTEM_NAME MATCHES "Linux"))
-  libomptarget_say("Not building remote offloading plugin: only support Linux hosts.")
-  return()
-endif()
-
-if (NOT(LIBOMPTARGET_ENABLE_EXPERIMENTAL_REMOTE_PLUGIN))
-  return()
-endif()
-
-find_package(Protobuf)
-find_package(gRPC CONFIG)
-
-find_program(PROTOC protoc)
-find_program(GRPC_CPP_PLUGIN grpc_cpp_plugin)
-
-if (Protobuf_FOUND AND gRPC_FOUND AND PROTOC AND GRPC_CPP_PLUGIN)
-  libomptarget_say("Building remote offloading plugin.")
-  set(directory "${CMAKE_BINARY_DIR}/include/openmp/libomptarget/plugins/remote/")
-  file(MAKE_DIRECTORY ${directory})
-  execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory ${directory})
-  execute_process(
-          COMMAND protoc --cpp_out=${directory} -I ${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/include/openmp.proto
-          COMMAND protoc --grpc_out=${directory} -I ${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/include/openmp.proto --plugin=protoc-gen-grpc=${GRPC_CPP_PLUGIN}
-  )
-
-  set(GRPC_SRC_FILES
-      ${directory}/openmp.grpc.pb.cc
-      ${directory}/openmp.pb.cc
-  )
-
-  set(GRPC_INCLUDE_DIR
-      ${directory}
-  )
-
-  set(RPC_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include/)
-  set(RPC_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/lib/)
-  
-  add_subdirectory(src)
-  add_subdirectory(server)
-else()
-  libomptarget_say("Not building remote offloading plugin: required libraries were not found.")
-endif()
-
diff --git a/openmp/libomptarget/plugins/remote/include/Utils.h b/openmp/libomptarget/plugins/remote/include/Utils.h
deleted file mode 100644
--- a/openmp/libomptarget/plugins/remote/include/Utils.h
+++ /dev/null
@@ -1,133 +0,0 @@
-//===----------------- Utils.h - Utilities for Remote RTL -----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Utilities for data transfer through protobuf and debugging.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef UTILS_H
-#define UTILS_H
-
-#include "Debug.h"
-#include "omptarget.h"
-#include "openmp.grpc.pb.h"
-#include "openmp.pb.h"
-#include "rtl.h"
-#include <string>
-
-#define CLIENT_DBG(...)                                                        \
-  {                                                                            \
-    if (DebugLevel > 0) {                                                      \
-      fprintf(stderr, "[[Client]] --> ");                                      \
-      fprintf(stderr, __VA_ARGS__);                                            \
-      fprintf(stderr, "\n");                                                   \
-    }                                                                          \
-  }
-
-#define SERVER_DBG(...)                                                        \
-  {                                                                            \
-    if (DebugLevel > 0) {                                                      \
-      fprintf(stderr, "[[Server]] --> ");                                      \
-      fprintf(stderr, __VA_ARGS__);                                            \
-      fprintf(stderr, "\n");                                                   \
-    }                                                                          \
-  }
-
-namespace RemoteOffloading {
-
-using namespace openmp::libomptarget::remote;
-
-using openmp::libomptarget::remote::DeviceOffloadEntry;
-using openmp::libomptarget::remote::TargetBinaryDescription;
-using openmp::libomptarget::remote::TargetOffloadEntry;
-using openmp::libomptarget::remote::TargetTable;
-
-struct ClientManagerConfigTy {
-  std::vector<std::string> ServerAddresses;
-  uint64_t MaxSize;
-  uint64_t BlockSize;
-  int Timeout;
-
-  ClientManagerConfigTy()
-      : ServerAddresses({"0.0.0.0:50051"}), MaxSize(1 << 30),
-        BlockSize(1 << 20), Timeout(5) {
-    // TODO: Error handle for incorrect inputs
-    if (const char *Env = std::getenv("LIBOMPTARGET_RPC_ADDRESS")) {
-      ServerAddresses.clear();
-      std::string AddressString = Env;
-      const std::string Delimiter = ",";
-
-      size_t Pos;
-      std::string Token;
-      while ((Pos = AddressString.find(Delimiter)) != std::string::npos) {
-        Token = AddressString.substr(0, Pos);
-        ServerAddresses.push_back(Token);
-        AddressString.erase(0, Pos + Delimiter.length());
-      }
-      ServerAddresses.push_back(AddressString);
-    }
-    if (const char *Env = std::getenv("LIBOMPTARGET_RPC_ALLOCATOR_MAX"))
-      MaxSize = std::stoi(Env);
-    if (const char *Env = std::getenv("LIBOMPTARGET_RPC_BLOCK_SIZE"))
-      BlockSize = std::stoi(Env);
-    if (const char *Env1 = std::getenv("LIBOMPTARGET_RPC_LATENCY"))
-      Timeout = std::stoi(Env1);
-  }
-};
-
-/// Loads a target binary description into protobuf.
-void loadTargetBinaryDescription(const __tgt_bin_desc *Desc,
-                                 TargetBinaryDescription &Request);
-
-/// Unload a target binary description from protobuf. The map is used to keep
-/// track of already copied device images.
-void unloadTargetBinaryDescription(
-    const TargetBinaryDescription *Request, __tgt_bin_desc *Desc,
-    std::unordered_map<const void *, __tgt_device_image *>
-        &HostToRemoteDeviceImage);
-
-/// Frees argument as constructed by loadTargetBinaryDescription
-void freeTargetBinaryDescription(__tgt_bin_desc *Desc);
-
-/// Copies from TargetOffloadEntry protobuf to a tgt_bin_desc during unloading.
-void copyOffloadEntry(const TargetOffloadEntry &EntryResponse,
-                      __tgt_offload_entry *Entry);
-
-/// Copies from tgt_bin_desc into TargetOffloadEntry protobuf during loading.
-void copyOffloadEntry(const __tgt_offload_entry *Entry,
-                      TargetOffloadEntry *EntryResponse);
-
-/// Shallow copy of offload entry from tgt_bin_desc to TargetOffloadEntry
-/// during loading.
-void shallowCopyOffloadEntry(const __tgt_offload_entry *Entry,
-                             TargetOffloadEntry *EntryResponse);
-
-/// Copies DeviceOffloadEntries into table during unloading.
-void copyOffloadEntry(const DeviceOffloadEntry &EntryResponse,
-                      __tgt_offload_entry *Entry);
-
-/// Loads tgt_target_table into a TargetTable protobuf message.
-void loadTargetTable(__tgt_target_table *Table, TargetTable &TableResponse,
-                     __tgt_device_image *Image);
-
-/// Unloads from a target_table from protobuf.
-void unloadTargetTable(
-    TargetTable &TableResponse, __tgt_target_table *Table,
-    std::unordered_map<void *, void *> &HostToRemoteTargetTableMap);
-
-/// Frees argument as constructed by unloadTargetTable
-void freeTargetTable(__tgt_target_table *Table);
-
-void dump(const void *Start, const void *End);
-void dump(__tgt_offload_entry *Entry);
-void dump(TargetOffloadEntry Entry);
-void dump(__tgt_target_table *Table);
-void dump(__tgt_device_image *Image);
-} // namespace RemoteOffloading
-
-#endif
diff --git a/openmp/libomptarget/plugins/remote/include/openmp.proto b/openmp/libomptarget/plugins/remote/include/openmp.proto
deleted file mode 100644
--- a/openmp/libomptarget/plugins/remote/include/openmp.proto
+++ /dev/null
@@ -1,153 +0,0 @@
-syntax = "proto3";
-
-package openmp.libomptarget.remote;
-option cc_enable_arenas = true;
-
-service RemoteOffload {
-  rpc Shutdown(Null) returns (I32) {}
-
-  rpc RegisterLib(TargetBinaryDescription) returns (I32) {}
-  rpc UnregisterLib(Pointer) returns (I32) {}
-
-  rpc IsValidBinary(TargetDeviceImagePtr) returns (I32) {}
-  rpc GetNumberOfDevices(Null) returns (I32) {}
-
-  rpc InitDevice(I32) returns (I32) {}
-  rpc InitRequires(I64) returns (I32) {}
-
-  rpc LoadBinary(Binary) returns (TargetTable) {}
-
-  rpc DataAlloc(AllocData) returns (Pointer) {}
-  rpc DataDelete(DeleteData) returns (I32) {}
-
-  rpc DataSubmit(stream SubmitData) returns (I32) {}
-  rpc DataRetrieve(RetrieveData) returns (stream Data) {}
-
-  rpc IsDataExchangeable(DevicePair) returns (I32) {}
-  rpc DataExchange(ExchangeData) returns (I32) {}
-
-  rpc RunTargetRegion(TargetRegion) returns (I32) {}
-  rpc RunTargetTeamRegion(TargetTeamRegion) returns (I32) {}
-}
-
-message Null {}
-
-message Pointer { uint64 number = 1; }
-
-message I32 { int32 number = 1; }
-
-message I64 { int64 number = 1; }
-
-message DevicePair {
-  int32 src_dev_id = 1;
-  int32 dst_dev_id = 2;
-}
-
-message Binary {
-  uint64 image_ptr = 1;
-  int32 device_id = 2;
-}
-
-message TargetOffloadEntry {
-  bytes data = 1;
-  string name = 2;
-  int32 flags = 3;
-  int32 reserved = 4;
-}
-
-message DeviceOffloadEntry {
-  string name = 1;
-  uint64 addr = 2;
-  int32 flags = 3;
-  int32 reserved = 4;
-  int32 size = 5;
-}
-
-message TargetTable {
-  repeated DeviceOffloadEntry entries = 1;
-  repeated uint64 entry_ptrs = 2;
-}
-
-message TargetDeviceImagePtr {
-  uint64 image_ptr = 1;
-  repeated uint64 entry_ptrs = 2;
-}
-
-message TargetDeviceImage {
-  bytes binary = 1;
-  repeated TargetOffloadEntry entries = 2;
-}
-
-message ImagePtrs {
-  uint64 img_ptr = 1;
-  repeated uint64 entry_ptrs = 2;
-}
-
-message TargetBinaryDescription {
-  repeated ImagePtrs image_ptrs = 1;
-  repeated TargetOffloadEntry entries = 2;
-  repeated TargetDeviceImage images = 3;
-  repeated uint64 entry_ptrs = 4;
-  uint64 bin_ptr = 5;
-}
-
-message AllocData {
-  uint64 size = 1;
-  uint64 hst_ptr = 2;
-  int32 device_id = 3;
-}
-
-message SubmitData {
-  bytes data = 1;
-  uint64 hst_ptr = 2;
-  uint64 tgt_ptr = 3;
-  uint64 start = 5;
-  uint64 size = 6;
-  int32 device_id = 7;
-}
-
-message RetrieveData {
-  uint64 hst_ptr = 1;
-  uint64 tgt_ptr = 2;
-  uint64 size = 3;
-  int32 device_id = 5;
-}
-
-message Data {
-  bytes data = 1;
-  uint64 start = 2;
-  uint64 size = 3;
-  int32 ret = 4;
-}
-
-message ExchangeData {
-  uint64 src_dev_id = 1;
-  uint64 src_ptr = 2;
-  uint64 dst_dev_id = 3;
-  uint64 dst_ptr = 4;
-  uint64 size = 6;
-}
-
-message DeleteData {
-  uint64 tgt_ptr = 1;
-  int32 device_id = 2;
-}
-
-message TargetRegion {
-  repeated uint64 tgt_args = 1;
-  repeated int64 tgt_offsets = 2;
-  uint64 tgt_entry_ptr = 3;
-  int32 device_id = 4;
-  int32 arg_num = 5;
-}
-
-message TargetTeamRegion {
-  repeated uint64 tgt_args = 1;
-  repeated int64 tgt_offsets = 2;
-  uint64 tgt_entry_ptr = 3;
-  uint64 loop_tripcount = 4;
-  int32 device_id = 5;
-  int32 arg_num = 6;
-  int32 team_num = 7;
-  int32 thread_limit = 8;
-}
diff --git a/openmp/libomptarget/plugins/remote/lib/Utils.cpp b/openmp/libomptarget/plugins/remote/lib/Utils.cpp
deleted file mode 100644
--- a/openmp/libomptarget/plugins/remote/lib/Utils.cpp
+++ /dev/null
@@ -1,295 +0,0 @@
-//===---------------- Utils.cpp - Utilities for Remote RTL ----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Utilities for data movement and debugging.
-//
-//===----------------------------------------------------------------------===//
-
-#include "Utils.h"
-#include "omptarget.h"
-
-namespace RemoteOffloading {
-
-void loadTargetBinaryDescription(const __tgt_bin_desc *Desc,
-                                 TargetBinaryDescription &Request) {
-  // Keeps track of entries which have already been deep copied.
-  std::vector<void *> DeepCopiedEntryAddrs;
-
-  // Copy Global Offload Entries
-  for (auto *CurEntry = Desc->HostEntriesBegin;
-       CurEntry != Desc->HostEntriesEnd; CurEntry++) {
-    auto *NewEntry = Request.add_entries();
-    copyOffloadEntry(CurEntry, NewEntry);
-
-    // Copy the pointer of the offload entry of the image into the Request
-    Request.add_entry_ptrs((uint64_t)CurEntry);
-    DeepCopiedEntryAddrs.push_back(CurEntry);
-  }
-
-  // Copy Device Images and Device Offload Entries
-  __tgt_device_image *CurImage = Desc->DeviceImages;
-  for (auto I = 0; I < Desc->NumDeviceImages; I++, CurImage++) {
-    auto *Image = Request.add_images();
-    auto Size = (char *)CurImage->ImageEnd - (char *)CurImage->ImageStart;
-    Image->set_binary(CurImage->ImageStart, Size);
-
-    // Copy the pointer of the image into the Request
-    auto *NewImagePtr = Request.add_image_ptrs();
-    NewImagePtr->set_img_ptr((uint64_t)CurImage->ImageStart);
-
-    // Copy Device Offload Entries
-    for (auto *CurEntry = CurImage->EntriesBegin;
-         CurEntry != CurImage->EntriesEnd; CurEntry++) {
-      auto *NewEntry = Image->add_entries();
-
-      auto Entry = std::find(DeepCopiedEntryAddrs.begin(),
-                             DeepCopiedEntryAddrs.end(), CurEntry);
-      if (Entry != DeepCopiedEntryAddrs.end()) {
-        // Offload entry has already been loaded
-        shallowCopyOffloadEntry(CurEntry, NewEntry);
-      } else { // Offload Entry has not been loaded into the Request
-        copyOffloadEntry(CurEntry, NewEntry);
-        DeepCopiedEntryAddrs.push_back(CurEntry);
-      }
-
-      // Copy the pointer of the offload entry of the image into the Request
-      NewImagePtr->add_entry_ptrs((uint64_t)CurEntry);
-    }
-  }
-}
-
-void unloadTargetBinaryDescription(
-    const TargetBinaryDescription *Request, __tgt_bin_desc *Desc,
-    std::unordered_map<const void *, __tgt_device_image *>
-        &HostToRemoteDeviceImage) {
-  std::unordered_map<const void *, __tgt_offload_entry *> CopiedOffloadEntries;
-  Desc->NumDeviceImages = Request->images_size();
-  Desc->DeviceImages = new __tgt_device_image[Desc->NumDeviceImages];
-
-  if (Request->entries_size())
-    Desc->HostEntriesBegin = new __tgt_offload_entry[Request->entries_size()];
-  else {
-    Desc->HostEntriesBegin = nullptr;
-    Desc->HostEntriesEnd = nullptr;
-  }
-
-  // Copy Global Offload Entries
-  __tgt_offload_entry *CurEntry = Desc->HostEntriesBegin;
-  size_t I = 0;
-  for (auto &Entry : Request->entries()) {
-    copyOffloadEntry(Entry, CurEntry);
-    CopiedOffloadEntries[(void *)Request->entry_ptrs()[I]] = CurEntry;
-    CurEntry++;
-    I++;
-  }
-  Desc->HostEntriesEnd = CurEntry;
-
-  // Copy Device Images and Device Offload Entries
-  __tgt_device_image *CurImage = Desc->DeviceImages;
-  auto ImageItr = Request->image_ptrs().begin();
-  for (auto Image : Request->images()) {
-    // Copy Device Offload Entries
-    CurEntry = Desc->HostEntriesBegin;
-    bool Found = false;
-
-    if (!Desc->HostEntriesBegin) {
-      CurImage->EntriesBegin = nullptr;
-      CurImage->EntriesEnd = nullptr;
-    }
-
-    for (size_t I = 0; I < Image.entries_size(); I++) {
-      auto TgtEntry =
-          CopiedOffloadEntries.find((void *)Request->entry_ptrs()[I]);
-      if (TgtEntry != CopiedOffloadEntries.end()) {
-        if (!Found)
-          CurImage->EntriesBegin = CurEntry;
-
-        CurImage->EntriesEnd = CurEntry + 1;
-        Found = true;
-      } else {
-        Found = false;
-        copyOffloadEntry(Image.entries()[I], CurEntry);
-        CopiedOffloadEntries[(void *)(Request->entry_ptrs()[I])] = CurEntry;
-      }
-      CurEntry++;
-    }
-
-    // Copy Device Image
-    CurImage->ImageStart = new uint8_t[Image.binary().size()];
-    memcpy(CurImage->ImageStart,
-           static_cast<const void *>(Image.binary().data()),
-           Image.binary().size());
-    CurImage->ImageEnd =
-        (void *)((char *)CurImage->ImageStart + Image.binary().size());
-
-    HostToRemoteDeviceImage[(void *)ImageItr->img_ptr()] = CurImage;
-    CurImage++;
-    ImageItr++;
-  }
-}
-
-void freeTargetBinaryDescription(__tgt_bin_desc *Desc) {
-  __tgt_device_image *CurImage = Desc->DeviceImages;
-  for (auto I = 0; I < Desc->NumDeviceImages; I++, CurImage++)
-    delete[](uint64_t *) CurImage->ImageStart;
-
-  delete[] Desc->DeviceImages;
-
-  for (auto *Entry = Desc->HostEntriesBegin; Entry != Desc->HostEntriesEnd;
-       Entry++) {
-    free(Entry->name);
-    free(Entry->addr);
-  }
-
-  delete[] Desc->HostEntriesBegin;
-}
-
-void freeTargetTable(__tgt_target_table *Table) {
-  for (auto *Entry = Table->EntriesBegin; Entry != Table->EntriesEnd; Entry++)
-    free(Entry->name);
-
-  delete[] Table->EntriesBegin;
-}
-
-void loadTargetTable(__tgt_target_table *Table, TargetTable &TableResponse,
-                     __tgt_device_image *Image) {
-  auto *ImageEntry = Image->EntriesBegin;
-  for (__tgt_offload_entry *CurEntry = Table->EntriesBegin;
-       CurEntry != Table->EntriesEnd; CurEntry++, ImageEntry++) {
-    // TODO: This can probably be trimmed substantially.
-    auto *NewEntry = TableResponse.add_entries();
-    NewEntry->set_name(CurEntry->name);
-    NewEntry->set_addr((uint64_t)CurEntry->addr);
-    NewEntry->set_flags(CurEntry->flags);
-    NewEntry->set_reserved(CurEntry->reserved);
-    NewEntry->set_size(CurEntry->size);
-    TableResponse.add_entry_ptrs((int64_t)CurEntry);
-  }
-}
-
-void unloadTargetTable(
-    TargetTable &TableResponse, __tgt_target_table *Table,
-    std::unordered_map<void *, void *> &HostToRemoteTargetTableMap) {
-  Table->EntriesBegin = new __tgt_offload_entry[TableResponse.entries_size()];
-
-  auto *CurEntry = Table->EntriesBegin;
-  for (size_t I = 0; I < TableResponse.entries_size(); I++) {
-    copyOffloadEntry(TableResponse.entries()[I], CurEntry);
-    HostToRemoteTargetTableMap[CurEntry->addr] =
-        (void *)TableResponse.entry_ptrs()[I];
-    CurEntry++;
-  }
-  Table->EntriesEnd = CurEntry;
-}
-
-void copyOffloadEntry(const TargetOffloadEntry &EntryResponse,
-                      __tgt_offload_entry *Entry) {
-  Entry->name = strdup(EntryResponse.name().c_str());
-  Entry->reserved = EntryResponse.reserved();
-  Entry->flags = EntryResponse.flags();
-  Entry->addr = strdup(EntryResponse.data().c_str());
-  Entry->size = EntryResponse.data().size();
-}
-
-void copyOffloadEntry(const DeviceOffloadEntry &EntryResponse,
-                      __tgt_offload_entry *Entry) {
-  Entry->name = strdup(EntryResponse.name().c_str());
-  Entry->reserved = EntryResponse.reserved();
-  Entry->flags = EntryResponse.flags();
-  Entry->addr = (void *)EntryResponse.addr();
-  Entry->size = EntryResponse.size();
-}
-
-/// We shallow copy with just the name because it is a convenient identifier, we
-/// do actually just match off of the address.
-void shallowCopyOffloadEntry(const __tgt_offload_entry *Entry,
-                             TargetOffloadEntry *EntryResponse) {
-  EntryResponse->set_name(Entry->name);
-}
-
-void copyOffloadEntry(const __tgt_offload_entry *Entry,
-                      TargetOffloadEntry *EntryResponse) {
-  shallowCopyOffloadEntry(Entry, EntryResponse);
-  EntryResponse->set_reserved(Entry->reserved);
-  EntryResponse->set_flags(Entry->flags);
-  EntryResponse->set_data(Entry->addr, Entry->size);
-}
-
-/// Dumps the memory region from Start to End in order to debug memory transfer
-/// errors within the plugin
-void dump(const void *Start, const void *End) {
-  unsigned char Line[17];
-  const unsigned char *PrintCharacter = (const unsigned char *)Start;
-
-  unsigned int I = 0;
-  for (; I < ((const int *)End - (const int *)Start); I++) {
-    if ((I % 16) == 0) {
-      if (I != 0)
-        printf("  %s\n", Line);
-
-      printf("  %04x ", I);
-    }
-
-    printf(" %02x", PrintCharacter[I]);
-
-    if ((PrintCharacter[I] < 0x20) || (PrintCharacter[I] > 0x7e))
-      Line[I % 16] = '.';
-    else
-      Line[I % 16] = PrintCharacter[I];
-
-    Line[(I % 16) + 1] = '\0';
-  }
-
-  while ((I % 16) != 0) {
-    printf("   ");
-    I++;
-  }
-
-  printf("  %s\n", Line);
-}
-
-void dump(__tgt_offload_entry *Entry) {
-  fprintf(stderr, "Entry (%p):\n", (void *)Entry);
-  fprintf(stderr, "  Name: %s (%p)\n", Entry->name, (void *)&Entry->name);
-  fprintf(stderr, "  Reserved: %d (%p)\n", Entry->reserved,
-          (void *)&Entry->reserved);
-  fprintf(stderr, "  Flags: %d (%p)\n", Entry->flags, (void *)&Entry->flags);
-  fprintf(stderr, "  Addr: %p\n", Entry->addr);
-  fprintf(stderr, "  Size: %lu\n", Entry->size);
-}
-
-void dump(__tgt_target_table *Table) {
-  for (auto *CurEntry = Table->EntriesBegin; CurEntry != Table->EntriesEnd;
-       CurEntry++)
-    dump(CurEntry);
-}
-
-void dump(TargetOffloadEntry Entry) {
-  fprintf(stderr, "Entry: ");
-  fprintf(stderr, "  Name: %s\n", Entry.name().c_str());
-  fprintf(stderr, "  Reserved: %d\n", Entry.reserved());
-  fprintf(stderr, "  Flags: %d\n", Entry.flags());
-  fprintf(stderr, "  Size:  %ld\n", Entry.data().size());
-  dump(static_cast<const void *>(Entry.data().data()),
-       static_cast<const void *>((Entry.data().c_str() + Entry.data().size())));
-}
-
-void dump(__tgt_device_image *Image) {
-  dump(Image->ImageStart, Image->ImageEnd);
-  __tgt_offload_entry *EntryItr = Image->EntriesBegin;
-  for (; EntryItr != Image->EntriesEnd; EntryItr++)
-    dump(EntryItr);
-}
-
-void dump(std::unordered_map<void *, __tgt_offload_entry *> &Map) {
-  fprintf(stderr, "Host to Remote Entry Map:\n");
-  for (auto Entry : Map)
-    fprintf(stderr, "  Host (%p) -> Tgt (%p): Addr((%p))\n", Entry.first,
-            (void *)Entry.second, (void *)Entry.second->addr);
-}
-} // namespace RemoteOffloading
diff --git a/openmp/libomptarget/plugins/remote/server/CMakeLists.txt b/openmp/libomptarget/plugins/remote/server/CMakeLists.txt
deleted file mode 100644
--- a/openmp/libomptarget/plugins/remote/server/CMakeLists.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build server for remote offloading.
-#
-##===----------------------------------------------------------------------===##
-
-include_directories(${LIBOMPTARGET_SRC_DIR})
-include_directories(${LIBOMPTARGET_INCLUDE_DIR})
-include_directories(${GRPC_INCLUDE_DIR})
-include_directories(${RPC_INCLUDE_DIR})
-
-add_executable(openmp-offloading-server
-        ${LIBOMPTARGET_SRC_FILES}
-        ${GRPC_SRC_FILES}
-        ${RPC_SRC_DIR}/Utils.cpp
-        Server.cpp
-        OffloadingServer.cpp
-)
-
-target_link_libraries(openmp-offloading-server
-        grpc++
-        protobuf
-        absl::synchronization
-        ${OPENMP_PTHREAD_LIB}
-        omp
-        "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../../exports")
diff --git a/openmp/libomptarget/plugins/remote/server/OffloadingServer.cpp b/openmp/libomptarget/plugins/remote/server/OffloadingServer.cpp
deleted file mode 100644
--- a/openmp/libomptarget/plugins/remote/server/OffloadingServer.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-//===------------- OffloadingServer.cpp - Server Application --------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Offloading server for remote host.
-//
-//===----------------------------------------------------------------------===//
-
-#include <future>
-#include <grpcpp/server.h>
-#include <grpcpp/server_builder.h>
-#include <iostream>
-#include <thread>
-
-#include "Server.h"
-
-using grpc::Server;
-using grpc::ServerBuilder;
-
-std::promise<void> ShutdownPromise;
-
-int main() {
-  ClientManagerConfigTy Config;
-
-  RemoteOffloadImpl Service(Config.MaxSize, Config.BlockSize);
-
-  ServerBuilder Builder;
-  Builder.AddListeningPort(Config.ServerAddresses[0],
-                           grpc::InsecureServerCredentials());
-  Builder.RegisterService(&Service);
-  Builder.SetMaxMessageSize(INT_MAX);
-  std::unique_ptr<Server> Server(Builder.BuildAndStart());
-  if (getDebugLevel())
-    std::cerr << "Server listening on " << Config.ServerAddresses[0]
-              << std::endl;
-
-  auto WaitForServer = [&]() { Server->Wait(); };
-
-  std::thread ServerThread(WaitForServer);
-
-  auto ShutdownFuture = ShutdownPromise.get_future();
-  ShutdownFuture.wait();
-  Server->Shutdown();
-  ServerThread.join();
-
-  return 0;
-}
diff --git a/openmp/libomptarget/plugins/remote/server/Server.h b/openmp/libomptarget/plugins/remote/server/Server.h
deleted file mode 100644
--- a/openmp/libomptarget/plugins/remote/server/Server.h
+++ /dev/null
@@ -1,106 +0,0 @@
-//===-------------------------- Server.h - Server -------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Offloading gRPC server for remote host.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_REMOTE_SERVER_SERVER_H
-#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_REMOTE_SERVER_SERVER_H
-
-#include <grpcpp/server_context.h>
-
-#include "Utils.h"
-#include "device.h"
-#include "omptarget.h"
-#include "openmp.grpc.pb.h"
-#include "openmp.pb.h"
-#include "rtl.h"
-
-using grpc::ServerContext;
-using grpc::ServerReader;
-using grpc::ServerWriter;
-using grpc::Status;
-
-using namespace openmp::libomptarget::remote;
-using namespace RemoteOffloading;
-
-using namespace google;
-
-extern PluginManager *PM;
-
-class RemoteOffloadImpl final : public RemoteOffload::Service {
-private:
-  int32_t mapHostRTLDeviceId(int32_t RTLDeviceID);
-
-  std::unordered_map<const void *, __tgt_device_image *>
-      HostToRemoteDeviceImage;
-  std::unordered_map<const void *, std::unique_ptr<__tgt_bin_desc>>
-      Descriptions;
-  __tgt_target_table *Table = nullptr;
-
-  int DebugLevel;
-  uint64_t MaxSize;
-  uint64_t BlockSize;
-  std::unique_ptr<protobuf::Arena> Arena;
-
-public:
-  RemoteOffloadImpl(uint64_t MaxSize, uint64_t BlockSize)
-      : MaxSize(MaxSize), BlockSize(BlockSize) {
-    DebugLevel = getDebugLevel();
-    Arena = std::make_unique<protobuf::Arena>();
-  }
-
-  Status Shutdown(ServerContext *Context, const Null *Request,
-                  I32 *Reply) override;
-
-  Status RegisterLib(ServerContext *Context,
-                     const TargetBinaryDescription *Description,
-                     I32 *Reply) override;
-  Status UnregisterLib(ServerContext *Context, const Pointer *Request,
-                       I32 *Reply) override;
-
-  Status IsValidBinary(ServerContext *Context,
-                       const TargetDeviceImagePtr *Image,
-                       I32 *IsValid) override;
-  Status GetNumberOfDevices(ServerContext *Context, const Null *Null,
-                            I32 *NumberOfDevices) override;
-
-  Status InitDevice(ServerContext *Context, const I32 *DeviceNum,
-                    I32 *Reply) override;
-  Status InitRequires(ServerContext *Context, const I64 *RequiresFlag,
-                      I32 *Reply) override;
-
-  Status LoadBinary(ServerContext *Context, const Binary *Binary,
-                    TargetTable *Reply) override;
-  Status IsDataExchangeable(ServerContext *Context, const DevicePair *Request,
-                            I32 *Reply) override;
-
-  Status DataAlloc(ServerContext *Context, const AllocData *Request,
-                   Pointer *Reply) override;
-
-  Status DataSubmit(ServerContext *Context, ServerReader<SubmitData> *Reader,
-                    I32 *Reply) override;
-  Status DataRetrieve(ServerContext *Context, const RetrieveData *Request,
-                      ServerWriter<Data> *Writer) override;
-
-  Status DataExchange(ServerContext *Context, const ExchangeData *Request,
-                      I32 *Reply) override;
-
-  Status DataDelete(ServerContext *Context, const DeleteData *Request,
-                    I32 *Reply) override;
-
-  Status RunTargetRegion(ServerContext *Context, const TargetRegion *Request,
-                         I32 *Reply) override;
-
-  Status RunTargetTeamRegion(ServerContext *Context,
-                             const TargetTeamRegion *Request,
-                             I32 *Reply) override;
-};
-
-#endif
diff --git a/openmp/libomptarget/plugins/remote/server/Server.cpp b/openmp/libomptarget/plugins/remote/server/Server.cpp
deleted file mode 100644
--- a/openmp/libomptarget/plugins/remote/server/Server.cpp
+++ /dev/null
@@ -1,352 +0,0 @@
-//===----------------- Server.cpp - Server Implementation -----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Offloading gRPC server for remote host.
-//
-//===----------------------------------------------------------------------===//
-
-#include <cmath>
-#include <future>
-
-#include "Server.h"
-#include "omptarget.h"
-#include "openmp.grpc.pb.h"
-#include "openmp.pb.h"
-
-using grpc::WriteOptions;
-
-extern std::promise<void> ShutdownPromise;
-
-Status RemoteOffloadImpl::Shutdown(ServerContext *Context, const Null *Request,
-                                   I32 *Reply) {
-  SERVER_DBG("Shutting down the server")
-
-  Reply->set_number(0);
-  ShutdownPromise.set_value();
-  return Status::OK;
-}
-
-Status
-RemoteOffloadImpl::RegisterLib(ServerContext *Context,
-                               const TargetBinaryDescription *Description,
-                               I32 *Reply) {
-  auto Desc = std::make_unique<__tgt_bin_desc>();
-
-  unloadTargetBinaryDescription(Description, Desc.get(),
-                                HostToRemoteDeviceImage);
-  PM->RTLs.RegisterLib(Desc.get());
-
-  if (Descriptions.find((void *)Description->bin_ptr()) != Descriptions.end())
-    freeTargetBinaryDescription(
-        Descriptions[(void *)Description->bin_ptr()].get());
-  else
-    Descriptions[(void *)Description->bin_ptr()] = std::move(Desc);
-
-  SERVER_DBG("Registered library")
-  Reply->set_number(0);
-  return Status::OK;
-}
-
-Status RemoteOffloadImpl::UnregisterLib(ServerContext *Context,
-                                        const Pointer *Request, I32 *Reply) {
-  if (Descriptions.find((void *)Request->number()) == Descriptions.end()) {
-    Reply->set_number(1);
-    return Status::OK;
-  }
-
-  PM->RTLs.UnregisterLib(Descriptions[(void *)Request->number()].get());
-  freeTargetBinaryDescription(Descriptions[(void *)Request->number()].get());
-  Descriptions.erase((void *)Request->number());
-
-  SERVER_DBG("Unregistered library")
-  Reply->set_number(0);
-  return Status::OK;
-}
-
-Status RemoteOffloadImpl::IsValidBinary(ServerContext *Context,
-                                        const TargetDeviceImagePtr *DeviceImage,
-                                        I32 *IsValid) {
-  __tgt_device_image *Image =
-      HostToRemoteDeviceImage[(void *)DeviceImage->image_ptr()];
-
-  IsValid->set_number(0);
-
-  for (auto &RTL : PM->RTLs.AllRTLs)
-    if (auto Ret = RTL.is_valid_binary(Image)) {
-      IsValid->set_number(Ret);
-      break;
-    }
-
-  SERVER_DBG("Checked if binary (%p) is valid",
-             (void *)(DeviceImage->image_ptr()))
-  return Status::OK;
-}
-
-Status RemoteOffloadImpl::GetNumberOfDevices(ServerContext *Context,
-                                             const Null *Null,
-                                             I32 *NumberOfDevices) {
-  int32_t Devices = 0;
-  PM->RTLsMtx.lock();
-  for (auto &RTL : PM->RTLs.AllRTLs)
-    Devices += RTL.NumberOfDevices;
-  PM->RTLsMtx.unlock();
-
-  NumberOfDevices->set_number(Devices);
-
-  SERVER_DBG("Got number of devices")
-  return Status::OK;
-}
-
-Status RemoteOffloadImpl::InitDevice(ServerContext *Context,
-                                     const I32 *DeviceNum, I32 *Reply) {
-  Reply->set_number(PM->Devices[DeviceNum->number()]->RTL->init_device(
-      mapHostRTLDeviceId(DeviceNum->number())));
-
-  SERVER_DBG("Initialized device %d", DeviceNum->number())
-  return Status::OK;
-}
-
-Status RemoteOffloadImpl::InitRequires(ServerContext *Context,
-                                       const I64 *RequiresFlag, I32 *Reply) {
-  for (auto &Device : PM->Devices)
-    if (Device->RTL->init_requires)
-      Device->RTL->init_requires(RequiresFlag->number());
-  Reply->set_number(RequiresFlag->number());
-
-  SERVER_DBG("Initialized requires for devices")
-  return Status::OK;
-}
-
-Status RemoteOffloadImpl::LoadBinary(ServerContext *Context,
-                                     const Binary *Binary, TargetTable *Reply) {
-  __tgt_device_image *Image =
-      HostToRemoteDeviceImage[(void *)Binary->image_ptr()];
-
-  Table = PM->Devices[Binary->device_id()]->RTL->load_binary(
-      mapHostRTLDeviceId(Binary->device_id()), Image);
-  if (Table)
-    loadTargetTable(Table, *Reply, Image);
-
-  SERVER_DBG("Loaded binary (%p) to device %d", (void *)Binary->image_ptr(),
-             Binary->device_id())
-  return Status::OK;
-}
-
-Status RemoteOffloadImpl::IsDataExchangeable(ServerContext *Context,
-                                             const DevicePair *Request,
-                                             I32 *Reply) {
-  Reply->set_number(-1);
-  if (PM->Devices[mapHostRTLDeviceId(Request->src_dev_id())]
-          ->RTL->is_data_exchangable)
-    Reply->set_number(PM->Devices[mapHostRTLDeviceId(Request->src_dev_id())]
-                          ->RTL->is_data_exchangable(Request->src_dev_id(),
-                                                     Request->dst_dev_id()));
-
-  SERVER_DBG("Checked if data exchangeable between device %d and device %d",
-             Request->src_dev_id(), Request->dst_dev_id())
-  return Status::OK;
-}
-
-Status RemoteOffloadImpl::DataAlloc(ServerContext *Context,
-                                    const AllocData *Request, Pointer *Reply) {
-  uint64_t TgtPtr =
-      (uint64_t)PM->Devices[Request->device_id()]->RTL->data_alloc(
-          mapHostRTLDeviceId(Request->device_id()), Request->size(),
-          (void *)Request->hst_ptr(), TARGET_ALLOC_DEFAULT);
-  Reply->set_number(TgtPtr);
-
-  SERVER_DBG("Allocated at " DPxMOD "", DPxPTR((void *)TgtPtr))
-
-  return Status::OK;
-}
-
-Status RemoteOffloadImpl::DataSubmit(ServerContext *Context,
-                                     ServerReader<SubmitData> *Reader,
-                                     I32 *Reply) {
-  SubmitData Request;
-  uint8_t *HostCopy = nullptr;
-  while (Reader->Read(&Request)) {
-    if (Request.start() == 0 && Request.size() == Request.data().size()) {
-      Reader->SendInitialMetadata();
-
-      Reply->set_number(PM->Devices[Request.device_id()]->RTL->data_submit(
-          mapHostRTLDeviceId(Request.device_id()), (void *)Request.tgt_ptr(),
-          (void *)Request.data().data(), Request.data().size()));
-
-      SERVER_DBG("Submitted %lu bytes async to (%p) on device %d",
-                 Request.data().size(), (void *)Request.tgt_ptr(),
-                 Request.device_id())
-
-      return Status::OK;
-    }
-    if (!HostCopy) {
-      HostCopy = new uint8_t[Request.size()];
-      Reader->SendInitialMetadata();
-    }
-
-    memcpy((void *)((char *)HostCopy + Request.start()), Request.data().data(),
-           Request.data().size());
-  }
-
-  Reply->set_number(PM->Devices[Request.device_id()]->RTL->data_submit(
-      mapHostRTLDeviceId(Request.device_id()), (void *)Request.tgt_ptr(),
-      HostCopy, Request.size()));
-
-  delete[] HostCopy;
-
-  SERVER_DBG("Submitted %lu bytes to (%p) on device %d", Request.data().size(),
-             (void *)Request.tgt_ptr(), Request.device_id())
-
-  return Status::OK;
-}
-
-Status RemoteOffloadImpl::DataRetrieve(ServerContext *Context,
-                                       const RetrieveData *Request,
-                                       ServerWriter<Data> *Writer) {
-  auto HstPtr = std::make_unique<char[]>(Request->size());
-
-  auto Ret = PM->Devices[Request->device_id()]->RTL->data_retrieve(
-      mapHostRTLDeviceId(Request->device_id()), HstPtr.get(),
-      (void *)Request->tgt_ptr(), Request->size());
-
-  if (Arena->SpaceAllocated() >= MaxSize)
-    Arena->Reset();
-
-  if (Request->size() > BlockSize) {
-    uint64_t Start = 0, End = BlockSize;
-    for (auto I = 0; I < ceil((float)Request->size() / BlockSize); I++) {
-      auto *Reply = protobuf::Arena::CreateMessage<Data>(Arena.get());
-
-      Reply->set_start(Start);
-      Reply->set_size(Request->size());
-      Reply->set_data((char *)HstPtr.get() + Start, End - Start);
-      Reply->set_ret(Ret);
-
-      if (!Writer->Write(*Reply)) {
-        CLIENT_DBG("Broken stream when submitting data")
-      }
-
-      SERVER_DBG("Retrieved %lu-%lu/%lu bytes from (%p) on device %d", Start,
-                 End, Request->size(), (void *)Request->tgt_ptr(),
-                 mapHostRTLDeviceId(Request->device_id()))
-
-      Start += BlockSize;
-      End += BlockSize;
-      if (End >= Request->size())
-        End = Request->size();
-    }
-  } else {
-    auto *Reply = protobuf::Arena::CreateMessage<Data>(Arena.get());
-
-    Reply->set_start(0);
-    Reply->set_size(Request->size());
-    Reply->set_data((char *)HstPtr.get(), Request->size());
-    Reply->set_ret(Ret);
-
-    SERVER_DBG("Retrieved %lu bytes from (%p) on device %d", Request->size(),
-               (void *)Request->tgt_ptr(),
-               mapHostRTLDeviceId(Request->device_id()))
-
-    Writer->WriteLast(*Reply, WriteOptions());
-  }
-
-  return Status::OK;
-}
-
-Status RemoteOffloadImpl::DataExchange(ServerContext *Context,
-                                       const ExchangeData *Request,
-                                       I32 *Reply) {
-  if (PM->Devices[Request->src_dev_id()]->RTL->data_exchange) {
-    int32_t Ret = PM->Devices[Request->src_dev_id()]->RTL->data_exchange(
-        mapHostRTLDeviceId(Request->src_dev_id()), (void *)Request->src_ptr(),
-        mapHostRTLDeviceId(Request->dst_dev_id()), (void *)Request->dst_ptr(),
-        Request->size());
-    Reply->set_number(Ret);
-  } else
-    Reply->set_number(-1);
-
-  SERVER_DBG(
-      "Exchanged data asynchronously from device %d (%p) to device %d (%p) of "
-      "size %lu",
-      mapHostRTLDeviceId(Request->src_dev_id()), (void *)Request->src_ptr(),
-      mapHostRTLDeviceId(Request->dst_dev_id()), (void *)Request->dst_ptr(),
-      Request->size())
-  return Status::OK;
-}
-
-Status RemoteOffloadImpl::DataDelete(ServerContext *Context,
-                                     const DeleteData *Request, I32 *Reply) {
-  auto Ret = PM->Devices[Request->device_id()]->RTL->data_delete(
-      mapHostRTLDeviceId(Request->device_id()), (void *)Request->tgt_ptr());
-  Reply->set_number(Ret);
-
-  SERVER_DBG("Deleted data from (%p) on device %d", (void *)Request->tgt_ptr(),
-             mapHostRTLDeviceId(Request->device_id()))
-  return Status::OK;
-}
-
-Status RemoteOffloadImpl::RunTargetRegion(ServerContext *Context,
-                                          const TargetRegion *Request,
-                                          I32 *Reply) {
-  std::vector<uint8_t> TgtArgs(Request->arg_num());
-  for (auto I = 0; I < Request->arg_num(); I++)
-    TgtArgs[I] = (uint64_t)Request->tgt_args()[I];
-
-  std::vector<ptrdiff_t> TgtOffsets(Request->arg_num());
-  const auto *TgtOffsetItr = Request->tgt_offsets().begin();
-  for (auto I = 0; I < Request->arg_num(); I++, TgtOffsetItr++)
-    TgtOffsets[I] = (ptrdiff_t)*TgtOffsetItr;
-
-  void *TgtEntryPtr = ((__tgt_offload_entry *)Request->tgt_entry_ptr())->addr;
-
-  int32_t Ret = PM->Devices[Request->device_id()]->RTL->run_region(
-      mapHostRTLDeviceId(Request->device_id()), TgtEntryPtr,
-      (void **)TgtArgs.data(), TgtOffsets.data(), Request->arg_num());
-
-  Reply->set_number(Ret);
-
-  SERVER_DBG("Ran TargetRegion on device %d with %d args",
-             mapHostRTLDeviceId(Request->device_id()), Request->arg_num())
-  return Status::OK;
-}
-
-Status RemoteOffloadImpl::RunTargetTeamRegion(ServerContext *Context,
-                                              const TargetTeamRegion *Request,
-                                              I32 *Reply) {
-  std::vector<uint64_t> TgtArgs(Request->arg_num());
-  for (auto I = 0; I < Request->arg_num(); I++)
-    TgtArgs[I] = (uint64_t)Request->tgt_args()[I];
-
-  std::vector<ptrdiff_t> TgtOffsets(Request->arg_num());
-  const auto *TgtOffsetItr = Request->tgt_offsets().begin();
-  for (auto I = 0; I < Request->arg_num(); I++, TgtOffsetItr++)
-    TgtOffsets[I] = (ptrdiff_t)*TgtOffsetItr;
-
-  void *TgtEntryPtr = ((__tgt_offload_entry *)Request->tgt_entry_ptr())->addr;
-
-  int32_t Ret = PM->Devices[Request->device_id()]->RTL->run_team_region(
-      mapHostRTLDeviceId(Request->device_id()), TgtEntryPtr,
-      (void **)TgtArgs.data(), TgtOffsets.data(), Request->arg_num(),
-      Request->team_num(), Request->thread_limit(), Request->loop_tripcount());
-
-  Reply->set_number(Ret);
-
-  SERVER_DBG("Ran TargetTeamRegion on device %d with %d args",
-             mapHostRTLDeviceId(Request->device_id()), Request->arg_num())
-  return Status::OK;
-}
-
-int32_t RemoteOffloadImpl::mapHostRTLDeviceId(int32_t RTLDeviceID) {
-  for (auto &RTL : PM->RTLs.UsedRTLs) {
-    if (RTLDeviceID - RTL->NumberOfDevices >= 0)
-      RTLDeviceID -= RTL->NumberOfDevices;
-    else
-      break;
-  }
-  return RTLDeviceID;
-}
diff --git a/openmp/libomptarget/plugins/remote/src/CMakeLists.txt b/openmp/libomptarget/plugins/remote/src/CMakeLists.txt
deleted file mode 100644
--- a/openmp/libomptarget/plugins/remote/src/CMakeLists.txt
+++ /dev/null
@@ -1,43 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for remote offloading.
-#
-##===----------------------------------------------------------------------===##
-
-cmake_minimum_required(VERSION 3.13.4)
-
-# Define the suffix for the runtime messaging dumps.
-add_definitions(-DTARGET_NAME=RPC)
-
-include_directories(${LIBOMPTARGET_SRC_DIR})
-include_directories(${LIBOMPTARGET_INCLUDE_DIR})
-include_directories(${GRPC_INCLUDE_DIR})
-include_directories(${RPC_INCLUDE_DIR})
-
-add_library(omptarget.rtl.rpc SHARED
-        ${LIBOMPTARGET_SRC_FILES}
-        ${GRPC_SRC_FILES}
-        ${RPC_SRC_DIR}/Utils.cpp
-        Client.cpp
-        rtl.cpp
-)
-
-# Install plugin under the lib destination folder.
-install(TARGETS omptarget.rtl.rpc LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-
-target_link_libraries(omptarget.rtl.rpc
-  grpc++
-  protobuf
-  absl::synchronization
-  ${OPENMP_PTHREAD_LIB}
-  omp
-  "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../../exports")
-
-# Report to the parent scope that we are building a plugin for RPC.
-set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} rpc" PARENT_SCOPE)
diff --git a/openmp/libomptarget/plugins/remote/src/Client.h b/openmp/libomptarget/plugins/remote/src/Client.h
deleted file mode 100644
--- a/openmp/libomptarget/plugins/remote/src/Client.h
+++ /dev/null
@@ -1,153 +0,0 @@
-//===------------------ Client.h - Client Implementation ------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// gRPC Client for the remote plugin.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_REMOTE_SRC_CLIENT_H
-#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_REMOTE_SRC_CLIENT_H
-
-#include "Utils.h"
-#include "omptarget.h"
-#include <google/protobuf/arena.h>
-#include <grpcpp/grpcpp.h>
-#include <grpcpp/security/credentials.h>
-#include <grpcpp/support/channel_arguments.h>
-#include <memory>
-#include <mutex>
-#include <numeric>
-
-using grpc::Channel;
-using openmp::libomptarget::remote::RemoteOffload;
-using namespace RemoteOffloading;
-
-using namespace google;
-
-class RemoteOffloadClient {
-  int DebugLevel;
-  const int Timeout;
-  const uint64_t MaxSize;
-  const int64_t BlockSize;
-
-  std::unique_ptr<RemoteOffload::Stub> Stub;
-  std::unique_ptr<protobuf::Arena> Arena;
-
-  std::unique_ptr<std::mutex> ArenaAllocatorLock;
-
-  std::map<int32_t, std::unordered_map<void *, void *>> RemoteEntries;
-  std::map<int32_t, std::unique_ptr<__tgt_target_table>> DevicesToTables;
-
-  template <typename Fn1, typename Fn2, typename TReturn>
-  auto remoteCall(Fn1 Preprocessor, Fn2 Postprocessor, TReturn ErrorValue,
-                  bool CanTimeOut = true);
-
-public:
-  RemoteOffloadClient(std::shared_ptr<Channel> Channel, int Timeout,
-                      uint64_t MaxSize, int64_t BlockSize)
-      : Timeout(Timeout), MaxSize(MaxSize), BlockSize(BlockSize),
-        Stub(RemoteOffload::NewStub(Channel)) {
-    DebugLevel = getDebugLevel();
-    Arena = std::make_unique<protobuf::Arena>();
-    ArenaAllocatorLock = std::make_unique<std::mutex>();
-  }
-
-  RemoteOffloadClient(RemoteOffloadClient &&C) = default;
-
-  ~RemoteOffloadClient() {
-    for (auto &TableIt : DevicesToTables)
-      freeTargetTable(TableIt.second.get());
-  }
-
-  int32_t shutdown(void);
-
-  int32_t registerLib(__tgt_bin_desc *Desc);
-  int32_t unregisterLib(__tgt_bin_desc *Desc);
-
-  int32_t isValidBinary(__tgt_device_image *Image);
-  int32_t getNumberOfDevices();
-
-  int32_t initDevice(int32_t DeviceId);
-  int32_t initRequires(int64_t RequiresFlags);
-
-  __tgt_target_table *loadBinary(int32_t DeviceId, __tgt_device_image *Image);
-
-  void *dataAlloc(int32_t DeviceId, int64_t Size, void *HstPtr);
-  int32_t dataDelete(int32_t DeviceId, void *TgtPtr);
-
-  int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
-                     int64_t Size);
-  int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
-                       int64_t Size);
-
-  int32_t isDataExchangeable(int32_t SrcDevId, int32_t DstDevId);
-  int32_t dataExchange(int32_t SrcDevId, void *SrcPtr, int32_t DstDevId,
-                       void *DstPtr, int64_t Size);
-
-  int32_t runTargetRegion(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
-                          ptrdiff_t *TgtOffsets, int32_t ArgNum);
-  int32_t runTargetTeamRegion(int32_t DeviceId, void *TgtEntryPtr,
-                              void **TgtArgs, ptrdiff_t *TgtOffsets,
-                              int32_t ArgNum, int32_t TeamNum,
-                              int32_t ThreadLimit, uint64_t LoopTripCount);
-};
-
-class RemoteClientManager {
-private:
-  std::vector<RemoteOffloadClient> Clients;
-  std::vector<int> Devices;
-
-  std::pair<int32_t, int32_t> mapDeviceId(int32_t DeviceId);
-  int DebugLevel;
-
-public:
-  RemoteClientManager() {
-    ClientManagerConfigTy Config;
-
-    grpc::ChannelArguments ChArgs;
-    ChArgs.SetMaxReceiveMessageSize(-1);
-    DebugLevel = getDebugLevel();
-    for (auto Address : Config.ServerAddresses) {
-      Clients.push_back(RemoteOffloadClient(
-          grpc::CreateChannel(Address, grpc::InsecureChannelCredentials()),
-          Config.Timeout, Config.MaxSize, Config.BlockSize));
-    }
-  }
-
-  int32_t shutdown(void);
-
-  int32_t registerLib(__tgt_bin_desc *Desc);
-  int32_t unregisterLib(__tgt_bin_desc *Desc);
-
-  int32_t isValidBinary(__tgt_device_image *Image);
-  int32_t getNumberOfDevices();
-
-  int32_t initDevice(int32_t DeviceId);
-  int32_t initRequires(int64_t RequiresFlags);
-
-  __tgt_target_table *loadBinary(int32_t DeviceId, __tgt_device_image *Image);
-
-  void *dataAlloc(int32_t DeviceId, int64_t Size, void *HstPtr);
-  int32_t dataDelete(int32_t DeviceId, void *TgtPtr);
-
-  int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
-                     int64_t Size);
-  int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
-                       int64_t Size);
-
-  int32_t isDataExchangeable(int32_t SrcDevId, int32_t DstDevId);
-  int32_t dataExchange(int32_t SrcDevId, void *SrcPtr, int32_t DstDevId,
-                       void *DstPtr, int64_t Size);
-
-  int32_t runTargetTeamRegion(int32_t DeviceId, void *TgtEntryPtr,
-                              void **TgtArgs, ptrdiff_t *TgtOffsets,
-                              int32_t ArgNum, int32_t TeamNum,
-                              int32_t ThreadLimit, uint64_t LoopTripCount);
-};
-
-#endif
diff --git a/openmp/libomptarget/plugins/remote/src/Client.cpp b/openmp/libomptarget/plugins/remote/src/Client.cpp
deleted file mode 100644
--- a/openmp/libomptarget/plugins/remote/src/Client.cpp
+++ /dev/null
@@ -1,711 +0,0 @@
-//===----------------- Client.cpp - Client Implementation -----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// gRPC (Client) for the remote plugin.
-//
-//===----------------------------------------------------------------------===//
-
-#include <cmath>
-
-#include "Client.h"
-#include "omptarget.h"
-#include "openmp.pb.h"
-
-using namespace std::chrono;
-
-using grpc::ClientContext;
-using grpc::ClientReader;
-using grpc::ClientWriter;
-using grpc::Status;
-
-template <typename Fn1, typename Fn2, typename TReturn>
-auto RemoteOffloadClient::remoteCall(Fn1 Preprocessor, Fn2 Postprocessor,
-                                     TReturn ErrorValue, bool CanTimeOut) {
-  ArenaAllocatorLock->lock();
-  if (Arena->SpaceAllocated() >= MaxSize)
-    Arena->Reset();
-  ArenaAllocatorLock->unlock();
-
-  ClientContext Context;
-  if (CanTimeOut) {
-    auto Deadline =
-        std::chrono::system_clock::now() + std::chrono::seconds(Timeout);
-    Context.set_deadline(Deadline);
-  }
-
-  Status RPCStatus;
-  auto Reply = Preprocessor(RPCStatus, Context);
-
-  if (!RPCStatus.ok()) {
-    CLIENT_DBG("%s", RPCStatus.error_message().c_str())
-  } else {
-    return Postprocessor(Reply);
-  }
-
-  CLIENT_DBG("Failed")
-  return ErrorValue;
-}
-
-int32_t RemoteOffloadClient::shutdown(void) {
-  ClientContext Context;
-  Null Request;
-  I32 Reply;
-  CLIENT_DBG("Shutting down server.")
-  auto Status = Stub->Shutdown(&Context, Request, &Reply);
-  if (Status.ok())
-    return Reply.number();
-  return 1;
-}
-
-int32_t RemoteOffloadClient::registerLib(__tgt_bin_desc *Desc) {
-  return remoteCall(
-      /* Preprocessor */
-      [&](auto &RPCStatus, auto &Context) {
-        auto *Request = protobuf::Arena::CreateMessage<TargetBinaryDescription>(
-            Arena.get());
-        auto *Reply = protobuf::Arena::CreateMessage<I32>(Arena.get());
-        loadTargetBinaryDescription(Desc, *Request);
-        Request->set_bin_ptr((uint64_t)Desc);
-
-        RPCStatus = Stub->RegisterLib(&Context, *Request, Reply);
-        return Reply;
-      },
-      /* Postprocessor */
-      [&](const auto &Reply) {
-        if (Reply->number() == 0) {
-          CLIENT_DBG("Registered library")
-          return 0;
-        }
-        return 1;
-      },
-      /* Error Value */ 1);
-}
-
-int32_t RemoteOffloadClient::unregisterLib(__tgt_bin_desc *Desc) {
-  return remoteCall(
-      /* Preprocessor */
-      [&](auto &RPCStatus, auto &Context) {
-        auto *Request = protobuf::Arena::CreateMessage<Pointer>(Arena.get());
-        auto *Reply = protobuf::Arena::CreateMessage<I32>(Arena.get());
-
-        Request->set_number((uint64_t)Desc);
-
-        RPCStatus = Stub->UnregisterLib(&Context, *Request, Reply);
-        return Reply;
-      },
-      /* Postprocessor */
-      [&](const auto &Reply) {
-        if (Reply->number() == 0) {
-          CLIENT_DBG("Unregistered library")
-          return 0;
-        }
-        CLIENT_DBG("Failed to unregister library")
-        return 1;
-      },
-      /* Error Value */ 1);
-}
-
-int32_t RemoteOffloadClient::isValidBinary(__tgt_device_image *Image) {
-  return remoteCall(
-      /* Preprocessor */
-      [&](auto &RPCStatus, auto &Context) {
-        auto *Request =
-            protobuf::Arena::CreateMessage<TargetDeviceImagePtr>(Arena.get());
-        auto *Reply = protobuf::Arena::CreateMessage<I32>(Arena.get());
-
-        Request->set_image_ptr((uint64_t)Image->ImageStart);
-
-        auto *EntryItr = Image->EntriesBegin;
-        while (EntryItr != Image->EntriesEnd)
-          Request->add_entry_ptrs((uint64_t)EntryItr++);
-
-        RPCStatus = Stub->IsValidBinary(&Context, *Request, Reply);
-        return Reply;
-      },
-      /* Postprocessor */
-      [&](const auto &Reply) {
-        if (Reply->number()) {
-          CLIENT_DBG("Validated binary")
-        } else {
-          CLIENT_DBG("Could not validate binary")
-        }
-        return Reply->number();
-      },
-      /* Error Value */ 0);
-}
-
-int32_t RemoteOffloadClient::getNumberOfDevices() {
-  return remoteCall(
-      /* Preprocessor */
-      [&](Status &RPCStatus, ClientContext &Context) {
-        auto *Request = protobuf::Arena::CreateMessage<Null>(Arena.get());
-        auto *Reply = protobuf::Arena::CreateMessage<I32>(Arena.get());
-
-        RPCStatus = Stub->GetNumberOfDevices(&Context, *Request, Reply);
-
-        return Reply;
-      },
-      /* Postprocessor */
-      [&](const auto &Reply) {
-        if (Reply->number()) {
-          CLIENT_DBG("Found %d devices", Reply->number())
-        } else {
-          CLIENT_DBG("Could not get the number of devices")
-        }
-        return Reply->number();
-      },
-      /*Error Value*/ -1);
-}
-
-int32_t RemoteOffloadClient::initDevice(int32_t DeviceId) {
-  return remoteCall(
-      /* Preprocessor */
-      [&](auto &RPCStatus, auto &Context) {
-        auto *Request = protobuf::Arena::CreateMessage<I32>(Arena.get());
-        auto *Reply = protobuf::Arena::CreateMessage<I32>(Arena.get());
-
-        Request->set_number(DeviceId);
-
-        RPCStatus = Stub->InitDevice(&Context, *Request, Reply);
-
-        return Reply;
-      },
-      /* Postprocessor */
-      [&](const auto &Reply) {
-        if (!Reply->number()) {
-          CLIENT_DBG("Initialized device %d", DeviceId)
-        } else {
-          CLIENT_DBG("Could not initialize device %d", DeviceId)
-        }
-        return Reply->number();
-      },
-      /* Error Value */ -1);
-}
-
-int32_t RemoteOffloadClient::initRequires(int64_t RequiresFlags) {
-  return remoteCall(
-      /* Preprocessor */
-      [&](auto &RPCStatus, auto &Context) {
-        auto *Request = protobuf::Arena::CreateMessage<I64>(Arena.get());
-        auto *Reply = protobuf::Arena::CreateMessage<I32>(Arena.get());
-        Request->set_number(RequiresFlags);
-        RPCStatus = Stub->InitRequires(&Context, *Request, Reply);
-        return Reply;
-      },
-      /* Postprocessor */
-      [&](const auto &Reply) {
-        if (Reply->number()) {
-          CLIENT_DBG("Initialized requires")
-        } else {
-          CLIENT_DBG("Could not initialize requires")
-        }
-        return Reply->number();
-      },
-      /* Error Value */ -1);
-}
-
-__tgt_target_table *RemoteOffloadClient::loadBinary(int32_t DeviceId,
-                                                    __tgt_device_image *Image) {
-  return remoteCall(
-      /* Preprocessor */
-      [&](auto &RPCStatus, auto &Context) {
-        auto *ImageMessage =
-            protobuf::Arena::CreateMessage<Binary>(Arena.get());
-        auto *Reply = protobuf::Arena::CreateMessage<TargetTable>(Arena.get());
-        ImageMessage->set_image_ptr((uint64_t)Image->ImageStart);
-        ImageMessage->set_device_id(DeviceId);
-
-        RPCStatus = Stub->LoadBinary(&Context, *ImageMessage, Reply);
-        return Reply;
-      },
-      /* Postprocessor */
-      [&](auto &Reply) {
-        if (Reply->entries_size() == 0) {
-          CLIENT_DBG("Could not load image %p onto device %d", Image, DeviceId)
-          return (__tgt_target_table *)nullptr;
-        }
-        DevicesToTables[DeviceId] = std::make_unique<__tgt_target_table>();
-        unloadTargetTable(*Reply, DevicesToTables[DeviceId].get(),
-                          RemoteEntries[DeviceId]);
-
-        CLIENT_DBG("Loaded Image %p to device %d with %d entries", Image,
-                   DeviceId, Reply->entries_size())
-
-        return DevicesToTables[DeviceId].get();
-      },
-      /* Error Value */ (__tgt_target_table *)nullptr,
-      /* CanTimeOut */ false);
-}
-
-int32_t RemoteOffloadClient::isDataExchangeable(int32_t SrcDevId,
-                                                int32_t DstDevId) {
-  return remoteCall(
-      /* Preprocessor */
-      [&](auto &RPCStatus, auto &Context) {
-        auto *Request = protobuf::Arena::CreateMessage<DevicePair>(Arena.get());
-        auto *Reply = protobuf::Arena::CreateMessage<I32>(Arena.get());
-
-        Request->set_src_dev_id(SrcDevId);
-        Request->set_dst_dev_id(DstDevId);
-
-        RPCStatus = Stub->IsDataExchangeable(&Context, *Request, Reply);
-        return Reply;
-      },
-      /* Postprocessor */
-      [&](auto &Reply) {
-        if (Reply->number()) {
-          CLIENT_DBG("Data is exchangeable between %d, %d", SrcDevId, DstDevId)
-        } else {
-          CLIENT_DBG("Data is not exchangeable between %d, %d", SrcDevId,
-                     DstDevId)
-        }
-        return Reply->number();
-      },
-      /* Error Value */ -1);
-}
-
-void *RemoteOffloadClient::dataAlloc(int32_t DeviceId, int64_t Size,
-                                     void *HstPtr) {
-  return remoteCall(
-      /* Preprocessor */
-      [&](auto &RPCStatus, auto &Context) {
-        auto *Reply = protobuf::Arena::CreateMessage<Pointer>(Arena.get());
-        auto *Request = protobuf::Arena::CreateMessage<AllocData>(Arena.get());
-
-        Request->set_device_id(DeviceId);
-        Request->set_size(Size);
-        Request->set_hst_ptr((uint64_t)HstPtr);
-
-        RPCStatus = Stub->DataAlloc(&Context, *Request, Reply);
-        return Reply;
-      },
-      /* Postprocessor */
-      [&](auto &Reply) {
-        if (Reply->number()) {
-          CLIENT_DBG("Allocated %ld bytes on device %d at %p", Size, DeviceId,
-                     (void *)Reply->number())
-        } else {
-          CLIENT_DBG("Could not allocate %ld bytes on device %d at %p", Size,
-                     DeviceId, (void *)Reply->number())
-        }
-        return (void *)Reply->number();
-      },
-      /* Error Value */ (void *)nullptr);
-}
-
-int32_t RemoteOffloadClient::dataSubmit(int32_t DeviceId, void *TgtPtr,
-                                        void *HstPtr, int64_t Size) {
-
-  return remoteCall(
-      /* Preprocessor */
-      [&](auto &RPCStatus, auto &Context) {
-        auto *Reply = protobuf::Arena::CreateMessage<I32>(Arena.get());
-        std::unique_ptr<ClientWriter<SubmitData>> Writer(
-            Stub->DataSubmit(&Context, Reply));
-
-        if (Size > BlockSize) {
-          int64_t Start = 0, End = BlockSize;
-          for (auto I = 0; I < ceil((float)Size / BlockSize); I++) {
-            auto *Request =
-                protobuf::Arena::CreateMessage<SubmitData>(Arena.get());
-
-            Request->set_device_id(DeviceId);
-            Request->set_data((char *)HstPtr + Start, End - Start);
-            Request->set_hst_ptr((uint64_t)HstPtr);
-            Request->set_tgt_ptr((uint64_t)TgtPtr);
-            Request->set_start(Start);
-            Request->set_size(Size);
-
-            if (!Writer->Write(*Request)) {
-              CLIENT_DBG("Broken stream when submitting data")
-              Reply->set_number(0);
-              return Reply;
-            }
-
-            Start += BlockSize;
-            End += BlockSize;
-            if (End >= Size)
-              End = Size;
-          }
-        } else {
-          auto *Request =
-              protobuf::Arena::CreateMessage<SubmitData>(Arena.get());
-
-          Request->set_device_id(DeviceId);
-          Request->set_data(HstPtr, Size);
-          Request->set_hst_ptr((uint64_t)HstPtr);
-          Request->set_tgt_ptr((uint64_t)TgtPtr);
-          Request->set_start(0);
-          Request->set_size(Size);
-
-          if (!Writer->Write(*Request)) {
-            CLIENT_DBG("Broken stream when submitting data")
-            Reply->set_number(0);
-            return Reply;
-          }
-        }
-
-        Writer->WritesDone();
-        RPCStatus = Writer->Finish();
-
-        return Reply;
-      },
-      /* Postprocessor */
-      [&](auto &Reply) {
-        if (!Reply->number()) {
-          CLIENT_DBG(" submitted %ld bytes on device %d at %p", Size, DeviceId,
-                     TgtPtr)
-        } else {
-          CLIENT_DBG("Could not async submit %ld bytes on device %d at %p",
-                     Size, DeviceId, TgtPtr)
-        }
-        return Reply->number();
-      },
-      /* Error Value */ -1,
-      /* CanTimeOut */ false);
-}
-
-int32_t RemoteOffloadClient::dataRetrieve(int32_t DeviceId, void *HstPtr,
-                                          void *TgtPtr, int64_t Size) {
-  return remoteCall(
-      /* Preprocessor */
-      [&](auto &RPCStatus, auto &Context) {
-        auto *Request =
-            protobuf::Arena::CreateMessage<RetrieveData>(Arena.get());
-
-        Request->set_device_id(DeviceId);
-        Request->set_size(Size);
-        Request->set_hst_ptr((int64_t)HstPtr);
-        Request->set_tgt_ptr((int64_t)TgtPtr);
-
-        auto *Reply = protobuf::Arena::CreateMessage<Data>(Arena.get());
-        std::unique_ptr<ClientReader<Data>> Reader(
-            Stub->DataRetrieve(&Context, *Request));
-        Reader->WaitForInitialMetadata();
-        while (Reader->Read(Reply)) {
-          if (Reply->ret()) {
-            CLIENT_DBG("Could not async retrieve %ld bytes on device %d at %p "
-                       "for %p",
-                       Size, DeviceId, TgtPtr, HstPtr)
-            return Reply;
-          }
-
-          if (Reply->start() == 0 && Reply->size() == Reply->data().size()) {
-            memcpy(HstPtr, Reply->data().data(), Reply->data().size());
-
-            return Reply;
-          }
-
-          memcpy((void *)((char *)HstPtr + Reply->start()),
-                 Reply->data().data(), Reply->data().size());
-        }
-        RPCStatus = Reader->Finish();
-
-        return Reply;
-      },
-      /* Postprocessor */
-      [&](auto &Reply) {
-        if (!Reply->ret()) {
-          CLIENT_DBG("Retrieved %ld bytes on Device %d", Size, DeviceId)
-        } else {
-          CLIENT_DBG("Could not async retrieve %ld bytes on Device %d", Size,
-                     DeviceId)
-        }
-        return Reply->ret();
-      },
-      /* Error Value */ -1,
-      /* CanTimeOut */ false);
-}
-
-int32_t RemoteOffloadClient::dataExchange(int32_t SrcDevId, void *SrcPtr,
-                                          int32_t DstDevId, void *DstPtr,
-                                          int64_t Size) {
-  return remoteCall(
-      /* Preprocessor */
-      [&](auto &RPCStatus, auto &Context) {
-        auto *Reply = protobuf::Arena::CreateMessage<I32>(Arena.get());
-        auto *Request =
-            protobuf::Arena::CreateMessage<ExchangeData>(Arena.get());
-
-        Request->set_src_dev_id(SrcDevId);
-        Request->set_src_ptr((uint64_t)SrcPtr);
-        Request->set_dst_dev_id(DstDevId);
-        Request->set_dst_ptr((uint64_t)DstPtr);
-        Request->set_size(Size);
-
-        RPCStatus = Stub->DataExchange(&Context, *Request, Reply);
-        return Reply;
-      },
-      /* Postprocessor */
-      [&](auto &Reply) {
-        if (Reply->number()) {
-          CLIENT_DBG(
-              "Exchanged %ld bytes on device %d at %p for %p on device %d",
-              Size, SrcDevId, SrcPtr, DstPtr, DstDevId)
-        } else {
-          CLIENT_DBG("Could not exchange %ld bytes on device %d at %p for %p "
-                     "on device %d",
-                     Size, SrcDevId, SrcPtr, DstPtr, DstDevId)
-        }
-        return Reply->number();
-      },
-      /* Error Value */ -1);
-}
-
-int32_t RemoteOffloadClient::dataDelete(int32_t DeviceId, void *TgtPtr) {
-  return remoteCall(
-      /* Preprocessor */
-      [&](auto &RPCStatus, auto &Context) {
-        auto *Reply = protobuf::Arena::CreateMessage<I32>(Arena.get());
-        auto *Request = protobuf::Arena::CreateMessage<DeleteData>(Arena.get());
-
-        Request->set_device_id(DeviceId);
-        Request->set_tgt_ptr((uint64_t)TgtPtr);
-
-        RPCStatus = Stub->DataDelete(&Context, *Request, Reply);
-        return Reply;
-      },
-      /* Postprocessor */
-      [&](auto &Reply) {
-        if (!Reply->number()) {
-          CLIENT_DBG("Deleted data at %p on device %d", TgtPtr, DeviceId)
-        } else {
-          CLIENT_DBG("Could not delete data at %p on device %d", TgtPtr,
-                     DeviceId)
-        }
-        return Reply->number();
-      },
-      /* Error Value */ -1);
-}
-
-int32_t RemoteOffloadClient::runTargetRegion(int32_t DeviceId,
-                                             void *TgtEntryPtr, void **TgtArgs,
-                                             ptrdiff_t *TgtOffsets,
-                                             int32_t ArgNum) {
-  return remoteCall(
-      /* Preprocessor */
-      [&](auto &RPCStatus, auto &Context) {
-        auto *Reply = protobuf::Arena::CreateMessage<I32>(Arena.get());
-        auto *Request =
-            protobuf::Arena::CreateMessage<TargetRegion>(Arena.get());
-
-        Request->set_device_id(DeviceId);
-
-        Request->set_tgt_entry_ptr(
-            (uint64_t)RemoteEntries[DeviceId][TgtEntryPtr]);
-
-        char **ArgPtr = (char **)TgtArgs;
-        for (auto I = 0; I < ArgNum; I++, ArgPtr++)
-          Request->add_tgt_args((uint64_t)*ArgPtr);
-
-        char *OffsetPtr = (char *)TgtOffsets;
-        for (auto I = 0; I < ArgNum; I++, OffsetPtr++)
-          Request->add_tgt_offsets((uint64_t)*OffsetPtr);
-
-        Request->set_arg_num(ArgNum);
-
-        RPCStatus = Stub->RunTargetRegion(&Context, *Request, Reply);
-        return Reply;
-      },
-      /* Postprocessor */
-      [&](auto &Reply) {
-        if (!Reply->number()) {
-          CLIENT_DBG("Ran target region async on device %d", DeviceId)
-        } else {
-          CLIENT_DBG("Could not run target region async on device %d", DeviceId)
-        }
-        return Reply->number();
-      },
-      /* Error Value */ -1,
-      /* CanTimeOut */ false);
-}
-
-int32_t RemoteOffloadClient::runTargetTeamRegion(
-    int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets,
-    int32_t ArgNum, int32_t TeamNum, int32_t ThreadLimit,
-    uint64_t LoopTripcount) {
-  return remoteCall(
-      /* Preprocessor */
-      [&](auto &RPCStatus, auto &Context) {
-        auto *Reply = protobuf::Arena::CreateMessage<I32>(Arena.get());
-        auto *Request =
-            protobuf::Arena::CreateMessage<TargetTeamRegion>(Arena.get());
-
-        Request->set_device_id(DeviceId);
-
-        Request->set_tgt_entry_ptr(
-            (uint64_t)RemoteEntries[DeviceId][TgtEntryPtr]);
-
-        char **ArgPtr = (char **)TgtArgs;
-        for (auto I = 0; I < ArgNum; I++, ArgPtr++) {
-          Request->add_tgt_args((uint64_t)*ArgPtr);
-        }
-
-        char *OffsetPtr = (char *)TgtOffsets;
-        for (auto I = 0; I < ArgNum; I++, OffsetPtr++)
-          Request->add_tgt_offsets((uint64_t)*OffsetPtr);
-
-        Request->set_arg_num(ArgNum);
-        Request->set_team_num(TeamNum);
-        Request->set_thread_limit(ThreadLimit);
-        Request->set_loop_tripcount(LoopTripcount);
-
-        RPCStatus = Stub->RunTargetTeamRegion(&Context, *Request, Reply);
-        return Reply;
-      },
-      /* Postprocessor */
-      [&](auto &Reply) {
-        if (!Reply->number()) {
-          CLIENT_DBG("Ran target team region async on device %d", DeviceId)
-        } else {
-          CLIENT_DBG("Could not run target team region async on device %d",
-                     DeviceId)
-        }
-        return Reply->number();
-      },
-      /* Error Value */ -1,
-      /* CanTimeOut */ false);
-}
-
-int32_t RemoteClientManager::shutdown(void) {
-  int32_t Ret = 0;
-  for (auto &Client : Clients)
-    Ret &= Client.shutdown();
-  return Ret;
-}
-
-int32_t RemoteClientManager::registerLib(__tgt_bin_desc *Desc) {
-  int32_t Ret = 0;
-  for (auto &Client : Clients)
-    Ret &= Client.registerLib(Desc);
-  return Ret;
-}
-
-int32_t RemoteClientManager::unregisterLib(__tgt_bin_desc *Desc) {
-  int32_t Ret = 0;
-  for (auto &Client : Clients)
-    Ret &= Client.unregisterLib(Desc);
-  return Ret;
-}
-
-int32_t RemoteClientManager::isValidBinary(__tgt_device_image *Image) {
-  int32_t ClientIdx = 0;
-  for (auto &Client : Clients) {
-    if (auto Ret = Client.isValidBinary(Image))
-      return Ret;
-    ClientIdx++;
-  }
-  return 0;
-}
-
-int32_t RemoteClientManager::getNumberOfDevices() {
-  auto ClientIdx = 0;
-  for (auto &Client : Clients) {
-    if (auto NumDevices = Client.getNumberOfDevices()) {
-      Devices.push_back(NumDevices);
-    }
-    ClientIdx++;
-  }
-
-  return std::accumulate(Devices.begin(), Devices.end(), 0);
-}
-
-std::pair<int32_t, int32_t> RemoteClientManager::mapDeviceId(int32_t DeviceId) {
-  for (size_t ClientIdx = 0; ClientIdx < Devices.size(); ClientIdx++) {
-    if (DeviceId < Devices[ClientIdx])
-      return {ClientIdx, DeviceId};
-    DeviceId -= Devices[ClientIdx];
-  }
-  return {-1, -1};
-}
-
-int32_t RemoteClientManager::initDevice(int32_t DeviceId) {
-  int32_t ClientIdx, DeviceIdx;
-  std::tie(ClientIdx, DeviceIdx) = mapDeviceId(DeviceId);
-  return Clients[ClientIdx].initDevice(DeviceIdx);
-}
-
-int32_t RemoteClientManager::initRequires(int64_t RequiresFlags) {
-  for (auto &Client : Clients)
-    Client.initRequires(RequiresFlags);
-
-  return RequiresFlags;
-}
-
-__tgt_target_table *RemoteClientManager::loadBinary(int32_t DeviceId,
-                                                    __tgt_device_image *Image) {
-  int32_t ClientIdx, DeviceIdx;
-  std::tie(ClientIdx, DeviceIdx) = mapDeviceId(DeviceId);
-  return Clients[ClientIdx].loadBinary(DeviceIdx, Image);
-}
-
-int32_t RemoteClientManager::isDataExchangeable(int32_t SrcDevId,
-                                                int32_t DstDevId) {
-  int32_t SrcClientIdx, SrcDeviceIdx, DstClientIdx, DstDeviceIdx;
-  std::tie(SrcClientIdx, SrcDeviceIdx) = mapDeviceId(SrcDevId);
-  std::tie(DstClientIdx, DstDeviceIdx) = mapDeviceId(DstDevId);
-  return Clients[SrcClientIdx].isDataExchangeable(SrcDeviceIdx, DstDeviceIdx);
-}
-
-void *RemoteClientManager::dataAlloc(int32_t DeviceId, int64_t Size,
-                                     void *HstPtr) {
-  int32_t ClientIdx, DeviceIdx;
-  std::tie(ClientIdx, DeviceIdx) = mapDeviceId(DeviceId);
-  return Clients[ClientIdx].dataAlloc(DeviceIdx, Size, HstPtr);
-}
-
-int32_t RemoteClientManager::dataDelete(int32_t DeviceId, void *TgtPtr) {
-  int32_t ClientIdx, DeviceIdx;
-  std::tie(ClientIdx, DeviceIdx) = mapDeviceId(DeviceId);
-  return Clients[ClientIdx].dataDelete(DeviceIdx, TgtPtr);
-}
-
-int32_t RemoteClientManager::dataSubmit(int32_t DeviceId, void *TgtPtr,
-                                        void *HstPtr, int64_t Size) {
-  int32_t ClientIdx, DeviceIdx;
-  std::tie(ClientIdx, DeviceIdx) = mapDeviceId(DeviceId);
-  return Clients[ClientIdx].dataSubmit(DeviceIdx, TgtPtr, HstPtr, Size);
-}
-
-int32_t RemoteClientManager::dataRetrieve(int32_t DeviceId, void *HstPtr,
-                                          void *TgtPtr, int64_t Size) {
-  int32_t ClientIdx, DeviceIdx;
-  std::tie(ClientIdx, DeviceIdx) = mapDeviceId(DeviceId);
-  return Clients[ClientIdx].dataRetrieve(DeviceIdx, HstPtr, TgtPtr, Size);
-}
-
-int32_t RemoteClientManager::dataExchange(int32_t SrcDevId, void *SrcPtr,
-                                          int32_t DstDevId, void *DstPtr,
-                                          int64_t Size) {
-  int32_t SrcClientIdx, SrcDeviceIdx, DstClientIdx, DstDeviceIdx;
-  std::tie(SrcClientIdx, SrcDeviceIdx) = mapDeviceId(SrcDevId);
-  std::tie(DstClientIdx, DstDeviceIdx) = mapDeviceId(DstDevId);
-  return Clients[SrcClientIdx].dataExchange(SrcDeviceIdx, SrcPtr, DstDeviceIdx,
-                                            DstPtr, Size);
-}
-
-int32_t RemoteClientManager::runTargetRegion(int32_t DeviceId,
-                                             void *TgtEntryPtr, void **TgtArgs,
-                                             ptrdiff_t *TgtOffsets,
-                                             int32_t ArgNum) {
-  int32_t ClientIdx, DeviceIdx;
-  std::tie(ClientIdx, DeviceIdx) = mapDeviceId(DeviceId);
-  return Clients[ClientIdx].runTargetRegion(DeviceIdx, TgtEntryPtr, TgtArgs,
-                                            TgtOffsets, ArgNum);
-}
-
-int32_t RemoteClientManager::runTargetTeamRegion(
-    int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets,
-    int32_t ArgNum, int32_t TeamNum, int32_t ThreadLimit,
-    uint64_t LoopTripCount) {
-  int32_t ClientIdx, DeviceIdx;
-  std::tie(ClientIdx, DeviceIdx) = mapDeviceId(DeviceId);
-  return Clients[ClientIdx].runTargetTeamRegion(DeviceIdx, TgtEntryPtr, TgtArgs,
-                                                TgtOffsets, ArgNum, TeamNum,
-                                                ThreadLimit, LoopTripCount);
-}
diff --git a/openmp/libomptarget/plugins/remote/src/rtl.cpp b/openmp/libomptarget/plugins/remote/src/rtl.cpp
deleted file mode 100644
--- a/openmp/libomptarget/plugins/remote/src/rtl.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-//===--------------------- rtl.cpp - Remote RTL Plugin --------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// RTL for Host.
-//
-//===----------------------------------------------------------------------===//
-
-#include <cstddef>
-#include <memory>
-#include <string>
-
-#include "Client.h"
-#include "Utils.h"
-#include "omptarget.h"
-#include "omptargetplugin.h"
-
-#define TARGET_NAME RPC
-#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
-
-RemoteClientManager *Manager;
-
-__attribute__((constructor(101))) void initRPC() {
-  DP("Init RPC library!\n");
-
-  Manager = new RemoteClientManager();
-}
-
-__attribute__((destructor(101))) void deinitRPC() {
-  Manager->shutdown(); // TODO: Error handle shutting down
-  DP("Deinit RPC library!\n");
-  delete Manager;
-}
-
-// Exposed library API function
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int32_t __tgt_rtl_register_lib(__tgt_bin_desc *Desc) {
-  return Manager->registerLib(Desc);
-}
-
-int32_t __tgt_rtl_unregister_lib(__tgt_bin_desc *Desc) {
-  return Manager->unregisterLib(Desc);
-}
-
-int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
-  return Manager->isValidBinary(Image);
-}
-
-int32_t __tgt_rtl_number_of_devices() { return Manager->getNumberOfDevices(); }
-
-int32_t __tgt_rtl_init_device(int32_t DeviceId) {
-  return Manager->initDevice(DeviceId);
-}
-
-int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
-  return Manager->initRequires(RequiresFlags);
-}
-
-__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
-                                          __tgt_device_image *Image) {
-  return Manager->loadBinary(DeviceId, (__tgt_device_image *)Image);
-}
-
-int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDevId, int32_t DstDevId) {
-  return Manager->isDataExchangeable(SrcDevId, DstDevId);
-}
-
-void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *HstPtr,
-                           int32_t Kind) {
-  if (Kind != TARGET_ALLOC_DEFAULT) {
-    REPORT("Invalid target data allocation kind or requested allocator not "
-           "implemented yet\n");
-    return NULL;
-  }
-
-  return Manager->dataAlloc(DeviceId, Size, HstPtr);
-}
-
-int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
-                              int64_t Size) {
-  return Manager->dataSubmit(DeviceId, TgtPtr, HstPtr, Size);
-}
-
-int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
-                                int64_t Size) {
-  return Manager->dataRetrieve(DeviceId, HstPtr, TgtPtr, Size);
-}
-
-int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr, int32_t) {
-  return Manager->dataDelete(DeviceId, TgtPtr);
-}
-
-int32_t __tgt_rtl_data_exchange(int32_t SrcDevId, void *SrcPtr,
-                                int32_t DstDevId, void *DstPtr, int64_t Size) {
-  return Manager->dataExchange(SrcDevId, SrcPtr, DstDevId, DstPtr, Size);
-}
-
-int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr,
-                                void **TgtArgs, ptrdiff_t *TgtOffsets,
-                                KernelArgsTy *KernelArgs,
-                                __tgt_async_info *AsyncInfoPtr) {
-  assert(!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] &&
-         !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
-         "Only one dimensional kernels supported.");
-  return Manager->runTargetTeamRegion(
-      DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, KernelArgs->NumArgs,
-      KernelArgs->NumTeams[0], KernelArgs->ThreadLimit[0],
-      KernelArgs->Tripcount);
-}
-
-// Exposed library API function
-#ifdef __cplusplus
-}
-#endif
diff --git a/openmp/libomptarget/plugins/ve/CMakeLists.txt b/openmp/libomptarget/plugins/ve/CMakeLists.txt
deleted file mode 100644
--- a/openmp/libomptarget/plugins/ve/CMakeLists.txt
+++ /dev/null
@@ -1,66 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for a NEC Aurora machine if available. (Can also run on host)
-#
-##===----------------------------------------------------------------------===##
-
-
-if(${LIBOMPTARGET_DEP_VEO_FOUND})
-  libomptarget_say("Building SX-Aurora VE offloading plugin.")
-  set(additional_libs "")
-  set(additional_libs ${LIBOMPTARGET_DEP_VEO_LIBRARIES}
-                      ${LIBOMPTARGET_DEP_VEOSINFO_LIBRARIES}
-                      ${additional_libs})
-
-  set(tmachine_name "ve")
-  set(tmachine_libname "ve")
-  set(tmachine_triple "ve-unknown-linux-unknown")
-  set(elf_machine_id 251)
-
-  # Define macro to be used as prefix of the runtime messages for this target.
-  add_definitions("-DTARGET_NAME=${tmachine_name}")
-
-  # Define macro with the ELF ID for this target.
-  add_definitions("-DTARGET_ELF_ID=${elf_machine_id}")
-
-  add_llvm_library("omptarget.rtl.${tmachine_libname}" 
-    SHARED
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/rtl.cpp
-
-    ADDITIONAL_HEADER_DIRS
-    ${LIBOMPTARGET_INCLUDE_DIR}
-    ${LIBOMPTARGET_DEP_VEO_INCLUDE_DIR}
-
-    LINK_LIBS
-    PRIVATE
-    elf_common
-    ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES}
-    ${additional_libs}
-    "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports -Wl,-z,defs"
-
-    NO_INSTALL_RPATH
-  )
-
-  # Install plugin under the lib destination folder.
-  install(TARGETS  "omptarget.rtl.${tmachine_libname}" LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-  set_target_properties("omptarget.rtl.${tmachine_libname}" PROPERTIES 
-    INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.."
-    CXX_VISIBILITY_PRESET protected)
-
-  target_include_directories("omptarget.rtl.${tmachine_libname}" PRIVATE
-    ${LIBOMPTARGET_INCLUDE_DIR}
-    ${LIBOMPTARGET_DEP_VEO_INCLUDE_DIR})
-
-  target_link_libraries(
-    "omptarget.rtl.${tmachine_libname}"
-    elf_common
-    ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES}
-    ${additional_libs}
-    "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports -Wl,-z,defs")
-
-  # Report to the parent scope that we are building a plugin.
-  set(LIBOMPTARGET_SYSTEM_TARGETS
-    "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple}" PARENT_SCOPE)
-else()
-    libomptarget_say("Not building nec-aurora plugin: libveo or libveosinfo not found.")
-endif()
diff --git a/openmp/libomptarget/plugins/ve/src/rtl.cpp b/openmp/libomptarget/plugins/ve/src/rtl.cpp
deleted file mode 100644
--- a/openmp/libomptarget/plugins/ve/src/rtl.cpp
+++ /dev/null
@@ -1,453 +0,0 @@
-//===-RTLs/nec-aurora/src/rtl.cpp - Target RTLs Implementation - C++ -*-======//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// RTL for NEC Aurora TSUBASA machines
-//
-//===----------------------------------------------------------------------===//
-
-#include <algorithm>
-#include <cassert>
-#include <cerrno>
-#include <cstring>
-#include <list>
-#include <stdlib.h>
-#include <string>
-#include <sys/stat.h>
-#include <ve_offload.h>
-#include <vector>
-#include <veosinfo/veosinfo.h>
-
-#include "Debug.h"
-#include "omptargetplugin.h"
-
-#ifndef TARGET_NAME
-#define TARGET_NAME VE
-#endif
-
-#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
-
-#ifndef TARGET_ELF_ID
-#define TARGET_ELF_ID 0
-#endif
-
-#include "elf_common.h"
-
-struct DynLibTy {
-  char *FileName;
-  uint64_t VeoLibHandle;
-};
-
-/// Keep entries table per device.
-struct FuncOrGblEntryTy {
-  __tgt_target_table Table;
-  std::vector<__tgt_offload_entry> Entries;
-};
-
-class RTLDeviceInfoTy {
-  std::vector<std::list<FuncOrGblEntryTy>> FuncOrGblEntry;
-
-public:
-  std::vector<struct veo_proc_handle *> ProcHandles;
-  std::vector<struct veo_thr_ctxt *> Contexts;
-  std::vector<uint64_t> LibraryHandles;
-  std::list<DynLibTy> DynLibs;
-  // Maps OpenMP device Ids to Ve nodeids
-  std::vector<int> NodeIds;
-
-  void buildOffloadTableFromHost(int32_t device_id, uint64_t VeoLibHandle,
-                                 __tgt_offload_entry *HostBegin,
-                                 __tgt_offload_entry *HostEnd) {
-    FuncOrGblEntry[device_id].emplace_back();
-    std::vector<__tgt_offload_entry> &T =
-        FuncOrGblEntry[device_id].back().Entries;
-    T.clear();
-    for (__tgt_offload_entry *i = HostBegin; i != HostEnd; ++i) {
-      char *SymbolName = i->name;
-      // we have not enough access to the target memory to conveniently parse
-      // the offload table there so we need to lookup every symbol with the host
-      // table
-      DP("Looking up symbol: %s\n", SymbolName);
-      uint64_t SymbolTargetAddr =
-          veo_get_sym(ProcHandles[device_id], VeoLibHandle, SymbolName);
-      __tgt_offload_entry Entry;
-
-      if (!SymbolTargetAddr) {
-        DP("Symbol %s not found in target image\n", SymbolName);
-        Entry = {NULL, NULL, 0, 0, 0};
-      } else {
-        DP("Found symbol %s successfully in target image (addr: %p)\n",
-           SymbolName, reinterpret_cast<void *>(SymbolTargetAddr));
-        Entry = {reinterpret_cast<void *>(SymbolTargetAddr), i->name, i->size,
-                 i->flags, 0};
-      }
-
-      T.push_back(Entry);
-    }
-
-    FuncOrGblEntry[device_id].back().Table.EntriesBegin = &T.front();
-    FuncOrGblEntry[device_id].back().Table.EntriesEnd = &T.back() + 1;
-  }
-
-  __tgt_target_table *getOffloadTable(int32_t device_id) {
-    return &FuncOrGblEntry[device_id].back().Table;
-  }
-
-  RTLDeviceInfoTy() {
-
-    struct ve_nodeinfo node_info;
-    ve_node_info(&node_info);
-
-    // Build a predictable mapping between VE node ids and OpenMP device ids.
-    // This is necessary, because nodes can be missing or offline and (active)
-    // node ids are thus not consecutive. The entries in ve_nodeinfo may also
-    // not be in the order of their node ids.
-    for (int i = 0; i < node_info.total_node_count; ++i) {
-      if (node_info.status[i] == 0) {
-        NodeIds.push_back(node_info.nodeid[i]);
-      }
-    }
-
-    // Because the entries in ve_nodeinfo may not be in the order of their node
-    // ids, we sort NodeIds to get a predictable mapping.
-    std::sort(NodeIds.begin(), NodeIds.end());
-
-    int NumDevices = NodeIds.size();
-    DP("Found %i VE devices\n", NumDevices);
-    ProcHandles.resize(NumDevices, NULL);
-    Contexts.resize(NumDevices, NULL);
-    FuncOrGblEntry.resize(NumDevices);
-    LibraryHandles.resize(NumDevices);
-  }
-
-  ~RTLDeviceInfoTy() {
-    for (auto &ctx : Contexts) {
-      if (ctx != NULL) {
-        if (veo_context_close(ctx) != 0) {
-          DP("Failed to close VEO context.\n");
-        }
-      }
-    }
-
-    for (auto &hdl : ProcHandles) {
-      if (hdl != NULL) {
-        veo_proc_destroy(hdl);
-      }
-    }
-
-    for (auto &lib : DynLibs) {
-      if (lib.FileName) {
-        remove(lib.FileName);
-      }
-    }
-  }
-};
-
-static RTLDeviceInfoTy DeviceInfo;
-
-static int target_run_function_wait(uint32_t DeviceID, uint64_t FuncAddr,
-                                    struct veo_args *args, uint64_t *RetVal) {
-  DP("Running function with entry point %p\n",
-     reinterpret_cast<void *>(FuncAddr));
-  uint64_t RequestHandle =
-      veo_call_async(DeviceInfo.Contexts[DeviceID], FuncAddr, args);
-  if (RequestHandle == VEO_REQUEST_ID_INVALID) {
-    DP("Execution of entry point %p failed\n",
-       reinterpret_cast<void *>(FuncAddr));
-    return OFFLOAD_FAIL;
-  }
-
-  DP("Function at address %p called (VEO request ID: %" PRIu64 ")\n",
-     reinterpret_cast<void *>(FuncAddr), RequestHandle);
-
-  int ret = veo_call_wait_result(DeviceInfo.Contexts[DeviceID], RequestHandle,
-                                 RetVal);
-  if (ret != 0) {
-    DP("Waiting for entry point %p failed (Error code %d)\n",
-       reinterpret_cast<void *>(FuncAddr), ret);
-    return OFFLOAD_FAIL;
-  }
-  return OFFLOAD_SUCCESS;
-}
-
-// Return the number of available devices of the type supported by the
-// target RTL.
-int32_t __tgt_rtl_number_of_devices(void) { return DeviceInfo.NodeIds.size(); }
-
-// Return an integer different from zero if the provided device image can be
-// supported by the runtime. The functionality is similar to comparing the
-// result of __tgt__rtl__load__binary to NULL. However, this is meant to be a
-// lightweight query to determine if the RTL is suitable for an image without
-// having to load the library, which can be expensive.
-int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
-#if TARGET_ELF_ID < 1
-  return 0;
-#else
-  return elf_check_machine(Image, TARGET_ELF_ID);
-#endif
-}
-
-// Initialize the specified device. In case of success return 0; otherwise
-// return an error code.
-int32_t __tgt_rtl_init_device(int32_t ID) {
-  DP("Available VEO version: %i\n", veo_api_version());
-
-  // At the moment we do not really initialize (i.e. create a process or
-  // context on) the device here, but in "__tgt_rtl_load_binary".
-  // The reason for this is, that, when we create a process for a statically
-  // linked binary, the VEO api needs us to already supply the binary (but we
-  // can load a dynamically linked binary later, after we create the process).
-  // At this stage, we cannot check if we have a dynamically or statically
-  // linked binary so we defer process creation until we know.
-  return OFFLOAD_SUCCESS;
-}
-
-// Pass an executable image section described by image to the specified
-// device and prepare an address table of target entities. In case of error,
-// return NULL. Otherwise, return a pointer to the built address table.
-// Individual entries in the table may also be NULL, when the corresponding
-// offload region is not supported on the target device.
-__tgt_target_table *__tgt_rtl_load_binary(int32_t ID,
-                                          __tgt_device_image *Image) {
-  DP("Dev %d: load binary from " DPxMOD " image\n", ID,
-     DPxPTR(Image->ImageStart));
-
-  assert(ID >= 0 && "bad dev id");
-
-  size_t ImageSize = (size_t)Image->ImageEnd - (size_t)Image->ImageStart;
-  size_t NumEntries = (size_t)(Image->EntriesEnd - Image->EntriesBegin);
-  DP("Expecting to have %zd entries defined.\n", NumEntries);
-
-  // load dynamic library and get the entry points. We use the dl library
-  // to do the loading of the library, but we could do it directly to avoid the
-  // dump to the temporary file.
-  //
-  // 1) Create tmp file with the library contents.
-  // 2) Use dlopen to load the file and dlsym to retrieve the symbols.
-  char tmp_name[] = "/tmp/tmpfile_XXXXXX";
-  int tmp_fd = mkstemp(tmp_name);
-
-  if (tmp_fd == -1) {
-    return NULL;
-  }
-
-  FILE *ftmp = fdopen(tmp_fd, "wb");
-
-  if (!ftmp) {
-    DP("fdopen() for %s failed. Could not write target image\n", tmp_name);
-    return NULL;
-  }
-
-  fwrite(Image->ImageStart, ImageSize, 1, ftmp);
-
-  // at least for the static case we need to change the permissions
-  chmod(tmp_name, 0700);
-
-  DP("Wrote target image to %s. ImageSize=%zu\n", tmp_name, ImageSize);
-
-  fclose(ftmp);
-
-  // See comment in "__tgt_rtl_init_device"
-  bool is_dyn = true;
-  if (DeviceInfo.ProcHandles[ID] == NULL) {
-    struct veo_proc_handle *proc_handle;
-    is_dyn = elf_is_dynamic(Image);
-    // If we have a dynamically linked image, we create the process handle, then
-    // the thread, and then load the image.
-    // If we have a statically linked image, we need to create the process
-    // handle and load the image at the same time with veo_proc_create_static().
-    if (is_dyn) {
-      proc_handle = veo_proc_create(DeviceInfo.NodeIds[ID]);
-      if (!proc_handle) {
-        DP("veo_proc_create() failed for device %d\n", ID);
-        return NULL;
-      }
-    } else {
-      proc_handle = veo_proc_create_static(DeviceInfo.NodeIds[ID], tmp_name);
-      if (!proc_handle) {
-        DP("veo_proc_create_static() failed for device %d, image=%s\n", ID,
-           tmp_name);
-        return NULL;
-      }
-    }
-    DeviceInfo.ProcHandles[ID] = proc_handle;
-  }
-
-  if (DeviceInfo.Contexts[ID] == NULL) {
-    struct veo_thr_ctxt *ctx = veo_context_open(DeviceInfo.ProcHandles[ID]);
-
-    if (!ctx) {
-      DP("veo_context_open() failed: %s\n", std::strerror(errno));
-      return NULL;
-    }
-
-    DeviceInfo.Contexts[ID] = ctx;
-  }
-
-  DP("Aurora device successfully initialized with loaded binary: "
-     "proc_handle=%p, ctx=%p\n",
-     DeviceInfo.ProcHandles[ID], DeviceInfo.Contexts[ID]);
-
-  uint64_t LibHandle = 0UL;
-  if (is_dyn) {
-    LibHandle = veo_load_library(DeviceInfo.ProcHandles[ID], tmp_name);
-
-    if (!LibHandle) {
-      DP("veo_load_library() failed: LibHandle=%" PRIu64
-         " Name=%s. Set env VEORUN_BIN for static linked target code.\n",
-         LibHandle, tmp_name);
-      return NULL;
-    }
-
-    DP("Successfully loaded library dynamically\n");
-  } else {
-    DP("Symbol table is expected to have been created by "
-       "veo_create_proc_static()\n");
-  }
-
-  DynLibTy Lib = {tmp_name, LibHandle};
-  DeviceInfo.DynLibs.push_back(Lib);
-  DeviceInfo.LibraryHandles[ID] = LibHandle;
-
-  DeviceInfo.buildOffloadTableFromHost(ID, LibHandle, Image->EntriesBegin,
-                                       Image->EntriesEnd);
-
-  return DeviceInfo.getOffloadTable(ID);
-}
-
-// Allocate data on the particular target device, of the specified size.
-// HostPtr is a address of the host data the allocated target data
-// will be associated with (HostPtr may be NULL if it is not known at
-// allocation time, like for example it would be for target data that
-// is allocated by omp_target_alloc() API). Return address of the
-// allocated data on the target that will be used by libomptarget.so to
-// initialize the target data mapping structures. These addresses are
-// used to generate a table of target variables to pass to
-// __tgt_rtl_run_region(). The __tgt_rtl_data_alloc() returns NULL in
-// case an error occurred on the target device.
-void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr,
-                           int32_t kind) {
-  int ret;
-  uint64_t addr;
-
-  if (kind != TARGET_ALLOC_DEFAULT) {
-    REPORT("Invalid target data allocation kind or requested allocator not "
-           "implemented yet\n");
-    return NULL;
-  }
-
-  if (DeviceInfo.ProcHandles[ID] == NULL) {
-    struct veo_proc_handle *proc_handle;
-    proc_handle = veo_proc_create(DeviceInfo.NodeIds[ID]);
-    if (!proc_handle) {
-      DP("veo_proc_create() failed for device %d\n", ID);
-      return NULL;
-    }
-    DeviceInfo.ProcHandles[ID] = proc_handle;
-    DP("Aurora device successfully initialized: proc_handle=%p", proc_handle);
-  }
-
-  ret = veo_alloc_mem(DeviceInfo.ProcHandles[ID], &addr, Size);
-  DP("Allocate target memory: device=%d, target addr=%p, size=%" PRIu64 "\n",
-     ID, reinterpret_cast<void *>(addr), Size);
-  if (ret != 0) {
-    DP("veo_alloc_mem(%d, %p, %" PRIu64 ") failed with error code %d\n", ID,
-       reinterpret_cast<void *>(addr), Size, ret);
-    return NULL;
-  }
-
-  return reinterpret_cast<void *>(addr);
-}
-
-// Pass the data content to the target device using the target address.
-// In case of success, return zero. Otherwise, return an error code.
-int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr,
-                              int64_t Size) {
-  int ret = veo_write_mem(DeviceInfo.ProcHandles[ID], (uint64_t)TargetPtr,
-                          HostPtr, (size_t)Size);
-  if (ret != 0) {
-    DP("veo_write_mem() failed with error code %d\n", ret);
-    return OFFLOAD_FAIL;
-  }
-  return OFFLOAD_SUCCESS;
-}
-
-// Retrieve the data content from the target device using its address.
-// In case of success, return zero. Otherwise, return an error code.
-int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr,
-                                int64_t Size) {
-  int ret = veo_read_mem(DeviceInfo.ProcHandles[ID], HostPtr,
-                         (uint64_t)TargetPtr, Size);
-  if (ret != 0) {
-    DP("veo_read_mem() failed with error code %d\n", ret);
-    return OFFLOAD_FAIL;
-  }
-  return OFFLOAD_SUCCESS;
-}
-
-// De-allocate the data referenced by target ptr on the device. In case of
-// success, return zero. Otherwise, return an error code.
-int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr, int32_t) {
-  int ret = veo_free_mem(DeviceInfo.ProcHandles[ID], (uint64_t)TargetPtr);
-
-  if (ret != 0) {
-    DP("veo_free_mem() failed with error code %d\n", ret);
-    return OFFLOAD_FAIL;
-  }
-  return OFFLOAD_SUCCESS;
-}
-
-// Transfer control to the offloaded entry Entry on the target device.
-// Args and Offsets are arrays of NumArgs size of target addresses and
-// offsets. An offset should be added to the target address before passing it
-// to the outlined function on device side. In case of success, return zero.
-// Otherwise, return an error code.
-int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr,
-                                void **TgtArgs, ptrdiff_t *TgtOffsets,
-                                KernelArgsTy *KernelArgs,
-                                __tgt_async_info *AsyncInfoPtr) {
-  assert(!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] &&
-         !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
-         "Only one dimensional kernels supported.");
-  int ret;
-
-  // ignore team num and thread limit.
-  std::vector<void *> ptrs(KernelArgs->NumArgs);
-
-  struct veo_args *TargetArgs;
-  TargetArgs = veo_args_alloc();
-
-  if (TargetArgs == NULL) {
-    DP("Could not allocate VEO args\n");
-    return OFFLOAD_FAIL;
-  }
-
-  for (int i = 0; i < KernelArgs->NumArgs; ++i) {
-    ret = veo_args_set_u64(TargetArgs, i, (intptr_t)Args[i]);
-
-    if (ret != 0) {
-      DP("veo_args_set_u64() has returned %d for argnum=%d and value %p\n", ret,
-         i, Args[i]);
-      return OFFLOAD_FAIL;
-    }
-  }
-
-  uint64_t RetVal;
-  if (target_run_function_wait(ID, reinterpret_cast<uint64_t>(Entry),
-                               TargetArgs, &RetVal) != OFFLOAD_SUCCESS) {
-    veo_args_free(TargetArgs);
-    return OFFLOAD_FAIL;
-  }
-  veo_args_free(TargetArgs);
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t __tgt_rtl_supports_empty_images() { return 1; }
-
-// VEC plugin's internal InfoLevel.
-std::atomic<uint32_t> InfoLevel;
diff --git a/openmp/libomptarget/plugins/x86_64/CMakeLists.txt b/openmp/libomptarget/plugins/x86_64/CMakeLists.txt
deleted file mode 100644
--- a/openmp/libomptarget/plugins/x86_64/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-##===----------------------------------------------------------------------===##
-# 
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# 
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for a x86_64 machine if available.
-#
-##===----------------------------------------------------------------------===##
-
-if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  build_generic_elf64("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62")
-else()
- libomptarget_say("Not building x86_64 offloading plugin: machine not found in the system.")
-endif()
diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp
--- a/openmp/libomptarget/src/rtl.cpp
+++ b/openmp/libomptarget/src/rtl.cpp
@@ -92,8 +92,6 @@
 
   DP("Loading RTLs...\n");
 
-  BoolEnvar NextGenPlugins("LIBOMPTARGET_NEXTGEN_PLUGINS", true);
-
   // Attempt to open all the plugins and, if they exist, check if the interface
   // is correct and if they are supporting any devices.
   for (const char *Name : RTLNames) {
@@ -102,14 +100,7 @@
     RTLInfoTy &RTL = AllRTLs.back();
 
     const std::string BaseRTLName(Name);
-    if (NextGenPlugins) {
-      if (attemptLoadRTL(BaseRTLName + ".nextgen.so", RTL))
-        continue;
-
-      DP("Falling back to original plugin...\n");
-    }
-
-    if (!attemptLoadRTL(BaseRTLName + ".so", RTL))
+    if (!attemptLoadRTL(BaseRTLName + ".nextgen.so", RTL))
       AllRTLs.pop_back();
   }
 
diff --git a/openmp/libomptarget/test/api/omp_dynamic_shared_memory_mixed_amdgpu.c b/openmp/libomptarget/test/api/omp_dynamic_shared_memory_mixed_amdgpu.c
--- a/openmp/libomptarget/test/api/omp_dynamic_shared_memory_mixed_amdgpu.c
+++ b/openmp/libomptarget/test/api/omp_dynamic_shared_memory_mixed_amdgpu.c
@@ -1,6 +1,5 @@
 // RUN: %libomptarget-compile-amdgcn-amd-amdhsa -O1 -mllvm -openmp-opt-inline-device -I %S
-// RUN: env LIBOMPTARGET_NEXTGEN_PLUGINS=1 \
-// RUN:   %libomptarget-run-amdgcn-amd-amdhsa | %fcheck-amdgcn-amd-amdhsa
+// RUN: %libomptarget-run-amdgcn-amd-amdhsa | %fcheck-amdgcn-amd-amdhsa
 // REQUIRES: amdgcn-amd-amdhsa
 
 #include "omp_dynamic_shared_memory_mixed.inc"
diff --git a/openmp/libomptarget/test/api/omp_dynamic_shared_memory_mixed_nvptx.c b/openmp/libomptarget/test/api/omp_dynamic_shared_memory_mixed_nvptx.c
--- a/openmp/libomptarget/test/api/omp_dynamic_shared_memory_mixed_nvptx.c
+++ b/openmp/libomptarget/test/api/omp_dynamic_shared_memory_mixed_nvptx.c
@@ -1,7 +1,5 @@
 // RUN: %libomptarget-compile-nvptx64-nvidia-cuda -I %S
-// RUN: env LIBOMPTARGET_NEXTGEN_PLUGINS=1 \
-// RUN:   %libomptarget-run-nvptx64-nvidia-cuda | %fcheck-nvptx64-nvidia-cuda
-// REQUIRES: nvptx64-nvidia-cuda
+// RUN: %libomptarget-run-nvptx64-nvidia-cuda | %fcheck-nvptx64-nvidia-cuda
 
 #include "omp_dynamic_shared_memory_mixed.inc"
 // CHECK: PASS
diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg
--- a/openmp/libomptarget/test/lit.cfg
+++ b/openmp/libomptarget/test/lit.cfg
@@ -21,10 +21,6 @@
 if 'LIBOMPTARGET_DEBUG' in os.environ:
     config.environment['LIBOMPTARGET_DEBUG'] = os.environ['LIBOMPTARGET_DEBUG']
 
-# Allow running the tests with nextgen plugins when available
-if 'LIBOMPTARGET_NEXTGEN_PLUGINS' in os.environ:
-    config.environment['LIBOMPTARGET_NEXTGEN_PLUGINS'] = os.environ['LIBOMPTARGET_NEXTGEN_PLUGINS']
-
 if 'OMP_TARGET_OFFLOAD' in os.environ:
     config.environment['OMP_TARGET_OFFLOAD'] = os.environ['OMP_TARGET_OFFLOAD']
 
@@ -118,9 +114,7 @@
         config.test_flags += " --libomptarget-nvptx-bc-path=" + config.library_dir
     if config.libomptarget_current_target.endswith('-LTO'):
         config.test_flags += " -foffload-lto"
-    if config.libomptarget_current_target.endswith('-JIT-LTO') and evaluate_bool_env(
-        config.environment['LIBOMPTARGET_NEXTGEN_PLUGINS']
-    ):
+    if config.libomptarget_current_target.endswith('-JIT-LTO'):
         config.test_flags += " -foffload-lto"
         config.test_flags += " -Wl,--embed-bitcode"
 
diff --git a/openmp/libomptarget/test/mapping/prelock.cpp b/openmp/libomptarget/test/mapping/prelock.cpp
--- a/openmp/libomptarget/test/mapping/prelock.cpp
+++ b/openmp/libomptarget/test/mapping/prelock.cpp
@@ -1,5 +1,4 @@
-// RUN: %libomptarget-compilexx-generic
-// RUN: env LIBOMPTARGET_NEXTGEN_PLUGINS=0 %libomptarget-run-generic %fcheck-generic
+// RUN: %libomptarget-compile-run-and-check-generic
 
 // UNSUPPORTED: aarch64-unknown-linux-gnu
 // UNSUPPORTED: aarch64-unknown-linux-gnu-LTO