diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst
--- a/openmp/docs/design/Runtimes.rst
+++ b/openmp/docs/design/Runtimes.rst
@@ -1378,7 +1378,7 @@
 .. code-block:: console
 
     $ clang++ -fopenmp --offload-arch=gfx90a -O3 shared.c
-    $ env LIBOMPTARGET_NEXTGEN_PLUGINS=1 ./shared
+    $ env ./shared
 
 
 .. _libomptarget_device_debugging:
diff --git a/openmp/libomptarget/CMakeLists.txt b/openmp/libomptarget/CMakeLists.txt
--- a/openmp/libomptarget/CMakeLists.txt
+++ b/openmp/libomptarget/CMakeLists.txt
@@ -105,7 +105,6 @@
   "Path to folder containing llvm library libomptarget.so")
 
 # Build offloading plugins and device RTLs if they are available.
-add_subdirectory(plugins)
 add_subdirectory(plugins-nextgen)
 add_subdirectory(DeviceRTL)
 add_subdirectory(tools)
diff --git a/openmp/libomptarget/plugins-nextgen/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/CMakeLists.txt
--- a/openmp/libomptarget/plugins-nextgen/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/CMakeLists.txt
@@ -12,12 +12,12 @@
 
 add_subdirectory(common)
 
-# void build_generic_elf64_nextgen(string tmachine, string tmachine_name, string tmachine_libname, string elf_machine_id);
+# void build_generic_elf64(string tmachine, string tmachine_name, string tmachine_libname, string elf_machine_id);
 # - build a plugin for an ELF based generic 64-bit target based on libffi.
 # - tmachine: name of the machine processor as used in the cmake build system.
 # - tmachine_name: name of the machine to be printed with the debug messages.
 # - tmachine_libname: machine name to be appended to the plugin library name.
-macro(build_generic_elf64_nextgen tmachine tmachine_name tmachine_libname tmachine_triple elf_machine_id)
+macro(build_generic_elf64 tmachine tmachine_name tmachine_libname tmachine_triple elf_machine_id)
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$")
   if(LIBOMPTARGET_DEP_LIBFFI_FOUND)
 
@@ -36,7 +36,7 @@
     # Define target regiple
     add_definitions("-DLIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE=${tmachine}")
 
-    add_llvm_library("omptarget.rtl.${tmachine_libname}.nextgen"
+    add_llvm_library("omptarget.rtl.${tmachine_libname}"
       SHARED
 
       ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/src/rtl.cpp
@@ -58,23 +58,23 @@
     )
 
     if (LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
-      target_link_libraries("omptarget.rtl.${tmachine_libname}.nextgen" PRIVATE
+      target_link_libraries("omptarget.rtl.${tmachine_libname}" PRIVATE
         "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports")
     endif()
 
     # Install plugin under the lib destination folder.
-    install(TARGETS "omptarget.rtl.${tmachine_libname}.nextgen"
+    install(TARGETS "omptarget.rtl.${tmachine_libname}"
       LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-    set_target_properties("omptarget.rtl.${tmachine_libname}.nextgen" PROPERTIES
+    set_target_properties("omptarget.rtl.${tmachine_libname}" PROPERTIES
       INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.."
       CXX_VISIBILITY_PRESET protected)
 
-    target_include_directories( "omptarget.rtl.${tmachine_libname}.nextgen" PRIVATE
+    target_include_directories( "omptarget.rtl.${tmachine_libname}" PRIVATE
       ${LIBOMPTARGET_INCLUDE_DIR}
       ${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR})
 
     list(APPEND LIBOMPTARGET_TESTED_PLUGINS
-      "omptarget.rtl.${tmachine_libname}.nextgen")
+      "omptarget.rtl.${tmachine_libname}")
 
   else(LIBOMPTARGET_DEP_LIBFFI_FOUND)
     libomptarget_say("Not building ${tmachine_name} NextGen offloading plugin: libffi dependency not found.")
diff --git a/openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt
--- a/openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt
@@ -11,7 +11,7 @@
 ##===----------------------------------------------------------------------===##
 
 if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  build_generic_elf64_nextgen("aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183")
+  build_generic_elf64("aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183")
 else()
  libomptarget_say("Not building aarch64 NextGen offloading plugin: machine not found in the system.")
 endif()
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/CMakeLists.txt
@@ -52,8 +52,8 @@
   set(LIBOMPTARGET_DEP_LIBRARIES hsa-runtime64::hsa-runtime64)
 else()
   libomptarget_say("Building AMDGPU NextGen plugin for dlopened libhsa")
-  include_directories(../../plugins/amdgpu/dynamic_hsa)
-  set(LIBOMPTARGET_EXTRA_SOURCE ../../plugins/amdgpu/dynamic_hsa/hsa.cpp)
+  include_directories(dynamic_hsa)
+  set(LIBOMPTARGET_EXTRA_SOURCE dynamic_hsa/hsa.cpp)
   set(LIBOMPTARGET_DEP_LIBRARIES)
 endif()
 
@@ -66,7 +66,7 @@
   set(LDFLAGS_UNDEFINED "-Wl,-z,defs")
 endif()
 
-add_llvm_library(omptarget.rtl.amdgpu.nextgen SHARED
+add_llvm_library(omptarget.rtl.amdgpu SHARED
   src/rtl.cpp
   ${LIBOMPTARGET_EXTRA_SOURCE}
 
@@ -91,16 +91,16 @@
 )
 
 if ((OMPT_TARGET_DEFAULT) AND (LIBOMPTARGET_OMPT_SUPPORT))
-  target_link_libraries(omptarget.rtl.amdgpu.nextgen PRIVATE OMPT)
+  target_link_libraries(omptarget.rtl.amdgpu PRIVATE OMPT)
 endif()
 
 if (LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
-  target_link_libraries(omptarget.rtl.amdgpu.nextgen PRIVATE
+  target_link_libraries(omptarget.rtl.amdgpu PRIVATE
     "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports")
 endif()
 
 target_include_directories(
-  omptarget.rtl.amdgpu.nextgen
+  omptarget.rtl.amdgpu
   PRIVATE
   ${LIBOMPTARGET_INCLUDE_DIR}
   ${CMAKE_CURRENT_SOURCE_DIR}/utils
@@ -108,7 +108,7 @@
 
 
 # Install plugin under the lib destination folder.
-install(TARGETS omptarget.rtl.amdgpu.nextgen LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-set_target_properties(omptarget.rtl.amdgpu.nextgen PROPERTIES
+install(TARGETS omptarget.rtl.amdgpu LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+set_target_properties(omptarget.rtl.amdgpu PROPERTIES
   INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.."
   CXX_VISIBILITY_PRESET protected)
diff --git a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.h b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
rename from openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.h
rename to openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
diff --git a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
rename from openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa.cpp
rename to openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
diff --git a/openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa_ext_amd.h b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
rename from openmp/libomptarget/plugins/amdgpu/dynamic_hsa/hsa_ext_amd.h
rename to openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
diff --git a/openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt
--- a/openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt
@@ -12,3 +12,5 @@
 
 add_subdirectory(OMPT)
 add_subdirectory(PluginInterface)
+add_subdirectory(MemoryManager)
+add_subdirectory(elf_common)
diff --git a/openmp/libomptarget/plugins/common/MemoryManager/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/common/MemoryManager/CMakeLists.txt
rename from openmp/libomptarget/plugins/common/MemoryManager/CMakeLists.txt
rename to openmp/libomptarget/plugins-nextgen/common/MemoryManager/CMakeLists.txt
diff --git a/openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h b/openmp/libomptarget/plugins-nextgen/common/MemoryManager/MemoryManager.h
rename from openmp/libomptarget/plugins/common/MemoryManager/MemoryManager.h
rename to openmp/libomptarget/plugins-nextgen/common/MemoryManager/MemoryManager.h
diff --git a/openmp/libomptarget/plugins/common/elf_common/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/common/elf_common/CMakeLists.txt
rename from openmp/libomptarget/plugins/common/elf_common/CMakeLists.txt
rename to openmp/libomptarget/plugins-nextgen/common/elf_common/CMakeLists.txt
--- a/openmp/libomptarget/plugins/common/elf_common/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/common/elf_common/CMakeLists.txt
@@ -11,7 +11,7 @@
 ##===----------------------------------------------------------------------===##
 
 # NOTE: Don't try to build `elf_common` using `add_llvm_library`.
-# See openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt
+# See openmp/libomptarget/plugins/common/PluginInterface/CMakeLists.txt
 # for more explanation.
 add_library(elf_common OBJECT elf_common.cpp ELFSymbols.cpp)
 
diff --git a/openmp/libomptarget/plugins/common/elf_common/ELFSymbols.h b/openmp/libomptarget/plugins-nextgen/common/elf_common/ELFSymbols.h
rename from openmp/libomptarget/plugins/common/elf_common/ELFSymbols.h
rename to openmp/libomptarget/plugins-nextgen/common/elf_common/ELFSymbols.h
diff --git a/openmp/libomptarget/plugins/common/elf_common/ELFSymbols.cpp b/openmp/libomptarget/plugins-nextgen/common/elf_common/ELFSymbols.cpp
rename from openmp/libomptarget/plugins/common/elf_common/ELFSymbols.cpp
rename to openmp/libomptarget/plugins-nextgen/common/elf_common/ELFSymbols.cpp
diff --git a/openmp/libomptarget/plugins/common/elf_common/elf_common.h b/openmp/libomptarget/plugins-nextgen/common/elf_common/elf_common.h
rename from openmp/libomptarget/plugins/common/elf_common/elf_common.h
rename to openmp/libomptarget/plugins-nextgen/common/elf_common/elf_common.h
diff --git a/openmp/libomptarget/plugins/common/elf_common/elf_common.cpp b/openmp/libomptarget/plugins-nextgen/common/elf_common/elf_common.cpp
rename from openmp/libomptarget/plugins/common/elf_common/elf_common.cpp
rename to openmp/libomptarget/plugins-nextgen/common/elf_common/elf_common.cpp
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/cuda/CMakeLists.txt
--- a/openmp/libomptarget/plugins-nextgen/cuda/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/cuda/CMakeLists.txt
@@ -26,7 +26,7 @@
 set(LIBOMPTARGET_DLOPEN_LIBCUDA OFF)
 option(LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA "Build with dlopened libcuda" ${LIBOMPTARGET_DLOPEN_LIBCUDA})
 
-add_llvm_library(omptarget.rtl.cuda.nextgen SHARED
+add_llvm_library(omptarget.rtl.cuda SHARED
   src/rtl.cpp
 
   LINK_COMPONENTS
@@ -43,33 +43,33 @@
 )
 
 if ((OMPT_TARGET_DEFAULT) AND (LIBOMPTARGET_OMPT_SUPPORT))
-  target_link_libraries(omptarget.rtl.cuda.nextgen PRIVATE OMPT)
+  target_link_libraries(omptarget.rtl.cuda PRIVATE OMPT)
 endif()
 
 if (LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
-  target_link_libraries(omptarget.rtl.cuda.nextgen PRIVATE
+  target_link_libraries(omptarget.rtl.cuda PRIVATE
   "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports,-z,defs")
 endif()
 
 
 if(LIBOMPTARGET_DEP_CUDA_FOUND AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA)
   libomptarget_say("Building CUDA plugin linked against libcuda")
-  target_link_libraries(omptarget.rtl.cuda.nextgen PRIVATE CUDA::cuda_driver)
+  target_link_libraries(omptarget.rtl.cuda PRIVATE CUDA::cuda_driver)
 else()
   libomptarget_say("Building CUDA plugin for dlopened libcuda")
-  target_include_directories(omptarget.rtl.cuda.nextgen PRIVATE ../../plugins/cuda/dynamic_cuda)
-  target_sources(omptarget.rtl.cuda.nextgen PRIVATE ../../plugins/cuda/dynamic_cuda/cuda.cpp)
+  target_include_directories(omptarget.rtl.cuda PRIVATE dynamic_cuda)
+  target_sources(omptarget.rtl.cuda PRIVATE dynamic_cuda/cuda.cpp)
 endif()
 
 # Define debug prefix. TODO: This should be automatized in the Debug.h but it
 # requires changing the original plugins.
-target_compile_definitions(omptarget.rtl.cuda.nextgen PRIVATE TARGET_NAME="CUDA")
-target_compile_definitions(omptarget.rtl.cuda.nextgen PRIVATE DEBUG_PREFIX="TARGET CUDA RTL")
+target_compile_definitions(omptarget.rtl.cuda PRIVATE TARGET_NAME="CUDA")
+target_compile_definitions(omptarget.rtl.cuda PRIVATE DEBUG_PREFIX="TARGET CUDA RTL")
 
-target_include_directories(omptarget.rtl.cuda.nextgen PRIVATE ${LIBOMPTARGET_INCLUDE_DIR})
+target_include_directories(omptarget.rtl.cuda PRIVATE ${LIBOMPTARGET_INCLUDE_DIR})
 
 # Install plugin under the lib destination folder.
-install(TARGETS omptarget.rtl.cuda.nextgen LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-set_target_properties(omptarget.rtl.cuda.nextgen PROPERTIES
+install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+set_target_properties(omptarget.rtl.cuda PROPERTIES
   INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.."
   CXX_VISIBILITY_PRESET protected)
diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
rename from openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h
rename to openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
rename from openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.cpp
rename to openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
diff --git a/openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt
--- a/openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt
@@ -11,7 +11,7 @@
 ##===----------------------------------------------------------------------===##
 
 if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  build_generic_elf64_nextgen("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu" "21")
+  build_generic_elf64("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu" "21")
 else()
  libomptarget_say("Not building ppc64 NextGen offloading plugin: machine not found in the system.")
 endif()
diff --git a/openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt
--- a/openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt
@@ -11,7 +11,7 @@
 ##===----------------------------------------------------------------------===##
 
 if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  build_generic_elf64_nextgen("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu" "21")
+  build_generic_elf64("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu" "21")
 else()
  libomptarget_say("Not building ppc64le NextGen offloading plugin: machine not found in the system.")
 endif()
diff --git a/openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt
--- a/openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt
@@ -11,7 +11,7 @@
 ##===----------------------------------------------------------------------===##
 
 if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  build_generic_elf64_nextgen("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62")
+  build_generic_elf64("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62")
 else()
  libomptarget_say("Not building x86_64 NextGen offloading plugin: machine not found in the system.")
 endif()
diff --git a/openmp/libomptarget/plugins/CMakeLists.txt b/openmp/libomptarget/plugins/CMakeLists.txt
deleted file mode 100644
--- a/openmp/libomptarget/plugins/CMakeLists.txt
+++ /dev/null
@@ -1,89 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build plugins for the user system if available.
-#
-##===----------------------------------------------------------------------===##
-
-add_subdirectory(common)
-
-# void build_generic_elf64(string tmachine, string tmachine_name, string tmachine_libname, string elf_machine_id);
-# - build a plugin for an ELF based generic 64-bit target based on libffi.
-# - tmachine: name of the machine processor as used in the cmake build system.
-# - tmachine_name: name of the machine to be printed with the debug messages.
-# - tmachine_libname: machine name to be appended to the plugin library name.
-macro(build_generic_elf64 tmachine tmachine_name tmachine_libname tmachine_triple elf_machine_id)
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$")
-  if(LIBOMPTARGET_DEP_LIBFFI_FOUND)
-
-    libomptarget_say("Building ${tmachine_name} offloading plugin.")
-
-    # Define macro to be used as prefix of the runtime messages for this target.
-    add_definitions("-DTARGET_NAME=${tmachine_name}")
-
-    # Define macro with the ELF ID for this target.
-    add_definitions("-DTARGET_ELF_ID=${elf_machine_id}")
-
-    add_llvm_library("omptarget.rtl.${tmachine_libname}"
-      SHARED
-
-      ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/src/rtl.cpp
-
-      ADDITIONAL_HEADER_DIRS
-      ${LIBOMPTARGET_INCLUDE_DIR}
-      ${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR}
-
-        LINK_LIBS 
-        PRIVATE
-        elf_common
-        ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES}
-        ${OPENMP_PTHREAD_LIB}
-        "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
-
-      NO_INSTALL_RPATH
-    )
-
-    # Install plugin under the lib destination folder.
-    install(TARGETS "omptarget.rtl.${tmachine_libname}"
-      LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-    set_target_properties("omptarget.rtl.${tmachine_libname}" PROPERTIES
-      INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.."
-      CXX_VISIBILITY_PRESET protected)
-
-    target_include_directories( "omptarget.rtl.${tmachine_libname}" PRIVATE
-      ${LIBOMPTARGET_INCLUDE_DIR}
-      ${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR})
-
-    list(APPEND LIBOMPTARGET_TESTED_PLUGINS
-      "omptarget.rtl.${tmachine_libname}")
-
-    # Report to the parent scope that we are building a plugin.
-    set(LIBOMPTARGET_SYSTEM_TARGETS
-      "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple} ${tmachine_triple}-LTO" PARENT_SCOPE)
-    set(LIBOMPTARGET_TESTED_PLUGINS
-      "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
-
-  else(LIBOMPTARGET_DEP_LIBFFI_FOUND)
-    libomptarget_say("Not building ${tmachine_name} offloading plugin: libffi dependency not found.")
-  endif(LIBOMPTARGET_DEP_LIBFFI_FOUND)
-else()
-  libomptarget_say("Not building ${tmachine_name} offloading plugin: machine not found in the system.")
-endif()
-endmacro()
-
-add_subdirectory(aarch64)
-add_subdirectory(amdgpu)
-add_subdirectory(cuda)
-add_subdirectory(ppc64)
-add_subdirectory(ppc64le)
-add_subdirectory(x86_64)
-
-# Make sure the parent scope can see the plugins that will be created.
-set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
-set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
-
diff --git a/openmp/libomptarget/plugins/aarch64/CMakeLists.txt b/openmp/libomptarget/plugins/aarch64/CMakeLists.txt
deleted file mode 100644
--- a/openmp/libomptarget/plugins/aarch64/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for an aarch64 machine if available.
-#
-##===----------------------------------------------------------------------===##
-
-if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  build_generic_elf64("aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183")
-else()
- libomptarget_say("Not building aarch64 offloading plugin: machine not found in the system.")
-endif()
diff --git a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt b/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
+++ /dev/null
@@ -1,122 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is dual licensed under the MIT and the University of Illinois Open
-# Source Licenses. See LICENSE.txt for details.
-#
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for an AMDGPU machine if available.
-#
-##===----------------------------------------------------------------------===##
-
-################################################################################
-set(LIBOMPTARGET_BUILD_AMDGPU_PLUGIN TRUE CACHE BOOL
-  "Whether to build AMDGPU plugin")
-if (NOT LIBOMPTARGET_BUILD_AMDGPU_PLUGIN)
-  libomptarget_say("Not building AMDGPU offloading plugin: LIBOMPTARGET_BUILD_AMDGPU_PLUGIN is false")
-  return()
-endif()
-
-# as of rocm-3.7, hsa is installed with cmake packages and kmt is found via hsa
-find_package(hsa-runtime64 QUIET 1.2.0 HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
-
-if(NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux"))
-  libomptarget_say("Not building AMDGPU plugin: only support AMDGPU in Linux x86_64, ppc64le, or aarch64 hosts")
-  return()
-endif()
-
-################################################################################
-# Define the suffix for the runtime messaging dumps.
-add_definitions(-DTARGET_NAME=AMDGPU)
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "(ppc64le)|(aarch64)$")
-   add_definitions(-DLITTLEENDIAN_CPU=1)
-endif()
-
-if(CMAKE_BUILD_TYPE MATCHES Debug)
-  add_definitions(-DDEBUG)
-endif()
-
-set(LIBOMPTARGET_DLOPEN_LIBHSA OFF)
-option(LIBOMPTARGET_FORCE_DLOPEN_LIBHSA "Build with dlopened libhsa" ${LIBOMPTARGET_DLOPEN_LIBHSA})
-
-if (${hsa-runtime64_FOUND} AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBHSA)
-  libomptarget_say("Building AMDGPU plugin linked against libhsa")
-  set(LIBOMPTARGET_EXTRA_SOURCE)
-  set(LIBOMPTARGET_DEP_LIBRARIES hsa-runtime64::hsa-runtime64)
-else()
-  libomptarget_say("Building AMDGPU plugin for dlopened libhsa")
-  include_directories(dynamic_hsa)
-  set(LIBOMPTARGET_EXTRA_SOURCE dynamic_hsa/hsa.cpp)
-  set(LIBOMPTARGET_DEP_LIBRARIES)
-endif()
-
-if(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
-  # On FreeBSD, the 'environ' symbol is undefined at link time, but resolved by
-  # the dynamic linker at runtime. Therefore, allow the symbol to be undefined
-  # when creating a shared library.
-  set(LDFLAGS_UNDEFINED "-Wl,--allow-shlib-undefined")
-else()
-  set(LDFLAGS_UNDEFINED "-Wl,-z,defs")
-endif()
-
-add_llvm_library(omptarget.rtl.amdgpu SHARED
-  impl/impl.cpp
-  impl/interop_hsa.cpp
-  impl/data.cpp
-  impl/get_elf_mach_gfx_name.cpp
-  impl/system.cpp
-  impl/msgpack.cpp
-  src/rtl.cpp
-  ${LIBOMPTARGET_EXTRA_SOURCE}
-
-  ADDITIONAL_HEADER_DIRS
-  ${LIBOMPTARGET_INCLUDE_DIR}
-  ${CMAKE_CURRENT_SOURCE_DIR}/impl
-  ${CMAKE_CURRENT_SOURCE_DIR}/../../plugins-nextgen/amdgpu/utils
-
-  LINK_COMPONENTS
-  Support
-  Object
-
-  LINK_LIBS 
-  PRIVATE
-  elf_common
-  ${LIBOMPTARGET_DEP_LIBRARIES}
-  ${OPENMP_PTHREAD_LIB}
-  "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
-  ${LDFLAGS_UNDEFINED}
-
-  NO_INSTALL_RPATH
-)
-
-target_include_directories(
-  omptarget.rtl.amdgpu
-  PRIVATE
-  ${LIBOMPTARGET_INCLUDE_DIR}
-  ${CMAKE_CURRENT_SOURCE_DIR}/impl
-  ${CMAKE_CURRENT_SOURCE_DIR}/../../plugins-nextgen/amdgpu/utils
-)
-
-
-# Install plugin under the lib destination folder.
-install(TARGETS omptarget.rtl.amdgpu LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-set_target_properties(omptarget.rtl.amdgpu PROPERTIES 
-  INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.."
-  CXX_VISIBILITY_PRESET protected)
-
-# Report to the parent scope that we are building a plugin for hsa.
-# This controls whether tests are run for the nvptx offloading target
-# Run them if libhsa is available, or if the user explicitly asked for dlopen
-# Otherwise this plugin is being built speculatively and there may be no hsa available
-option(LIBOMPTARGET_FORCE_AMDGPU_TESTS "Build AMDGPU libomptarget tests" OFF)
-if (LIBOMPTARGET_FOUND_AMDGPU_GPU OR LIBOMPTARGET_FORCE_AMDGPU_TESTS)
-  # Report to the parent scope that we are building a plugin for amdgpu
-  set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa" PARENT_SCOPE)
-  list(APPEND LIBOMPTARGET_TESTED_PLUGINS "omptarget.rtl.amdgpu")
-  set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
-else()
-  libomptarget_say("Not generating AMDGPU tests, no supported devices detected. Use 'LIBOMPTARGET_FORCE_AMDGPU_TESTS' to override.")
-  return()
-endif()
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp b/openmp/libomptarget/plugins/amdgpu/impl/data.cpp
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/data.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-//===--- amdgpu/impl/data.cpp ------------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#include "impl_runtime.h"
-#include "hsa_api.h"
-#include "internal.h"
-#include "rt.h"
-#include <cassert>
-#include <stdio.h>
-#include <string.h>
-#include <vector>
-
-using core::TaskImpl;
-
-namespace core {
-namespace Runtime {
-hsa_status_t HostMalloc(void **ptr, size_t size,
-                        hsa_amd_memory_pool_t MemoryPool) {
-  hsa_status_t err = hsa_amd_memory_pool_allocate(MemoryPool, size, 0, ptr);
-  DP("Malloced %p\n", *ptr);
-  if (err == HSA_STATUS_SUCCESS) {
-    err = core::allow_access_to_all_gpu_agents(*ptr);
-  }
-  return err;
-}
-
-hsa_status_t Memfree(void *ptr) {
-  hsa_status_t err = hsa_amd_memory_pool_free(ptr);
-  DP("Freed %p\n", ptr);
-  return err;
-}
-} // namespace Runtime
-} // namespace core
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.h
+++ /dev/null
@@ -1,15 +0,0 @@
-//===--- amdgpu/impl/get_elf_mach_gfx_name.h ---------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef GET_ELF_MACH_GFX_NAME_H_INCLUDED
-#define GET_ELF_MACH_GFX_NAME_H_INCLUDED
-
-#include <stdint.h>
-
-const char *get_elf_mach_gfx_name(uint32_t EFlags);
-
-#endif
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp b/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/get_elf_mach_gfx_name.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-//===--- amdgpu/impl/get_elf_mach_gfx_name.cpp -------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#include "get_elf_mach_gfx_name.h"
-
-// This header conflicts with the system elf.h (macros vs enums of the same
-// identifier) and contains more up to date values for the enum checked here.
-// rtl.cpp uses the system elf.h.
-#include "llvm/BinaryFormat/ELF.h"
-
-const char *get_elf_mach_gfx_name(uint32_t EFlags) {
-  using namespace llvm::ELF;
-  uint32_t Gfx = (EFlags & EF_AMDGPU_MACH);
-  switch (Gfx) {
-  case EF_AMDGPU_MACH_AMDGCN_GFX801:
-    return "gfx801";
-  case EF_AMDGPU_MACH_AMDGCN_GFX802:
-    return "gfx802";
-  case EF_AMDGPU_MACH_AMDGCN_GFX803:
-    return "gfx803";
-  case EF_AMDGPU_MACH_AMDGCN_GFX805:
-    return "gfx805";
-  case EF_AMDGPU_MACH_AMDGCN_GFX810:
-    return "gfx810";
-  case EF_AMDGPU_MACH_AMDGCN_GFX900:
-    return "gfx900";
-  case EF_AMDGPU_MACH_AMDGCN_GFX902:
-    return "gfx902";
-  case EF_AMDGPU_MACH_AMDGCN_GFX904:
-    return "gfx904";
-  case EF_AMDGPU_MACH_AMDGCN_GFX906:
-    return "gfx906";
-  case EF_AMDGPU_MACH_AMDGCN_GFX908:
-    return "gfx908";
-  case EF_AMDGPU_MACH_AMDGCN_GFX909:
-    return "gfx909";
-  case EF_AMDGPU_MACH_AMDGCN_GFX90A:
-    return "gfx90a";
-  case EF_AMDGPU_MACH_AMDGCN_GFX90C:
-    return "gfx90c";
-  case EF_AMDGPU_MACH_AMDGCN_GFX940:
-    return "gfx940";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1010:
-    return "gfx1010";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1011:
-    return "gfx1011";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1012:
-    return "gfx1012";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1013:
-    return "gfx1013";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1030:
-    return "gfx1030";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1031:
-    return "gfx1031";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1032:
-    return "gfx1032";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1033:
-    return "gfx1033";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1034:
-    return "gfx1034";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1035:
-    return "gfx1035";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1036:
-    return "gfx1036";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1100:
-    return "gfx1100";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1101:
-    return "gfx1101";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1102:
-    return "gfx1102";
-  case EF_AMDGPU_MACH_AMDGCN_GFX1103:
-    return "gfx1103";
-  default:
-    return "--unknown gfx";
-  }
-}
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/hsa_api.h b/openmp/libomptarget/plugins/amdgpu/impl/hsa_api.h
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/hsa_api.h
+++ /dev/null
@@ -1,26 +0,0 @@
-//===--- amdgpu/impl/hsa_api.h ------------------------------------ C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef AMDGPU_HSA_API_H_INCLUDED
-#define AMDGPU_HSA_API_H_INCLUDED
-
-#if defined(__has_include)
-#if __has_include("hsa/hsa.h")
-#include "hsa/hsa.h"
-#include "hsa/hsa_ext_amd.h"
-#elif __has_include("hsa.h")
-#include "hsa.h"
-#include "hsa_ext_amd.h"
-#endif
-#else
-#include "hsa/hsa.h"
-#include "hsa_ext_amd.h"
-#endif
-
-
-
-#endif
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp b/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/impl.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-//===--- amdgpu/impl/impl.cpp ------------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#include "rt.h"
-#include <memory>
-
-/*
- * Data
- */
-
-hsa_status_t is_locked(void *ptr, void **agentBaseAddress) {
-  hsa_status_t err = HSA_STATUS_SUCCESS;
-  hsa_amd_pointer_info_t info;
-  info.size = sizeof(hsa_amd_pointer_info_t);
-  err = hsa_amd_pointer_info(ptr, &info, /*alloc=*/nullptr,
-                             /*num_agents_accessible=*/nullptr,
-                             /*accessible=*/nullptr);
-  if (err != HSA_STATUS_SUCCESS) {
-    DP("Error when getting pointer info\n");
-    return err;
-  }
-
-  if (info.type == HSA_EXT_POINTER_TYPE_LOCKED) {
-    // When user passes in a basePtr+offset we need to fix the
-    // locked pointer to include the offset: ROCr always returns
-    // the base locked address, not the shifted one.
-    if ((char *)info.hostBaseAddress <= (char *)ptr &&
-        (char *)ptr < (char *)info.hostBaseAddress + info.sizeInBytes)
-      *agentBaseAddress =
-          (void *)((uint64_t)info.agentBaseAddress + (uint64_t)ptr -
-                   (uint64_t)info.hostBaseAddress);
-    else // address is already device-agent accessible, no need to compute
-         // offset
-      *agentBaseAddress = ptr;
-  } else
-    *agentBaseAddress = nullptr;
-
-  return HSA_STATUS_SUCCESS;
-}
-
-// host pointer (either src or dest) must be locked via hsa_amd_memory_lock
-static hsa_status_t invoke_hsa_copy(hsa_signal_t signal, void *dest,
-                                    hsa_agent_t agent, const void *src,
-                                    size_t size) {
-  const hsa_signal_value_t init = 1;
-  const hsa_signal_value_t success = 0;
-  hsa_signal_store_screlease(signal, init);
-
-  hsa_status_t err = hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0,
-                                               nullptr, signal);
-  if (err != HSA_STATUS_SUCCESS)
-    return err;
-
-  // async_copy reports success by decrementing and failure by setting to < 0
-  hsa_signal_value_t got = init;
-  while (got == init)
-    got = hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_NE, init,
-                                    UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
-
-  if (got != success)
-    return HSA_STATUS_ERROR;
-
-  return err;
-}
-
-struct implFreePtrDeletor {
-  void operator()(void *p) {
-    core::Runtime::Memfree(p); // ignore failure to free
-  }
-};
-
-enum CopyDirection { H2D, D2H };
-
-static hsa_status_t locking_async_memcpy(enum CopyDirection direction,
-                                         hsa_signal_t signal, void *dest,
-                                         hsa_agent_t agent, void *src,
-                                         void *lockingPtr, size_t size) {
-  void *lockedPtr = nullptr;
-  hsa_status_t err = is_locked(lockingPtr, &lockedPtr);
-  bool HostPtrIsLocked = true;
-  if (err != HSA_STATUS_SUCCESS)
-    return err;
-  if (!lockedPtr) { // not locked
-    HostPtrIsLocked = false;
-    hsa_agent_t agents[1] = {agent};
-    err = hsa_amd_memory_lock(lockingPtr, size, agents, /*num_agent=*/1,
-                              (void **)&lockedPtr);
-    if (err != HSA_STATUS_SUCCESS)
-      return err;
-    DP("locking_async_memcpy: lockingPtr=%p lockedPtr=%p Size = %lu\n",
-       lockingPtr, lockedPtr, size);
-  }
-
-  switch (direction) {
-  case H2D:
-    err = invoke_hsa_copy(signal, dest, agent, lockedPtr, size);
-    break;
-  case D2H:
-    err = invoke_hsa_copy(signal, lockedPtr, agent, src, size);
-    break;
-  }
-
-  if (err != HSA_STATUS_SUCCESS && !HostPtrIsLocked) {
-    // do not leak locked host pointers, but discard potential error message
-    // because the initial error was in the copy function
-    hsa_amd_memory_unlock(lockingPtr);
-    return err;
-  }
-
-  // unlock only if not user locked
-  if (!HostPtrIsLocked)
-    err = hsa_amd_memory_unlock(lockingPtr);
-  if (err != HSA_STATUS_SUCCESS)
-    return err;
-
-  return HSA_STATUS_SUCCESS;
-}
-
-hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
-                             void *hostSrc, size_t size,
-                             hsa_agent_t device_agent,
-                             hsa_amd_memory_pool_t MemoryPool) {
-  hsa_status_t err;
-
-  err = locking_async_memcpy(CopyDirection::H2D, signal, deviceDest,
-                             device_agent, hostSrc, hostSrc, size);
-
-  if (err == HSA_STATUS_SUCCESS)
-    return err;
-
-  // async memcpy sometimes fails in situations where
-  // allocate + copy succeeds. Looks like it might be related to
-  // locking part of a read only segment. Fall back for now.
-  void *tempHostPtr;
-  hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool);
-  if (ret != HSA_STATUS_SUCCESS) {
-    DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size);
-    return ret;
-  }
-  std::unique_ptr<void, implFreePtrDeletor> del(tempHostPtr);
-  memcpy(tempHostPtr, hostSrc, size);
-
-  return locking_async_memcpy(CopyDirection::H2D, signal, deviceDest,
-                              device_agent, tempHostPtr, tempHostPtr, size);
-}
-
-hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *hostDest,
-                             void *deviceSrc, size_t size,
-                             hsa_agent_t deviceAgent,
-                             hsa_amd_memory_pool_t MemoryPool) {
-  hsa_status_t err;
-
-  // device has always visibility over both pointers, so use that
-  err = locking_async_memcpy(CopyDirection::D2H, signal, hostDest, deviceAgent,
-                             deviceSrc, hostDest, size);
-
-  if (err == HSA_STATUS_SUCCESS)
-    return err;
-
-  // hsa_memory_copy sometimes fails in situations where
-  // allocate + copy succeeds. Looks like it might be related to
-  // locking part of a read only segment. Fall back for now.
-  void *tempHostPtr;
-  hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool);
-  if (ret != HSA_STATUS_SUCCESS) {
-    DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size);
-    return ret;
-  }
-  std::unique_ptr<void, implFreePtrDeletor> del(tempHostPtr);
-
-  err = locking_async_memcpy(CopyDirection::D2H, signal, tempHostPtr,
-                             deviceAgent, deviceSrc, tempHostPtr, size);
-  if (err != HSA_STATUS_SUCCESS)
-    return HSA_STATUS_ERROR;
-
-  memcpy(hostDest, tempHostPtr, size);
-  return HSA_STATUS_SUCCESS;
-}
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h b/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/impl_runtime.h
+++ /dev/null
@@ -1,34 +0,0 @@
-//===--- amdgpu/impl/impl_runtime.h ------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef INCLUDE_IMPL_RUNTIME_H_
-#define INCLUDE_IMPL_RUNTIME_H_
-
-#include "hsa_api.h"
-
-extern "C" {
-
-// Check if pointer ptr is already locked
-hsa_status_t is_locked(void *ptr, void **agentBaseAddress);
-
-hsa_status_t impl_module_register_from_memory_to_place(
-    void *module_bytes, size_t module_size, int DeviceId,
-    hsa_status_t (*on_deserialized_data)(void *data, size_t size,
-                                         void *cb_state),
-    void *cb_state);
-
-hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
-                             void *hostSrc, size_t size,
-                             hsa_agent_t device_agent,
-                             hsa_amd_memory_pool_t MemoryPool);
-
-hsa_status_t impl_memcpy_d2h(hsa_signal_t sig, void *hostDest, void *deviceSrc,
-                             size_t size, hsa_agent_t device_agent,
-                             hsa_amd_memory_pool_t MemoryPool);
-}
-
-#endif // INCLUDE_IMPL_RUNTIME_H_
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h
+++ /dev/null
@@ -1,154 +0,0 @@
-//===--- amdgpu/impl/internal.h ----------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef SRC_RUNTIME_INCLUDE_INTERNAL_H_
-#define SRC_RUNTIME_INCLUDE_INTERNAL_H_
-#include <inttypes.h>
-#include <pthread.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <cstring>
-#include <map>
-#include <queue>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "hsa_api.h"
-
-#include "impl_runtime.h"
-
-#ifndef TARGET_NAME
-#error "Missing TARGET_NAME macro"
-#endif
-#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
-#include "Debug.h"
-
-#define MAX_NUM_KERNELS (1024 * 16)
-
-// ---------------------- Kernel Start -------------
-typedef struct atl_kernel_info_s {
-  uint64_t kernel_object;
-  uint32_t group_segment_size;
-  uint32_t private_segment_size;
-  uint32_t sgpr_count;
-  uint32_t vgpr_count;
-  uint32_t sgpr_spill_count;
-  uint32_t vgpr_spill_count;
-  uint32_t kernel_segment_size;
-  uint32_t explicit_argument_count;
-  uint32_t implicit_argument_count;
-} atl_kernel_info_t;
-
-typedef struct atl_symbol_info_s {
-  uint64_t addr;
-  uint32_t size;
-} atl_symbol_info_t;
-
-// ---------------------- Kernel End -------------
-
-namespace core {
-class TaskgroupImpl;
-class TaskImpl;
-class Kernel;
-class KernelImpl;
-} // namespace core
-
-struct SignalPoolT {
-  SignalPoolT() {}
-  SignalPoolT(const SignalPoolT &) = delete;
-  SignalPoolT(SignalPoolT &&) = delete;
-  ~SignalPoolT() {
-    size_t N = state.size();
-    for (size_t i = 0; i < N; i++) {
-      hsa_signal_t signal = state.front();
-      state.pop();
-      hsa_status_t rc = hsa_signal_destroy(signal);
-      if (rc != HSA_STATUS_SUCCESS) {
-        DP("Signal pool destruction failed\n");
-      }
-    }
-  }
-  size_t size() {
-    lock l(&mutex);
-    return state.size();
-  }
-  void push(hsa_signal_t s) {
-    lock l(&mutex);
-    state.push(s);
-  }
-  hsa_signal_t pop(void) {
-    lock l(&mutex);
-    if (!state.empty()) {
-      hsa_signal_t res = state.front();
-      state.pop();
-      return res;
-    }
-
-    // Pool empty, attempt to create another signal
-    hsa_signal_t new_signal;
-    hsa_status_t err = hsa_signal_create(0, 0, NULL, &new_signal);
-    if (err == HSA_STATUS_SUCCESS) {
-      return new_signal;
-    }
-
-    // Fail
-    return {0};
-  }
-
-private:
-  static pthread_mutex_t mutex;
-  std::queue<hsa_signal_t> state;
-  struct lock {
-    lock(pthread_mutex_t *m) : m(m) { pthread_mutex_lock(m); }
-    ~lock() { pthread_mutex_unlock(m); }
-    pthread_mutex_t *m;
-  };
-};
-
-namespace core {
-hsa_status_t atl_init_gpu_context();
-
-hsa_status_t init_hsa();
-hsa_status_t finalize_hsa();
-/*
- * Generic utils
- */
-template <typename T> inline T alignDown(T value, size_t alignment) {
-  return (T)(value & ~(alignment - 1));
-}
-
-template <typename T> inline T *alignDown(T *value, size_t alignment) {
-  return reinterpret_cast<T *>(alignDown((intptr_t)value, alignment));
-}
-
-template <typename T> inline T alignUp(T value, size_t alignment) {
-  return alignDown((T)(value + alignment - 1), alignment);
-}
-
-template <typename T> inline T *alignUp(T *value, size_t alignment) {
-  return reinterpret_cast<T *>(
-      alignDown((intptr_t)(value + alignment - 1), alignment));
-}
-
-extern bool atl_is_impl_initialized();
-
-bool handle_group_signal(hsa_signal_value_t value, void *arg);
-
-hsa_status_t allow_access_to_all_gpu_agents(void *ptr);
-} // namespace core
-
-inline const char *get_error_string(hsa_status_t err) {
-  const char *res;
-  hsa_status_t rc = hsa_status_string(err, &res);
-  return (rc == HSA_STATUS_SUCCESS) ? res : "HSA_STATUS UNKNOWN.";
-}
-
-#endif // SRC_RUNTIME_INCLUDE_INTERNAL_H_
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.h b/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.h
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.h
+++ /dev/null
@@ -1,26 +0,0 @@
-//===--- amdgpu/impl/interop_hsa.h -------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef INCLUDE_INTEROP_HSA_H_
-#define INCLUDE_INTEROP_HSA_H_
-
-#include "impl_runtime.h"
-#include "hsa_api.h"
-#include "internal.h"
-
-#include <map>
-#include <string>
-
-extern "C" {
-
-hsa_status_t interop_hsa_get_symbol_info(
-    const std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
-    int DeviceId, const char *symbol, void **var_addr, unsigned int *var_size);
-
-}
-
-#endif // INCLUDE_INTEROP_HSA_H_
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.cpp b/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.cpp
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/interop_hsa.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-//===--- amdgpu/impl/interop_hsa.cpp ------------------------------ C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#include "interop_hsa.h"
-#include "internal.h"
-
-hsa_status_t interop_hsa_get_symbol_info(
-    const std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
-    int DeviceId, const char *symbol, void **var_addr, unsigned int *var_size) {
-  /*
-     // Typical usage:
-     void *var_addr;
-     size_t var_size;
-     interop_hsa_get_symbol_addr(gpu_place, "symbol_name", &var_addr,
-     &var_size);
-     impl_memcpy(signal, host_add, var_addr, var_size);
-  */
-
-  if (!symbol || !var_addr || !var_size)
-    return HSA_STATUS_ERROR;
-
-  // get the symbol info
-  std::string symbolStr = std::string(symbol);
-  auto It = SymbolInfoTable.find(symbolStr);
-  if (It != SymbolInfoTable.end()) {
-    atl_symbol_info_t info = It->second;
-    *var_addr = reinterpret_cast<void *>(info.addr);
-    *var_size = info.size;
-    return HSA_STATUS_SUCCESS;
-  } else {
-    *var_addr = NULL;
-    *var_size = 0;
-    return HSA_STATUS_ERROR;
-  }
-}
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.h b/openmp/libomptarget/plugins/amdgpu/impl/msgpack.h
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.h
+++ /dev/null
@@ -1,282 +0,0 @@
-//===--- amdgpu/impl/msgpack.h ------------------------------------ C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef MSGPACK_H
-#define MSGPACK_H
-
-#include <functional>
-
-namespace msgpack {
-
-// The message pack format is dynamically typed, schema-less. Format is:
-// message: [type][header][payload]
-// where type is one byte, header length is a fixed length function of type
-// payload is zero to N bytes, with the length encoded in [type][header]
-
-// Scalar fields include boolean, signed integer, float, string etc
-// Composite types are sequences of messages
-// Array field is [header][element][element]...
-// Map field is [header][key][value][key][value]...
-
-// Multibyte integer fields are big endian encoded
-// The map key can be any message type
-// Maps may contain duplicate keys
-// Data is not uniquely encoded, e.g. integer "8" may be stored as one byte or
-// in as many as nine, as signed or unsigned. Implementation defined.
-// Similarly "foo" may embed the length in the type field or in multiple bytes
-
-// This parser is structured as an iterator over a sequence of bytes.
-// It calls a user provided function on each message in order to extract fields
-// The default implementation for each scalar type is to do nothing. For map or
-// arrays, the default implementation returns just after that message to support
-// iterating to the next message, but otherwise has no effect.
-
-struct byte_range {
-  const unsigned char *start;
-  const unsigned char *end;
-};
-
-const unsigned char *skip_next_message(const unsigned char *start,
-                                       const unsigned char *end);
-
-template <typename Derived> class functors_defaults {
-public:
-  void cb_string(size_t N, const unsigned char *str) {
-    derived().handle_string(N, str);
-  }
-  void cb_boolean(bool x) { derived().handle_boolean(x); }
-  void cb_signed(int64_t x) { derived().handle_signed(x); }
-  void cb_unsigned(uint64_t x) { derived().handle_unsigned(x); }
-  void cb_array_elements(byte_range bytes) {
-    derived().handle_array_elements(bytes);
-  }
-  void cb_map_elements(byte_range key, byte_range value) {
-    derived().handle_map_elements(key, value);
-  }
-  const unsigned char *cb_array(uint64_t N, byte_range bytes) {
-    return derived().handle_array(N, bytes);
-  }
-  const unsigned char *cb_map(uint64_t N, byte_range bytes) {
-    return derived().handle_map(N, bytes);
-  }
-
-private:
-  Derived &derived() { return *static_cast<Derived *>(this); }
-
-  // Default implementations for scalar ops are no-ops
-  void handle_string(size_t, const unsigned char *) {}
-  void handle_boolean(bool) {}
-  void handle_signed(int64_t) {}
-  void handle_unsigned(uint64_t) {}
-  void handle_array_elements(byte_range) {}
-  void handle_map_elements(byte_range, byte_range) {}
-
-  // Default implementation for sequences is to skip over the messages
-  const unsigned char *handle_array(uint64_t N, byte_range bytes) {
-    for (uint64_t i = 0; i < N; i++) {
-      const unsigned char *next = skip_next_message(bytes.start, bytes.end);
-      if (!next) {
-        return nullptr;
-      }
-      cb_array_elements(bytes);
-      bytes.start = next;
-    }
-    return bytes.start;
-  }
-  const unsigned char *handle_map(uint64_t N, byte_range bytes) {
-    for (uint64_t i = 0; i < N; i++) {
-      const unsigned char *start_key = bytes.start;
-      const unsigned char *end_key = skip_next_message(start_key, bytes.end);
-      if (!end_key) {
-        return nullptr;
-      }
-      const unsigned char *start_value = end_key;
-      const unsigned char *end_value =
-          skip_next_message(start_value, bytes.end);
-      if (!end_value) {
-        return nullptr;
-      }
-      cb_map_elements({start_key, end_key}, {start_value, end_value});
-      bytes.start = end_value;
-    }
-    return bytes.start;
-  }
-};
-
-typedef enum : uint8_t {
-#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER) NAME,
-#include "msgpack.def"
-#undef X
-} type;
-
-[[noreturn]] void internal_error();
-type parse_type(unsigned char x);
-unsigned bytes_used_fixed(type ty);
-
-typedef uint64_t (*payload_info_t)(const unsigned char *);
-payload_info_t payload_info(msgpack::type ty);
-
-template <typename T, typename R> R bitcast(T x);
-
-template <typename F, msgpack::type ty>
-const unsigned char *handle_msgpack_given_type(byte_range bytes, F f) {
-  const unsigned char *start = bytes.start;
-  const unsigned char *end = bytes.end;
-  const uint64_t available = end - start;
-  assert(available != 0);
-  assert(ty == parse_type(*start));
-
-  const uint64_t bytes_used = bytes_used_fixed(ty);
-  if (available < bytes_used) {
-    return 0;
-  }
-  const uint64_t available_post_header = available - bytes_used;
-
-  const payload_info_t info = payload_info(ty);
-  const uint64_t N = info(start);
-
-  switch (ty) {
-  case msgpack::t:
-  case msgpack::f: {
-    // t is 0b11000010, f is 0b11000011, masked with 0x1
-    f.cb_boolean(N);
-    return start + bytes_used;
-  }
-
-  case msgpack::posfixint:
-  case msgpack::uint8:
-  case msgpack::uint16:
-  case msgpack::uint32:
-  case msgpack::uint64: {
-    f.cb_unsigned(N);
-    return start + bytes_used;
-  }
-
-  case msgpack::negfixint:
-  case msgpack::int8:
-  case msgpack::int16:
-  case msgpack::int32:
-  case msgpack::int64: {
-    f.cb_signed(bitcast<uint64_t, int64_t>(N));
-    return start + bytes_used;
-  }
-
-  case msgpack::fixstr:
-  case msgpack::str8:
-  case msgpack::str16:
-  case msgpack::str32: {
-    if (available_post_header < N) {
-      return 0;
-    } else {
-      f.cb_string(N, start + bytes_used);
-      return start + bytes_used + N;
-    }
-  }
-
-  case msgpack::fixarray:
-  case msgpack::array16:
-  case msgpack::array32: {
-    return f.cb_array(N, {start + bytes_used, end});
-  }
-
-  case msgpack::fixmap:
-  case msgpack::map16:
-  case msgpack::map32: {
-    return f.cb_map(N, {start + bytes_used, end});
-  }
-
-  case msgpack::nil:
-  case msgpack::bin8:
-  case msgpack::bin16:
-  case msgpack::bin32:
-  case msgpack::float32:
-  case msgpack::float64:
-  case msgpack::ext8:
-  case msgpack::ext16:
-  case msgpack::ext32:
-  case msgpack::fixext1:
-  case msgpack::fixext2:
-  case msgpack::fixext4:
-  case msgpack::fixext8:
-  case msgpack::fixext16:
-  case msgpack::never_used: {
-    if (available_post_header < N) {
-      return 0;
-    }
-    return start + bytes_used + N;
-  }
-  }
-  internal_error();
-}
-
-template <typename F>
-const unsigned char *handle_msgpack(byte_range bytes, F f) {
-  const unsigned char *start = bytes.start;
-  const unsigned char *end = bytes.end;
-  const uint64_t available = end - start;
-  if (available == 0) {
-    return 0;
-  }
-  const type ty = parse_type(*start);
-
-  switch (ty) {
-#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER)                                  \
-  case msgpack::NAME:                                                          \
-    return handle_msgpack_given_type<F, msgpack::NAME>(bytes, f);
-#include "msgpack.def"
-#undef X
-  }
-
-  internal_error();
-}
-
-bool message_is_string(byte_range bytes, const char *str);
-
-template <typename C> void foronly_string(byte_range bytes, C callback) {
-  struct inner : functors_defaults<inner> {
-    inner(C &cb) : cb(cb) {}
-    C &cb;
-    void handle_string(size_t N, const unsigned char *str) { cb(N, str); }
-  };
-  handle_msgpack<inner>(bytes, {callback});
-}
-
-template <typename C> void foronly_unsigned(byte_range bytes, C callback) {
-  struct inner : functors_defaults<inner> {
-    inner(C &cb) : cb(cb) {}
-    C &cb;
-    void handle_unsigned(uint64_t x) { cb(x); }
-  };
-  handle_msgpack<inner>(bytes, {callback});
-}
-
-template <typename C> void foreach_array(byte_range bytes, C callback) {
-  struct inner : functors_defaults<inner> {
-    inner(C &cb) : cb(cb) {}
-    C &cb;
-    void handle_array_elements(byte_range element) { cb(element); }
-  };
-  handle_msgpack<inner>(bytes, {callback});
-}
-
-template <typename C> void foreach_map(byte_range bytes, C callback) {
-  struct inner : functors_defaults<inner> {
-    inner(C &cb) : cb(cb) {}
-    C &cb;
-    void handle_map_elements(byte_range key, byte_range value) {
-      cb(key, value);
-    }
-  };
-  handle_msgpack<inner>(bytes, {callback});
-}
-
-// Crude approximation to json
-void dump(byte_range);
-
-} // namespace msgpack
-
-#endif
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.cpp b/openmp/libomptarget/plugins/amdgpu/impl/msgpack.cpp
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.cpp
+++ /dev/null
@@ -1,271 +0,0 @@
-//===--- amdgpu/impl/msgpack.cpp ---------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#include <cassert>
-#include <cstdint>
-#include <cstring>
-#include <functional>
-#include <string>
-
-#include "msgpack.h"
-
-namespace msgpack {
-
-[[noreturn]] void internal_error() {
-  printf("internal error\n");
-  exit(1);
-}
-
-const char *type_name(type ty) {
-  switch (ty) {
-#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER)                                  \
-  case NAME:                                                                   \
-    return #NAME;
-#include "msgpack.def"
-#undef X
-  }
-  internal_error();
-}
-
-unsigned bytes_used_fixed(msgpack::type ty) {
-  using namespace msgpack;
-  switch (ty) {
-#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER)                                  \
-  case NAME:                                                                   \
-    return WIDTH;
-#include "msgpack.def"
-#undef X
-  }
-  internal_error();
-}
-
-msgpack::type parse_type(unsigned char x) {
-
-#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER)                                  \
-  if (x >= LOWER && x <= UPPER) {                                              \
-    return NAME;                                                               \
-  } else
-#include "msgpack.def"
-#undef X
-  { internal_error(); }
-}
-
-template <typename T, typename R> R bitcast(T x) {
-  static_assert(sizeof(T) == sizeof(R), "");
-  R tmp;
-  memcpy(&tmp, &x, sizeof(T));
-  return tmp;
-}
-template int64_t bitcast<uint64_t, int64_t>(uint64_t);
-} // namespace msgpack
-
-// Helper functions for reading additional payload from the header
-// Depending on the type, this can be a number of bytes, elements,
-// key-value pairs or an embedded integer.
-// Each takes a pointer to the start of the header and returns a uint64_t
-
-namespace {
-namespace payload {
-uint64_t read_zero(const unsigned char *) { return 0; }
-
-// Read the first byte and zero/sign extend it
-uint64_t read_embedded_u8(const unsigned char *start) { return start[0]; }
-uint64_t read_embedded_s8(const unsigned char *start) {
-  int64_t res = msgpack::bitcast<uint8_t, int8_t>(start[0]);
-  return msgpack::bitcast<int64_t, uint64_t>(res);
-}
-
-// Read a masked part of the first byte
-uint64_t read_via_mask_0x1(const unsigned char *start) { return *start & 0x1u; }
-uint64_t read_via_mask_0xf(const unsigned char *start) { return *start & 0xfu; }
-uint64_t read_via_mask_0x1f(const unsigned char *start) {
-  return *start & 0x1fu;
-}
-
-// Read 1/2/4/8 bytes immediately following the type byte and zero/sign extend
-// Big endian format.
-uint64_t read_size_field_u8(const unsigned char *from) {
-  from++;
-  return from[0];
-}
-
-// TODO: detect whether host is little endian or not, and whether the intrinsic
-// is available. And probably use the builtin to test the diy
-const bool use_bswap = false;
-
-uint64_t read_size_field_u16(const unsigned char *from) {
-  from++;
-  if (use_bswap) {
-    uint16_t b;
-    memcpy(&b, from, 2);
-    return __builtin_bswap16(b);
-  } else {
-    return (from[0] << 8u) | from[1];
-  }
-}
-uint64_t read_size_field_u32(const unsigned char *from) {
-  from++;
-  if (use_bswap) {
-    uint32_t b;
-    memcpy(&b, from, 4);
-    return __builtin_bswap32(b);
-  } else {
-    return (from[0] << 24u) | (from[1] << 16u) | (from[2] << 8u) |
-           (from[3] << 0u);
-  }
-}
-uint64_t read_size_field_u64(const unsigned char *from) {
-  from++;
-  if (use_bswap) {
-    uint64_t b;
-    memcpy(&b, from, 8);
-    return __builtin_bswap64(b);
-  } else {
-    return ((uint64_t)from[0] << 56u) | ((uint64_t)from[1] << 48u) |
-           ((uint64_t)from[2] << 40u) | ((uint64_t)from[3] << 32u) |
-           (from[4] << 24u) | (from[5] << 16u) | (from[6] << 8u) |
-           (from[7] << 0u);
-  }
-}
-
-uint64_t read_size_field_s8(const unsigned char *from) {
-  uint8_t u = read_size_field_u8(from);
-  int64_t res = msgpack::bitcast<uint8_t, int8_t>(u);
-  return msgpack::bitcast<int64_t, uint64_t>(res);
-}
-uint64_t read_size_field_s16(const unsigned char *from) {
-  uint16_t u = read_size_field_u16(from);
-  int64_t res = msgpack::bitcast<uint16_t, int16_t>(u);
-  return msgpack::bitcast<int64_t, uint64_t>(res);
-}
-uint64_t read_size_field_s32(const unsigned char *from) {
-  uint32_t u = read_size_field_u32(from);
-  int64_t res = msgpack::bitcast<uint32_t, int32_t>(u);
-  return msgpack::bitcast<int64_t, uint64_t>(res);
-}
-uint64_t read_size_field_s64(const unsigned char *from) {
-  uint64_t u = read_size_field_u64(from);
-  int64_t res = msgpack::bitcast<uint64_t, int64_t>(u);
-  return msgpack::bitcast<int64_t, uint64_t>(res);
-}
-} // namespace payload
-} // namespace
-
-namespace msgpack {
-
-payload_info_t payload_info(msgpack::type ty) {
-  using namespace msgpack;
-  switch (ty) {
-#define X(NAME, WIDTH, PAYLOAD, LOWER, UPPER)                                  \
-  case NAME:                                                                   \
-    return payload::PAYLOAD;
-#include "msgpack.def"
-#undef X
-  }
-  internal_error();
-}
-
-} // namespace msgpack
-
-const unsigned char *msgpack::skip_next_message(const unsigned char *start,
-                                                const unsigned char *end) {
-  class f : public functors_defaults<f> {};
-  return handle_msgpack({start, end}, f());
-}
-
-namespace msgpack {
-bool message_is_string(byte_range bytes, const char *needle) {
-  bool matched = false;
-  size_t needleN = strlen(needle);
-
-  foronly_string(bytes, [=, &matched](size_t N, const unsigned char *str) {
-    if (N == needleN) {
-      if (memcmp(needle, str, N) == 0) {
-        matched = true;
-      }
-    }
-  });
-  return matched;
-}
-
-void dump(byte_range bytes) {
-  struct inner : functors_defaults<inner> {
-    inner(unsigned indent) : indent(indent) {}
-    const unsigned by = 2;
-    unsigned indent = 0;
-
-    void handle_string(size_t N, const unsigned char *bytes) {
-      char *tmp = (char *)malloc(N + 1);
-      memcpy(tmp, bytes, N);
-      tmp[N] = '\0';
-      printf("\"%s\"", tmp);
-      free(tmp);
-    }
-
-    void handle_signed(int64_t x) { printf("%ld", x); }
-    void handle_unsigned(uint64_t x) { printf("%lu", x); }
-
-    const unsigned char *handle_array(uint64_t N, byte_range bytes) {
-      printf("\n%*s[\n", indent, "");
-      indent += by;
-
-      for (uint64_t i = 0; i < N; i++) {
-        indent += by;
-        printf("%*s", indent, "");
-        const unsigned char *next = handle_msgpack<inner>(bytes, {indent});
-        printf(",\n");
-        indent -= by;
-        bytes.start = next;
-        if (!next) {
-          break;
-        }
-      }
-      indent -= by;
-      printf("%*s]", indent, "");
-
-      return bytes.start;
-    }
-
-    const unsigned char *handle_map(uint64_t N, byte_range bytes) {
-      printf("\n%*s{\n", indent, "");
-      indent += by;
-
-      for (uint64_t i = 0; i < 2 * N; i += 2) {
-        const unsigned char *start_key = bytes.start;
-        printf("%*s", indent, "");
-        const unsigned char *end_key =
-            handle_msgpack<inner>({start_key, bytes.end}, {indent});
-        if (!end_key) {
-          break;
-        }
-
-        printf(" : ");
-
-        const unsigned char *start_value = end_key;
-        const unsigned char *end_value =
-            handle_msgpack<inner>({start_value, bytes.end}, {indent});
-
-        if (!end_value) {
-          break;
-        }
-
-        printf(",\n");
-        bytes.start = end_value;
-      }
-
-      indent -= by;
-      printf("%*s}", indent, "");
-
-      return bytes.start;
-    }
-  };
-
-  handle_msgpack<inner>(bytes, {0});
-  printf("\n");
-}
-
-} // namespace msgpack
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.def b/openmp/libomptarget/plugins/amdgpu/impl/msgpack.def
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/msgpack.def
+++ /dev/null
@@ -1,46 +0,0 @@
-//===--- amdgpu/impl/msgpack.def ---------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// name, header width, reader, [lower, upper] encoding
-X(posfixint, 1, read_embedded_u8, 0x00, 0x7f)
-X(negfixint, 1, read_embedded_s8, 0xe0, 0xff)
-X(fixmap, 1, read_via_mask_0xf, 0x80, 0x8f)
-X(fixarray, 1, read_via_mask_0xf, 0x90, 0x9f)
-X(fixstr, 1, read_via_mask_0x1f, 0xa0, 0xbf)
-X(nil, 1, read_zero, 0xc0, 0xc0)
-X(never_used, 1, read_zero, 0xc1, 0xc1)
-X(f, 1, read_via_mask_0x1, 0xc2, 0xc2)
-X(t, 1, read_via_mask_0x1, 0xc3, 0xc3)
-X(bin8, 2, read_size_field_u8, 0xc4, 0xc4)
-X(bin16, 3, read_size_field_u16, 0xc5, 0xc5)
-X(bin32, 5, read_size_field_u32, 0xc6, 0xc6)
-X(ext8, 3, read_size_field_u8, 0xc7, 0xc7)
-X(ext16, 4, read_size_field_u16, 0xc8, 0xc8)
-X(ext32, 6, read_size_field_u32, 0xc9, 0xc9)
-X(float32, 5, read_zero, 0xca, 0xca)
-X(float64, 9, read_zero, 0xcb, 0xcb)
-X(uint8, 2, read_size_field_u8, 0xcc, 0xcc)
-X(uint16, 3, read_size_field_u16, 0xcd, 0xcd)
-X(uint32, 5, read_size_field_u32, 0xce, 0xce)
-X(uint64, 9, read_size_field_u64, 0xcf, 0xcf)
-X(int8, 2, read_size_field_s8, 0xd0, 0xd0)
-X(int16, 3, read_size_field_s16, 0xd1, 0xd1)
-X(int32, 5, read_size_field_s32, 0xd2, 0xd2)
-X(int64, 9, read_size_field_s64, 0xd3, 0xd3)
-X(fixext1, 3, read_zero, 0xd4, 0xd4)
-X(fixext2, 4, read_zero, 0xd5, 0xd5)
-X(fixext4, 6, read_zero, 0xd6, 0xd6)
-X(fixext8, 10, read_zero, 0xd7, 0xd7)
-X(fixext16, 18, read_zero, 0xd8, 0xd8)
-X(str8, 2, read_size_field_u8, 0xd9, 0xd9)
-X(str16, 3, read_size_field_u16, 0xda, 0xda)
-X(str32, 5, read_size_field_u32, 0xdb, 0xdb)
-X(array16, 3, read_size_field_u16, 0xdc, 0xdc)
-X(array32, 5, read_size_field_u32, 0xdd, 0xdd)
-X(map16, 3, read_size_field_u16, 0xde, 0xde)
-X(map32, 5, read_size_field_u32, 0xdf, 0xdf)
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/rt.h b/openmp/libomptarget/plugins/amdgpu/impl/rt.h
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/rt.h
+++ /dev/null
@@ -1,34 +0,0 @@
-//===--- amdgpu/impl/rt.h ----------------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef SRC_RUNTIME_INCLUDE_RT_H_
-#define SRC_RUNTIME_INCLUDE_RT_H_
-
-#include "hsa_api.h"
-#include "impl_runtime.h"
-#include "internal.h"
-
-#include <string>
-
-namespace core {
-namespace Runtime {
-hsa_status_t Memfree(void *);
-hsa_status_t HostMalloc(void **ptr, size_t size,
-                        hsa_amd_memory_pool_t MemoryPool);
-
-} // namespace Runtime
-hsa_status_t RegisterModuleFromMemory(
-    std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
-    std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
-    void *module_bytes, size_t module_size, hsa_agent_t agent,
-    hsa_status_t (*on_deserialized_data)(void *data, size_t size,
-                                         void *cb_state),
-    void *cb_state, std::vector<hsa_executable_t> &HSAExecutables);
-
-} // namespace core
-
-#endif // SRC_RUNTIME_INCLUDE_RT_H_
diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp
+++ /dev/null
@@ -1,744 +0,0 @@
-//===--- amdgpu/impl/system.cpp ----------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/Object/ELF.h"
-#include "llvm/Object/ELFObjectFile.h"
-
-#include <cassert>
-#include <sstream>
-#include <string>
-
-#include "internal.h"
-#include "rt.h"
-
-#include "msgpack.h"
-
-using namespace llvm;
-using namespace llvm::object;
-using namespace llvm::ELF;
-
-namespace hsa {
-// Wrap HSA iterate API in a shim that allows passing general callables
-template <typename C>
-hsa_status_t executable_iterate_symbols(hsa_executable_t executable, C cb) {
-  auto L = [](hsa_executable_t executable, hsa_executable_symbol_t symbol,
-              void *data) -> hsa_status_t {
-    C *unwrapped = static_cast<C *>(data);
-    return (*unwrapped)(executable, symbol);
-  };
-  return hsa_executable_iterate_symbols(executable, L,
-                                        static_cast<void *>(&cb));
-}
-} // namespace hsa
-
-typedef unsigned char *address;
-/*
- * Note descriptors.
- */
-// FreeBSD already declares Elf_Note (indirectly via <libelf.h>)
-#if !defined(__FreeBSD__)
-typedef struct {
-  uint32_t n_namesz; /* Length of note's name. */
-  uint32_t n_descsz; /* Length of note's value. */
-  uint32_t n_type;   /* Type of note. */
-  // then name
-  // then padding, optional
-  // then desc, at 4 byte alignment (not 8, despite being elf64)
-} Elf_Note;
-#endif
-
-class KernelArgMD {
-public:
-  enum class ValueKind {
-    HiddenGlobalOffsetX,
-    HiddenGlobalOffsetY,
-    HiddenGlobalOffsetZ,
-    HiddenNone,
-    HiddenPrintfBuffer,
-    HiddenDefaultQueue,
-    HiddenCompletionAction,
-    HiddenMultiGridSyncArg,
-    HiddenHostcallBuffer,
-    HiddenHeapV1,
-    Unknown
-  };
-
-  KernelArgMD()
-      : name_(std::string()), size_(0), offset_(0),
-        valueKind_(ValueKind::Unknown) {}
-
-  // fields
-  std::string name_;
-  uint32_t size_;
-  uint32_t offset_;
-  ValueKind valueKind_;
-};
-
-static const std::map<std::string, KernelArgMD::ValueKind> ArgValueKind = {
-    // v3
-    //    {"by_value", KernelArgMD::ValueKind::ByValue},
-    //    {"global_buffer", KernelArgMD::ValueKind::GlobalBuffer},
-    //    {"dynamic_shared_pointer",
-    //    KernelArgMD::ValueKind::DynamicSharedPointer},
-    //    {"sampler", KernelArgMD::ValueKind::Sampler},
-    //    {"image", KernelArgMD::ValueKind::Image},
-    //    {"pipe", KernelArgMD::ValueKind::Pipe},
-    //    {"queue", KernelArgMD::ValueKind::Queue},
-    {"hidden_global_offset_x", KernelArgMD::ValueKind::HiddenGlobalOffsetX},
-    {"hidden_global_offset_y", KernelArgMD::ValueKind::HiddenGlobalOffsetY},
-    {"hidden_global_offset_z", KernelArgMD::ValueKind::HiddenGlobalOffsetZ},
-    {"hidden_none", KernelArgMD::ValueKind::HiddenNone},
-    {"hidden_printf_buffer", KernelArgMD::ValueKind::HiddenPrintfBuffer},
-    {"hidden_default_queue", KernelArgMD::ValueKind::HiddenDefaultQueue},
-    {"hidden_completion_action",
-     KernelArgMD::ValueKind::HiddenCompletionAction},
-    {"hidden_multigrid_sync_arg",
-     KernelArgMD::ValueKind::HiddenMultiGridSyncArg},
-    {"hidden_hostcall_buffer", KernelArgMD::ValueKind::HiddenHostcallBuffer},
-    {"hidden_heap_v1", KernelArgMD::ValueKind::HiddenHeapV1}};
-
-namespace core {
-
-hsa_status_t callbackEvent(const hsa_amd_event_t *event, void *data) {
-  if (event->event_type == HSA_AMD_GPU_MEMORY_FAULT_EVENT) {
-    hsa_amd_gpu_memory_fault_info_t memory_fault = event->memory_fault;
-    // memory_fault.agent
-    // memory_fault.virtual_address
-    // memory_fault.fault_reason_mask
-    // fprintf("[GPU Error at %p: Reason is ", memory_fault.virtual_address);
-    std::stringstream stream;
-    stream << std::hex << (uintptr_t)memory_fault.virtual_address;
-    std::string addr("0x" + stream.str());
-
-    std::string err_string = "[GPU Memory Error] Addr: " + addr;
-    err_string += " Reason: ";
-    if (!(memory_fault.fault_reason_mask & 0x00111111)) {
-      err_string += "No Idea! ";
-    } else {
-      if (memory_fault.fault_reason_mask & 0x00000001)
-        err_string += "Page not present or supervisor privilege. ";
-      if (memory_fault.fault_reason_mask & 0x00000010)
-        err_string += "Write access to a read-only page. ";
-      if (memory_fault.fault_reason_mask & 0x00000100)
-        err_string += "Execute access to a page marked NX. ";
-      if (memory_fault.fault_reason_mask & 0x00001000)
-        err_string += "Host access only. ";
-      if (memory_fault.fault_reason_mask & 0x00010000)
-        err_string += "ECC failure (if supported by HW). ";
-      if (memory_fault.fault_reason_mask & 0x00100000)
-        err_string += "Can't determine the exact fault address. ";
-    }
-    fprintf(stderr, "%s\n", err_string.c_str());
-    return HSA_STATUS_ERROR;
-  }
-  return HSA_STATUS_SUCCESS;
-}
-
-hsa_status_t atl_init_gpu_context() {
-  hsa_status_t err = hsa_amd_register_system_event_handler(callbackEvent, NULL);
-  if (err != HSA_STATUS_SUCCESS) {
-    printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-           "Registering the system for memory faults", get_error_string(err));
-    return HSA_STATUS_ERROR;
-  }
-
-  return HSA_STATUS_SUCCESS;
-}
-
-static bool isImplicit(KernelArgMD::ValueKind value_kind) {
-  switch (value_kind) {
-  case KernelArgMD::ValueKind::HiddenGlobalOffsetX:
-  case KernelArgMD::ValueKind::HiddenGlobalOffsetY:
-  case KernelArgMD::ValueKind::HiddenGlobalOffsetZ:
-  case KernelArgMD::ValueKind::HiddenNone:
-  case KernelArgMD::ValueKind::HiddenPrintfBuffer:
-  case KernelArgMD::ValueKind::HiddenDefaultQueue:
-  case KernelArgMD::ValueKind::HiddenCompletionAction:
-  case KernelArgMD::ValueKind::HiddenMultiGridSyncArg:
-  case KernelArgMD::ValueKind::HiddenHostcallBuffer:
-  case KernelArgMD::ValueKind::HiddenHeapV1:
-    return true;
-  default:
-    return false;
-  }
-}
-
-static std::pair<const unsigned char *, const unsigned char *>
-findMetadata(const ELFObjectFile<ELF64LE> &ELFObj) {
-  constexpr std::pair<const unsigned char *, const unsigned char *> Failure = {
-      nullptr, nullptr};
-  const auto &Elf = ELFObj.getELFFile();
-  auto PhdrsOrErr = Elf.program_headers();
-  if (!PhdrsOrErr) {
-    consumeError(PhdrsOrErr.takeError());
-    return Failure;
-  }
-
-  for (auto Phdr : *PhdrsOrErr) {
-    if (Phdr.p_type != PT_NOTE)
-      continue;
-
-    Error Err = Error::success();
-    for (auto Note : Elf.notes(Phdr, Err)) {
-      if (Note.getType() == 7 || Note.getType() == 8)
-        return Failure;
-
-      // Code object v2 uses yaml metadata and is no longer supported.
-      if (Note.getType() == NT_AMD_HSA_METADATA && Note.getName() == "AMD")
-        return Failure;
-      // Code object v3 should have AMDGPU metadata.
-      if (Note.getType() == NT_AMDGPU_METADATA && Note.getName() != "AMDGPU")
-        return Failure;
-
-      ArrayRef<uint8_t> Desc = Note.getDesc(Phdr.p_align);
-      return {Desc.data(), Desc.data() + Desc.size()};
-    }
-
-    if (Err) {
-      consumeError(std::move(Err));
-      return Failure;
-    }
-  }
-
-  return Failure;
-}
-
-static std::pair<const unsigned char *, const unsigned char *>
-find_metadata(void *binary, size_t binSize) {
-  constexpr std::pair<const unsigned char *, const unsigned char *> Failure = {
-      nullptr, nullptr};
-
-  StringRef Buffer = StringRef(static_cast<const char *>(binary), binSize);
-  auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""),
-                                                  /*InitContent=*/false);
-  if (!ElfOrErr) {
-    consumeError(ElfOrErr.takeError());
-    return Failure;
-  }
-
-  if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(ElfOrErr->get()))
-    return findMetadata(*ELFObj);
-  return Failure;
-}
-
-namespace {
-int map_lookup_array(msgpack::byte_range message, const char *needle,
-                     msgpack::byte_range *res, uint64_t *size) {
-  unsigned count = 0;
-  struct s : msgpack::functors_defaults<s> {
-    s(unsigned &count, uint64_t *size) : count(count), size(size) {}
-    unsigned &count;
-    uint64_t *size;
-    const unsigned char *handle_array(uint64_t N, msgpack::byte_range bytes) {
-      count++;
-      *size = N;
-      return bytes.end;
-    }
-  };
-
-  msgpack::foreach_map(message,
-                       [&](msgpack::byte_range key, msgpack::byte_range value) {
-                         if (msgpack::message_is_string(key, needle)) {
-                           // If the message is an array, record number of
-                           // elements in *size
-                           msgpack::handle_msgpack<s>(value, {count, size});
-                           // return the whole array
-                           *res = value;
-                         }
-                       });
-  // Only claim success if exactly one key/array pair matched
-  return count != 1;
-}
-
-int map_lookup_string(msgpack::byte_range message, const char *needle,
-                      std::string *res) {
-  unsigned count = 0;
-  struct s : public msgpack::functors_defaults<s> {
-    s(unsigned &count, std::string *res) : count(count), res(res) {}
-    unsigned &count;
-    std::string *res;
-    void handle_string(size_t N, const unsigned char *str) {
-      count++;
-      *res = std::string(str, str + N);
-    }
-  };
-  msgpack::foreach_map(message,
-                       [&](msgpack::byte_range key, msgpack::byte_range value) {
-                         if (msgpack::message_is_string(key, needle)) {
-                           msgpack::handle_msgpack<s>(value, {count, res});
-                         }
-                       });
-  return count != 1;
-}
-
-int map_lookup_uint64_t(msgpack::byte_range message, const char *needle,
-                        uint64_t *res) {
-  unsigned count = 0;
-  msgpack::foreach_map(message,
-                       [&](msgpack::byte_range key, msgpack::byte_range value) {
-                         if (msgpack::message_is_string(key, needle)) {
-                           msgpack::foronly_unsigned(value, [&](uint64_t x) {
-                             count++;
-                             *res = x;
-                           });
-                         }
-                       });
-  return count != 1;
-}
-
-int array_lookup_element(msgpack::byte_range message, uint64_t elt,
-                         msgpack::byte_range *res) {
-  int rc = 1;
-  uint64_t i = 0;
-  msgpack::foreach_array(message, [&](msgpack::byte_range value) {
-    if (i == elt) {
-      *res = value;
-      rc = 0;
-    }
-    i++;
-  });
-  return rc;
-}
-
-int populate_kernelArgMD(msgpack::byte_range args_element,
-                         KernelArgMD *kernelarg) {
-  using namespace msgpack;
-  int error = 0;
-  foreach_map(args_element, [&](byte_range key, byte_range value) -> void {
-    if (message_is_string(key, ".name")) {
-      foronly_string(value, [&](size_t N, const unsigned char *str) {
-        kernelarg->name_ = std::string(str, str + N);
-      });
-    } else if (message_is_string(key, ".size")) {
-      foronly_unsigned(value, [&](uint64_t x) { kernelarg->size_ = x; });
-    } else if (message_is_string(key, ".offset")) {
-      foronly_unsigned(value, [&](uint64_t x) { kernelarg->offset_ = x; });
-    } else if (message_is_string(key, ".value_kind")) {
-      foronly_string(value, [&](size_t N, const unsigned char *str) {
-        std::string s = std::string(str, str + N);
-        auto itValueKind = ArgValueKind.find(s);
-        if (itValueKind != ArgValueKind.end()) {
-          kernelarg->valueKind_ = itValueKind->second;
-        }
-      });
-    }
-  });
-  return error;
-}
-} // namespace
-
-static hsa_status_t get_code_object_custom_metadata(
-    void *binary, size_t binSize,
-    std::map<std::string, atl_kernel_info_t> &KernelInfoTable) {
-  // parse code object with different keys from v2
-  // also, the kernel name is not the same as the symbol name -- so a
-  // symbol->name map is needed
-
-  std::pair<const unsigned char *, const unsigned char *> metadata =
-      find_metadata(binary, binSize);
-  if (!metadata.first) {
-    return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
-  }
-
-  uint64_t kernelsSize = 0;
-  int msgpack_errors = 0;
-  msgpack::byte_range kernel_array;
-  msgpack_errors =
-      map_lookup_array({metadata.first, metadata.second}, "amdhsa.kernels",
-                       &kernel_array, &kernelsSize);
-  if (msgpack_errors != 0) {
-    printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
-           "kernels lookup in program metadata");
-    return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
-  }
-
-  for (size_t i = 0; i < kernelsSize; i++) {
-    assert(msgpack_errors == 0);
-    std::string kernelName;
-    std::string symbolName;
-
-    msgpack::byte_range element;
-    msgpack_errors += array_lookup_element(kernel_array, i, &element);
-    if (msgpack_errors != 0) {
-      printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
-             "element lookup in kernel metadata");
-      return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
-    }
-
-    msgpack_errors += map_lookup_string(element, ".name", &kernelName);
-    msgpack_errors += map_lookup_string(element, ".symbol", &symbolName);
-    if (msgpack_errors != 0) {
-      printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
-             "strings lookup in kernel metadata");
-      return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
-    }
-
-    // Make sure that kernelName + ".kd" == symbolName
-    if ((kernelName + ".kd") != symbolName) {
-      printf("[%s:%d] Kernel name mismatching symbol: %s != %s + .kd\n",
-             __FILE__, __LINE__, symbolName.c_str(), kernelName.c_str());
-      return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
-    }
-
-    atl_kernel_info_t info = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-
-    uint64_t sgpr_count, vgpr_count, sgpr_spill_count, vgpr_spill_count;
-    msgpack_errors += map_lookup_uint64_t(element, ".sgpr_count", &sgpr_count);
-    if (msgpack_errors != 0) {
-      printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
-             "sgpr count metadata lookup in kernel metadata");
-      return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
-    }
-
-    info.sgpr_count = sgpr_count;
-
-    msgpack_errors += map_lookup_uint64_t(element, ".vgpr_count", &vgpr_count);
-    if (msgpack_errors != 0) {
-      printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
-             "vgpr count metadata lookup in kernel metadata");
-      return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
-    }
-
-    info.vgpr_count = vgpr_count;
-
-    msgpack_errors +=
-        map_lookup_uint64_t(element, ".sgpr_spill_count", &sgpr_spill_count);
-    if (msgpack_errors != 0) {
-      printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
-             "sgpr spill count metadata lookup in kernel metadata");
-      return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
-    }
-
-    info.sgpr_spill_count = sgpr_spill_count;
-
-    msgpack_errors +=
-        map_lookup_uint64_t(element, ".vgpr_spill_count", &vgpr_spill_count);
-    if (msgpack_errors != 0) {
-      printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
-             "vgpr spill count metadata lookup in kernel metadata");
-      return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
-    }
-
-    info.vgpr_spill_count = vgpr_spill_count;
-
-    size_t kernel_explicit_args_size = 0;
-    uint64_t kernel_segment_size;
-    msgpack_errors += map_lookup_uint64_t(element, ".kernarg_segment_size",
-                                          &kernel_segment_size);
-    if (msgpack_errors != 0) {
-      printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
-             "kernarg segment size metadata lookup in kernel metadata");
-      return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
-    }
-
-    bool hasHiddenArgs = false;
-    if (kernel_segment_size > 0) {
-      uint64_t argsSize;
-      size_t offset = 0;
-
-      msgpack::byte_range args_array;
-      msgpack_errors +=
-          map_lookup_array(element, ".args", &args_array, &argsSize);
-      if (msgpack_errors != 0) {
-        printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
-               "kernel args metadata lookup in kernel metadata");
-        return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
-      }
-
-      for (size_t i = 0; i < argsSize; ++i) {
-        KernelArgMD lcArg;
-
-        msgpack::byte_range args_element;
-        msgpack_errors += array_lookup_element(args_array, i, &args_element);
-        if (msgpack_errors != 0) {
-          printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
-                 "iterate args map in kernel args metadata");
-          return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
-        }
-
-        msgpack_errors += populate_kernelArgMD(args_element, &lcArg);
-        if (msgpack_errors != 0) {
-          printf("[%s:%d] %s failed\n", __FILE__, __LINE__,
-                 "iterate args map in kernel args metadata");
-          return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
-        }
-        // v3 has offset field and not align field
-        size_t new_offset = lcArg.offset_;
-        size_t padding = new_offset - offset;
-        offset = new_offset;
-        DP("Arg[%lu] \"%s\" (%u, %u)\n", i, lcArg.name_.c_str(), lcArg.size_,
-           lcArg.offset_);
-        offset += lcArg.size_;
-
-        // check if the arg is a hidden/implicit arg
-        // this logic assumes that all hidden args are 8-byte aligned
-        if (!isImplicit(lcArg.valueKind_)) {
-          info.explicit_argument_count++;
-          kernel_explicit_args_size += lcArg.size_;
-        } else {
-          info.implicit_argument_count++;
-          hasHiddenArgs = true;
-        }
-        kernel_explicit_args_size += padding;
-      }
-    }
-
-    // TODO: Probably don't want this arithmetic
-    info.kernel_segment_size =
-        (hasHiddenArgs ? kernel_explicit_args_size : kernel_segment_size);
-    DP("[%s: kernarg seg size] (%lu --> %u)\n", kernelName.c_str(),
-       kernel_segment_size, info.kernel_segment_size);
-
-    // kernel received, now add it to the kernel info table
-    KernelInfoTable[kernelName] = info;
-  }
-
-  return HSA_STATUS_SUCCESS;
-}
-
-static hsa_status_t
-populate_InfoTables(hsa_executable_symbol_t symbol,
-                    std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
-                    std::map<std::string, atl_symbol_info_t> &SymbolInfoTable) {
-  hsa_symbol_kind_t type;
-
-  uint32_t name_length;
-  hsa_status_t err;
-  err = hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE,
-                                       &type);
-  if (err != HSA_STATUS_SUCCESS) {
-    printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-           "Symbol info extraction", get_error_string(err));
-    return err;
-  }
-  DP("Exec Symbol type: %d\n", type);
-  if (type == HSA_SYMBOL_KIND_KERNEL) {
-    err = hsa_executable_symbol_get_info(
-        symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length);
-    if (err != HSA_STATUS_SUCCESS) {
-      printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-             "Symbol info extraction", get_error_string(err));
-      return err;
-    }
-    char *name = reinterpret_cast<char *>(malloc(name_length + 1));
-    err = hsa_executable_symbol_get_info(symbol,
-                                         HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
-    if (err != HSA_STATUS_SUCCESS) {
-      printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-             "Symbol info extraction", get_error_string(err));
-      return err;
-    }
-    // remove the suffix .kd from symbol name.
-    name[name_length - 3] = 0;
-
-    atl_kernel_info_t info;
-    std::string kernelName(name);
-    // by now, the kernel info table should already have an entry
-    // because the non-ROCr custom code object parsing is called before
-    // iterating over the code object symbols using ROCr
-    if (KernelInfoTable.find(kernelName) == KernelInfoTable.end()) {
-      DP("amdgpu internal consistency error\n");
-      return HSA_STATUS_ERROR;
-    }
-    // found, so assign and update
-    info = KernelInfoTable[kernelName];
-
-    /* Extract dispatch information from the symbol */
-    err = hsa_executable_symbol_get_info(
-        symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
-        &(info.kernel_object));
-    if (err != HSA_STATUS_SUCCESS) {
-      printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-             "Extracting the symbol from the executable",
-             get_error_string(err));
-      return err;
-    }
-    err = hsa_executable_symbol_get_info(
-        symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
-        &(info.group_segment_size));
-    if (err != HSA_STATUS_SUCCESS) {
-      printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-             "Extracting the group segment size from the executable",
-             get_error_string(err));
-      return err;
-    }
-    err = hsa_executable_symbol_get_info(
-        symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
-        &(info.private_segment_size));
-    if (err != HSA_STATUS_SUCCESS) {
-      printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-             "Extracting the private segment from the executable",
-             get_error_string(err));
-      return err;
-    }
-
-    DP("Kernel %s --> %lx symbol %u group segsize %u pvt segsize %u bytes "
-       "kernarg\n",
-       kernelName.c_str(), info.kernel_object, info.group_segment_size,
-       info.private_segment_size, info.kernel_segment_size);
-
-    // assign it back to the kernel info table
-    KernelInfoTable[kernelName] = info;
-    free(name);
-  } else if (type == HSA_SYMBOL_KIND_VARIABLE) {
-    err = hsa_executable_symbol_get_info(
-        symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &name_length);
-    if (err != HSA_STATUS_SUCCESS) {
-      printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-             "Symbol info extraction", get_error_string(err));
-      return err;
-    }
-    char *name = reinterpret_cast<char *>(malloc(name_length + 1));
-    err = hsa_executable_symbol_get_info(symbol,
-                                         HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
-    if (err != HSA_STATUS_SUCCESS) {
-      printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-             "Symbol info extraction", get_error_string(err));
-      return err;
-    }
-    name[name_length] = 0;
-
-    atl_symbol_info_t info;
-
-    err = hsa_executable_symbol_get_info(
-        symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &(info.addr));
-    if (err != HSA_STATUS_SUCCESS) {
-      printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-             "Symbol info address extraction", get_error_string(err));
-      return err;
-    }
-
-    err = hsa_executable_symbol_get_info(
-        symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &(info.size));
-    if (err != HSA_STATUS_SUCCESS) {
-      printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-             "Symbol info size extraction", get_error_string(err));
-      return err;
-    }
-
-    DP("Symbol %s = %p (%u bytes)\n", name, (void *)info.addr, info.size);
-    SymbolInfoTable[std::string(name)] = info;
-    free(name);
-  } else {
-    DP("Symbol is an indirect function\n");
-  }
-  return HSA_STATUS_SUCCESS;
-}
-
-hsa_status_t RegisterModuleFromMemory(
-    std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
-    std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
-    void *module_bytes, size_t module_size, hsa_agent_t agent,
-    hsa_status_t (*on_deserialized_data)(void *data, size_t size,
-                                         void *cb_state),
-    void *cb_state, std::vector<hsa_executable_t> &HSAExecutables) {
-  hsa_status_t err;
-  hsa_executable_t executable = {0};
-  hsa_profile_t agent_profile;
-
-  err = hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_profile);
-  if (err != HSA_STATUS_SUCCESS) {
-    printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-           "Query the agent profile", get_error_string(err));
-    return HSA_STATUS_ERROR;
-  }
-  // FIXME: Assume that every profile is FULL until we understand how to build
-  // GCN with base profile
-  agent_profile = HSA_PROFILE_FULL;
-  /* Create the empty executable.  */
-  err = hsa_executable_create(agent_profile, HSA_EXECUTABLE_STATE_UNFROZEN, "",
-                              &executable);
-  if (err != HSA_STATUS_SUCCESS) {
-    printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-           "Create the executable", get_error_string(err));
-    return HSA_STATUS_ERROR;
-  }
-
-  bool module_load_success = false;
-  do // Existing control flow used continue, preserve that for this patch
-  {
-    {
-      // Some metadata info is not available through ROCr API, so use custom
-      // code object metadata parsing to collect such metadata info
-
-      err = get_code_object_custom_metadata(module_bytes, module_size,
-                                            KernelInfoTable);
-      if (err != HSA_STATUS_SUCCESS) {
-        DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-           "Getting custom code object metadata", get_error_string(err));
-        continue;
-      }
-
-      // Deserialize code object.
-      hsa_code_object_t code_object = {0};
-      err = hsa_code_object_deserialize(module_bytes, module_size, NULL,
-                                        &code_object);
-      if (err != HSA_STATUS_SUCCESS) {
-        DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-           "Code Object Deserialization", get_error_string(err));
-        continue;
-      }
-      assert(0 != code_object.handle);
-
-      // Mutating the device image here avoids another allocation & memcpy
-      void *code_object_alloc_data =
-          reinterpret_cast<void *>(code_object.handle);
-      hsa_status_t impl_err =
-          on_deserialized_data(code_object_alloc_data, module_size, cb_state);
-      if (impl_err != HSA_STATUS_SUCCESS) {
-        printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-               "Error in deserialized_data callback",
-               get_error_string(impl_err));
-        return impl_err;
-      }
-
-      /* Load the code object.  */
-      err =
-          hsa_executable_load_code_object(executable, agent, code_object, NULL);
-      if (err != HSA_STATUS_SUCCESS) {
-        DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-           "Loading the code object", get_error_string(err));
-        continue;
-      }
-
-      // cannot iterate over symbols until executable is frozen
-    }
-    module_load_success = true;
-  } while (0);
-  DP("Modules loaded successful? %d\n", module_load_success);
-  if (module_load_success) {
-    /* Freeze the executable; it can now be queried for symbols.  */
-    err = hsa_executable_freeze(executable, "");
-    if (err != HSA_STATUS_SUCCESS) {
-      printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-             "Freeze the executable", get_error_string(err));
-      return HSA_STATUS_ERROR;
-    }
-
-    err = hsa::executable_iterate_symbols(
-        executable,
-        [&](hsa_executable_t, hsa_executable_symbol_t symbol) -> hsa_status_t {
-          return populate_InfoTables(symbol, KernelInfoTable, SymbolInfoTable);
-        });
-    if (err != HSA_STATUS_SUCCESS) {
-      printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-             "Iterating over symbols for execuatable", get_error_string(err));
-      return HSA_STATUS_ERROR;
-    }
-
-    // save the executable and destroy during finalize
-    HSAExecutables.push_back(executable);
-    return HSA_STATUS_SUCCESS;
-  } else {
-    return HSA_STATUS_ERROR;
-  }
-}
-
-} // namespace core
diff --git a/openmp/libomptarget/plugins/amdgpu/src/print_tracing.h b/openmp/libomptarget/plugins/amdgpu/src/print_tracing.h
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/print_tracing.h
+++ /dev/null
@@ -1,20 +0,0 @@
-//===--- amdgpu/src/print_tracing.h ------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#ifndef LIBOMPTARGET_PLUGINS_AMGGPU_SRC_PRINT_TRACING_H_INCLUDED
-#define LIBOMPTARGET_PLUGINS_AMGGPU_SRC_PRINT_TRACING_H_INCLUDED
-
-enum PrintTraceControlBits {
-  LAUNCH = 1,          // print a message to stderr for each kernel launch
-  RTL_TIMING = 2,      // Print timing info around each RTL step
-  STARTUP_DETAILS = 4, // Details around loading up kernel
-  RTL_TO_STDOUT = 8    // Redirect RTL tracing to stdout
-};
-
-extern int print_kernel_trace; // set by environment variable
-
-#endif
diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
deleted file mode 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ /dev/null
@@ -1,2615 +0,0 @@
-//===--- amdgpu/src/rtl.cpp --------------------------------------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// RTL for AMD hsa machine
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Frontend/OpenMP/OMPConstants.h"
-#include "llvm/Frontend/OpenMP/OMPGridValues.h"
-#include "llvm/Object/ELF.h"
-#include "llvm/Object/ELFObjectFile.h"
-
-#include <algorithm>
-#include <assert.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <functional>
-#include <list>
-#include <memory>
-#include <mutex>
-#include <shared_mutex>
-#include <unordered_map>
-#include <vector>
-
-#include "ELFSymbols.h"
-#include "impl_runtime.h"
-#include "interop_hsa.h"
-
-#include "UtilitiesRTL.h"
-#include "internal.h"
-#include "rt.h"
-
-#include "DeviceEnvironment.h"
-#include "get_elf_mach_gfx_name.h"
-#include "omptargetplugin.h"
-#include "print_tracing.h"
-
-using namespace llvm;
-using namespace llvm::object;
-using namespace llvm::ELF;
-using namespace llvm::omp::target::plugin::utils;
-
-// hostrpc interface, FIXME: consider moving to its own include these are
-// statically linked into amdgpu/plugin if present from hostrpc_services.a,
-// linked as --whole-archive to override the weak symbols that are used to
-// implement a fallback for toolchains that do not yet have a hostrpc library.
-extern "C" {
-uint64_t hostrpc_assign_buffer(hsa_agent_t Agent, hsa_queue_t *ThisQ,
-                               uint32_t DeviceId);
-hsa_status_t hostrpc_init();
-hsa_status_t hostrpc_terminate();
-
-__attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; }
-__attribute__((weak)) hsa_status_t hostrpc_terminate() {
-  return HSA_STATUS_SUCCESS;
-}
-__attribute__((weak)) uint64_t hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *,
-                                                     uint32_t DeviceId) {
-  DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library "
-     "missing\n",
-     DeviceId);
-  return 0;
-}
-}
-
-// Heuristic parameters used for kernel launch
-// Number of teams per CU to allow scheduling flexibility
-static const unsigned DefaultTeamsPerCU = 4;
-
-int print_kernel_trace;
-
-#ifdef OMPTARGET_DEBUG
-#define check(msg, status)                                                     \
-  if (status != HSA_STATUS_SUCCESS) {                                          \
-    DP(#msg " failed\n");                                                      \
-  } else {                                                                     \
-    DP(#msg " succeeded\n");                                                   \
-  }
-#else
-#define check(msg, status)                                                     \
-  {}
-#endif
-
-#include "elf_common.h"
-
-namespace hsa {
-template <typename C> hsa_status_t iterate_agents(C Cb) {
-  auto L = [](hsa_agent_t Agent, void *Data) -> hsa_status_t {
-    C *Unwrapped = static_cast<C *>(Data);
-    return (*Unwrapped)(Agent);
-  };
-  return hsa_iterate_agents(L, static_cast<void *>(&Cb));
-}
-
-template <typename C>
-hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C Cb) {
-  auto L = [](hsa_amd_memory_pool_t MemoryPool, void *Data) -> hsa_status_t {
-    C *Unwrapped = static_cast<C *>(Data);
-    return (*Unwrapped)(MemoryPool);
-  };
-
-  return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast<void *>(&Cb));
-}
-
-} // namespace hsa
-
-/// Keep entries table per device
-struct FuncOrGblEntryTy {
-  __tgt_target_table Table;
-  std::vector<__tgt_offload_entry> Entries;
-};
-
-struct KernelArgPool {
-private:
-  static pthread_mutex_t Mutex;
-
-public:
-  uint32_t KernargSegmentSize;
-  void *KernargRegion = nullptr;
-  std::queue<int> FreeKernargSegments;
-
-  uint32_t kernargSizeIncludingImplicit() {
-    return KernargSegmentSize + sizeof(AMDGPUImplicitArgsTy);
-  }
-
-  ~KernelArgPool() {
-    if (KernargRegion) {
-      auto R = hsa_amd_memory_pool_free(KernargRegion);
-      if (R != HSA_STATUS_SUCCESS) {
-        DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(R));
-      }
-    }
-  }
-
-  // Can't really copy or move a mutex
-  KernelArgPool() = default;
-  KernelArgPool(const KernelArgPool &) = delete;
-  KernelArgPool(KernelArgPool &&) = delete;
-
-  KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool)
-      : KernargSegmentSize(KernargSegmentSize) {
-
-    // impl uses one pool per kernel for all gpus, with a fixed upper size
-    // preserving that exact scheme here, including the queue<int>
-
-    hsa_status_t Err = hsa_amd_memory_pool_allocate(
-        MemoryPool, kernargSizeIncludingImplicit() * MAX_NUM_KERNELS, 0,
-        &KernargRegion);
-
-    if (Err != HSA_STATUS_SUCCESS) {
-      DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(Err));
-      KernargRegion = nullptr; // paranoid
-      return;
-    }
-
-    Err = core::allow_access_to_all_gpu_agents(KernargRegion);
-    if (Err != HSA_STATUS_SUCCESS) {
-      DP("hsa allow_access_to_all_gpu_agents failed: %s\n",
-         get_error_string(Err));
-      auto R = hsa_amd_memory_pool_free(KernargRegion);
-      if (R != HSA_STATUS_SUCCESS) {
-        // if free failed, can't do anything more to resolve it
-        DP("hsa memory poll free failed: %s\n", get_error_string(Err));
-      }
-      KernargRegion = nullptr;
-      return;
-    }
-
-    for (int I = 0; I < MAX_NUM_KERNELS; I++) {
-      FreeKernargSegments.push(I);
-    }
-  }
-
-  void *allocate(uint64_t ArgNum) {
-    assert((ArgNum * sizeof(void *)) == KernargSegmentSize);
-    Lock L(&Mutex);
-    void *Res = nullptr;
-    if (!FreeKernargSegments.empty()) {
-
-      int FreeIdx = FreeKernargSegments.front();
-      Res = static_cast<void *>(static_cast<char *>(KernargRegion) +
-                                (FreeIdx * kernargSizeIncludingImplicit()));
-      assert(FreeIdx == pointerToIndex(Res));
-      FreeKernargSegments.pop();
-    }
-    return Res;
-  }
-
-  void deallocate(void *Ptr) {
-    Lock L(&Mutex);
-    int Idx = pointerToIndex(Ptr);
-    FreeKernargSegments.push(Idx);
-  }
-
-private:
-  int pointerToIndex(void *Ptr) {
-    ptrdiff_t Bytes =
-        static_cast<char *>(Ptr) - static_cast<char *>(KernargRegion);
-    assert(Bytes >= 0);
-    assert(Bytes % kernargSizeIncludingImplicit() == 0);
-    return Bytes / kernargSizeIncludingImplicit();
-  }
-  struct Lock {
-    Lock(pthread_mutex_t *M) : M(M) { pthread_mutex_lock(M); }
-    ~Lock() { pthread_mutex_unlock(M); }
-    pthread_mutex_t *M;
-  };
-};
-pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER;
-
-std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>>
-    KernelArgPoolMap;
-
-/// Use a single entity to encode a kernel and a set of flags
-struct KernelTy {
-  llvm::omp::OMPTgtExecModeFlags ExecutionMode;
-  int16_t ConstWGSize;
-  int32_t DeviceId;
-  void *CallStackAddr = nullptr;
-  const char *Name;
-
-  KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize,
-           int32_t DeviceId, void *CallStackAddr, const char *Name,
-           uint32_t KernargSegmentSize,
-           hsa_amd_memory_pool_t &KernArgMemoryPool)
-      : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize),
-        DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) {
-    DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode);
-
-    std::string N(Name);
-    if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
-      KernelArgPoolMap.insert(
-          std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool(
-                                KernargSegmentSize, KernArgMemoryPool))));
-    }
-  }
-};
-
-/// List that contains all the kernels.
-/// FIXME: we may need this to be per device and per library.
-std::list<KernelTy> KernelsList;
-
-template <typename Callback> static hsa_status_t findAgents(Callback CB) {
-
-  hsa_status_t Err =
-      hsa::iterate_agents([&](hsa_agent_t Agent) -> hsa_status_t {
-        hsa_device_type_t DeviceType;
-        // get_info fails iff HSA runtime not yet initialized
-        hsa_status_t Err =
-            hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType);
-
-        if (Err != HSA_STATUS_SUCCESS) {
-          if (print_kernel_trace > 0)
-            DP("rtl.cpp: err %s\n", get_error_string(Err));
-
-          return Err;
-        }
-
-        CB(DeviceType, Agent);
-        return HSA_STATUS_SUCCESS;
-      });
-
-  // iterate_agents fails iff HSA runtime not yet initialized
-  if (print_kernel_trace > 0 && Err != HSA_STATUS_SUCCESS) {
-    DP("rtl.cpp: err %s\n", get_error_string(Err));
-  }
-
-  return Err;
-}
-
-static void callbackQueue(hsa_status_t Status, hsa_queue_t *Source,
-                          void *Data) {
-  if (Status != HSA_STATUS_SUCCESS) {
-    const char *StatusString;
-    if (hsa_status_string(Status, &StatusString) != HSA_STATUS_SUCCESS) {
-      StatusString = "unavailable";
-    }
-    DP("[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, __LINE__, Source,
-       Status, StatusString);
-    abort();
-  }
-}
-
-namespace core {
-namespace {
-
-bool checkResult(hsa_status_t Err, const char *ErrMsg) {
-  if (Err == HSA_STATUS_SUCCESS)
-    return true;
-
-  REPORT("%s", ErrMsg);
-  REPORT("%s", get_error_string(Err));
-  return false;
-}
-
-void packetStoreRelease(uint32_t *Packet, uint16_t Header, uint16_t Rest) {
-  __atomic_store_n(Packet, Header | (Rest << 16), __ATOMIC_RELEASE);
-}
-
-uint16_t createHeader() {
-  uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
-  Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
-  Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
-  return Header;
-}
-
-hsa_status_t isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) {
-  bool AllocAllowed = false;
-  hsa_status_t Err = hsa_amd_memory_pool_get_info(
-      MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
-      &AllocAllowed);
-  if (Err != HSA_STATUS_SUCCESS) {
-    DP("Alloc allowed in memory pool check failed: %s\n",
-       get_error_string(Err));
-    return Err;
-  }
-
-  size_t Size = 0;
-  Err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE,
-                                     &Size);
-  if (Err != HSA_STATUS_SUCCESS) {
-    DP("Get memory pool size failed: %s\n", get_error_string(Err));
-    return Err;
-  }
-
-  return (AllocAllowed && Size > 0) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
-}
-
-hsa_status_t addMemoryPool(hsa_amd_memory_pool_t MemoryPool, void *Data) {
-  std::vector<hsa_amd_memory_pool_t> *Result =
-      static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data);
-
-  hsa_status_t Err;
-  if ((Err = isValidMemoryPool(MemoryPool)) != HSA_STATUS_SUCCESS) {
-    return Err;
-  }
-
-  Result->push_back(MemoryPool);
-  return HSA_STATUS_SUCCESS;
-}
-
-} // namespace
-} // namespace core
-
-struct EnvironmentVariables {
-  int NumTeams;
-  int TeamLimit;
-  int TeamThreadLimit;
-  int MaxTeamsDefault;
-  int DynamicMemSize;
-};
-
-template <uint32_t wavesize>
-static constexpr const llvm::omp::GV &getGridValue() {
-  return llvm::omp::getAMDGPUGridValues<wavesize>();
-}
-
-struct HSALifetime {
-  // Wrapper around HSA used to ensure it is constructed before other types
-  // and destructed after, which means said other types can use raii for
-  // cleanup without risking running outside of the lifetime of HSA
-  const hsa_status_t S;
-
-  bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; }
-  HSALifetime() : S(hsa_init()) {}
-
-  ~HSALifetime() {
-    if (S == HSA_STATUS_SUCCESS) {
-      hsa_status_t Err = hsa_shut_down();
-      if (Err != HSA_STATUS_SUCCESS) {
-        // Can't call into HSA to get a string from the integer
-        DP("Shutting down HSA failed: %d\n", Err);
-      }
-    }
-  }
-};
-
-// Handle scheduling of multiple hsa_queue's per device to
-// multiple threads (one scheduler per device)
-class HSAQueueScheduler {
-public:
-  HSAQueueScheduler() : Current(0) {}
-
-  HSAQueueScheduler(const HSAQueueScheduler &) = delete;
-
-  HSAQueueScheduler(HSAQueueScheduler &&Q) {
-    Current = Q.Current.load();
-    for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
-      HSAQueues[I] = Q.HSAQueues[I];
-      Q.HSAQueues[I] = nullptr;
-    }
-  }
-
-  // \return false if any HSA queue creation fails
-  bool createQueues(hsa_agent_t HSAAgent, uint32_t QueueSize) {
-    for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
-      hsa_queue_t *Q = nullptr;
-      hsa_status_t Rc =
-          hsa_queue_create(HSAAgent, QueueSize, HSA_QUEUE_TYPE_MULTI,
-                           callbackQueue, NULL, UINT32_MAX, UINT32_MAX, &Q);
-      if (Rc != HSA_STATUS_SUCCESS) {
-        DP("Failed to create HSA queue %d\n", I);
-        return false;
-      }
-      HSAQueues[I] = Q;
-    }
-    return true;
-  }
-
-  ~HSAQueueScheduler() {
-    for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
-      if (HSAQueues[I]) {
-        hsa_status_t Err = hsa_queue_destroy(HSAQueues[I]);
-        if (Err != HSA_STATUS_SUCCESS)
-          DP("Error destroying HSA queue");
-      }
-    }
-  }
-
-  // \return next queue to use for device
-  hsa_queue_t *next() {
-    return HSAQueues[(Current.fetch_add(1, std::memory_order_relaxed)) %
-                     NUM_QUEUES_PER_DEVICE];
-  }
-
-private:
-  // Number of queues per device
-  enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 };
-  hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {};
-  std::atomic<uint8_t> Current;
-};
-
-/// Class containing all the device information
-class RTLDeviceInfoTy : HSALifetime {
-  std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
-
-  struct QueueDeleter {
-    void operator()(hsa_queue_t *Q) {
-      if (Q) {
-        hsa_status_t Err = hsa_queue_destroy(Q);
-        if (Err != HSA_STATUS_SUCCESS) {
-          DP("Error destroying hsa queue: %s\n", get_error_string(Err));
-        }
-      }
-    }
-  };
-
-public:
-  bool ConstructionSucceeded = false;
-
-  // load binary populates symbol tables and mutates various global state
-  // run uses those symbol tables
-  std::shared_timed_mutex LoadRunLock;
-
-  int NumberOfDevices = 0;
-
-  // GPU devices
-  std::vector<hsa_agent_t> HSAAgents;
-  std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu
-
-  // CPUs
-  std::vector<hsa_agent_t> CPUAgents;
-
-  // Device properties
-  std::vector<int> ComputeUnits;
-  std::vector<int> GroupsPerDevice;
-  std::vector<int> ThreadsPerGroup;
-  std::vector<int> WarpSize;
-  std::vector<std::string> GPUName;
-  std::vector<std::string> TargetID;
-
-  // OpenMP properties
-  std::vector<int> NumTeams;
-  std::vector<int> NumThreads;
-
-  // OpenMP Environment properties
-  EnvironmentVariables Env;
-
-  // OpenMP Requires Flags
-  int64_t RequiresFlags;
-
-  // Resource pools
-  SignalPoolT FreeSignalPool;
-
-  bool HostcallRequired = false;
-
-  std::vector<hsa_executable_t> HSAExecutables;
-
-  std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
-  std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
-
-  hsa_amd_memory_pool_t KernArgPool;
-
-  // fine grained memory pool for host allocations
-  hsa_amd_memory_pool_t HostFineGrainedMemoryPool;
-
-  // fine and coarse-grained memory pools per offloading device
-  std::vector<hsa_amd_memory_pool_t> DeviceFineGrainedMemoryPools;
-  std::vector<hsa_amd_memory_pool_t> DeviceCoarseGrainedMemoryPools;
-
-  struct ImplFreePtrDeletor {
-    void operator()(void *P) {
-      core::Runtime::Memfree(P); // ignore failure to free
-    }
-  };
-
-  // device_State shared across loaded binaries, error if inconsistent size
-  std::vector<std::pair<std::unique_ptr<void, ImplFreePtrDeletor>, uint64_t>>
-      DeviceStateStore;
-
-  static const unsigned HardTeamLimit =
-      (1 << 16) - 1; // 64K needed to fit in uint16
-  static const int DefaultNumTeams = 128;
-
-  // These need to be per-device since different devices can have different
-  // wave sizes, but are currently the same number for each so that refactor
-  // can be postponed.
-  static_assert(getGridValue<32>().GV_Max_Teams ==
-                    getGridValue<64>().GV_Max_Teams,
-                "");
-  static const int MaxTeams = getGridValue<64>().GV_Max_Teams;
-
-  static_assert(getGridValue<32>().GV_Max_WG_Size ==
-                    getGridValue<64>().GV_Max_WG_Size,
-                "");
-  static const int MaxWgSize = getGridValue<64>().GV_Max_WG_Size;
-
-  static_assert(getGridValue<32>().GV_Default_WG_Size ==
-                    getGridValue<64>().GV_Default_WG_Size,
-                "");
-  static const int DefaultWgSize = getGridValue<64>().GV_Default_WG_Size;
-
-  using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, void *, size_t Size,
-                                      hsa_agent_t, hsa_amd_memory_pool_t);
-  hsa_status_t freesignalpoolMemcpy(void *Dest, void *Src, size_t Size,
-                                    MemcpyFunc Func, int32_t DeviceId) {
-    hsa_agent_t Agent = HSAAgents[DeviceId];
-    hsa_signal_t S = FreeSignalPool.pop();
-    if (S.handle == 0) {
-      return HSA_STATUS_ERROR;
-    }
-    hsa_status_t R = Func(S, Dest, Src, Size, Agent, HostFineGrainedMemoryPool);
-    FreeSignalPool.push(S);
-    return R;
-  }
-
-  hsa_status_t freesignalpoolMemcpyD2H(void *Dest, void *Src, size_t Size,
-                                       int32_t DeviceId) {
-    return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_d2h, DeviceId);
-  }
-
-  hsa_status_t freesignalpoolMemcpyH2D(void *Dest, void *Src, size_t Size,
-                                       int32_t DeviceId) {
-    return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_h2d, DeviceId);
-  }
-
-  static void printDeviceInfo(int32_t DeviceId, hsa_agent_t Agent) {
-    char TmpChar[1000];
-    uint16_t Major, Minor;
-    uint32_t TmpUInt;
-    uint32_t TmpUInt2;
-    uint32_t CacheSize[4];
-    bool TmpBool;
-    uint16_t WorkgroupMaxDim[3];
-    hsa_dim3_t GridMaxDim;
-
-    // Getting basic information about HSA and Device
-    core::checkResult(
-        hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major),
-        "Error from hsa_system_get_info when obtaining "
-        "HSA_SYSTEM_INFO_VERSION_MAJOR\n");
-    core::checkResult(
-        hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor),
-        "Error from hsa_system_get_info when obtaining "
-        "HSA_SYSTEM_INFO_VERSION_MINOR\n");
-    printf("    HSA Runtime Version: \t\t%u.%u \n", Major, Minor);
-    printf("    HSA OpenMP Device Number: \t\t%d \n", DeviceId);
-    core::checkResult(
-        hsa_agent_get_info(
-            Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AMD_AGENT_INFO_PRODUCT_NAME\n");
-    printf("    Product Name: \t\t\t%s \n", TmpChar);
-    core::checkResult(hsa_agent_get_info(Agent, HSA_AGENT_INFO_NAME, TmpChar),
-                      "Error returned from hsa_agent_get_info when obtaining "
-                      "HSA_AGENT_INFO_NAME\n");
-    printf("    Device Name: \t\t\t%s \n", TmpChar);
-    core::checkResult(
-        hsa_agent_get_info(Agent, HSA_AGENT_INFO_VENDOR_NAME, TmpChar),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AGENT_INFO_NAME\n");
-    printf("    Vendor Name: \t\t\t%s \n", TmpChar);
-    hsa_device_type_t DevType;
-    core::checkResult(
-        hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DevType),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AGENT_INFO_DEVICE\n");
-    printf("    Device Type: \t\t\t%s \n",
-           DevType == HSA_DEVICE_TYPE_CPU
-               ? "CPU"
-               : (DevType == HSA_DEVICE_TYPE_GPU
-                      ? "GPU"
-                      : (DevType == HSA_DEVICE_TYPE_DSP ? "DSP" : "UNKNOWN")));
-    core::checkResult(
-        hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUES_MAX, &TmpUInt),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AGENT_INFO_QUEUES_MAX\n");
-    printf("    Max Queues: \t\t\t%u \n", TmpUInt);
-    core::checkResult(
-        hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &TmpUInt),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AGENT_INFO_QUEUE_MIN_SIZE\n");
-    printf("    Queue Min Size: \t\t\t%u \n", TmpUInt);
-    core::checkResult(
-        hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &TmpUInt),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AGENT_INFO_QUEUE_MAX_SIZE\n");
-    printf("    Queue Max Size: \t\t\t%u \n", TmpUInt);
-
-    // Getting cache information
-    printf("    Cache:\n");
-
-    // FIXME: This is deprecated according to HSA documentation. But using
-    // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during
-    // runtime.
-    core::checkResult(
-        hsa_agent_get_info(Agent, HSA_AGENT_INFO_CACHE_SIZE, CacheSize),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AGENT_INFO_CACHE_SIZE\n");
-
-    for (int I = 0; I < 4; I++) {
-      if (CacheSize[I]) {
-        printf("      L%u: \t\t\t\t%u bytes\n", I, CacheSize[I]);
-      }
-    }
-
-    core::checkResult(
-        hsa_agent_get_info(Agent,
-                           (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE,
-                           &TmpUInt),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AMD_AGENT_INFO_CACHELINE_SIZE\n");
-    printf("    Cacheline Size: \t\t\t%u \n", TmpUInt);
-    core::checkResult(
-        hsa_agent_get_info(
-            Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY,
-            &TmpUInt),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY\n");
-    printf("    Max Clock Freq(MHz): \t\t%u \n", TmpUInt);
-    core::checkResult(
-        hsa_agent_get_info(
-            Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
-            &TmpUInt),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT\n");
-    printf("    Compute Units: \t\t\t%u \n", TmpUInt);
-    core::checkResult(hsa_agent_get_info(
-                          Agent,
-                          (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU,
-                          &TmpUInt),
-                      "Error returned from hsa_agent_get_info when obtaining "
-                      "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n");
-    printf("    SIMD per CU: \t\t\t%u \n", TmpUInt);
-    core::checkResult(
-        hsa_agent_get_info(Agent, HSA_AGENT_INFO_FAST_F16_OPERATION, &TmpBool),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n");
-    printf("    Fast F16 Operation: \t\t%s \n", (TmpBool ? "TRUE" : "FALSE"));
-    core::checkResult(
-        hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &TmpUInt2),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AGENT_INFO_WAVEFRONT_SIZE\n");
-    printf("    Wavefront Size: \t\t\t%u \n", TmpUInt2);
-    core::checkResult(
-        hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &TmpUInt),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AGENT_INFO_WORKGROUP_MAX_SIZE\n");
-    printf("    Workgroup Max Size: \t\t%u \n", TmpUInt);
-    core::checkResult(hsa_agent_get_info(Agent,
-                                         HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
-                                         WorkgroupMaxDim),
-                      "Error returned from hsa_agent_get_info when obtaining "
-                      "HSA_AGENT_INFO_WORKGROUP_MAX_DIM\n");
-    printf("    Workgroup Max Size per Dimension:\n");
-    printf("      x: \t\t\t\t%u\n", WorkgroupMaxDim[0]);
-    printf("      y: \t\t\t\t%u\n", WorkgroupMaxDim[1]);
-    printf("      z: \t\t\t\t%u\n", WorkgroupMaxDim[2]);
-    core::checkResult(hsa_agent_get_info(
-                          Agent,
-                          (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU,
-                          &TmpUInt),
-                      "Error returned from hsa_agent_get_info when obtaining "
-                      "HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU\n");
-    printf("    Max Waves Per CU: \t\t\t%u \n", TmpUInt);
-    printf("    Max Work-item Per CU: \t\t%u \n", TmpUInt * TmpUInt2);
-    core::checkResult(
-        hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_SIZE, &TmpUInt),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AGENT_INFO_GRID_MAX_SIZE\n");
-    printf("    Grid Max Size: \t\t\t%u \n", TmpUInt);
-    core::checkResult(
-        hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AGENT_INFO_GRID_MAX_DIM\n");
-    printf("    Grid Max Size per Dimension: \t\t\n");
-    printf("      x: \t\t\t\t%u\n", GridMaxDim.x);
-    printf("      y: \t\t\t\t%u\n", GridMaxDim.y);
-    printf("      z: \t\t\t\t%u\n", GridMaxDim.z);
-    core::checkResult(
-        hsa_agent_get_info(Agent, HSA_AGENT_INFO_FBARRIER_MAX_SIZE, &TmpUInt),
-        "Error returned from hsa_agent_get_info when obtaining "
-        "HSA_AGENT_INFO_FBARRIER_MAX_SIZE\n");
-    printf("    Max fbarriers/Workgrp: \t\t%u\n", TmpUInt);
-
-    printf("    Memory Pools:\n");
-    auto CbMem = [](hsa_amd_memory_pool_t Region, void *Data) -> hsa_status_t {
-      std::string TmpStr;
-      size_t Size;
-      bool Alloc, Access;
-      hsa_amd_segment_t Segment;
-      hsa_amd_memory_pool_global_flag_t GlobalFlags;
-      core::checkResult(
-          hsa_amd_memory_pool_get_info(
-              Region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags),
-          "Error returned from hsa_amd_memory_pool_get_info when obtaining "
-          "HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS\n");
-      core::checkResult(hsa_amd_memory_pool_get_info(
-                            Region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &Segment),
-                        "Error returned from hsa_amd_memory_pool_get_info when "
-                        "obtaining HSA_AMD_MEMORY_POOL_INFO_SEGMENT\n");
-
-      switch (Segment) {
-      case HSA_AMD_SEGMENT_GLOBAL:
-        TmpStr = "GLOBAL; FLAGS: ";
-        if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & GlobalFlags)
-          TmpStr += "KERNARG, ";
-        if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & GlobalFlags)
-          TmpStr += "FINE GRAINED, ";
-        if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED & GlobalFlags)
-          TmpStr += "COARSE GRAINED, ";
-        break;
-      case HSA_AMD_SEGMENT_READONLY:
-        TmpStr = "READONLY";
-        break;
-      case HSA_AMD_SEGMENT_PRIVATE:
-        TmpStr = "PRIVATE";
-        break;
-      case HSA_AMD_SEGMENT_GROUP:
-        TmpStr = "GROUP";
-        break;
-      }
-      printf("      Pool %s: \n", TmpStr.c_str());
-
-      core::checkResult(hsa_amd_memory_pool_get_info(
-                            Region, HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size),
-                        "Error returned from hsa_amd_memory_pool_get_info when "
-                        "obtaining HSA_AMD_MEMORY_POOL_INFO_SIZE\n");
-      printf("        Size: \t\t\t\t %zu bytes\n", Size);
-      core::checkResult(
-          hsa_amd_memory_pool_get_info(
-              Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &Alloc),
-          "Error returned from hsa_amd_memory_pool_get_info when obtaining "
-          "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED\n");
-      printf("        Allocatable: \t\t\t %s\n", (Alloc ? "TRUE" : "FALSE"));
-      core::checkResult(
-          hsa_amd_memory_pool_get_info(
-              Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &Size),
-          "Error returned from hsa_amd_memory_pool_get_info when obtaining "
-          "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE\n");
-      printf("        Runtime Alloc Granule: \t\t %zu bytes\n", Size);
-      core::checkResult(
-          hsa_amd_memory_pool_get_info(
-              Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, &Size),
-          "Error returned from hsa_amd_memory_pool_get_info when obtaining "
-          "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT\n");
-      printf("        Runtime Alloc alignment: \t %zu bytes\n", Size);
-      core::checkResult(
-          hsa_amd_memory_pool_get_info(
-              Region, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &Access),
-          "Error returned from hsa_amd_memory_pool_get_info when obtaining "
-          "HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL\n");
-      printf("        Accessable by all: \t\t %s\n",
-             (Access ? "TRUE" : "FALSE"));
-
-      return HSA_STATUS_SUCCESS;
-    };
-    // Iterate over all the memory regions for this agent. Get the memory region
-    // type and size
-    hsa_amd_agent_iterate_memory_pools(Agent, CbMem, nullptr);
-
-    printf("    ISAs:\n");
-    auto CBIsas = [](hsa_isa_t Isa, void *Data) -> hsa_status_t {
-      char TmpChar[1000];
-      core::checkResult(hsa_isa_get_info_alt(Isa, HSA_ISA_INFO_NAME, TmpChar),
-                        "Error returned from hsa_isa_get_info_alt when "
-                        "obtaining HSA_ISA_INFO_NAME\n");
-      printf("        Name: \t\t\t\t %s\n", TmpChar);
-
-      return HSA_STATUS_SUCCESS;
-    };
-    // Iterate over all the memory regions for this agent. Get the memory region
-    // type and size
-    hsa_agent_iterate_isas(Agent, CBIsas, nullptr);
-  }
-
-  // Record entry point associated with device
-  void addOffloadEntry(int32_t DeviceId, __tgt_offload_entry Entry) {
-    assert(DeviceId < (int32_t)FuncGblEntries.size() &&
-           "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
-
-    E.Entries.push_back(Entry);
-  }
-
-  // Return true if the entry is associated with device
-  bool findOffloadEntry(int32_t DeviceId, void *Addr) {
-    assert(DeviceId < (int32_t)FuncGblEntries.size() &&
-           "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
-
-    for (auto &It : E.Entries) {
-      if (It.addr == Addr)
-        return true;
-    }
-
-    return false;
-  }
-
-  // Return the pointer to the target entries table
-  __tgt_target_table *getOffloadEntriesTable(int32_t DeviceId) {
-    assert(DeviceId < (int32_t)FuncGblEntries.size() &&
-           "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
-
-    int32_t Size = E.Entries.size();
-
-    // Table is empty
-    if (!Size)
-      return 0;
-
-    __tgt_offload_entry *Begin = &E.Entries[0];
-    __tgt_offload_entry *End = &E.Entries[Size - 1];
-
-    // Update table info according to the entries and return the pointer
-    E.Table.EntriesBegin = Begin;
-    E.Table.EntriesEnd = ++End;
-
-    return &E.Table;
-  }
-
-  // Clear entries table for a device
-  void clearOffloadEntriesTable(int DeviceId) {
-    assert(DeviceId < (int32_t)FuncGblEntries.size() &&
-           "Unexpected device id!");
-    FuncGblEntries[DeviceId].emplace_back();
-    FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
-    // KernelArgPoolMap.clear();
-    E.Entries.clear();
-    E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
-  }
-
-  hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool,
-                                   unsigned int DeviceId) {
-    assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here.");
-    uint32_t GlobalFlags = 0;
-    hsa_status_t Err = hsa_amd_memory_pool_get_info(
-        MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
-
-    if (Err != HSA_STATUS_SUCCESS) {
-      return Err;
-    }
-
-    if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
-      DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool;
-    } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
-      DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool;
-    }
-
-    return HSA_STATUS_SUCCESS;
-  }
-
-  hsa_status_t setupDevicePools(const std::vector<hsa_agent_t> &Agents) {
-    for (unsigned int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) {
-      hsa_status_t Err = hsa::amd_agent_iterate_memory_pools(
-          Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) {
-            hsa_status_t ValidStatus = core::isValidMemoryPool(MemoryPool);
-            if (ValidStatus != HSA_STATUS_SUCCESS) {
-              DP("Alloc allowed in memory pool check failed: %s\n",
-                 get_error_string(ValidStatus));
-              return HSA_STATUS_SUCCESS;
-            }
-            return addDeviceMemoryPool(MemoryPool, DeviceId);
-          });
-
-      if (Err != HSA_STATUS_SUCCESS) {
-        DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-           "Iterate all memory pools", get_error_string(Err));
-        return Err;
-      }
-    }
-    return HSA_STATUS_SUCCESS;
-  }
-
-  hsa_status_t setupHostMemoryPools(std::vector<hsa_agent_t> &Agents) {
-    std::vector<hsa_amd_memory_pool_t> HostPools;
-
-    // collect all the "valid" pools for all the given agents.
-    for (const auto &Agent : Agents) {
-      hsa_status_t Err = hsa_amd_agent_iterate_memory_pools(
-          Agent, core::addMemoryPool, static_cast<void *>(&HostPools));
-      if (Err != HSA_STATUS_SUCCESS) {
-        DP("addMemoryPool returned %s, continuing\n", get_error_string(Err));
-      }
-    }
-
-    // We need two fine-grained pools.
-    //  1. One with kernarg flag set for storing kernel arguments
-    //  2. Second for host allocations
-    bool FineGrainedMemoryPoolSet = false;
-    bool KernArgPoolSet = false;
-    for (const auto &MemoryPool : HostPools) {
-      hsa_status_t Err = HSA_STATUS_SUCCESS;
-      uint32_t GlobalFlags = 0;
-      Err = hsa_amd_memory_pool_get_info(
-          MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
-      if (Err != HSA_STATUS_SUCCESS) {
-        DP("Get memory pool info failed: %s\n", get_error_string(Err));
-        return Err;
-      }
-
-      if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
-        if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) {
-          KernArgPool = MemoryPool;
-          KernArgPoolSet = true;
-        } else {
-          HostFineGrainedMemoryPool = MemoryPool;
-          FineGrainedMemoryPoolSet = true;
-        }
-      }
-    }
-
-    if (FineGrainedMemoryPoolSet && KernArgPoolSet)
-      return HSA_STATUS_SUCCESS;
-
-    return HSA_STATUS_ERROR;
-  }
-
-  hsa_amd_memory_pool_t getDeviceMemoryPool(unsigned int DeviceId) {
-    assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() &&
-           "Invalid device Id");
-    return DeviceCoarseGrainedMemoryPools[DeviceId];
-  }
-
-  hsa_amd_memory_pool_t getHostMemoryPool() {
-    return HostFineGrainedMemoryPool;
-  }
-
-  static int readEnv(const char *Env, int Default = -1) {
-    const char *EnvStr = getenv(Env);
-    int Res = Default;
-    if (EnvStr) {
-      Res = std::stoi(EnvStr);
-      DP("Parsed %s=%d\n", Env, Res);
-    }
-    return Res;
-  }
-
-  RTLDeviceInfoTy() {
-    DP("Start initializing " GETNAME(TARGET_NAME) "\n");
-
-    // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr
-    // anytime. You do not need a debug library build.
-    //  0 => no tracing
-    //  1 => tracing dispatch only
-    // >1 => verbosity increase
-
-    if (!HSAInitSuccess()) {
-      DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n");
-      return;
-    }
-
-    if (char *EnvStr = getenv("LIBOMPTARGET_KERNEL_TRACE"))
-      print_kernel_trace = atoi(EnvStr);
-    else
-      print_kernel_trace = 0;
-
-    hsa_status_t Err = core::atl_init_gpu_context();
-    if (Err != HSA_STATUS_SUCCESS) {
-      DP("Error when initializing " GETNAME(TARGET_NAME) "\n");
-      return;
-    }
-
-    // Init hostcall soon after initializing hsa
-    hostrpc_init();
-
-    Err = findAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) {
-      if (DeviceType == HSA_DEVICE_TYPE_CPU) {
-        CPUAgents.push_back(Agent);
-      } else {
-        HSAAgents.push_back(Agent);
-      }
-    });
-    if (Err != HSA_STATUS_SUCCESS)
-      return;
-
-    NumberOfDevices = (int)HSAAgents.size();
-
-    if (NumberOfDevices == 0) {
-      DP("There are no devices supporting HSA.\n");
-      return;
-    }
-    DP("There are %d devices supporting HSA.\n", NumberOfDevices);
-
-    // Init the device info
-    HSAQueueSchedulers.reserve(NumberOfDevices);
-    FuncGblEntries.resize(NumberOfDevices);
-    ThreadsPerGroup.resize(NumberOfDevices);
-    ComputeUnits.resize(NumberOfDevices);
-    GPUName.resize(NumberOfDevices);
-    GroupsPerDevice.resize(NumberOfDevices);
-    WarpSize.resize(NumberOfDevices);
-    NumTeams.resize(NumberOfDevices);
-    NumThreads.resize(NumberOfDevices);
-    DeviceStateStore.resize(NumberOfDevices);
-    KernelInfoTable.resize(NumberOfDevices);
-    SymbolInfoTable.resize(NumberOfDevices);
-    DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices);
-    DeviceFineGrainedMemoryPools.resize(NumberOfDevices);
-
-    Err = setupDevicePools(HSAAgents);
-    if (Err != HSA_STATUS_SUCCESS) {
-      DP("Setup for Device Memory Pools failed\n");
-      return;
-    }
-
-    Err = setupHostMemoryPools(CPUAgents);
-    if (Err != HSA_STATUS_SUCCESS) {
-      DP("Setup for Host Memory Pools failed\n");
-      return;
-    }
-
-    for (int I = 0; I < NumberOfDevices; I++) {
-      uint32_t QueueSize = 0;
-      {
-        hsa_status_t Err = hsa_agent_get_info(
-            HSAAgents[I], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &QueueSize);
-        if (Err != HSA_STATUS_SUCCESS) {
-          DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", I);
-          return;
-        }
-        enum { MaxQueueSize = 4096 };
-        if (QueueSize > MaxQueueSize) {
-          QueueSize = MaxQueueSize;
-        }
-      }
-
-      {
-        HSAQueueScheduler QSched;
-        if (!QSched.createQueues(HSAAgents[I], QueueSize))
-          return;
-        HSAQueueSchedulers.emplace_back(std::move(QSched));
-      }
-
-      DeviceStateStore[I] = {nullptr, 0};
-    }
-
-    for (int I = 0; I < NumberOfDevices; I++) {
-      ThreadsPerGroup[I] = RTLDeviceInfoTy::DefaultWgSize;
-      GroupsPerDevice[I] = RTLDeviceInfoTy::DefaultNumTeams;
-      ComputeUnits[I] = 1;
-      DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", I,
-         GroupsPerDevice[I], ThreadsPerGroup[I]);
-    }
-
-    // Get environment variables regarding teams
-    Env.TeamLimit = readEnv("OMP_TEAM_LIMIT");
-    Env.NumTeams = readEnv("OMP_NUM_TEAMS");
-    Env.MaxTeamsDefault = readEnv("OMP_MAX_TEAMS_DEFAULT");
-    Env.TeamThreadLimit = readEnv("OMP_TEAMS_THREAD_LIMIT");
-    Env.DynamicMemSize = readEnv("LIBOMPTARGET_SHARED_MEMORY_SIZE", 0);
-
-    // Default state.
-    RequiresFlags = OMP_REQ_UNDEFINED;
-
-    ConstructionSucceeded = true;
-  }
-
-  ~RTLDeviceInfoTy() {
-    DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n");
-    if (!HSAInitSuccess()) {
-      // Then none of these can have been set up and they can't be torn down
-      return;
-    }
-    // Run destructors on types that use HSA before
-    // impl_finalize removes access to it
-    DeviceStateStore.clear();
-    KernelArgPoolMap.clear();
-    // Terminate hostrpc before finalizing hsa
-    hostrpc_terminate();
-
-    hsa_status_t Err;
-    for (uint32_t I = 0; I < HSAExecutables.size(); I++) {
-      Err = hsa_executable_destroy(HSAExecutables[I]);
-      if (Err != HSA_STATUS_SUCCESS) {
-        DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
-           "Destroying executable", get_error_string(Err));
-      }
-    }
-  }
-};
-
-pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER;
-
-// Putting accesses to DeviceInfo global behind a function call prior
-// to changing to use init_plugin/deinit_plugin calls
-static RTLDeviceInfoTy DeviceInfoState;
-static RTLDeviceInfoTy &DeviceInfo() { return DeviceInfoState; }
-
-namespace {
-
-int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
-                     __tgt_async_info *AsyncInfo) {
-  assert(AsyncInfo && "AsyncInfo is nullptr");
-  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
-  // Return success if we are not copying back to host from target.
-  if (!HstPtr)
-    return OFFLOAD_SUCCESS;
-  hsa_status_t Err;
-  DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
-     (long long unsigned)(Elf64_Addr)TgtPtr,
-     (long long unsigned)(Elf64_Addr)HstPtr);
-
-  Err = DeviceInfo().freesignalpoolMemcpyD2H(HstPtr, TgtPtr, (size_t)Size,
-                                             DeviceId);
-
-  if (Err != HSA_STATUS_SUCCESS) {
-    DP("Error when copying data from device to host. Pointers: "
-       "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
-       (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
-    return OFFLOAD_FAIL;
-  }
-  DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,
-     (long long unsigned)(Elf64_Addr)TgtPtr,
-     (long long unsigned)(Elf64_Addr)HstPtr);
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
-                   __tgt_async_info *AsyncInfo) {
-  assert(AsyncInfo && "AsyncInfo is nullptr");
-  hsa_status_t Err;
-  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
-  // Return success if we are not doing host to target.
-  if (!HstPtr)
-    return OFFLOAD_SUCCESS;
-
-  DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,
-     (long long unsigned)(Elf64_Addr)HstPtr,
-     (long long unsigned)(Elf64_Addr)TgtPtr);
-  Err = DeviceInfo().freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size,
-                                             DeviceId);
-  if (Err != HSA_STATUS_SUCCESS) {
-    DP("Error when copying data from host to device. Pointers: "
-       "host = 0x%016lx, device = 0x%016lx, size = %lld\n",
-       (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size);
-    return OFFLOAD_FAIL;
-  }
-  return OFFLOAD_SUCCESS;
-}
-
-// Async.
-// The implementation was written with cuda streams in mind. The semantics of
-// that are to execute kernels on a queue in order of insertion. A synchronise
-// call then makes writes visible between host and device. This means a series
-// of N data_submit_async calls are expected to execute serially. HSA offers
-// various options to run the data copies concurrently. This may require changes
-// to libomptarget.
-
-// __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that
-// there are no outstanding kernels that need to be synchronized. Any async call
-// may be passed a Queue==0, at which point the cuda implementation will set it
-// to non-null (see getStream). The cuda streams are per-device. Upstream may
-// change this interface to explicitly initialize the AsyncInfo_pointer, but
-// until then hsa lazily initializes it as well.
-
-void initAsyncInfo(__tgt_async_info *AsyncInfo) {
-  // set non-null while using async calls, return to null to indicate completion
-  assert(AsyncInfo);
-  if (!AsyncInfo->Queue) {
-    AsyncInfo->Queue = reinterpret_cast<void *>(UINT64_MAX);
-  }
-}
-void finiAsyncInfo(__tgt_async_info *AsyncInfo) {
-  assert(AsyncInfo);
-  assert(AsyncInfo->Queue);
-  AsyncInfo->Queue = 0;
-}
-
-// Determine launch values for kernel.
-struct LaunchVals {
-  int WorkgroupSize;
-  int GridSize;
-};
-LaunchVals getLaunchVals(int WarpSize, EnvironmentVariables Env,
-                         int ConstWGSize,
-                         llvm::omp::OMPTgtExecModeFlags ExecutionMode,
-                         int NumTeams, int ThreadLimit, uint64_t LoopTripcount,
-                         int DeviceNumTeams) {
-
-  int ThreadsPerGroup = RTLDeviceInfoTy::DefaultWgSize;
-  int NumGroups = 0;
-
-  int MaxTeams = Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams;
-  if (MaxTeams > static_cast<int>(RTLDeviceInfoTy::HardTeamLimit))
-    MaxTeams = RTLDeviceInfoTy::HardTeamLimit;
-
-  if (print_kernel_trace & STARTUP_DETAILS) {
-    DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::MaxTeams);
-    DP("Max_Teams: %d\n", MaxTeams);
-    DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize);
-    DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::MaxWgSize);
-    DP("RTLDeviceInfoTy::Default_WG_Size: %d\n",
-       RTLDeviceInfoTy::DefaultWgSize);
-    DP("thread_limit: %d\n", ThreadLimit);
-    DP("threadsPerGroup: %d\n", ThreadsPerGroup);
-    DP("ConstWGSize: %d\n", ConstWGSize);
-  }
-  // check for thread_limit() clause
-  if (ThreadLimit > 0) {
-    ThreadsPerGroup = ThreadLimit;
-    DP("Setting threads per block to requested %d\n", ThreadLimit);
-    // Add master warp for GENERIC
-    if (ExecutionMode ==
-        llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
-      ThreadsPerGroup += WarpSize;
-      DP("Adding master wavefront: +%d threads\n", WarpSize);
-    }
-    if (ThreadsPerGroup > RTLDeviceInfoTy::MaxWgSize) { // limit to max
-      ThreadsPerGroup = RTLDeviceInfoTy::MaxWgSize;
-      DP("Setting threads per block to maximum %d\n", ThreadsPerGroup);
-    }
-  }
-  // check flat_max_work_group_size attr here
-  if (ThreadsPerGroup > ConstWGSize) {
-    ThreadsPerGroup = ConstWGSize;
-    DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n",
-       ThreadsPerGroup);
-  }
-  if (print_kernel_trace & STARTUP_DETAILS)
-    DP("threadsPerGroup: %d\n", ThreadsPerGroup);
-  DP("Preparing %d threads\n", ThreadsPerGroup);
-
-  // Set default num_groups (teams)
-  if (Env.TeamLimit > 0)
-    NumGroups = (MaxTeams < Env.TeamLimit) ? MaxTeams : Env.TeamLimit;
-  else
-    NumGroups = MaxTeams;
-  DP("Set default num of groups %d\n", NumGroups);
-
-  if (print_kernel_trace & STARTUP_DETAILS) {
-    DP("num_groups: %d\n", NumGroups);
-    DP("num_teams: %d\n", NumTeams);
-  }
-
-  // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size
-  // This reduction is typical for default case (no thread_limit clause).
-  // or when user goes crazy with num_teams clause.
-  // FIXME: We cant distinguish between a constant or variable thread limit.
-  // So we only handle constant thread_limits.
-  if (ThreadsPerGroup >
-      RTLDeviceInfoTy::DefaultWgSize) //  256 < threadsPerGroup <= 1024
-    // Should we round threadsPerGroup up to nearest WarpSize
-    // here?
-    NumGroups = (MaxTeams * RTLDeviceInfoTy::MaxWgSize) / ThreadsPerGroup;
-
-  // check for num_teams() clause
-  if (NumTeams > 0) {
-    NumGroups = (NumTeams < NumGroups) ? NumTeams : NumGroups;
-  }
-  if (print_kernel_trace & STARTUP_DETAILS) {
-    DP("num_groups: %d\n", NumGroups);
-    DP("Env.NumTeams %d\n", Env.NumTeams);
-    DP("Env.TeamLimit %d\n", Env.TeamLimit);
-  }
-
-  if (Env.NumTeams > 0) {
-    NumGroups = (Env.NumTeams < NumGroups) ? Env.NumTeams : NumGroups;
-    DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams);
-  } else if (Env.TeamLimit > 0) {
-    NumGroups = (Env.TeamLimit < NumGroups) ? Env.TeamLimit : NumGroups;
-    DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit);
-  } else {
-    if (NumTeams <= 0) {
-      if (LoopTripcount > 0) {
-        if (ExecutionMode ==
-            llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) {
-          // round up to the nearest integer
-          NumGroups = ((LoopTripcount - 1) / ThreadsPerGroup) + 1;
-        } else if (ExecutionMode ==
-                   llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
-          NumGroups = LoopTripcount;
-        } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ {
-          // This is a generic kernel that was transformed to use SPMD-mode
-          // execution but uses Generic-mode semantics for scheduling.
-          NumGroups = LoopTripcount;
-        }
-        DP("Using %d teams due to loop trip count %" PRIu64 " and number of "
-           "threads per block %d\n",
-           NumGroups, LoopTripcount, ThreadsPerGroup);
-      }
-    } else {
-      NumGroups = NumTeams;
-    }
-    if (NumGroups > MaxTeams) {
-      NumGroups = MaxTeams;
-      if (print_kernel_trace & STARTUP_DETAILS)
-        DP("Limiting num_groups %d to Max_Teams %d \n", NumGroups, MaxTeams);
-    }
-    if (NumGroups > NumTeams && NumTeams > 0) {
-      NumGroups = NumTeams;
-      if (print_kernel_trace & STARTUP_DETAILS)
-        DP("Limiting num_groups %d to clause num_teams %d \n", NumGroups,
-           NumTeams);
-    }
-  }
-
-  // num_teams clause always honored, no matter what, unless DEFAULT is active.
-  if (NumTeams > 0) {
-    NumGroups = NumTeams;
-    // Cap num_groups to EnvMaxTeamsDefault if set.
-    if (Env.MaxTeamsDefault > 0 && NumGroups > Env.MaxTeamsDefault)
-      NumGroups = Env.MaxTeamsDefault;
-  }
-  if (print_kernel_trace & STARTUP_DETAILS) {
-    DP("threadsPerGroup: %d\n", ThreadsPerGroup);
-    DP("num_groups: %d\n", NumGroups);
-    DP("loop_tripcount: %ld\n", LoopTripcount);
-  }
-  DP("Final %d num_groups and %d threadsPerGroup\n", NumGroups,
-     ThreadsPerGroup);
-
-  LaunchVals Res;
-  Res.WorkgroupSize = ThreadsPerGroup;
-  Res.GridSize = ThreadsPerGroup * NumGroups;
-  return Res;
-}
-
-static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) {
-  uint64_t PacketId = hsa_queue_add_write_index_relaxed(Queue, 1);
-  bool Full = true;
-  while (Full) {
-    Full =
-        PacketId >= (Queue->size + hsa_queue_load_read_index_scacquire(Queue));
-  }
-  return PacketId;
-}
-
-int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
-                        ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams,
-                        int32_t ThreadLimit, uint64_t LoopTripcount) {
-  // Set the context we are using
-  // update thread limit content in gpu memory if un-initialized or specified
-  // from host
-
-  DP("Run target team region thread_limit %d\n", ThreadLimit);
-
-  // All args are references.
-  std::vector<void *> Args(ArgNum);
-  std::vector<void *> Ptrs(ArgNum);
-
-  DP("Arg_num: %d\n", ArgNum);
-  for (int32_t I = 0; I < ArgNum; ++I) {
-    Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
-    Args[I] = &Ptrs[I];
-    DP("Offseted base: arg[%d]:" DPxMOD "\n", I, DPxPTR(Ptrs[I]));
-  }
-
-  KernelTy *KernelInfo = (KernelTy *)TgtEntryPtr;
-
-  std::string KernelName = std::string(KernelInfo->Name);
-  auto &KernelInfoTable = DeviceInfo().KernelInfoTable;
-  if (KernelInfoTable[DeviceId].find(KernelName) ==
-      KernelInfoTable[DeviceId].end()) {
-    DP("Kernel %s not found\n", KernelName.c_str());
-    return OFFLOAD_FAIL;
-  }
-
-  const atl_kernel_info_t KernelInfoEntry =
-      KernelInfoTable[DeviceId][KernelName];
-  const uint32_t GroupSegmentSize =
-      KernelInfoEntry.group_segment_size + DeviceInfo().Env.DynamicMemSize;
-  const uint32_t SgprCount = KernelInfoEntry.sgpr_count;
-  const uint32_t VgprCount = KernelInfoEntry.vgpr_count;
-  const uint32_t SgprSpillCount = KernelInfoEntry.sgpr_spill_count;
-  const uint32_t VgprSpillCount = KernelInfoEntry.vgpr_spill_count;
-
-  assert(ArgNum == (int)KernelInfoEntry.explicit_argument_count);
-
-  /*
-   * Set limit based on ThreadsPerGroup and GroupsPerDevice
-   */
-  LaunchVals LV =
-      getLaunchVals(DeviceInfo().WarpSize[DeviceId], DeviceInfo().Env,
-                    KernelInfo->ConstWGSize, KernelInfo->ExecutionMode,
-                    NumTeams,      // From run_region arg
-                    ThreadLimit,   // From run_region arg
-                    LoopTripcount, // From run_region arg
-                    DeviceInfo().NumTeams[KernelInfo->DeviceId]);
-  const int GridSize = LV.GridSize;
-  const int WorkgroupSize = LV.WorkgroupSize;
-
-  if (print_kernel_trace >= LAUNCH) {
-    int NumGroups = GridSize / WorkgroupSize;
-    // enum modes are SPMD, GENERIC, NONE 0,1,2
-    // if doing rtl timing, print to stderr, unless stdout requested.
-    bool TraceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING);
-    fprintf(TraceToStdout ? stdout : stderr,
-            "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) "
-            "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
-            "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n",
-            DeviceId, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,
-            ArgNum, NumGroups, WorkgroupSize, NumTeams, ThreadLimit,
-            GroupSegmentSize, SgprCount, VgprCount, SgprSpillCount,
-            VgprSpillCount, LoopTripcount, KernelInfo->Name);
-  }
-
-  // Run on the device.
-  {
-    hsa_queue_t *Queue = DeviceInfo().HSAQueueSchedulers[DeviceId].next();
-    if (!Queue) {
-      return OFFLOAD_FAIL;
-    }
-    uint64_t PacketId = acquireAvailablePacketId(Queue);
-
-    const uint32_t Mask = Queue->size - 1; // size is a power of 2
-    hsa_kernel_dispatch_packet_t *Packet =
-        (hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask);
-
-    // packet->header is written last
-    Packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
-    Packet->workgroup_size_x = WorkgroupSize;
-    Packet->workgroup_size_y = 1;
-    Packet->workgroup_size_z = 1;
-    Packet->reserved0 = 0;
-    Packet->grid_size_x = GridSize;
-    Packet->grid_size_y = 1;
-    Packet->grid_size_z = 1;
-    Packet->private_segment_size = KernelInfoEntry.private_segment_size;
-    Packet->group_segment_size = GroupSegmentSize;
-    Packet->kernel_object = KernelInfoEntry.kernel_object;
-    Packet->kernarg_address = 0;     // use the block allocator
-    Packet->reserved2 = 0;           // impl writes id_ here
-    Packet->completion_signal = {0}; // may want a pool of signals
-
-    KernelArgPool *ArgPool = nullptr;
-    void *KernArg = nullptr;
-    {
-      auto It = KernelArgPoolMap.find(std::string(KernelInfo->Name));
-      if (It != KernelArgPoolMap.end()) {
-        ArgPool = (It->second).get();
-      }
-    }
-    if (!ArgPool) {
-      DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name,
-         DeviceId);
-    }
-    {
-      if (ArgPool) {
-        assert(ArgPool->KernargSegmentSize == (ArgNum * sizeof(void *)));
-        KernArg = ArgPool->allocate(ArgNum);
-      }
-      if (!KernArg) {
-        DP("Allocate kernarg failed\n");
-        return OFFLOAD_FAIL;
-      }
-
-      // Copy explicit arguments
-      for (int I = 0; I < ArgNum; I++) {
-        memcpy((char *)KernArg + sizeof(void *) * I, Args[I], sizeof(void *));
-      }
-
-      // Initialize implicit arguments. TODO: Which of these can be dropped
-      AMDGPUImplicitArgsTy *ImplArgs = reinterpret_cast<AMDGPUImplicitArgsTy *>(
-          static_cast<char *>(KernArg) + ArgPool->KernargSegmentSize);
-      memset(ImplArgs, 0,
-             sizeof(AMDGPUImplicitArgsTy)); // may not be necessary
-      ImplArgs->OffsetX = 0;
-      ImplArgs->OffsetY = 0;
-      ImplArgs->OffsetZ = 0;
-
-      // assign a hostcall buffer for the selected Q
-      if (__atomic_load_n(&DeviceInfo().HostcallRequired, __ATOMIC_ACQUIRE)) {
-        // hostrpc_assign_buffer is not thread safe, and this function is
-        // under a multiple reader lock, not a writer lock.
-        static pthread_mutex_t HostcallInitLock = PTHREAD_MUTEX_INITIALIZER;
-        pthread_mutex_lock(&HostcallInitLock);
-        uint64_t Buffer = hostrpc_assign_buffer(
-            DeviceInfo().HSAAgents[DeviceId], Queue, DeviceId);
-        pthread_mutex_unlock(&HostcallInitLock);
-        if (!Buffer) {
-          DP("hostrpc_assign_buffer failed, gpu would dereference null and "
-             "error\n");
-          return OFFLOAD_FAIL;
-        }
-
-        DP("Implicit argument count: %d\n",
-           KernelInfoEntry.implicit_argument_count);
-        if (KernelInfoEntry.implicit_argument_count >= 4) {
-          // Initialise pointer for implicit_argument_count != 0 ABI
-          // Guess that the right implicit argument is at offset 24 after
-          // the explicit arguments. In the future, should be able to read
-          // the offset from msgpack. Clang is not annotating it at present.
-          uint64_t Offset =
-              sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3);
-          if ((Offset + 8) > ArgPool->kernargSizeIncludingImplicit()) {
-            DP("Bad offset of hostcall: %lu, exceeds kernarg size w/ implicit "
-               "args: %d\n",
-               Offset + 8, ArgPool->kernargSizeIncludingImplicit());
-          } else {
-            memcpy(static_cast<char *>(KernArg) + Offset, &Buffer, 8);
-          }
-        }
-
-        // initialise pointer for implicit_argument_count == 0 ABI
-        ImplArgs->HostcallPtr = Buffer;
-      }
-
-      Packet->kernarg_address = KernArg;
-    }
-
-    hsa_signal_t S = DeviceInfo().FreeSignalPool.pop();
-    if (S.handle == 0) {
-      DP("Failed to get signal instance\n");
-      return OFFLOAD_FAIL;
-    }
-    Packet->completion_signal = S;
-    hsa_signal_store_relaxed(Packet->completion_signal, 1);
-
-    // Publish the packet indicating it is ready to be processed
-    core::packetStoreRelease(reinterpret_cast<uint32_t *>(Packet),
-                             core::createHeader(), Packet->setup);
-
-    // Since the packet is already published, its contents must not be
-    // accessed any more
-    hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
-
-    while (hsa_signal_wait_scacquire(S, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
-                                     HSA_WAIT_STATE_BLOCKED) != 0)
-      ;
-
-    assert(ArgPool);
-    ArgPool->deallocate(KernArg);
-    DeviceInfo().FreeSignalPool.push(S);
-  }
-
-  DP("Kernel completed\n");
-  return OFFLOAD_SUCCESS;
-}
-
-bool elfMachineIdIsAmdgcn(__tgt_device_image *Image) {
-  const uint16_t AmdgcnMachineID = EM_AMDGPU;
-  const int32_t R = elf_check_machine(Image, AmdgcnMachineID);
-  if (!R) {
-    DP("Supported machine ID not found\n");
-  }
-  return R;
-}
-
-uint32_t elfEFlags(__tgt_device_image *Image) {
-  const char *ImgBegin = (char *)Image->ImageStart;
-  size_t ImgSize = (char *)Image->ImageEnd - ImgBegin;
-
-  StringRef Buffer = StringRef(ImgBegin, ImgSize);
-  auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""),
-                                                  /*InitContent=*/false);
-  if (!ElfOrErr) {
-    consumeError(ElfOrErr.takeError());
-    return 0;
-  }
-
-  if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(ElfOrErr->get()))
-    return ELFObj->getPlatformFlags();
-  return 0;
-}
-
-template <typename T> bool enforceUpperBound(T *Value, T Upper) {
-  bool Changed = *Value > Upper;
-  if (Changed) {
-    *Value = Upper;
-  }
-  return Changed;
-}
-
-struct SymbolInfo {
-  const void *Addr = nullptr;
-  uint32_t Size = UINT32_MAX;
-  uint32_t ShType = SHT_NULL;
-};
-
-int getSymbolInfoWithoutLoading(const ELFObjectFile<ELF64LE> &ELFObj,
-                                StringRef SymName, SymbolInfo *Res) {
-  auto SymOrErr = getELFSymbol(ELFObj, SymName);
-  if (!SymOrErr) {
-    std::string ErrorString = toString(SymOrErr.takeError());
-    DP("Failed ELF lookup: %s\n", ErrorString.c_str());
-    return 1;
-  }
-  if (!*SymOrErr)
-    return 1;
-
-  auto SymSecOrErr = ELFObj.getELFFile().getSection((*SymOrErr)->st_shndx);
-  if (!SymSecOrErr) {
-    std::string ErrorString = toString(SymOrErr.takeError());
-    DP("Failed ELF lookup: %s\n", ErrorString.c_str());
-    return 1;
-  }
-
-  Res->Addr = (*SymOrErr)->st_value + ELFObj.getELFFile().base();
-  Res->Size = static_cast<uint32_t>((*SymOrErr)->st_size);
-  Res->ShType = static_cast<uint32_t>((*SymSecOrErr)->sh_type);
-  return 0;
-}
-
-int getSymbolInfoWithoutLoading(char *Base, size_t ImgSize, const char *SymName,
-                                SymbolInfo *Res) {
-  StringRef Buffer = StringRef(Base, ImgSize);
-  auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""),
-                                                  /*InitContent=*/false);
-  if (!ElfOrErr) {
-    REPORT("Failed to load ELF: %s\n", toString(ElfOrErr.takeError()).c_str());
-    return 1;
-  }
-
-  if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(ElfOrErr->get()))
-    return getSymbolInfoWithoutLoading(*ELFObj, SymName, Res);
-  return 1;
-}
-
-hsa_status_t interopGetSymbolInfo(char *Base, size_t ImgSize,
-                                  const char *SymName, const void **VarAddr,
-                                  uint32_t *VarSize) {
-  SymbolInfo SI;
-  int Rc = getSymbolInfoWithoutLoading(Base, ImgSize, SymName, &SI);
-  if (Rc == 0) {
-    *VarAddr = SI.Addr;
-    *VarSize = SI.Size;
-    return HSA_STATUS_SUCCESS;
-  }
-  return HSA_STATUS_ERROR;
-}
-
-template <typename C>
-hsa_status_t moduleRegisterFromMemoryToPlace(
-    std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
-    std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
-    void *ModuleBytes, size_t ModuleSize, int DeviceId, C Cb,
-    std::vector<hsa_executable_t> &HSAExecutables) {
-  auto L = [](void *Data, size_t Size, void *CbState) -> hsa_status_t {
-    C *Unwrapped = static_cast<C *>(CbState);
-    return (*Unwrapped)(Data, Size);
-  };
-  return core::RegisterModuleFromMemory(
-      KernelInfoTable, SymbolInfoTable, ModuleBytes, ModuleSize,
-      DeviceInfo().HSAAgents[DeviceId], L, static_cast<void *>(&Cb),
-      HSAExecutables);
-}
-
-uint64_t getDeviceStateBytes(char *ImageStart, size_t ImgSize) {
-  uint64_t DeviceStateBytes = 0;
-  {
-    // If this is the deviceRTL, get the state variable size
-    SymbolInfo SizeSi;
-    int Rc = getSymbolInfoWithoutLoading(
-        ImageStart, ImgSize, "omptarget_nvptx_device_State_size", &SizeSi);
-
-    if (Rc == 0) {
-      if (SizeSi.Size != sizeof(uint64_t)) {
-        DP("Found device_State_size variable with wrong size\n");
-        return 0;
-      }
-
-      // Read number of bytes directly from the elf
-      memcpy(&DeviceStateBytes, SizeSi.Addr, sizeof(uint64_t));
-    }
-  }
-  return DeviceStateBytes;
-}
-
-struct DeviceEnvironment {
-  // initialise an DeviceEnvironmentTy in the deviceRTL
-  // patches around differences in the deviceRTL between trunk, aomp,
-  // rocmcc. Over time these differences will tend to zero and this class
-  // simplified.
-  // Symbol may be in .data or .bss, and may be missing fields, todo:
-  // review aomp/trunk/rocm and simplify the following
-
-  // The symbol may also have been deadstripped because the device side
-  // accessors were unused.
-
-  // If the symbol is in .data (aomp, rocm) it can be written directly.
-  // If it is in .bss, we must wait for it to be allocated space on the
-  // gpu (trunk) and initialize after loading.
-  const char *sym() { return "__omp_rtl_device_environment"; }
-
-  DeviceEnvironmentTy HostDeviceEnv;
-  SymbolInfo SI;
-  bool Valid = false;
-
-  __tgt_device_image *Image;
-  const size_t ImgSize;
-
-  DeviceEnvironment(int DeviceId, int NumberDevices, int DynamicMemSize,
-                    __tgt_device_image *Image, const size_t ImgSize)
-      : Image(Image), ImgSize(ImgSize) {
-
-    HostDeviceEnv.NumDevices = NumberDevices;
-    HostDeviceEnv.DeviceNum = DeviceId;
-    HostDeviceEnv.DebugKind = 0;
-    HostDeviceEnv.DynamicMemSize = DynamicMemSize;
-    if (char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
-      HostDeviceEnv.DebugKind = std::stoi(EnvStr);
-
-    int Rc = getSymbolInfoWithoutLoading((char *)Image->ImageStart, ImgSize,
-                                         sym(), &SI);
-    if (Rc != 0) {
-      DP("Finding global device environment '%s' - symbol missing.\n", sym());
-      return;
-    }
-
-    if (SI.Size > sizeof(HostDeviceEnv)) {
-      DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), SI.Size,
-         sizeof(HostDeviceEnv));
-      return;
-    }
-
-    Valid = true;
-  }
-
-  bool inImage() { return SI.ShType != SHT_NOBITS; }
-
-  hsa_status_t beforeLoading(void *Data, size_t Size) {
-    if (Valid) {
-      if (inImage()) {
-        DP("Setting global device environment before load (%u bytes)\n",
-           SI.Size);
-        uint64_t Offset = reinterpret_cast<const char *>(SI.Addr) -
-                          reinterpret_cast<const char *>(Image->ImageStart);
-        void *Pos = reinterpret_cast<char *>(Data) + Offset;
-        memcpy(Pos, &HostDeviceEnv, SI.Size);
-      }
-    }
-    return HSA_STATUS_SUCCESS;
-  }
-
-  hsa_status_t afterLoading() {
-    if (Valid) {
-      if (!inImage()) {
-        DP("Setting global device environment after load (%u bytes)\n",
-           SI.Size);
-        int DeviceId = HostDeviceEnv.DeviceNum;
-        auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId];
-        void *StatePtr;
-        uint32_t StatePtrSize;
-        hsa_status_t Err = interop_hsa_get_symbol_info(
-            SymbolInfo, DeviceId, sym(), &StatePtr, &StatePtrSize);
-        if (Err != HSA_STATUS_SUCCESS) {
-          DP("failed to find %s in loaded image\n", sym());
-          return Err;
-        }
-
-        if (StatePtrSize != SI.Size) {
-          DP("Symbol had size %u before loading, %u after\n", StatePtrSize,
-             SI.Size);
-          return HSA_STATUS_ERROR;
-        }
-
-        return DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &HostDeviceEnv,
-                                                    StatePtrSize, DeviceId);
-      }
-    }
-    return HSA_STATUS_SUCCESS;
-  }
-};
-
-hsa_status_t implCalloc(void **RetPtr, size_t Size, int DeviceId) {
-  uint64_t Rounded = 4 * ((Size + 3) / 4);
-  void *Ptr;
-  hsa_amd_memory_pool_t MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId);
-  hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Rounded, 0, &Ptr);
-  if (Err != HSA_STATUS_SUCCESS) {
-    return Err;
-  }
-
-  hsa_status_t Rc = hsa_amd_memory_fill(Ptr, 0, Rounded / 4);
-  if (Rc != HSA_STATUS_SUCCESS) {
-    DP("zero fill device_state failed with %u\n", Rc);
-    core::Runtime::Memfree(Ptr);
-    return HSA_STATUS_ERROR;
-  }
-
-  *RetPtr = Ptr;
-  return HSA_STATUS_SUCCESS;
-}
-
-bool imageContainsSymbol(void *Data, size_t Size, const char *Sym) {
-  SymbolInfo SI;
-  int Rc = getSymbolInfoWithoutLoading((char *)Data, Size, Sym, &SI);
-  return (Rc == 0) && (SI.Addr != nullptr);
-}
-
-hsa_status_t lock_memory(void *HostPtr, size_t Size, hsa_agent_t Agent,
-                         void **LockedHostPtr) {
-  hsa_status_t err = is_locked(HostPtr, LockedHostPtr);
-  if (err != HSA_STATUS_SUCCESS)
-    return err;
-
-  // HostPtr is already locked, just return it
-  if (*LockedHostPtr)
-    return HSA_STATUS_SUCCESS;
-
-  hsa_agent_t Agents[1] = {Agent};
-  return hsa_amd_memory_lock(HostPtr, Size, Agents, /*num_agent=*/1,
-                             LockedHostPtr);
-}
-
-hsa_status_t unlock_memory(void *HostPtr) {
-  void *LockedHostPtr = nullptr;
-  hsa_status_t err = is_locked(HostPtr, &LockedHostPtr);
-  if (err != HSA_STATUS_SUCCESS)
-    return err;
-
-  // if LockedHostPtr is nullptr, then HostPtr was not locked
-  if (!LockedHostPtr)
-    return HSA_STATUS_SUCCESS;
-
-  err = hsa_amd_memory_unlock(HostPtr);
-  return err;
-}
-
-} // namespace
-
-namespace core {
-hsa_status_t allow_access_to_all_gpu_agents(void *Ptr) {
-  return hsa_amd_agents_allow_access(DeviceInfo().HSAAgents.size(),
-                                     &DeviceInfo().HSAAgents[0], NULL, Ptr);
-}
-} // namespace core
-
-static hsa_status_t GetIsaInfo(hsa_isa_t isa, void *data) {
-  hsa_status_t err;
-  uint32_t name_len;
-  err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME_LENGTH, &name_len);
-  if (err != HSA_STATUS_SUCCESS) {
-    DP("Error getting ISA info length\n");
-    return err;
-  }
-
-  char TargetID[name_len];
-  err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME, TargetID);
-  if (err != HSA_STATUS_SUCCESS) {
-    DP("Error getting ISA info name\n");
-    return err;
-  }
-
-  auto TripleTargetID = llvm::StringRef(TargetID);
-  if (TripleTargetID.consume_front("amdgcn-amd-amdhsa")) {
-    DeviceInfo().TargetID.push_back(TripleTargetID.ltrim('-').str());
-  }
-  return HSA_STATUS_SUCCESS;
-}
-
-extern "C" {
-int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
-  return elfMachineIdIsAmdgcn(Image);
-}
-
-int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image,
-                                       __tgt_image_info *info) {
-  if (!__tgt_rtl_is_valid_binary(image))
-    return false;
-
-  // A subarchitecture was not specified. Assume it is compatible.
-  if (!info->Arch)
-    return true;
-
-  int32_t NumberOfDevices = __tgt_rtl_number_of_devices();
-
-  for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) {
-    __tgt_rtl_init_device(DeviceId);
-    hsa_agent_t agent = DeviceInfo().HSAAgents[DeviceId];
-    hsa_status_t err = hsa_agent_iterate_isas(agent, GetIsaInfo, &DeviceId);
-    if (err != HSA_STATUS_SUCCESS) {
-      DP("Error iterating ISAs\n");
-      return false;
-    }
-    if (!isImageCompatibleWithEnv(info, DeviceInfo().TargetID[DeviceId]))
-      return false;
-  }
-  DP("Image has Target ID compatible with the current environment: %s\n",
-     info->Arch);
-  return true;
-}
-
-int32_t __tgt_rtl_init_plugin() { return OFFLOAD_SUCCESS; }
-int32_t __tgt_rtl_deinit_plugin() { return OFFLOAD_SUCCESS; }
-
-int __tgt_rtl_number_of_devices() {
-  // If the construction failed, no methods are safe to call
-  if (DeviceInfo().ConstructionSucceeded) {
-    return DeviceInfo().NumberOfDevices;
-  }
-  DP("AMDGPU plugin construction failed. Zero devices available\n");
-  return 0;
-}
-
-int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
-  DP("Init requires flags to %ld\n", RequiresFlags);
-  DeviceInfo().RequiresFlags = RequiresFlags;
-  return RequiresFlags;
-}
-
-int32_t __tgt_rtl_init_device(int DeviceId) {
-  hsa_status_t Err = hsa_init();
-  if (Err != HSA_STATUS_SUCCESS) {
-    DP("HSA Initialization Failed.\n");
-    return HSA_STATUS_ERROR;
-  }
-  // this is per device id init
-  DP("Initialize the device id: %d\n", DeviceId);
-
-  hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId];
-
-  // Get number of Compute Unit
-  uint32_t ComputeUnits = 0;
-  Err = hsa_agent_get_info(
-      Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
-      &ComputeUnits);
-  if (Err != HSA_STATUS_SUCCESS) {
-    DeviceInfo().ComputeUnits[DeviceId] = 1;
-    DP("Error getting compute units : settiing to 1\n");
-  } else {
-    DeviceInfo().ComputeUnits[DeviceId] = ComputeUnits;
-    DP("Using %d compute unis per grid\n", DeviceInfo().ComputeUnits[DeviceId]);
-  }
-
-  char GetInfoName[64]; // 64 max size returned by get info
-  Err = hsa_agent_get_info(Agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME,
-                           (void *)GetInfoName);
-  if (Err)
-    DeviceInfo().GPUName[DeviceId] = "--unknown gpu--";
-  else {
-    DeviceInfo().GPUName[DeviceId] = GetInfoName;
-  }
-
-  if (print_kernel_trace & STARTUP_DETAILS)
-    DP("Device#%-2d CU's: %2d %s\n", DeviceId,
-       DeviceInfo().ComputeUnits[DeviceId],
-       DeviceInfo().GPUName[DeviceId].c_str());
-
-  // Query attributes to determine number of threads/block and blocks/grid.
-  uint16_t WorkgroupMaxDim[3];
-  Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
-                           &WorkgroupMaxDim);
-  if (Err != HSA_STATUS_SUCCESS) {
-    DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::DefaultNumTeams;
-    DP("Error getting grid dims: num groups : %d\n",
-       RTLDeviceInfoTy::DefaultNumTeams);
-  } else if (WorkgroupMaxDim[0] <= RTLDeviceInfoTy::HardTeamLimit) {
-    DeviceInfo().GroupsPerDevice[DeviceId] = WorkgroupMaxDim[0];
-    DP("Using %d ROCm blocks per grid\n",
-       DeviceInfo().GroupsPerDevice[DeviceId]);
-  } else {
-    DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::HardTeamLimit;
-    DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping "
-       "at the hard limit\n",
-       WorkgroupMaxDim[0], RTLDeviceInfoTy::HardTeamLimit);
-  }
-
-  // Get thread limit
-  hsa_dim3_t GridMaxDim;
-  Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim);
-  if (Err == HSA_STATUS_SUCCESS) {
-    DeviceInfo().ThreadsPerGroup[DeviceId] =
-        reinterpret_cast<uint32_t *>(&GridMaxDim)[0] /
-        DeviceInfo().GroupsPerDevice[DeviceId];
-
-    if (DeviceInfo().ThreadsPerGroup[DeviceId] == 0) {
-      DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
-      DP("Default thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize);
-    } else if (enforceUpperBound(&DeviceInfo().ThreadsPerGroup[DeviceId],
-                                 RTLDeviceInfoTy::MaxWgSize)) {
-      DP("Capped thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize);
-    } else {
-      DP("Using ROCm Queried thread limit: %d\n",
-         DeviceInfo().ThreadsPerGroup[DeviceId]);
-    }
-  } else {
-    DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
-    DP("Error getting max block dimension, use default:%d \n",
-       RTLDeviceInfoTy::MaxWgSize);
-  }
-
-  // Get wavefront size
-  uint32_t WavefrontSize = 0;
-  Err =
-      hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &WavefrontSize);
-  if (Err == HSA_STATUS_SUCCESS) {
-    DP("Queried wavefront size: %d\n", WavefrontSize);
-    DeviceInfo().WarpSize[DeviceId] = WavefrontSize;
-  } else {
-    // TODO: Burn the wavefront size into the code object
-    DP("Warning: Unknown wavefront size, assuming 64\n");
-    DeviceInfo().WarpSize[DeviceId] = 64;
-  }
-
-  // Adjust teams to the env variables
-
-  if (DeviceInfo().Env.TeamLimit > 0 &&
-      (enforceUpperBound(&DeviceInfo().GroupsPerDevice[DeviceId],
-                         DeviceInfo().Env.TeamLimit))) {
-    DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n",
-       DeviceInfo().Env.TeamLimit);
-  }
-
-  // Set default number of teams
-  if (DeviceInfo().Env.NumTeams > 0) {
-    DeviceInfo().NumTeams[DeviceId] = DeviceInfo().Env.NumTeams;
-    DP("Default number of teams set according to environment %d\n",
-       DeviceInfo().Env.NumTeams);
-  } else {
-    char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC");
-    int TeamsPerCU = DefaultTeamsPerCU;
-    if (TeamsPerCUEnvStr) {
-      TeamsPerCU = std::stoi(TeamsPerCUEnvStr);
-    }
-
-    DeviceInfo().NumTeams[DeviceId] =
-        TeamsPerCU * DeviceInfo().ComputeUnits[DeviceId];
-    DP("Default number of teams = %d * number of compute units %d\n",
-       TeamsPerCU, DeviceInfo().ComputeUnits[DeviceId]);
-  }
-
-  if (enforceUpperBound(&DeviceInfo().NumTeams[DeviceId],
-                        DeviceInfo().GroupsPerDevice[DeviceId])) {
-    DP("Default number of teams exceeds device limit, capping at %d\n",
-       DeviceInfo().GroupsPerDevice[DeviceId]);
-  }
-
-  // Adjust threads to the env variables
-  if (DeviceInfo().Env.TeamThreadLimit > 0 &&
-      (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId],
-                         DeviceInfo().Env.TeamThreadLimit))) {
-    DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n",
-       DeviceInfo().Env.TeamThreadLimit);
-  }
-
-  // Set default number of threads
-  DeviceInfo().NumThreads[DeviceId] = RTLDeviceInfoTy::DefaultWgSize;
-  DP("Default number of threads set according to library's default %d\n",
-     RTLDeviceInfoTy::DefaultWgSize);
-  if (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId],
-                        DeviceInfo().ThreadsPerGroup[DeviceId])) {
-    DP("Default number of threads exceeds device limit, capping at %d\n",
-       DeviceInfo().ThreadsPerGroup[DeviceId]);
-  }
-
-  DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n",
-     DeviceId, DeviceInfo().GroupsPerDevice[DeviceId],
-     DeviceInfo().ThreadsPerGroup[DeviceId]);
-
-  DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", DeviceId,
-     DeviceInfo().WarpSize[DeviceId], DeviceInfo().ThreadsPerGroup[DeviceId],
-     DeviceInfo().GroupsPerDevice[DeviceId],
-     DeviceInfo().GroupsPerDevice[DeviceId] *
-         DeviceInfo().ThreadsPerGroup[DeviceId]);
-
-  return OFFLOAD_SUCCESS;
-}
-
-static __tgt_target_table *
-__tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image);
-
-__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
-                                          __tgt_device_image *Image) {
-  DeviceInfo().LoadRunLock.lock();
-  __tgt_target_table *Res = __tgt_rtl_load_binary_locked(DeviceId, Image);
-  DeviceInfo().LoadRunLock.unlock();
-  return Res;
-}
-
-__tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
-                                                 __tgt_device_image *Image) {
-  // This function loads the device image onto gpu[DeviceId] and does other
-  // per-image initialization work. Specifically:
-  //
-  // - Initialize an DeviceEnvironmentTy instance embedded in the
-  //   image at the symbol "__omp_rtl_device_environment"
-  //   Fields DebugKind, DeviceNum, NumDevices. Used by the deviceRTL.
-  //
-  // - Allocate a large array per-gpu (could be moved to init_device)
-  //   - Read a uint64_t at symbol omptarget_nvptx_device_State_size
-  //   - Allocate at least that many bytes of gpu memory
-  //   - Zero initialize it
-  //   - Write the pointer to the symbol omptarget_nvptx_device_State
-  //
-  // - Pulls some per-kernel information together from various sources and
-  //   records it in the KernelsList for quicker access later
-  //
-  // The initialization can be done before or after loading the image onto the
-  // gpu. This function presently does a mixture. Using the hsa api to get/set
-  // the information is simpler to implement, in exchange for more complicated
-  // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes
-  // back from the gpu vs a hashtable lookup on the host.
-
-  const size_t ImgSize = (char *)Image->ImageEnd - (char *)Image->ImageStart;
-
-  DeviceInfo().clearOffloadEntriesTable(DeviceId);
-
-  // We do not need to set the ELF version because the caller of this function
-  // had to do that to decide the right runtime to use
-
-  if (!elfMachineIdIsAmdgcn(Image))
-    return NULL;
-
-  {
-    auto Env =
-        DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices,
-                          DeviceInfo().Env.DynamicMemSize, Image, ImgSize);
-
-    auto &KernelInfo = DeviceInfo().KernelInfoTable[DeviceId];
-    auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId];
-    hsa_status_t Err = moduleRegisterFromMemoryToPlace(
-        KernelInfo, SymbolInfo, (void *)Image->ImageStart, ImgSize, DeviceId,
-        [&](void *Data, size_t Size) {
-          if (imageContainsSymbol(Data, Size, "needs_hostcall_buffer")) {
-            __atomic_store_n(&DeviceInfo().HostcallRequired, true,
-                             __ATOMIC_RELEASE);
-          }
-          return Env.beforeLoading(Data, Size);
-        },
-        DeviceInfo().HSAExecutables);
-
-    check("Module registering", Err);
-    if (Err != HSA_STATUS_SUCCESS) {
-      const char *DeviceName = DeviceInfo().GPUName[DeviceId].c_str();
-      const char *ElfName = get_elf_mach_gfx_name(elfEFlags(Image));
-
-      if (strcmp(DeviceName, ElfName) != 0) {
-        DP("Possible gpu arch mismatch: device:%s, image:%s please check"
-           " compiler flag: -march=<gpu>\n",
-           DeviceName, ElfName);
-      } else {
-        DP("Error loading image onto GPU: %s\n", get_error_string(Err));
-      }
-
-      return NULL;
-    }
-
-    Err = Env.afterLoading();
-    if (Err != HSA_STATUS_SUCCESS) {
-      return NULL;
-    }
-  }
-
-  DP("AMDGPU module successfully loaded!\n");
-
-  {
-    // the device_State array is either large value in bss or a void* that
-    // needs to be assigned to a pointer to an array of size device_state_bytes
-    // If absent, it has been deadstripped and needs no setup.
-
-    void *StatePtr;
-    uint32_t StatePtrSize;
-    auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId];
-    hsa_status_t Err = interop_hsa_get_symbol_info(
-        SymbolInfoMap, DeviceId, "omptarget_nvptx_device_State", &StatePtr,
-        &StatePtrSize);
-
-    if (Err != HSA_STATUS_SUCCESS) {
-      DP("No device_state symbol found, skipping initialization\n");
-    } else {
-      if (StatePtrSize < sizeof(void *)) {
-        DP("unexpected size of state_ptr %u != %zu\n", StatePtrSize,
-           sizeof(void *));
-        return NULL;
-      }
-
-      // if it's larger than a void*, assume it's a bss array and no further
-      // initialization is required. Only try to set up a pointer for
-      // sizeof(void*)
-      if (StatePtrSize == sizeof(void *)) {
-        uint64_t DeviceStateBytes =
-            getDeviceStateBytes((char *)Image->ImageStart, ImgSize);
-        if (DeviceStateBytes == 0) {
-          DP("Can't initialize device_State, missing size information\n");
-          return NULL;
-        }
-
-        auto &DSS = DeviceInfo().DeviceStateStore[DeviceId];
-        if (DSS.first.get() == nullptr) {
-          assert(DSS.second == 0);
-          void *Ptr = NULL;
-          hsa_status_t Err = implCalloc(&Ptr, DeviceStateBytes, DeviceId);
-          if (Err != HSA_STATUS_SUCCESS) {
-            DP("Failed to allocate device_state array\n");
-            return NULL;
-          }
-          DSS = {
-              std::unique_ptr<void, RTLDeviceInfoTy::ImplFreePtrDeletor>{Ptr},
-              DeviceStateBytes,
-          };
-        }
-
-        void *Ptr = DSS.first.get();
-        if (DeviceStateBytes != DSS.second) {
-          DP("Inconsistent sizes of device_State unsupported\n");
-          return NULL;
-        }
-
-        // write ptr to device memory so it can be used by later kernels
-        Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr,
-                                                   sizeof(void *), DeviceId);
-        if (Err != HSA_STATUS_SUCCESS) {
-          DP("memcpy install of state_ptr failed\n");
-          return NULL;
-        }
-      }
-    }
-  }
-
-  // Here, we take advantage of the data that is appended after img_end to get
-  // the symbols' name we need to load. This data consist of the host entries
-  // begin and end as well as the target name (see the offloading linker script
-  // creation in clang compiler).
-
-  // Find the symbols in the module by name. The name can be obtain by
-  // concatenating the host entry name with the target name
-
-  __tgt_offload_entry *HostBegin = Image->EntriesBegin;
-  __tgt_offload_entry *HostEnd = Image->EntriesEnd;
-
-  for (__tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
-
-    if (!E->addr) {
-      // The host should have always something in the address to
-      // uniquely identify the target region.
-      DP("Analyzing host entry '<null>' (size = %lld)...\n",
-         (unsigned long long)E->size);
-      return NULL;
-    }
-
-    if (E->size) {
-      __tgt_offload_entry Entry = *E;
-
-      void *Varptr;
-      uint32_t Varsize;
-
-      auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId];
-      hsa_status_t Err = interop_hsa_get_symbol_info(
-          SymbolInfoMap, DeviceId, E->name, &Varptr, &Varsize);
-
-      if (Err != HSA_STATUS_SUCCESS) {
-        // Inform the user what symbol prevented offloading
-        DP("Loading global '%s' (Failed)\n", E->name);
-        return NULL;
-      }
-
-      if (Varsize != E->size) {
-        DP("Loading global '%s' - size mismatch (%u != %lu)\n", E->name,
-           Varsize, E->size);
-        return NULL;
-      }
-
-      DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
-         DPxPTR(E - HostBegin), E->name, DPxPTR(Varptr));
-      Entry.addr = (void *)Varptr;
-
-      DeviceInfo().addOffloadEntry(DeviceId, Entry);
-
-      if (DeviceInfo().RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
-          E->flags & OMP_DECLARE_TARGET_LINK) {
-        // If unified memory is present any target link variables
-        // can access host addresses directly. There is no longer a
-        // need for device copies.
-        Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr,
-                                                   sizeof(void *), DeviceId);
-        if (Err != HSA_STATUS_SUCCESS)
-          DP("Error when copying USM\n");
-        DP("Copy linked variable host address (" DPxMOD ")"
-           "to device address (" DPxMOD ")\n",
-           DPxPTR(*((void **)E->addr)), DPxPTR(Varptr));
-      }
-
-      continue;
-    }
-
-    DP("to find the kernel name: %s size: %lu\n", E->name, strlen(E->name));
-
-    // errors in kernarg_segment_size previously treated as = 0 (or as undef)
-    uint32_t KernargSegmentSize = 0;
-    auto &KernelInfoMap = DeviceInfo().KernelInfoTable[DeviceId];
-    hsa_status_t Err = HSA_STATUS_SUCCESS;
-    if (!E->name) {
-      Err = HSA_STATUS_ERROR;
-    } else {
-      std::string KernelStr = std::string(E->name);
-      auto It = KernelInfoMap.find(KernelStr);
-      if (It != KernelInfoMap.end()) {
-        atl_kernel_info_t Info = It->second;
-        KernargSegmentSize = Info.kernel_segment_size;
-      } else {
-        Err = HSA_STATUS_ERROR;
-      }
-    }
-
-    // default value GENERIC (in case symbol is missing from cubin file)
-    llvm::omp::OMPTgtExecModeFlags ExecModeVal =
-        llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
-
-    // get flat group size if present, else Default_WG_Size
-    int16_t WGSizeVal = RTLDeviceInfoTy::DefaultWgSize;
-
-    // get Kernel Descriptor if present.
-    // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp
-    struct KernDescValType {
-      uint16_t Version;
-      uint16_t TSize;
-      uint16_t WGSize;
-    };
-    struct KernDescValType KernDescVal;
-    std::string KernDescNameStr(E->name);
-    KernDescNameStr += "_kern_desc";
-    const char *KernDescName = KernDescNameStr.c_str();
-
-    const void *KernDescPtr;
-    uint32_t KernDescSize;
-    void *CallStackAddr = nullptr;
-    Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, KernDescName,
-                               &KernDescPtr, &KernDescSize);
-
-    if (Err == HSA_STATUS_SUCCESS) {
-      if ((size_t)KernDescSize != sizeof(KernDescVal))
-        DP("Loading global computation properties '%s' - size mismatch (%u != "
-           "%lu)\n",
-           KernDescName, KernDescSize, sizeof(KernDescVal));
-
-      memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize);
-
-      // Check structure size against recorded size.
-      if ((size_t)KernDescSize != KernDescVal.TSize)
-        DP("KernDescVal size %lu does not match advertized size %d for '%s'\n",
-           sizeof(KernDescVal), KernDescVal.TSize, KernDescName);
-
-      DP("After loading global for %s KernDesc \n", KernDescName);
-      DP("KernDesc: Version: %d\n", KernDescVal.Version);
-      DP("KernDesc: TSize: %d\n", KernDescVal.TSize);
-      DP("KernDesc: WG_Size: %d\n", KernDescVal.WGSize);
-
-      if (KernDescVal.WGSize == 0) {
-        KernDescVal.WGSize = RTLDeviceInfoTy::DefaultWgSize;
-        DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WGSize);
-      }
-      WGSizeVal = KernDescVal.WGSize;
-      DP("WGSizeVal %d\n", WGSizeVal);
-      check("Loading KernDesc computation property", Err);
-    } else {
-      DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName);
-
-      // Flat group size
-      std::string WGSizeNameStr(E->name);
-      WGSizeNameStr += "_wg_size";
-      const char *WGSizeName = WGSizeNameStr.c_str();
-
-      const void *WGSizePtr;
-      uint32_t WGSize;
-      Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, WGSizeName,
-                                 &WGSizePtr, &WGSize);
-
-      if (Err == HSA_STATUS_SUCCESS) {
-        if ((size_t)WGSize != sizeof(int16_t)) {
-          DP("Loading global computation properties '%s' - size mismatch (%u "
-             "!= "
-             "%lu)\n",
-             WGSizeName, WGSize, sizeof(int16_t));
-          return NULL;
-        }
-
-        memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize);
-
-        DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal);
-
-        if (WGSizeVal < RTLDeviceInfoTy::DefaultWgSize ||
-            WGSizeVal > RTLDeviceInfoTy::MaxWgSize) {
-          DP("Error wrong WGSize value specified in HSA code object file: "
-             "%d\n",
-             WGSizeVal);
-          WGSizeVal = RTLDeviceInfoTy::DefaultWgSize;
-        }
-      } else {
-        DP("Warning: Loading WGSize '%s' - symbol not found, "
-           "using default value %d\n",
-           WGSizeName, WGSizeVal);
-      }
-
-      check("Loading WGSize computation property", Err);
-    }
-
-    // Read execution mode from global in binary
-    std::string ExecModeNameStr(E->name);
-    ExecModeNameStr += "_exec_mode";
-    const char *ExecModeName = ExecModeNameStr.c_str();
-
-    const void *ExecModePtr;
-    uint32_t VarSize;
-    Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, ExecModeName,
-                               &ExecModePtr, &VarSize);
-
-    if (Err == HSA_STATUS_SUCCESS) {
-      if ((size_t)VarSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
-        DP("Loading global computation properties '%s' - size mismatch(%u != "
-           "%lu)\n",
-           ExecModeName, VarSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
-        return NULL;
-      }
-
-      memcpy(&ExecModeVal, ExecModePtr, (size_t)VarSize);
-
-      DP("After loading global for %s ExecMode = %d\n", ExecModeName,
-         ExecModeVal);
-
-      if (ExecModeVal < 0 ||
-          ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) {
-        DP("Error wrong exec_mode value specified in HSA code object file: "
-           "%d\n",
-           ExecModeVal);
-        return NULL;
-      }
-    } else {
-      DP("Loading global exec_mode '%s' - symbol missing, using default "
-         "value "
-         "GENERIC (1)\n",
-         ExecModeName);
-    }
-    check("Loading computation property", Err);
-
-    KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId,
-                                   CallStackAddr, E->name, KernargSegmentSize,
-                                   DeviceInfo().KernArgPool));
-    __tgt_offload_entry Entry = *E;
-    Entry.addr = (void *)&KernelsList.back();
-    DeviceInfo().addOffloadEntry(DeviceId, Entry);
-    DP("Entry point %ld maps to %s\n", E - HostBegin, E->name);
-  }
-
-  return DeviceInfo().getOffloadEntriesTable(DeviceId);
-}
-
-void *__tgt_rtl_data_alloc(int DeviceId, int64_t Size, void *, int32_t Kind) {
-  void *Ptr = NULL;
-  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
-
-  hsa_amd_memory_pool_t MemoryPool;
-  switch (Kind) {
-  case TARGET_ALLOC_DEFAULT:
-  case TARGET_ALLOC_DEVICE:
-    // GPU memory
-    MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId);
-    break;
-  case TARGET_ALLOC_HOST:
-    // non-migratable memory accessible by host and device(s)
-    MemoryPool = DeviceInfo().getHostMemoryPool();
-    break;
-  default:
-    REPORT("Invalid target data allocation kind or requested allocator not "
-           "implemented yet\n");
-    return NULL;
-  }
-
-  hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, &Ptr);
-  DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", Size,
-     (long long unsigned)(Elf64_Addr)Ptr);
-  Ptr = (Err == HSA_STATUS_SUCCESS) ? Ptr : NULL;
-  return Ptr;
-}
-
-int32_t __tgt_rtl_data_submit(int DeviceId, void *TgtPtr, void *HstPtr,
-                              int64_t Size) {
-  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
-  __tgt_async_info AsyncInfo;
-  int32_t Rc = dataSubmit(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo);
-  if (Rc != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
-}
-
-int32_t __tgt_rtl_data_submit_async(int DeviceId, void *TgtPtr, void *HstPtr,
-                                    int64_t Size, __tgt_async_info *AsyncInfo) {
-  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
-  if (AsyncInfo) {
-    initAsyncInfo(AsyncInfo);
-    return dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfo);
-  }
-  return __tgt_rtl_data_submit(DeviceId, TgtPtr, HstPtr, Size);
-}
-
-int32_t __tgt_rtl_data_retrieve(int DeviceId, void *HstPtr, void *TgtPtr,
-                                int64_t Size) {
-  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
-  __tgt_async_info AsyncInfo;
-  int32_t Rc = dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo);
-  if (Rc != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
-}
-
-int32_t __tgt_rtl_data_retrieve_async(int DeviceId, void *HstPtr, void *TgtPtr,
-                                      int64_t Size,
-                                      __tgt_async_info *AsyncInfo) {
-  assert(AsyncInfo && "AsyncInfo is nullptr");
-  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
-  initAsyncInfo(AsyncInfo);
-  return dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfo);
-}
-
-int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr, int32_t) {
-  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
-  // HSA can free pointers allocated from different types of memory pool.
-  hsa_status_t Err;
-  DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)TgtPtr);
-  Err = core::Runtime::Memfree(TgtPtr);
-  if (Err != HSA_STATUS_SUCCESS) {
-    DP("Error when freeing CUDA memory\n");
-    return OFFLOAD_FAIL;
-  }
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr,
-                                void **TgtArgs, ptrdiff_t *TgtOffsets,
-                                KernelArgsTy *KernelArgs,
-                                __tgt_async_info *AsyncInfo) {
-  assert(!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] &&
-         !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
-         "Only one dimensional kernels supported.");
-  assert(AsyncInfo && "AsyncInfo is nullptr");
-  initAsyncInfo(AsyncInfo);
-
-  DeviceInfo().LoadRunLock.lock_shared();
-  int32_t Res =
-      runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets,
-                      KernelArgs->NumArgs, KernelArgs->NumTeams[0],
-                      KernelArgs->ThreadLimit[0], KernelArgs->Tripcount);
-
-  DeviceInfo().LoadRunLock.unlock_shared();
-  return Res;
-}
-
-int32_t __tgt_rtl_synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfo) {
-  assert(AsyncInfo && "AsyncInfo is nullptr");
-
-  // Cuda asserts that AsyncInfo->Queue is non-null, but this invariant
-  // is not ensured by devices.cpp for amdgcn
-  // assert(AsyncInfo->Queue && "AsyncInfo->Queue is nullptr");
-  if (AsyncInfo->Queue) {
-    finiAsyncInfo(AsyncInfo);
-  }
-  return OFFLOAD_SUCCESS;
-}
-
-void __tgt_rtl_print_device_info(int32_t DeviceId) {
-  // TODO: Assertion to see if DeviceId is correct
-  // NOTE: We don't need to set context for print device info.
-
-  DeviceInfo().printDeviceInfo(DeviceId, DeviceInfo().HSAAgents[DeviceId]);
-}
-
-int32_t __tgt_rtl_data_lock(int32_t DeviceId, void *HostPtr, int64_t Size,
-                            void **LockedHostPtr) {
-  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
-
-  hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId];
-  hsa_status_t err = lock_memory(HostPtr, Size, Agent, LockedHostPtr);
-  if (err != HSA_STATUS_SUCCESS) {
-    DP("Error in tgt_rtl_data_lock\n");
-    return OFFLOAD_FAIL;
-  }
-  DP("Tgt lock host data %ld bytes, (HostPtr:%016llx).\n", Size,
-     (long long unsigned)(Elf64_Addr)*LockedHostPtr);
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t __tgt_rtl_data_unlock(int DeviceId, void *HostPtr) {
-  assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large");
-  hsa_status_t err = unlock_memory(HostPtr);
-  if (err != HSA_STATUS_SUCCESS) {
-    DP("Error in tgt_rtl_data_unlock\n");
-    return OFFLOAD_FAIL;
-  }
-
-  DP("Tgt unlock data (tgt:%016llx).\n",
-     (long long unsigned)(Elf64_Addr)HostPtr);
-  return OFFLOAD_SUCCESS;
-}
-
-} // extern "C"
diff --git a/openmp/libomptarget/plugins/common/CMakeLists.txt b/openmp/libomptarget/plugins/common/CMakeLists.txt
deleted file mode 100644
--- a/openmp/libomptarget/plugins/common/CMakeLists.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Common parts which can be used by all plugins
-#
-##===----------------------------------------------------------------------===##
-
-add_subdirectory(elf_common)
-add_subdirectory(MemoryManager)
diff --git a/openmp/libomptarget/plugins/cuda/CMakeLists.txt b/openmp/libomptarget/plugins/cuda/CMakeLists.txt
deleted file mode 100644
--- a/openmp/libomptarget/plugins/cuda/CMakeLists.txt
+++ /dev/null
@@ -1,76 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for a CUDA machine if available.
-#
-##===----------------------------------------------------------------------===##
-set(LIBOMPTARGET_BUILD_CUDA_PLUGIN TRUE CACHE BOOL
-  "Whether to build CUDA plugin")
-if (NOT LIBOMPTARGET_BUILD_CUDA_PLUGIN)
-  libomptarget_say("Not building CUDA offloading plugin: LIBOMPTARGET_BUILD_CUDA_PLUGIN is false")
-  return()
-endif()
-
-if (NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux"))
-  libomptarget_say("Not building CUDA offloading plugin: only support CUDA in Linux x86_64, ppc64le, or aarch64 hosts.")
-  return()
-endif()
-
-libomptarget_say("Building CUDA offloading plugin.")
-
-set(LIBOMPTARGET_DLOPEN_LIBCUDA OFF)
-option(LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA "Build with dlopened libcuda" ${LIBOMPTARGET_DLOPEN_LIBCUDA})
-
-add_llvm_library(omptarget.rtl.cuda SHARED
-  src/rtl.cpp
-
-  LINK_COMPONENTS
-  Support
-  Object
-
-  LINK_LIBS PRIVATE
-  elf_common
-  MemoryManager
-  ${OPENMP_PTHREAD_LIB}
-  "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports,-z,defs"
-
-  NO_INSTALL_RPATH
-)
-
-if(LIBOMPTARGET_DEP_CUDA_FOUND AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA)
-  libomptarget_say("Building CUDA plugin linked against libcuda")
-  target_link_libraries(omptarget.rtl.cuda PRIVATE CUDA::cuda_driver)
-else()
-  libomptarget_say("Building CUDA plugin for dlopened libcuda")
-  target_include_directories(omptarget.rtl.cuda PRIVATE dynamic_cuda)
-  target_sources(omptarget.rtl.cuda PRIVATE dynamic_cuda/cuda.cpp)
-endif()
-
-# Define the suffix for the runtime messaging dumps.
-target_compile_definitions(omptarget.rtl.cuda PRIVATE TARGET_NAME="CUDA")
-target_include_directories(omptarget.rtl.cuda PRIVATE ${LIBOMPTARGET_INCLUDE_DIR})
-
-# Install plugin under the lib destination folder.
-install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-set_target_properties(omptarget.rtl.cuda PROPERTIES 
-  INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.."
-  CXX_VISIBILITY_PRESET protected)
-
-# Report to the parent scope that we are building a plugin for CUDA.
-# This controls whether tests are run for the nvptx offloading target
-# Run them if libcuda is available, or if the user explicitly asked for dlopen
-# Otherwise this plugin is being built speculatively and there may be no cuda available
-option(LIBOMPTARGET_FORCE_NVIDIA_TESTS "Build NVIDIA libomptarget tests" OFF)
-if (LIBOMPTARGET_FOUND_NVIDIA_GPU OR LIBOMPTARGET_FORCE_NVIDIA_TESTS)
-  libomptarget_say("Enable tests using CUDA plugin")
-  set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda nvptx64-nvidia-cuda-LTO" PARENT_SCOPE)
-  list(APPEND LIBOMPTARGET_TESTED_PLUGINS "omptarget.rtl.cuda")
-  set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
-else()
-  libomptarget_say("Not generating NVIDIA tests, no supported devices detected. Use 'LIBOMPTARGET_FORCE_NVIDIA_TESTS' to override.")
-endif()
diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
deleted file mode 100644
--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ /dev/null
@@ -1,1925 +0,0 @@
-//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// RTL for CUDA machine
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/StringRef.h"
-
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cuda.h>
-#include <list>
-#include <memory>
-#include <mutex>
-#include <string>
-#include <vector>
-
-#include "Debug.h"
-#include "DeviceEnvironment.h"
-#include "omptarget.h"
-#include "omptargetplugin.h"
-
-#ifndef TARGET_NAME
-#define TARGET_NAME CUDA
-#endif
-#ifndef DEBUG_PREFIX
-#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
-#endif
-
-#include "MemoryManager.h"
-
-#include "llvm/Frontend/OpenMP/OMPConstants.h"
-
-using namespace llvm;
-
-// Utility for retrieving and printing CUDA error string.
-#ifdef OMPTARGET_DEBUG
-#define CUDA_ERR_STRING(err)                                                   \
-  do {                                                                         \
-    if (getDebugLevel() > 0) {                                                 \
-      const char *errStr = nullptr;                                            \
-      CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
-      if (errStr_status == CUDA_ERROR_INVALID_VALUE)                           \
-        REPORT("Unrecognized CUDA error code: %d\n", err);                     \
-      else if (errStr_status == CUDA_SUCCESS)                                  \
-        REPORT("CUDA error is: %s\n", errStr);                                 \
-      else {                                                                   \
-        REPORT("Unresolved CUDA error code: %d\n", err);                       \
-        REPORT("Unsuccessful cuGetErrorString return status: %d\n",            \
-               errStr_status);                                                 \
-      }                                                                        \
-    } else {                                                                   \
-      const char *errStr = nullptr;                                            \
-      CUresult errStr_status = cuGetErrorString(err, &errStr);                 \
-      if (errStr_status == CUDA_SUCCESS)                                       \
-        REPORT("%s \n", errStr);                                               \
-    }                                                                          \
-  } while (false)
-#else // OMPTARGET_DEBUG
-#define CUDA_ERR_STRING(err)                                                   \
-  do {                                                                         \
-    const char *errStr = nullptr;                                              \
-    CUresult errStr_status = cuGetErrorString(err, &errStr);                   \
-    if (errStr_status == CUDA_SUCCESS)                                         \
-      REPORT("%s \n", errStr);                                                 \
-  } while (false)
-#endif // OMPTARGET_DEBUG
-
-#define BOOL2TEXT(b) ((b) ? "Yes" : "No")
-
-#include "elf_common.h"
-
-/// Keep entries table per device.
-struct FuncOrGblEntryTy {
-  __tgt_target_table Table;
-  std::vector<__tgt_offload_entry> Entries;
-};
-
-/// Use a single entity to encode a kernel and a set of flags.
-struct KernelTy {
-  CUfunction Func;
-
-  // execution mode of kernel
-  llvm::omp::OMPTgtExecModeFlags ExecutionMode;
-
-  /// Maximal number of threads per block for this kernel.
-  int MaxThreadsPerBlock = 0;
-
-  KernelTy(CUfunction Func, llvm::omp::OMPTgtExecModeFlags ExecutionMode)
-      : Func(Func), ExecutionMode(ExecutionMode) {}
-};
-
-namespace {
-bool checkResult(CUresult Err, const char *ErrMsg) {
-  if (Err == CUDA_SUCCESS)
-    return true;
-
-  REPORT("%s", ErrMsg);
-  CUDA_ERR_STRING(Err);
-  return false;
-}
-
-int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
-               CUstream Stream) {
-  CUresult Err =
-      cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
-
-  if (Err != CUDA_SUCCESS) {
-    DP("Error when copying data from device to device. Pointers: src "
-       "= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
-       DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
-    CUDA_ERR_STRING(Err);
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
-  CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
-  CUevent Event = reinterpret_cast<CUevent>(EventPtr);
-
-  CUresult Err = cuEventRecord(Event, Stream);
-  if (Err != CUDA_SUCCESS) {
-    DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n",
-       DPxPTR(Stream), DPxPTR(Event));
-    CUDA_ERR_STRING(Err);
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-int syncEvent(void *EventPtr) {
-  CUevent Event = reinterpret_cast<CUevent>(EventPtr);
-
-  CUresult Err = cuEventSynchronize(Event);
-  if (Err != CUDA_SUCCESS) {
-    DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event));
-    CUDA_ERR_STRING(Err);
-    return OFFLOAD_FAIL;
-  }
-
-  return OFFLOAD_SUCCESS;
-}
-
-namespace {
-
-// Structure contains per-device data
-struct DeviceDataTy {
-  /// List that contains all the kernels.
-  std::list<KernelTy> KernelsList;
-
-  std::list<FuncOrGblEntryTy> FuncGblEntries;
-
-  CUcontext Context = nullptr;
-  // Device properties
-  unsigned int ThreadsPerBlock = 0;
-  unsigned int BlocksPerGrid = 0;
-  unsigned int WarpSize = 0;
-  // OpenMP properties
-  unsigned int NumTeams = 0;
-  unsigned int NumThreads = 0;
-};
-
-/// Resource allocator where \p T is the resource type.
-/// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL
-/// accordingly. The implementation should not raise any exception.
-template <typename T> struct AllocatorTy {
-  using ElementTy = T;
-  virtual ~AllocatorTy() {}
-
-  /// Create a resource and assign to R.
-  virtual int create(T &R) noexcept = 0;
-  /// Destroy the resource.
-  virtual int destroy(T) noexcept = 0;
-};
-
-/// Allocator for CUstream.
-struct StreamAllocatorTy final : public AllocatorTy<CUstream> {
-  /// See AllocatorTy<T>::create.
-  int create(CUstream &Stream) noexcept override {
-    if (!checkResult(cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING),
-                     "Error returned from cuStreamCreate\n"))
-      return OFFLOAD_FAIL;
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  /// See AllocatorTy<T>::destroy.
-  int destroy(CUstream Stream) noexcept override {
-    if (!checkResult(cuStreamDestroy(Stream),
-                     "Error returned from cuStreamDestroy\n"))
-      return OFFLOAD_FAIL;
-
-    return OFFLOAD_SUCCESS;
-  }
-};
-
-/// Allocator for CUevent.
-struct EventAllocatorTy final : public AllocatorTy<CUevent> {
-  /// See AllocatorTy<T>::create.
-  int create(CUevent &Event) noexcept override {
-    if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT),
-                     "Error returned from cuEventCreate\n"))
-      return OFFLOAD_FAIL;
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  /// See AllocatorTy<T>::destroy.
-  int destroy(CUevent Event) noexcept override {
-    if (!checkResult(cuEventDestroy(Event),
-                     "Error returned from cuEventDestroy\n"))
-      return OFFLOAD_FAIL;
-
-    return OFFLOAD_SUCCESS;
-  }
-};
-
-/// A generic pool of resources where \p T is the resource type.
-/// \p T should be copyable as the object is stored in \p std::vector .
-template <typename AllocTy> class ResourcePoolTy {
-  using ElementTy = typename AllocTy::ElementTy;
-  /// Index of the next available resource.
-  size_t Next = 0;
-  /// Mutex to guard the pool.
-  std::mutex Mutex;
-  /// Pool of resources. The difference between \p Resources and \p Pool is,
-  /// when a resource is acquired and released, it is all on \p Resources. When
-  /// a batch of new resources are needed, they are both added to \p Resources
-  /// and \p Pool. The reason for this setting is, \p Resources could contain
-  /// redundant elements because resources are not released, which can cause
-  /// double free. This setting makes sure that \p Pool always has every
-  /// resource allocated from the device.
-  std::vector<ElementTy> Resources;
-  std::vector<ElementTy> Pool;
-  /// A reference to the corresponding allocator.
-  AllocTy Allocator;
-
-  /// If `Resources` is used up, we will fill in more resources. It assumes that
-  /// the new size `Size` should be always larger than the current size.
-  bool resize(size_t Size) {
-    assert(Resources.size() == Pool.size() && "size mismatch");
-    auto CurSize = Resources.size();
-    assert(Size > CurSize && "Unexpected smaller size");
-    Pool.reserve(Size);
-    Resources.reserve(Size);
-    for (auto I = CurSize; I < Size; ++I) {
-      ElementTy NewItem;
-      int Ret = Allocator.create(NewItem);
-      if (Ret != OFFLOAD_SUCCESS)
-        return false;
-      Pool.push_back(NewItem);
-      Resources.push_back(NewItem);
-    }
-    return true;
-  }
-
-public:
-  ResourcePoolTy(AllocTy &&A, size_t Size = 0) noexcept
-      : Allocator(std::move(A)) {
-    if (Size)
-      (void)resize(Size);
-  }
-
-  ~ResourcePoolTy() noexcept { clear(); }
-
-  /// Get a resource from pool. `Next` always points to the next available
-  /// resource. That means, `[0, next-1]` have been assigned, and `[id,]` are
-  /// still available. If there is no resource left, we will ask for more. Each
-  /// time a resource is assigned, the id will increase one.
-  /// xxxxxs+++++++++
-  ///      ^
-  ///      Next
-  /// After assignment, the pool becomes the following and s is assigned.
-  /// xxxxxs+++++++++
-  ///       ^
-  ///       Next
-  int acquire(ElementTy &R) noexcept {
-    std::lock_guard<std::mutex> LG(Mutex);
-    if (Next == Resources.size()) {
-      auto NewSize = Resources.size() ? Resources.size() * 2 : 1;
-      if (!resize(NewSize))
-        return OFFLOAD_FAIL;
-    }
-
-    assert(Next < Resources.size());
-
-    R = Resources[Next++];
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  /// Return the resource back to the pool. When we return a resource, we need
-  /// to first decrease `Next`, and then copy the resource back. It is worth
-  /// noting that, the order of resources return might be different from that
-  /// they're assigned, that saying, at some point, there might be two identical
-  /// resources.
-  /// xxax+a+++++
-  ///     ^
-  ///     Next
-  /// However, it doesn't matter, because they're always on the two sides of
-  /// `Next`. The left one will in the end be overwritten by another resource.
-  /// Therefore, after several execution, the order of pool might be different
-  /// from its initial state.
-  void release(ElementTy R) noexcept {
-    std::lock_guard<std::mutex> LG(Mutex);
-    Resources[--Next] = R;
-  }
-
-  /// Released all stored resources and clear the pool.
-  /// Note: This function is not thread safe. Be sure to guard it if necessary.
-  void clear() noexcept {
-    for (auto &R : Pool)
-      (void)Allocator.destroy(R);
-    Pool.clear();
-    Resources.clear();
-  }
-};
-
-} // namespace
-
-class DeviceRTLTy {
-  int NumberOfDevices;
-  // OpenMP environment properties
-  int EnvNumTeams;
-  unsigned int EnvTeamLimit;
-  unsigned int EnvTeamThreadLimit;
-  // OpenMP requires flags
-  int64_t RequiresFlags;
-  // Amount of dynamic shared memory to use at launch.
-  uint64_t DynamicMemorySize;
-
-  /// Number of initial streams for each device.
-  int NumInitialStreams = 32;
-
-  /// Number of initial events for each device.
-  int NumInitialEvents = 8;
-
-  static constexpr const int32_t HardThreadLimit = 1024;
-  static constexpr const int32_t DefaultNumTeams = 128;
-  static constexpr const int32_t DefaultNumThreads = 128;
-
-  using StreamPoolTy = ResourcePoolTy<StreamAllocatorTy>;
-  std::vector<std::unique_ptr<StreamPoolTy>> StreamPool;
-
-  using EventPoolTy = ResourcePoolTy<EventAllocatorTy>;
-  std::vector<std::unique_ptr<EventPoolTy>> EventPool;
-
-  std::vector<DeviceDataTy> DeviceData;
-  std::vector<std::vector<CUmodule>> Modules;
-
-  /// Vector of flags indicating the initalization status of all associated
-  /// devices.
-  std::vector<bool> InitializedFlags;
-
-  enum class PeerAccessState : uint8_t { Unkown, Yes, No };
-  std::vector<std::vector<PeerAccessState>> PeerAccessMatrix;
-  std::mutex PeerAccessMatrixLock;
-
-  /// A class responsible for interacting with device native runtime library to
-  /// allocate and free memory.
-  class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
-  public:
-    void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
-      if (Size == 0)
-        return nullptr;
-
-      void *MemAlloc = nullptr;
-      CUresult Err;
-      switch (Kind) {
-      case TARGET_ALLOC_DEFAULT:
-      case TARGET_ALLOC_DEVICE:
-        CUdeviceptr DevicePtr;
-        Err = cuMemAlloc(&DevicePtr, Size);
-        MemAlloc = (void *)DevicePtr;
-        if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
-          return nullptr;
-        break;
-      case TARGET_ALLOC_HOST:
-        void *HostPtr;
-        Err = cuMemAllocHost(&HostPtr, Size);
-        MemAlloc = HostPtr;
-        if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
-          return nullptr;
-        break;
-      case TARGET_ALLOC_SHARED:
-        CUdeviceptr SharedPtr;
-        Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
-        MemAlloc = (void *)SharedPtr;
-        if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
-          return nullptr;
-        break;
-      }
-
-      return MemAlloc;
-    }
-
-    int free(void *TgtPtr, TargetAllocTy Kind) override {
-      CUresult Err;
-      // Host pinned memory must be freed differently.
-      switch (Kind) {
-      case TARGET_ALLOC_DEFAULT:
-      case TARGET_ALLOC_DEVICE:
-      case TARGET_ALLOC_SHARED:
-        Err = cuMemFree((CUdeviceptr)TgtPtr);
-        if (!checkResult(Err, "Error returned from cuMemFree\n"))
-          return OFFLOAD_FAIL;
-        break;
-      case TARGET_ALLOC_HOST:
-        Err = cuMemFreeHost(TgtPtr);
-        if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
-          return OFFLOAD_FAIL;
-        break;
-      }
-
-      return OFFLOAD_SUCCESS;
-    }
-  };
-
-  /// A vector of device allocators
-  std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
-
-  /// A vector of memory managers. Since the memory manager is non-copyable and
-  // non-removable, we wrap them into std::unique_ptr.
-  std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
-
-  /// Whether use memory manager
-  bool UseMemoryManager = true;
-
-  // Record entry point associated with device
-  void addOffloadEntry(const int DeviceId, const __tgt_offload_entry Entry) {
-    FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
-    E.Entries.push_back(Entry);
-  }
-
-  // Return a pointer to the entry associated with the pointer
-  const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
-                                             const void *Addr) const {
-    for (const __tgt_offload_entry &Itr :
-         DeviceData[DeviceId].FuncGblEntries.back().Entries)
-      if (Itr.addr == Addr)
-        return &Itr;
-
-    return nullptr;
-  }
-
-  // Return the pointer to the target entries table
-  __tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
-    FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
-
-    if (E.Entries.empty())
-      return nullptr;
-
-    // Update table info according to the entries and return the pointer
-    E.Table.EntriesBegin = E.Entries.data();
-    E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
-
-    return &E.Table;
-  }
-
-  // Clear entries table for a device
-  void clearOffloadEntriesTable(const int DeviceId) {
-    DeviceData[DeviceId].FuncGblEntries.emplace_back();
-    FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
-    E.Entries.clear();
-    E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
-  }
-
-public:
-  CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
-    assert(AsyncInfo && "AsyncInfo is nullptr");
-
-    if (!AsyncInfo->Queue) {
-      CUstream S;
-      if (StreamPool[DeviceId]->acquire(S) != OFFLOAD_SUCCESS)
-        return nullptr;
-
-      AsyncInfo->Queue = S;
-    }
-
-    return reinterpret_cast<CUstream>(AsyncInfo->Queue);
-  }
-
-  // This class should not be copied
-  DeviceRTLTy(const DeviceRTLTy &) = delete;
-  DeviceRTLTy(DeviceRTLTy &&) = delete;
-
-  DeviceRTLTy()
-      : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
-        EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED),
-        DynamicMemorySize(0) {
-
-    DP("Start initializing CUDA\n");
-
-    CUresult Err = cuInit(0);
-    if (Err == CUDA_ERROR_INVALID_HANDLE) {
-      // Can't call cuGetErrorString if dlsym failed
-      DP("Failed to load CUDA shared library\n");
-      return;
-    }
-    if (Err == CUDA_ERROR_NO_DEVICE) {
-      DP("There are no devices supporting CUDA.\n");
-      return;
-    }
-    if (!checkResult(Err, "Error returned from cuInit\n")) {
-      return;
-    }
-
-    Err = cuDeviceGetCount(&NumberOfDevices);
-    if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
-      return;
-
-    if (NumberOfDevices == 0) {
-      DP("There are no devices supporting CUDA.\n");
-      return;
-    }
-
-    DeviceData.resize(NumberOfDevices);
-    Modules.resize(NumberOfDevices);
-    StreamPool.resize(NumberOfDevices);
-    EventPool.resize(NumberOfDevices);
-    PeerAccessMatrix.resize(NumberOfDevices);
-    for (auto &V : PeerAccessMatrix)
-      V.resize(NumberOfDevices, PeerAccessState::Unkown);
-
-    // Get environment variables regarding teams
-    if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
-      // OMP_TEAM_LIMIT has been set
-      EnvTeamLimit = std::stoi(EnvStr);
-      DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
-    }
-    if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) {
-      // OMP_TEAMS_THREAD_LIMIT has been set
-      EnvTeamThreadLimit = std::stoi(EnvStr);
-      DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
-    }
-    if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
-      // OMP_NUM_TEAMS has been set
-      EnvNumTeams = std::stoi(EnvStr);
-      DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
-    }
-    if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) {
-      // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set
-      DynamicMemorySize = std::stoi(EnvStr);
-      DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n",
-         DynamicMemorySize);
-    }
-    if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) {
-      // LIBOMPTARGET_NUM_INITIAL_STREAMS has been set
-      NumInitialStreams = std::stoi(EnvStr);
-      DP("Parsed LIBOMPTARGET_NUM_INITIAL_STREAMS=%d\n", NumInitialStreams);
-    }
-
-    for (int I = 0; I < NumberOfDevices; ++I)
-      DeviceAllocators.emplace_back();
-
-    // Get the size threshold from environment variable
-    std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
-    UseMemoryManager = Res.second;
-    size_t MemoryManagerThreshold = Res.first;
-
-    if (UseMemoryManager)
-      for (int I = 0; I < NumberOfDevices; ++I)
-        MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
-            DeviceAllocators[I], MemoryManagerThreshold));
-
-    // We lazily initialize all devices later.
-    InitializedFlags.assign(NumberOfDevices, false);
-  }
-
-  ~DeviceRTLTy() {
-    for (int DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId)
-      deinitDevice(DeviceId);
-  }
-
-  // Check whether a given DeviceId is valid
-  bool isValidDeviceId(const int DeviceId) const {
-    return DeviceId >= 0 && DeviceId < NumberOfDevices;
-  }
-
-  int getNumOfDevices() const { return NumberOfDevices; }
-
-  void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
-
-  int initDevice(const int DeviceId) {
-    CUdevice Device;
-
-    DP("Getting device %d\n", DeviceId);
-    CUresult Err = cuDeviceGet(&Device, DeviceId);
-    if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
-      return OFFLOAD_FAIL;
-
-    assert(InitializedFlags[DeviceId] == false && "Reinitializing device!");
-    InitializedFlags[DeviceId] = true;
-
-    // Query the current flags of the primary context and set its flags if
-    // it is inactive
-    unsigned int FormerPrimaryCtxFlags = 0;
-    int FormerPrimaryCtxIsActive = 0;
-    Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
-                                     &FormerPrimaryCtxIsActive);
-    if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
-      return OFFLOAD_FAIL;
-
-    if (FormerPrimaryCtxIsActive) {
-      DP("The primary context is active, no change to its flags\n");
-      if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
-          CU_CTX_SCHED_BLOCKING_SYNC)
-        DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
-    } else {
-      DP("The primary context is inactive, set its flags to "
-         "CU_CTX_SCHED_BLOCKING_SYNC\n");
-      Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
-      if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
-        return OFFLOAD_FAIL;
-    }
-
-    // Retain the per device primary context and save it to use whenever this
-    // device is selected.
-    Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
-    if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
-      return OFFLOAD_FAIL;
-
-    Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
-    if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
-      return OFFLOAD_FAIL;
-
-    // Initialize the stream pool.
-    if (!StreamPool[DeviceId])
-      StreamPool[DeviceId] = std::make_unique<StreamPoolTy>(StreamAllocatorTy(),
-                                                            NumInitialStreams);
-
-    // Initialize the event pool.
-    if (!EventPool[DeviceId])
-      EventPool[DeviceId] =
-          std::make_unique<EventPoolTy>(EventAllocatorTy(), NumInitialEvents);
-
-    // Query attributes to determine number of threads/block and blocks/grid.
-    int MaxGridDimX;
-    Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
-                               Device);
-    if (Err != CUDA_SUCCESS) {
-      DP("Error getting max grid dimension, use default value %d\n",
-         DeviceRTLTy::DefaultNumTeams);
-      DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
-    } else {
-      DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
-      DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
-    }
-
-    // We are only exploiting threads along the x axis.
-    int MaxBlockDimX;
-    Err = cuDeviceGetAttribute(&MaxBlockDimX,
-                               CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
-    if (Err != CUDA_SUCCESS) {
-      DP("Error getting max block dimension, use default value %d\n",
-         DeviceRTLTy::DefaultNumThreads);
-      DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
-    } else {
-      DP("Using %d CUDA threads per block\n", MaxBlockDimX);
-      DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
-
-      if (EnvTeamThreadLimit > 0 &&
-          DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) {
-        DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
-           "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
-           DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit);
-        DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit;
-      }
-      if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) {
-        DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
-           "capping at the hard limit\n",
-           DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit);
-        DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
-      }
-    }
-
-    // Get and set warp size
-    int WarpSize;
-    Err =
-        cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
-    if (Err != CUDA_SUCCESS) {
-      DP("Error getting warp size, assume default value 32\n");
-      DeviceData[DeviceId].WarpSize = 32;
-    } else {
-      DP("Using warp size %d\n", WarpSize);
-      DeviceData[DeviceId].WarpSize = WarpSize;
-    }
-
-    // Adjust teams to the env variables
-    if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
-      DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
-         EnvTeamLimit);
-      DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
-    }
-
-    size_t StackLimit;
-    size_t HeapLimit;
-    if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) {
-      StackLimit = std::stol(EnvStr);
-      if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS)
-        return OFFLOAD_FAIL;
-    } else {
-      if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS)
-        return OFFLOAD_FAIL;
-    }
-    if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) {
-      HeapLimit = std::stol(EnvStr);
-      if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS)
-        return OFFLOAD_FAIL;
-    } else {
-      if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS)
-        return OFFLOAD_FAIL;
-    }
-
-    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-         "Device supports up to %d CUDA blocks and %d threads with a "
-         "warp size of %d\n",
-         DeviceData[DeviceId].BlocksPerGrid,
-         DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
-    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-         "Device heap size is %d Bytes, device stack size is %d Bytes per "
-         "thread\n",
-         (int)HeapLimit, (int)StackLimit);
-
-    // Set default number of teams
-    if (EnvNumTeams > 0) {
-      DP("Default number of teams set according to environment %d\n",
-         EnvNumTeams);
-      DeviceData[DeviceId].NumTeams = EnvNumTeams;
-    } else {
-      DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
-      DP("Default number of teams set according to library's default %d\n",
-         DeviceRTLTy::DefaultNumTeams);
-    }
-
-    if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
-      DP("Default number of teams exceeds device limit, capping at %d\n",
-         DeviceData[DeviceId].BlocksPerGrid);
-      DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
-    }
-
-    // Set default number of threads
-    DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
-    DP("Default number of threads set according to library's default %d\n",
-       DeviceRTLTy::DefaultNumThreads);
-    if (DeviceData[DeviceId].NumThreads >
-        DeviceData[DeviceId].ThreadsPerBlock) {
-      DP("Default number of threads exceeds device limit, capping at %d\n",
-         DeviceData[DeviceId].ThreadsPerBlock);
-      DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock;
-    }
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  int deinitDevice(const int DeviceId) {
-    auto IsInitialized = InitializedFlags[DeviceId];
-    if (!IsInitialized)
-      return OFFLOAD_SUCCESS;
-    InitializedFlags[DeviceId] = false;
-
-    if (UseMemoryManager)
-      MemoryManagers[DeviceId].release();
-
-    StreamPool[DeviceId].reset();
-    EventPool[DeviceId].reset();
-
-    DeviceDataTy &D = DeviceData[DeviceId];
-    if (!checkResult(cuCtxSetCurrent(D.Context),
-                     "Error returned from cuCtxSetCurrent\n"))
-      return OFFLOAD_FAIL;
-
-    // Unload all modules.
-    for (auto &M : Modules[DeviceId])
-      if (!checkResult(cuModuleUnload(M),
-                       "Error returned from cuModuleUnload\n"))
-        return OFFLOAD_FAIL;
-
-    // Destroy context.
-    CUdevice Device;
-    if (!checkResult(cuCtxGetDevice(&Device),
-                     "Error returned from cuCtxGetDevice\n"))
-      return OFFLOAD_FAIL;
-
-    if (!checkResult(cuDevicePrimaryCtxRelease(Device),
-                     "Error returned from cuDevicePrimaryCtxRelease\n"))
-      return OFFLOAD_FAIL;
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  __tgt_target_table *loadBinary(const int DeviceId,
-                                 const __tgt_device_image *Image) {
-    // Clear the offload table as we are going to create a new one.
-    clearOffloadEntriesTable(DeviceId);
-
-    // Create the module and extract the function pointers.
-    CUmodule Module;
-    DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
-    CUresult Err =
-        cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
-    if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
-      return nullptr;
-
-    DP("CUDA module successfully loaded!\n");
-
-    Modules[DeviceId].push_back(Module);
-
-    // Find the symbols in the module by name.
-    const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
-    const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
-
-    std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
-    for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
-      if (!E->addr) {
-        // We return nullptr when something like this happens, the host should
-        // have always something in the address to uniquely identify the target
-        // region.
-        DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
-        return nullptr;
-      }
-
-      if (E->size) {
-        __tgt_offload_entry Entry = *E;
-        CUdeviceptr CUPtr;
-        size_t CUSize;
-        Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
-        // We keep this style here because we need the name
-        if (Err != CUDA_SUCCESS) {
-          REPORT("Loading global '%s' Failed\n", E->name);
-          CUDA_ERR_STRING(Err);
-          return nullptr;
-        }
-
-        if (CUSize != E->size) {
-          DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
-             CUSize, E->size);
-          return nullptr;
-        }
-
-        DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
-           DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
-
-        Entry.addr = (void *)(CUPtr);
-
-        // Note: In the current implementation declare target variables
-        // can either be link or to. This means that once unified
-        // memory is activated via the requires directive, the variable
-        // can be used directly from the host in both cases.
-        // TODO: when variables types other than to or link are added,
-        // the below condition should be changed to explicitly
-        // check for to and link variables types:
-        // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
-        // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
-        if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
-          // If unified memory is present any target link or to variables
-          // can access host addresses directly. There is no longer a
-          // need for device copies.
-          cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
-          DP("Copy linked variable host address (" DPxMOD
-             ") to device address (" DPxMOD ")\n",
-             DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
-        }
-
-        addOffloadEntry(DeviceId, Entry);
-
-        continue;
-      }
-
-      CUfunction Func;
-      Err = cuModuleGetFunction(&Func, Module, E->name);
-      // We keep this style here because we need the name
-      if (Err != CUDA_SUCCESS) {
-        REPORT("Loading '%s' Failed\n", E->name);
-        CUDA_ERR_STRING(Err);
-        return nullptr;
-      }
-
-      DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
-         DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
-
-      // default value GENERIC (in case symbol is missing from cubin file)
-      llvm::omp::OMPTgtExecModeFlags ExecModeVal;
-      std::string ExecModeNameStr(E->name);
-      ExecModeNameStr += "_exec_mode";
-      const char *ExecModeName = ExecModeNameStr.c_str();
-
-      CUdeviceptr ExecModePtr;
-      size_t CUSize;
-      Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
-      if (Err == CUDA_SUCCESS) {
-        if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
-          DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
-             ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
-          return nullptr;
-        }
-
-        Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
-        if (Err != CUDA_SUCCESS) {
-          REPORT("Error when copying data from device to host. Pointers: "
-                 "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
-                 DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
-          CUDA_ERR_STRING(Err);
-          return nullptr;
-        }
-      } else {
-        DP("Loading global exec_mode '%s' - symbol missing, using default "
-           "value GENERIC (1)\n",
-           ExecModeName);
-      }
-
-      KernelsList.emplace_back(Func, ExecModeVal);
-
-      __tgt_offload_entry Entry = *E;
-      Entry.addr = &KernelsList.back();
-      addOffloadEntry(DeviceId, Entry);
-    }
-
-    // send device environment data to the device
-    {
-      // TODO: The device ID used here is not the real device ID used by OpenMP.
-      DeviceEnvironmentTy DeviceEnv{0, static_cast<uint32_t>(NumberOfDevices),
-                                    static_cast<uint32_t>(DeviceId),
-                                    static_cast<uint32_t>(DynamicMemorySize)};
-
-      if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
-        DeviceEnv.DebugKind = std::stoi(EnvStr);
-
-      const char *DeviceEnvName = "__omp_rtl_device_environment";
-      CUdeviceptr DeviceEnvPtr;
-      size_t CUSize;
-
-      Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
-      if (Err == CUDA_SUCCESS) {
-        if (CUSize != sizeof(DeviceEnv)) {
-          REPORT(
-              "Global device_environment '%s' - size mismatch (%zu != %zu)\n",
-              DeviceEnvName, CUSize, sizeof(int32_t));
-          CUDA_ERR_STRING(Err);
-          return nullptr;
-        }
-
-        Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
-        if (Err != CUDA_SUCCESS) {
-          REPORT("Error when copying data from host to device. Pointers: "
-                 "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
-                 DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
-          CUDA_ERR_STRING(Err);
-          return nullptr;
-        }
-
-        DP("Sending global device environment data %zu bytes\n", CUSize);
-      } else {
-        DP("Finding global device environment '%s' - symbol missing.\n",
-           DeviceEnvName);
-        DP("Continue, considering this is a device RTL which does not accept "
-           "environment setting.\n");
-      }
-    }
-
-    return getOffloadEntriesTable(DeviceId);
-  }
-
-  void *dataAlloc(const int DeviceId, const int64_t Size,
-                  const TargetAllocTy Kind) {
-    switch (Kind) {
-    case TARGET_ALLOC_DEFAULT:
-    case TARGET_ALLOC_DEVICE:
-      if (UseMemoryManager)
-        return MemoryManagers[DeviceId]->allocate(Size, nullptr);
-      else
-        return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
-    case TARGET_ALLOC_HOST:
-    case TARGET_ALLOC_SHARED:
-      return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
-    }
-
-    REPORT("Invalid target data allocation kind or requested allocator not "
-           "implemented yet\n");
-
-    return nullptr;
-  }
-
-  int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
-                 const int64_t Size, __tgt_async_info *AsyncInfo) const {
-    assert(AsyncInfo && "AsyncInfo is nullptr");
-
-    CUstream Stream = getStream(DeviceId, AsyncInfo);
-    CUresult Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
-    if (Err != CUDA_SUCCESS) {
-      DP("Error when copying data from host to device. Pointers: host "
-         "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
-         DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
-      CUDA_ERR_STRING(Err);
-      return OFFLOAD_FAIL;
-    }
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
-                   const int64_t Size, __tgt_async_info *AsyncInfo) const {
-    assert(AsyncInfo && "AsyncInfo is nullptr");
-
-    CUstream Stream = getStream(DeviceId, AsyncInfo);
-    CUresult Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
-    if (Err != CUDA_SUCCESS) {
-      DP("Error when copying data from device to host. Pointers: host "
-         "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
-         DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
-      CUDA_ERR_STRING(Err);
-      return OFFLOAD_FAIL;
-    }
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
-                   int64_t Size, __tgt_async_info *AsyncInfo) {
-    assert(AsyncInfo && "AsyncInfo is nullptr");
-
-    CUresult Err;
-    CUstream Stream = getStream(SrcDevId, AsyncInfo);
-
-    // If they are two devices, we try peer to peer copy first
-    if (SrcDevId != DstDevId) {
-      std::lock_guard<std::mutex> LG(PeerAccessMatrixLock);
-
-      switch (PeerAccessMatrix[SrcDevId][DstDevId]) {
-      case PeerAccessState::No: {
-        REPORT("Peer access from %" PRId32 " to %" PRId32
-               " is not supported. Fall back to D2D memcpy.\n",
-               SrcDevId, DstDevId);
-        return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
-      }
-      case PeerAccessState::Unkown: {
-        int CanAccessPeer = 0;
-        Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
-        if (Err != CUDA_SUCCESS) {
-          REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
-                 ", dst = %" PRId32 ". Fall back to D2D memcpy.\n",
-                 SrcDevId, DstDevId);
-          CUDA_ERR_STRING(Err);
-          PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
-          return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
-        }
-
-        if (!CanAccessPeer) {
-          REPORT("P2P access from %d to %d is not supported. Fall back to D2D "
-                 "memcpy.\n",
-                 SrcDevId, DstDevId);
-          PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
-          return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
-        }
-
-        Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
-        if (Err != CUDA_SUCCESS) {
-          REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
-                 ", dst = %" PRId32 ". Fall back to D2D memcpy.\n",
-                 SrcDevId, DstDevId);
-          CUDA_ERR_STRING(Err);
-          PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
-          return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
-        }
-
-        PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::Yes;
-
-        [[fallthrough]];
-      }
-      case PeerAccessState::Yes: {
-        Err = cuMemcpyPeerAsync(
-            (CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
-            (CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, Size, Stream);
-        if (Err == CUDA_SUCCESS)
-          return OFFLOAD_SUCCESS;
-
-        DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
-           ", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32
-           ". Fall back to D2D memcpy.\n",
-           DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
-        CUDA_ERR_STRING(Err);
-
-        return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
-      }
-      }
-    }
-
-    return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
-  }
-
-  int dataDelete(const int DeviceId, void *TgtPtr, TargetAllocTy Kind) {
-    switch (Kind) {
-    case TARGET_ALLOC_DEFAULT:
-    case TARGET_ALLOC_DEVICE:
-      if (UseMemoryManager)
-        return MemoryManagers[DeviceId]->free(TgtPtr);
-      else
-        return DeviceAllocators[DeviceId].free(TgtPtr, Kind);
-    case TARGET_ALLOC_HOST:
-    case TARGET_ALLOC_SHARED:
-      return DeviceAllocators[DeviceId].free(TgtPtr, Kind);
-    }
-
-    REPORT("Invalid target data allocation kind or requested allocator not "
-           "implemented yet\n");
-
-    return OFFLOAD_FAIL;
-  }
-
-  int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
-                          ptrdiff_t *TgtOffsets, const int ArgNum,
-                          const int TeamNum, const int ThreadLimit,
-                          const unsigned int LoopTripCount,
-                          __tgt_async_info *AsyncInfo) const {
-    // All args are references.
-    std::vector<void *> Args(ArgNum);
-    std::vector<void *> Ptrs(ArgNum);
-
-    for (int I = 0; I < ArgNum; ++I) {
-      Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
-      Args[I] = &Ptrs[I];
-    }
-
-    KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
-
-    const bool IsSPMDGenericMode =
-        KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
-    const bool IsSPMDMode =
-        KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
-    const bool IsGenericMode =
-        KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
-
-    int CudaThreadsPerBlock;
-    if (ThreadLimit > 0) {
-      DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
-      CudaThreadsPerBlock = ThreadLimit;
-      // Add master warp if necessary
-      if (IsGenericMode) {
-        DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
-        CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
-      }
-    } else {
-      DP("Setting CUDA threads per block to default %d\n",
-         DeviceData[DeviceId].NumThreads);
-      CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
-    }
-
-    if ((unsigned)CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
-      DP("Threads per block capped at device limit %d\n",
-         DeviceData[DeviceId].ThreadsPerBlock);
-      CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
-    }
-
-    CUresult Err;
-    if (!KernelInfo->MaxThreadsPerBlock) {
-      Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
-                               CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                               KernelInfo->Func);
-      if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
-        return OFFLOAD_FAIL;
-    }
-
-    if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
-      DP("Threads per block capped at kernel limit %d\n",
-         KernelInfo->MaxThreadsPerBlock);
-      CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
-    }
-
-    unsigned int CudaBlocksPerGrid;
-    if (TeamNum <= 0) {
-      if (LoopTripCount > 0 && EnvNumTeams < 0) {
-        if (IsSPMDGenericMode) {
-          // If we reach this point, then we are executing a kernel that was
-          // transformed from Generic-mode to SPMD-mode. This kernel has
-          // SPMD-mode execution, but needs its blocks to be scheduled
-          // differently because the current loop trip count only applies to the
-          // `teams distribute` region and will create var too few blocks using
-          // the regular SPMD-mode method.
-          CudaBlocksPerGrid = LoopTripCount;
-        } else if (IsSPMDMode) {
-          // We have a combined construct, i.e. `target teams distribute
-          // parallel for [simd]`. We launch so many teams so that each thread
-          // will execute one iteration of the loop. round up to the nearest
-          // integer
-          CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
-        } else if (IsGenericMode) {
-          // If we reach this point, then we have a non-combined construct, i.e.
-          // `teams distribute` with a nested `parallel for` and each team is
-          // assigned one iteration of the `distribute` loop. E.g.:
-          //
-          // #pragma omp target teams distribute
-          // for(...loop_tripcount...) {
-          //   #pragma omp parallel for
-          //   for(...) {}
-          // }
-          //
-          // Threads within a team will execute the iterations of the `parallel`
-          // loop.
-          CudaBlocksPerGrid = LoopTripCount;
-        } else {
-          REPORT("Unknown execution mode: %d\n",
-                 static_cast<int8_t>(KernelInfo->ExecutionMode));
-          return OFFLOAD_FAIL;
-        }
-        DP("Using %d teams due to loop trip count %" PRIu32
-           " and number of threads per block %d\n",
-           CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
-      } else {
-        DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
-        CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
-      }
-    } else {
-      DP("Using requested number of teams %d\n", TeamNum);
-      CudaBlocksPerGrid = TeamNum;
-    }
-
-    if (CudaBlocksPerGrid > DeviceData[DeviceId].BlocksPerGrid) {
-      DP("Capping number of teams to team limit %d\n",
-         DeviceData[DeviceId].BlocksPerGrid);
-      CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
-    }
-
-    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-         "Launching kernel %s with %d blocks and %d threads in %s mode\n",
-         (getOffloadEntry(DeviceId, TgtEntryPtr))
-             ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
-             : "(null)",
-         CudaBlocksPerGrid, CudaThreadsPerBlock,
-         (!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD"));
-
-    CUstream Stream = getStream(DeviceId, AsyncInfo);
-    Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
-                         /* gridDimZ */ 1, CudaThreadsPerBlock,
-                         /* blockDimY */ 1, /* blockDimZ */ 1,
-                         DynamicMemorySize, Stream, &Args[0], nullptr);
-    if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
-      return OFFLOAD_FAIL;
-
-    DP("Launch of entry point at " DPxMOD " successful!\n",
-       DPxPTR(TgtEntryPtr));
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const {
-    CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
-    CUresult Err = cuStreamSynchronize(Stream);
-
-    // Once the stream is synchronized, return it to stream pool and reset
-    // AsyncInfo. This is to make sure the synchronization only works for its
-    // own tasks.
-    StreamPool[DeviceId]->release(reinterpret_cast<CUstream>(AsyncInfo->Queue));
-    AsyncInfo->Queue = nullptr;
-
-    if (Err != CUDA_SUCCESS) {
-      DP("Error when synchronizing stream. stream = " DPxMOD
-         ", async info ptr = " DPxMOD "\n",
-         DPxPTR(Stream), DPxPTR(AsyncInfo));
-      CUDA_ERR_STRING(Err);
-    }
-    return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
-  }
-
-  int queryAsync(const int DeviceId, __tgt_async_info *AsyncInfo) const {
-    CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
-    CUresult Err = cuStreamQuery(Stream);
-
-    // Not ready streams must be considered as successful operations.
-    if (Err == CUDA_ERROR_NOT_READY)
-      return OFFLOAD_SUCCESS;
-
-    // Once the stream is synchronized or an error occurs, return it to the
-    // stream pool and reset AsyncInfo. This is to make sure the
-    // synchronization only works for its own tasks.
-    StreamPool[DeviceId]->release(Stream);
-    AsyncInfo->Queue = nullptr;
-
-    if (Err != CUDA_SUCCESS) {
-      DP("Error when querying for stream progress. stream = " DPxMOD
-         ", async info ptr = " DPxMOD "\n",
-         DPxPTR(Stream), DPxPTR(AsyncInfo));
-      CUDA_ERR_STRING(Err);
-    }
-    return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
-  }
-
-  void printDeviceInfo(int32_t DeviceId) {
-    char TmpChar[1000];
-    std::string TmpStr;
-    size_t TmpSt;
-    int TmpInt, TmpInt2, TmpInt3;
-
-    CUdevice Device;
-    checkResult(cuDeviceGet(&Device, DeviceId),
-                "Error returned from cuCtxGetDevice\n");
-
-    cuDriverGetVersion(&TmpInt);
-    printf("    CUDA Driver Version: \t\t%d \n", TmpInt);
-    printf("    CUDA Device Number: \t\t%d \n", DeviceId);
-    checkResult(cuDeviceGetName(TmpChar, 1000, Device),
-                "Error returned from cuDeviceGetName\n");
-    printf("    Device Name: \t\t\t%s \n", TmpChar);
-    checkResult(cuDeviceTotalMem(&TmpSt, Device),
-                "Error returned from cuDeviceTotalMem\n");
-    printf("    Global Memory Size: \t\t%zu bytes \n", TmpSt);
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Number of Multiprocessors: \t\t%d \n", TmpInt);
-    checkResult(
-        cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device),
-        "Error returned from cuDeviceGetAttribute\n");
-    printf("    Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt));
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Total Constant Memory: \t\t%d bytes\n", TmpInt);
-    checkResult(
-        cuDeviceGetAttribute(
-            &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device),
-        "Error returned from cuDeviceGetAttribute\n");
-    printf("    Max Shared Memory per Block: \t%d bytes \n", TmpInt);
-    checkResult(
-        cuDeviceGetAttribute(
-            &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device),
-        "Error returned from cuDeviceGetAttribute\n");
-    printf("    Registers per Block: \t\t%d \n", TmpInt);
-    checkResult(
-        cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device),
-        "Error returned from cuDeviceGetAttribute\n");
-    printf("    Warp Size: \t\t\t\t%d Threads \n", TmpInt);
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Maximum Threads per Block: \t\t%d \n", TmpInt);
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2,
-           TmpInt3);
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2,
-           TmpInt3);
-    checkResult(
-        cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device),
-        "Error returned from cuDeviceGetAttribute\n");
-    printf("    Maximum Memory Pitch: \t\t%d bytes \n", TmpInt);
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Texture Alignment: \t\t\t%d bytes \n", TmpInt);
-    checkResult(
-        cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device),
-        "Error returned from cuDeviceGetAttribute\n");
-    printf("    Clock Rate: \t\t\t%d kHz\n", TmpInt);
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt));
-    checkResult(
-        cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device),
-        "Error returned from cuDeviceGetAttribute\n");
-    printf("    Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt));
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
-    checkResult(
-        cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device),
-        "Error returned from cuDeviceGetAttribute\n");
-    if (TmpInt == CU_COMPUTEMODE_DEFAULT)
-      TmpStr = "DEFAULT";
-    else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
-      TmpStr = "PROHIBITED";
-    else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
-      TmpStr = "EXCLUSIVE PROCESS";
-    else
-      TmpStr = "unknown";
-    printf("    Compute Mode: \t\t\t%s \n", TmpStr.c_str());
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt));
-    checkResult(
-        cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device),
-        "Error returned from cuDeviceGetAttribute\n");
-    printf("    ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt));
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Memory Clock Rate: \t\t\t%d kHz\n", TmpInt);
-    checkResult(
-        cuDeviceGetAttribute(
-            &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device),
-        "Error returned from cuDeviceGetAttribute\n");
-    printf("    Memory Bus Width: \t\t\t%d bits\n", TmpInt);
-    checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
-                                     Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    L2 Cache Size: \t\t\t%d bytes \n", TmpInt);
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
-                    Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Max Threads Per SMP: \t\t%d \n", TmpInt);
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt);
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt));
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt));
-    checkResult(
-        cuDeviceGetAttribute(
-            &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device),
-        "Error returned from cuDeviceGetAttribute\n");
-    printf("    Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
-    checkResult(
-        cuDeviceGetAttribute(
-            &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device),
-        "Error returned from cuDeviceGetAttribute\n");
-    printf("    Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt));
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt));
-    checkResult(cuDeviceGetAttribute(
-                    &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device),
-                "Error returned from cuDeviceGetAttribute\n");
-    printf("    Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt));
-    checkResult(
-        cuDeviceGetAttribute(
-            &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device),
-        "Error returned from cuDeviceGetAttribute\n");
-    checkResult(
-        cuDeviceGetAttribute(
-            &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device),
-        "Error returned from cuDeviceGetAttribute\n");
-    printf("    Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
-  }
-
-  int createEvent(int DeviceId, void **P) {
-    CUevent Event = nullptr;
-    if (EventPool[DeviceId]->acquire(Event) != OFFLOAD_SUCCESS)
-      return OFFLOAD_FAIL;
-    *P = Event;
-    return OFFLOAD_SUCCESS;
-  }
-
-  int destroyEvent(int DeviceId, void *EventPtr) {
-    EventPool[DeviceId]->release(reinterpret_cast<CUevent>(EventPtr));
-    return OFFLOAD_SUCCESS;
-  }
-
-  int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo,
-                void *EventPtr) const {
-    CUstream Stream = getStream(DeviceId, AsyncInfo);
-    CUevent Event = reinterpret_cast<CUevent>(EventPtr);
-
-    // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from
-    // specific CUDA version, and defined as 0x0. In previous version, per CUDA
-    // API document, that argument has to be 0x0.
-    CUresult Err = cuStreamWaitEvent(Stream, Event, 0);
-    if (Err != CUDA_SUCCESS) {
-      DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n",
-         DPxPTR(Stream), DPxPTR(Event));
-      CUDA_ERR_STRING(Err);
-      return OFFLOAD_FAIL;
-    }
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  int releaseAsyncInfo(int DeviceId, __tgt_async_info *AsyncInfo) const {
-    if (AsyncInfo->Queue) {
-      StreamPool[DeviceId]->release(
-          reinterpret_cast<CUstream>(AsyncInfo->Queue));
-      AsyncInfo->Queue = nullptr;
-    }
-
-    return OFFLOAD_SUCCESS;
-  }
-
-  int initAsyncInfo(int DeviceId, __tgt_async_info **AsyncInfo) const {
-    *AsyncInfo = new __tgt_async_info;
-    getStream(DeviceId, *AsyncInfo);
-    return OFFLOAD_SUCCESS;
-  }
-
-  int initDeviceInfo(int DeviceId, __tgt_device_info *DeviceInfo,
-                     const char **ErrStr) const {
-    assert(DeviceInfo && "DeviceInfo is nullptr");
-
-    if (!DeviceInfo->Context)
-      DeviceInfo->Context = DeviceData[DeviceId].Context;
-    if (!DeviceInfo->Device) {
-      CUdevice Dev;
-      CUresult Err = cuDeviceGet(&Dev, DeviceId);
-      if (Err == CUDA_SUCCESS) {
-        DeviceInfo->Device = reinterpret_cast<void *>(Dev);
-      } else {
-        cuGetErrorString(Err, ErrStr);
-        return OFFLOAD_FAIL;
-      }
-    }
-    return OFFLOAD_SUCCESS;
-  }
-
-  int setContext(int DeviceId) {
-    assert(InitializedFlags[DeviceId] && "Device is not initialized");
-
-    CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
-    if (!checkResult(Err, "error returned from cuCtxSetCurrent"))
-      return OFFLOAD_FAIL;
-
-    return OFFLOAD_SUCCESS;
-  }
-};
-
-DeviceRTLTy DeviceRTL;
-} // namespace
-
-// Exposed library API function
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
-  return elf_check_machine(Image, /* EM_CUDA */ 190);
-}
-
-int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *Image,
-                                       __tgt_image_info *Info) {
-  if (!__tgt_rtl_is_valid_binary(Image))
-    return false;
-
-  // A subarchitecture was not specified. Assume it is compatible.
-  if (!Info || !Info->Arch)
-    return true;
-
-  int32_t NumberOfDevices = 0;
-  if (cuDeviceGetCount(&NumberOfDevices) != CUDA_SUCCESS)
-    return false;
-
-  StringRef ArchStr = StringRef(Info->Arch).drop_front(sizeof("sm_") - 1);
-  for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) {
-    CUdevice Device;
-    if (cuDeviceGet(&Device, DeviceId) != CUDA_SUCCESS)
-      return false;
-
-    int32_t Major, Minor;
-    if (cuDeviceGetAttribute(&Major,
-                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                             Device) != CUDA_SUCCESS)
-      return false;
-    if (cuDeviceGetAttribute(&Minor,
-                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
-                             Device) != CUDA_SUCCESS)
-      return false;
-
-    // A cubin generated for a certain compute capability is supported to run on
-    // any GPU with the same major revision and same or higher minor revision.
-    int32_t ImageMajor = ArchStr[0] - '0';
-    int32_t ImageMinor = ArchStr[1] - '0';
-    if (Major != ImageMajor || Minor < ImageMinor)
-      return false;
-  }
-
-  DP("Image has compatible compute capability: %s\n", Info->Arch);
-  return true;
-}
-
-int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
-
-int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
-  DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
-  DeviceRTL.setRequiresFlag(RequiresFlags);
-  return RequiresFlags;
-}
-
-int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDevId, int DstDevId) {
-  if (DeviceRTL.isValidDeviceId(SrcDevId) &&
-      DeviceRTL.isValidDeviceId(DstDevId))
-    return 1;
-
-  return 0;
-}
-
-int32_t __tgt_rtl_init_device(int32_t DeviceId) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  // Context is set when init the device.
-
-  return DeviceRTL.initDevice(DeviceId);
-}
-
-int32_t __tgt_rtl_deinit_device(int32_t DeviceId) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  // Context is set when deinit the device.
-
-  return DeviceRTL.deinitDevice(DeviceId);
-}
-
-__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
-                                          __tgt_device_image *Image) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-
-  if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
-    return nullptr;
-
-  return DeviceRTL.loadBinary(DeviceId, Image);
-}
-
-void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *,
-                           int32_t Kind) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-
-  if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
-    return nullptr;
-
-  return DeviceRTL.dataAlloc(DeviceId, Size, (TargetAllocTy)Kind);
-}
-
-int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
-                              int64_t Size) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  // Context is set in __tgt_rtl_data_submit_async.
-
-  __tgt_async_info AsyncInfo;
-  const int32_t Rc =
-      __tgt_rtl_data_submit_async(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo);
-  if (Rc != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
-}
-
-int32_t __tgt_rtl_data_submit_async(int32_t DeviceId, void *TgtPtr,
-                                    void *HstPtr, int64_t Size,
-                                    __tgt_async_info *AsyncInfoPtr) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  assert(AsyncInfoPtr && "async_info_ptr is nullptr");
-
-  if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return DeviceRTL.dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
-                                int64_t Size) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  // Context is set in __tgt_rtl_data_retrieve_async.
-
-  __tgt_async_info AsyncInfo;
-  const int32_t Rc =
-      __tgt_rtl_data_retrieve_async(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo);
-  if (Rc != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
-}
-
-int32_t __tgt_rtl_data_retrieve_async(int32_t DeviceId, void *HstPtr,
-                                      void *TgtPtr, int64_t Size,
-                                      __tgt_async_info *AsyncInfoPtr) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  assert(AsyncInfoPtr && "async_info_ptr is nullptr");
-
-  if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return DeviceRTL.dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_data_exchange_async(int32_t SrcDevId, void *SrcPtr,
-                                      int DstDevId, void *DstPtr, int64_t Size,
-                                      __tgt_async_info *AsyncInfo) {
-  assert(DeviceRTL.isValidDeviceId(SrcDevId) && "src_dev_id is invalid");
-  assert(DeviceRTL.isValidDeviceId(DstDevId) && "dst_dev_id is invalid");
-  assert(AsyncInfo && "AsyncInfo is nullptr");
-
-  if (DeviceRTL.setContext(SrcDevId) != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return DeviceRTL.dataExchange(SrcDevId, SrcPtr, DstDevId, DstPtr, Size,
-                                AsyncInfo);
-}
-
-int32_t __tgt_rtl_data_exchange(int32_t SrcDevId, void *SrcPtr,
-                                int32_t DstDevId, void *DstPtr, int64_t Size) {
-  assert(DeviceRTL.isValidDeviceId(SrcDevId) && "src_dev_id is invalid");
-  assert(DeviceRTL.isValidDeviceId(DstDevId) && "dst_dev_id is invalid");
-  // Context is set in __tgt_rtl_data_exchange_async.
-
-  __tgt_async_info AsyncInfo;
-  const int32_t Rc = __tgt_rtl_data_exchange_async(SrcDevId, SrcPtr, DstDevId,
-                                                   DstPtr, Size, &AsyncInfo);
-  if (Rc != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return __tgt_rtl_synchronize(SrcDevId, &AsyncInfo);
-}
-
-int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr, int32_t Kind) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-
-  if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return DeviceRTL.dataDelete(DeviceId, TgtPtr, (TargetAllocTy)Kind);
-}
-
-int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr,
-                                         void **TgtArgs, ptrdiff_t *TgtOffsets,
-                                         int32_t ArgNum, int32_t TeamNum,
-                                         int32_t ThreadLimit,
-                                         uint64_t LoopTripcount) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  // Context is set in __tgt_rtl_run_target_team_region_async.
-
-  __tgt_async_info AsyncInfo;
-  const int32_t Rc = __tgt_rtl_run_target_team_region_async(
-      DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, TeamNum, ThreadLimit,
-      LoopTripcount, &AsyncInfo);
-  if (Rc != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
-}
-
-int32_t __tgt_rtl_run_target_team_region_async(
-    int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets,
-    int32_t ArgNum, int32_t TeamNum, int32_t ThreadLimit,
-    uint64_t LoopTripcount, __tgt_async_info *AsyncInfoPtr) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-
-  if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return DeviceRTL.runTargetTeamRegion(DeviceId, TgtEntryPtr, TgtArgs,
-                                       TgtOffsets, ArgNum, TeamNum, ThreadLimit,
-                                       LoopTripcount, AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr,
-                                    void **TgtArgs, ptrdiff_t *TgtOffsets,
-                                    int32_t ArgNum) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  // Context is set in __tgt_rtl_run_target_region_async.
-
-  __tgt_async_info AsyncInfo;
-  const int32_t Rc = __tgt_rtl_run_target_region_async(
-      DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, &AsyncInfo);
-  if (Rc != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
-}
-
-int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, void *TgtEntryPtr,
-                                          void **TgtArgs, ptrdiff_t *TgtOffsets,
-                                          int32_t ArgNum,
-                                          __tgt_async_info *AsyncInfoPtr) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  // Context is set in __tgt_rtl_run_target_team_region_async.
-  return __tgt_rtl_run_target_team_region_async(
-      DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum,
-      /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
-      AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_synchronize(int32_t DeviceId,
-                              __tgt_async_info *AsyncInfoPtr) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  assert(AsyncInfoPtr && "async_info_ptr is nullptr");
-  assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr");
-  // NOTE: We don't need to set context for stream sync.
-  return DeviceRTL.synchronize(DeviceId, AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_query_async(int32_t DeviceId,
-                              __tgt_async_info *AsyncInfoPtr) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  assert(AsyncInfoPtr && "async_info_ptr is nullptr");
-  assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr");
-  // NOTE: We don't need to set context for stream query.
-  return DeviceRTL.queryAsync(DeviceId, AsyncInfoPtr);
-}
-
-void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
-  std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
-  InfoLevel.store(NewInfoLevel);
-}
-
-void __tgt_rtl_print_device_info(int32_t DeviceId) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  // NOTE: We don't need to set context for print device info.
-  DeviceRTL.printDeviceInfo(DeviceId);
-}
-
-int32_t __tgt_rtl_create_event(int32_t DeviceId, void **Event) {
-  assert(Event && "event is nullptr");
-
-  if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return DeviceRTL.createEvent(DeviceId, Event);
-}
-
-int32_t __tgt_rtl_record_event(int32_t DeviceId, void *EventPtr,
-                               __tgt_async_info *AsyncInfoPtr) {
-  assert(AsyncInfoPtr && "async_info_ptr is nullptr");
-  assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr");
-  assert(EventPtr && "event_ptr is nullptr");
-  // NOTE: We might not need to set context for event record.
-  return recordEvent(EventPtr, AsyncInfoPtr);
-}
-
-int32_t __tgt_rtl_wait_event(int32_t DeviceId, void *EventPtr,
-                             __tgt_async_info *AsyncInfoPtr) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  assert(AsyncInfoPtr && "async_info_ptr is nullptr");
-  assert(EventPtr && "event is nullptr");
-  // If we don't have a queue we need to set the context.
-  if (!AsyncInfoPtr->Queue && DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-  return DeviceRTL.waitEvent(DeviceId, AsyncInfoPtr, EventPtr);
-}
-
-int32_t __tgt_rtl_sync_event(int32_t DeviceId, void *EventPtr) {
-  assert(EventPtr && "event is nullptr");
-  // NOTE: We might not need to set context for event sync.
-  return syncEvent(EventPtr);
-}
-
-int32_t __tgt_rtl_destroy_event(int32_t DeviceId, void *EventPtr) {
-  assert(EventPtr && "event is nullptr");
-
-  if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return DeviceRTL.destroyEvent(DeviceId, EventPtr);
-}
-
-int32_t __tgt_rtl_release_async_info(int32_t DeviceId,
-                                     __tgt_async_info *AsyncInfo) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  assert(AsyncInfo && "async_info is nullptr");
-
-  if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return DeviceRTL.releaseAsyncInfo(DeviceId, AsyncInfo);
-}
-
-int32_t __tgt_rtl_init_async_info(int32_t DeviceId,
-                                  __tgt_async_info **AsyncInfo) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  assert(AsyncInfo && "async_info is nullptr");
-
-  if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return DeviceRTL.initAsyncInfo(DeviceId, AsyncInfo);
-}
-
-int32_t __tgt_rtl_init_device_info(int32_t DeviceId,
-                                   __tgt_device_info *DeviceInfoPtr,
-                                   const char **ErrStr) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-  assert(DeviceInfoPtr && "device_info_ptr is nullptr");
-
-  if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return DeviceRTL.initDeviceInfo(DeviceId, DeviceInfoPtr, ErrStr);
-}
-
-int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr,
-                                void **TgtArgs, ptrdiff_t *TgtOffsets,
-                                KernelArgsTy *KernelArgs,
-                                __tgt_async_info *AsyncInfo) {
-  assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
-
-  if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
-    return OFFLOAD_FAIL;
-
-  return DeviceRTL.runTargetTeamRegion(
-      DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, KernelArgs->NumArgs,
-      KernelArgs->NumTeams[0], KernelArgs->ThreadLimit[0],
-      KernelArgs->Tripcount, AsyncInfo);
-}
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/openmp/libomptarget/plugins/exports b/openmp/libomptarget/plugins/exports
deleted file mode 100644
--- a/openmp/libomptarget/plugins/exports
+++ /dev/null
@@ -1,6 +0,0 @@
-VERS1.0 {
-  global:
-    __tgt_rtl*;
-  local:
-    *;
-};
diff --git a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
deleted file mode 100644
--- a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp
+++ /dev/null
@@ -1,280 +0,0 @@
-//===-RTLs/generic-64bit/src/rtl.cpp - Target RTLs Implementation - C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// RTL for generic 64-bit machine
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/DynamicLibrary.h"
-
-#include <cassert>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <ffi.h>
-#include <link.h>
-#include <list>
-#include <string>
-#include <vector>
-
-#include "Debug.h"
-#include "omptargetplugin.h"
-
-using namespace llvm;
-using namespace llvm::sys;
-
-#ifndef TARGET_NAME
-#define TARGET_NAME Generic ELF - 64bit
-#endif
-#define DEBUG_PREFIX "TARGET " GETNAME(TARGET_NAME) " RTL"
-
-#ifndef TARGET_ELF_ID
-#define TARGET_ELF_ID 0
-#endif
-
-#include "elf_common.h"
-
-#define NUMBER_OF_DEVICES 4
-#define OFFLOAD_SECTION_NAME "omp_offloading_entries"
-
-/// Array of Dynamic libraries loaded for this target.
-struct DynLibTy {
-  std::string FileName;
-  std::unique_ptr<DynamicLibrary> DynLib;
-};
-
-/// Keep entries table per device.
-struct FuncOrGblEntryTy {
-  __tgt_target_table Table;
-  SmallVector<__tgt_offload_entry> Entries;
-};
-
-/// Class containing all the device information.
-class RTLDeviceInfoTy {
-  std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
-
-public:
-  std::list<DynLibTy> DynLibs;
-
-  // Record entry point associated with device.
-  void createOffloadTable(int32_t DeviceId,
-                          SmallVector<__tgt_offload_entry> &&Entries) {
-    assert(DeviceId < (int32_t)FuncGblEntries.size() &&
-           "Unexpected device id!");
-    FuncGblEntries[DeviceId].emplace_back();
-    FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
-
-    E.Entries = Entries;
-    E.Table.EntriesBegin = E.Entries.begin();
-    E.Table.EntriesEnd = E.Entries.end();
-  }
-
-  // Return true if the entry is associated with device.
-  bool findOffloadEntry(int32_t DeviceId, void *Addr) {
-    assert(DeviceId < (int32_t)FuncGblEntries.size() &&
-           "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
-
-    for (__tgt_offload_entry *I = E.Table.EntriesBegin,
-                             *End = E.Table.EntriesEnd;
-         I < End; ++I) {
-      if (I->addr == Addr)
-        return true;
-    }
-
-    return false;
-  }
-
-  // Return the pointer to the target entries table.
-  __tgt_target_table *getOffloadEntriesTable(int32_t DeviceId) {
-    assert(DeviceId < (int32_t)FuncGblEntries.size() &&
-           "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
-
-    return &E.Table;
-  }
-
-  RTLDeviceInfoTy(int32_t NumDevices) { FuncGblEntries.resize(NumDevices); }
-
-  ~RTLDeviceInfoTy() {
-    // Close dynamic libraries
-    for (auto &Lib : DynLibs) {
-      if (Lib.DynLib->isValid())
-        remove(Lib.FileName.c_str());
-    }
-  }
-};
-
-static RTLDeviceInfoTy DeviceInfo(NUMBER_OF_DEVICES);
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
-// If we don't have a valid ELF ID we can just fail.
-#if TARGET_ELF_ID < 1
-  return 0;
-#else
-  return elf_check_machine(Image, TARGET_ELF_ID);
-#endif
-}
-
-int32_t __tgt_rtl_number_of_devices() { return NUMBER_OF_DEVICES; }
-
-int32_t __tgt_rtl_init_device(int32_t DeviceId) { return OFFLOAD_SUCCESS; }
-
-__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
-                                          __tgt_device_image *Image) {
-
-  DP("Dev %d: load binary from " DPxMOD " image\n", DeviceId,
-     DPxPTR(Image->ImageStart));
-
-  assert(DeviceId >= 0 && DeviceId < NUMBER_OF_DEVICES && "bad dev id");
-
-  size_t ImageSize = (size_t)Image->ImageEnd - (size_t)Image->ImageStart;
-
-  // load dynamic library and get the entry points. We use the dl library
-  // to do the loading of the library, but we could do it directly to avoid the
-  // dump to the temporary file.
-  //
-  // 1) Create tmp file with the library contents.
-  // 2) Use dlopen to load the file and dlsym to retrieve the symbols.
-  char TmpName[] = "/tmp/tmpfile_XXXXXX";
-  int TmpFd = mkstemp(TmpName);
-
-  if (TmpFd == -1)
-    return nullptr;
-
-  FILE *Ftmp = fdopen(TmpFd, "wb");
-
-  if (!Ftmp)
-    return nullptr;
-
-  fwrite(Image->ImageStart, ImageSize, 1, Ftmp);
-  fclose(Ftmp);
-
-  std::string ErrMsg;
-  auto DynLib = std::make_unique<sys::DynamicLibrary>(
-      sys::DynamicLibrary::getPermanentLibrary(TmpName, &ErrMsg));
-  DynLibTy Lib = {TmpName, std::move(DynLib)};
-
-  if (!Lib.DynLib->isValid()) {
-    DP("Target library loading error: %s\n", ErrMsg.c_str());
-    return NULL;
-  }
-
-  __tgt_offload_entry *HostBegin = Image->EntriesBegin;
-  __tgt_offload_entry *HostEnd = Image->EntriesEnd;
-
-  // Create a new offloading entry list using the device symbol address.
-  SmallVector<__tgt_offload_entry> Entries;
-  for (__tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
-    if (!E->addr)
-      return nullptr;
-
-    __tgt_offload_entry Entry = *E;
-
-    void *DevAddr = Lib.DynLib->getAddressOfSymbol(E->name);
-    Entry.addr = DevAddr;
-
-    DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
-       DPxPTR(E - HostBegin), E->name, DPxPTR(DevAddr));
-
-    Entries.emplace_back(Entry);
-  }
-
-  DeviceInfo.createOffloadTable(DeviceId, std::move(Entries));
-  DeviceInfo.DynLibs.emplace_back(std::move(Lib));
-
-  return DeviceInfo.getOffloadEntriesTable(DeviceId);
-}
-
-void __tgt_rtl_print_device_info(int32_t DeviceId) {
-  printf("    This is a generic-elf-64bit device\n");
-}
-
-// Sample implementation of explicit memory allocator. For this plugin all kinds
-// are equivalent to each other.
-void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *HstPtr,
-                           int32_t Kind) {
-  void *Ptr = NULL;
-
-  switch (Kind) {
-  case TARGET_ALLOC_DEVICE:
-  case TARGET_ALLOC_HOST:
-  case TARGET_ALLOC_SHARED:
-  case TARGET_ALLOC_DEFAULT:
-    Ptr = malloc(Size);
-    break;
-  default:
-    REPORT("Invalid target data allocation kind");
-  }
-
-  return Ptr;
-}
-
-int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
-                              int64_t Size) {
-  memcpy(TgtPtr, HstPtr, Size);
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
-                                int64_t Size) {
-  memcpy(HstPtr, TgtPtr, Size);
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr, int32_t) {
-  free(TgtPtr);
-  return OFFLOAD_SUCCESS;
-}
-
-int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr,
-                                void **TgtArgs, ptrdiff_t *TgtOffsets,
-                                KernelArgsTy *KernelArgs,
-                                __tgt_async_info *AsyncInfoPtr) {
-  assert(!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] &&
-         !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
-         "Only one dimensional kernels supported.");
-  // ignore team num and thread limit.
-
-  // Use libffi to launch execution.
-  ffi_cif Cif;
-
-  // All args are references.
-  std::vector<ffi_type *> ArgsTypes(KernelArgs->NumArgs, &ffi_type_pointer);
-  std::vector<void *> Args(KernelArgs->NumArgs);
-  std::vector<void *> Ptrs(KernelArgs->NumArgs);
-
-  for (uint32_t I = 0; I < KernelArgs->NumArgs; ++I) {
-    Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
-    Args[I] = &Ptrs[I];
-  }
-
-  ffi_status Status = ffi_prep_cif(&Cif, FFI_DEFAULT_ABI, KernelArgs->NumArgs,
-                                   &ffi_type_void, &ArgsTypes[0]);
-
-  assert(Status == FFI_OK && "Unable to prepare target launch!");
-
-  if (Status != FFI_OK)
-    return OFFLOAD_FAIL;
-
-  DP("Running entry point at " DPxMOD "...\n", DPxPTR(TgtEntryPtr));
-
-  void (*Entry)(void);
-  *((void **)&Entry) = TgtEntryPtr;
-  ffi_call(&Cif, Entry, NULL, &Args[0]);
-  return OFFLOAD_SUCCESS;
-}
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/openmp/libomptarget/plugins/ppc64/CMakeLists.txt b/openmp/libomptarget/plugins/ppc64/CMakeLists.txt
deleted file mode 100644
--- a/openmp/libomptarget/plugins/ppc64/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-##===----------------------------------------------------------------------===##
-# 
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# 
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for a ppc64 machine if available.
-#
-##===----------------------------------------------------------------------===##
-
-if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  build_generic_elf64("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu" "21")
-else()
- libomptarget_say("Not building ppc64 offloading plugin: machine not found in the system.")
-endif()
diff --git a/openmp/libomptarget/plugins/ppc64le/CMakeLists.txt b/openmp/libomptarget/plugins/ppc64le/CMakeLists.txt
deleted file mode 100644
--- a/openmp/libomptarget/plugins/ppc64le/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-##===----------------------------------------------------------------------===##
-# 
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# 
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for a ppc64le machine if available.
-#
-##===----------------------------------------------------------------------===##
-
-if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  build_generic_elf64("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu" "21")
-else()
- libomptarget_say("Not building ppc64le offloading plugin: machine not found in the system.")
-endif()
diff --git a/openmp/libomptarget/plugins/x86_64/CMakeLists.txt b/openmp/libomptarget/plugins/x86_64/CMakeLists.txt
deleted file mode 100644
--- a/openmp/libomptarget/plugins/x86_64/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-##===----------------------------------------------------------------------===##
-# 
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-# 
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for a x86_64 machine if available.
-#
-##===----------------------------------------------------------------------===##
-
-if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  build_generic_elf64("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62")
-else()
- libomptarget_say("Not building x86_64 offloading plugin: machine not found in the system.")
-endif()
diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp
--- a/openmp/libomptarget/src/rtl.cpp
+++ b/openmp/libomptarget/src/rtl.cpp
@@ -99,8 +99,6 @@
 
   DP("Loading RTLs...\n");
 
-  BoolEnvar NextGenPlugins("LIBOMPTARGET_NEXTGEN_PLUGINS", true);
-
   // Attempt to open all the plugins and, if they exist, check if the interface
   // is correct and if they are supporting any devices.
   for (const char *Name : RTLNames) {
@@ -109,13 +107,6 @@
     RTLInfoTy &RTL = AllRTLs.back();
 
     const std::string BaseRTLName(Name);
-    if (NextGenPlugins) {
-      if (attemptLoadRTL(BaseRTLName + ".nextgen.so", RTL))
-        continue;
-
-      DP("Falling back to original plugin...\n");
-    }
-
     if (!attemptLoadRTL(BaseRTLName + ".so", RTL))
       AllRTLs.pop_back();
   }