Index: llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
===================================================================
--- llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
+++ llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
@@ -56,24 +56,24 @@
 
 struct GV {
   /// The size reserved for data in a shared memory slot.
-  const unsigned GV_Slot_Size;
+  unsigned GV_Slot_Size;
   /// The default value of maximum number of threads in a worker warp.
-  const unsigned GV_Warp_Size;
+  unsigned GV_Warp_Size;
 
   constexpr unsigned warpSlotSize() const {
     return GV_Warp_Size * GV_Slot_Size;
   }
 
   /// the maximum number of teams.
-  const unsigned GV_Max_Teams;
+  unsigned GV_Max_Teams;
   // An alternative to the heavy data sharing infrastructure that uses global
   // memory is one that uses device __shared__ memory.  The amount of such space
   // (in bytes) reserved by the OpenMP runtime is noted here.
-  const unsigned GV_SimpleBufferSize;
+  unsigned GV_SimpleBufferSize;
   // The absolute maximum team size for a working group
-  const unsigned GV_Max_WG_Size;
+  unsigned GV_Max_WG_Size;
   // The default maximum team size for a working group
-  const unsigned GV_Default_WG_Size;
+  unsigned GV_Default_WG_Size;
 
   constexpr unsigned maxWarpNumber() const {
     return GV_Max_WG_Size / GV_Warp_Size;
Index: openmp/libomptarget/CMakeLists.txt
===================================================================
--- openmp/libomptarget/CMakeLists.txt
+++ openmp/libomptarget/CMakeLists.txt
@@ -85,6 +85,7 @@
 
 # Build offloading plugins and device RTLs if they are available.
 add_subdirectory(plugins)
+add_subdirectory(plugins-nextgen)
 add_subdirectory(DeviceRTL)
 add_subdirectory(tools)
 
Index: openmp/libomptarget/include/Utilities.h
===================================================================
--- /dev/null
+++ openmp/libomptarget/include/Utilities.h
@@ -0,0 +1,200 @@
+//===------- Utilities.h - Target independent OpenMP target RTL -- C++ ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Routines and classes used to provide useful functionalities like string
+// parsing and environment variables.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_INCLUDE_UTILITIES_H
+#define OPENMP_LIBOMPTARGET_INCLUDE_UTILITIES_H
+
+#include "llvm/ADT/STLFunctionalExtras.h"
+
+#include "Debug.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <sstream>
+#include <string>
+
+namespace llvm {
+namespace omp {
+namespace target {
+
+/// Utility class for parsing strings to other types.
+struct StringParser {
+  /// Parse a string to another type.
+  template <typename Ty> static bool parse(const char *Value, Ty &Result);
+};
+
+/// Class for reading and checking environment variables. Currently working with
+/// integer, floats, std::string and bool types.
+template <typename Ty> class Envar {
+  Ty Data;
+  bool IsPresent;
+  bool Initialized;
+
+public:
+  /// Auxiliary function to safely create envars. This static function safely
+  /// creates envars using fallible constructors. See the constructors to know
+  /// more details about the creation parameters.
+  template <typename... ArgsTy>
+  static Expected<Envar> create(ArgsTy &&...Args) {
+    Error Err = Error::success();
+    Envar Envar(std::forward<ArgsTy>(Args)..., Err);
+    if (Err)
+      return std::move(Err);
+    return std::move(Envar);
+  }
+
+  /// Create an empty envar. Cannot be consulted. This constructor is merely
+  /// for convenience. This constructor is not fallible.
+  Envar() : Data(Ty()), IsPresent(false), Initialized(false) {}
+
+  /// Create an envar with a name and an optional default. The Envar object will
+  /// take the value read from the environment variable, or the default if it
+  /// was not set or not correct. This constructor is not fallible.
+  Envar(StringRef Name, Ty Default = Ty())
+      : Data(Default), IsPresent(false), Initialized(true) {
+
+    if (const char *EnvStr = getenv(Name.data())) {
+      // Check whether the envar is defined and valid.
+      IsPresent = StringParser::parse<Ty>(EnvStr, Data);
+
+      if (!IsPresent) {
+        DP("Ignoring invalid value %s for envar %s\n", EnvStr, Name.data());
+        Data = Default;
+      }
+    }
+  }
+
+  /// Get the definitive value.
+  const Ty &get() const {
+    // Throw a runtime error in case this envar is not initialized.
+    if (!Initialized)
+      FATAL_MESSAGE0(1, "Consulting envar before initialization");
+
+    return Data;
+  }
+
+  /// Get the definitive value.
+  operator Ty() const { return get(); }
+
+  /// Indicate whether the environment variable was defined and valid.
+  bool isPresent() const { return IsPresent; }
+
+private:
+  /// This constructor should never fail but we provide it for convenience. This
+  /// way, the constructor can be used by the Envar::create() static function
+  /// to safely create this kind of envars.
+  Envar(StringRef Name, Ty Default, Error &Err) : Envar(Name, Default) {
+    ErrorAsOutParameter EAO(&Err);
+    Err = Error::success();
+  }
+
+  /// Create an envar with a name, getter function and a setter function. The
+  /// Envar object will take the value read from the environment variable if
+  /// this value is accepted by the setter function. Otherwise, the getter
+  /// function will be executed to get the default value. The getter should be
+  /// of the form Error GetterFunctionTy(Ty &Value) and the setter should
+  /// be of the form Error SetterFunctionTy(Ty Value). This constructor has a
+  /// private visibility because is a fallible constructor. Please use the
+  /// Envar::create() static function to safely create this object instead.
+  template <typename GetterFunctor, typename SetterFunctor>
+  Envar(StringRef Name, GetterFunctor Getter, SetterFunctor Setter, Error &Err)
+      : Data(Ty()), IsPresent(false), Initialized(true) {
+    ErrorAsOutParameter EAO(&Err);
+    Err = init(Name, Getter, Setter);
+  }
+
+  template <typename GetterFunctor, typename SetterFunctor>
+  Error init(StringRef Name, GetterFunctor Getter, SetterFunctor Setter);
+};
+
+/// Define some common envar types.
+using IntEnvar = Envar<int>;
+using Int32Envar = Envar<int32_t>;
+using Int64Envar = Envar<int64_t>;
+using UInt32Envar = Envar<uint32_t>;
+using UInt64Envar = Envar<uint64_t>;
+using StringEnvar = Envar<std::string>;
+using BoolEnvar = Envar<bool>;
+
+template <>
+inline bool StringParser::parse(const char *ValueStr, bool &Result) {
+  std::string Value(ValueStr);
+
+  // Convert the string to lowercase.
+  std::transform(Value.begin(), Value.end(), Value.begin(),
+                 [](unsigned char c) { return std::tolower(c); });
+
+  // May be implemented with fancier C++ features, but let's keep it simple.
+  if (Value == "true" || Value == "yes" || Value == "on" || Value == "1")
+    Result = true;
+  else if (Value == "false" || Value == "no" || Value == "off" || Value == "0")
+    Result = false;
+  else
+    return false;
+
+  // Parsed correctly.
+  return true;
+}
+
+template <typename Ty>
+inline bool StringParser::parse(const char *Value, Ty &Result) {
+  assert(Value && "Parsed value cannot be null");
+
+  std::istringstream Stream(Value);
+  Stream >> Result;
+
+  return !Stream.fail();
+}
+
+template <typename Ty>
+template <typename GetterFunctor, typename SetterFunctor>
+inline Error Envar<Ty>::init(StringRef Name, GetterFunctor Getter,
+                             SetterFunctor Setter) {
+  // Get the default value.
+  Ty Default;
+  if (Error Err = Getter(Default))
+    return Err;
+
+  if (const char *EnvStr = getenv(Name.data())) {
+    IsPresent = StringParser::parse<Ty>(EnvStr, Data);
+    if (IsPresent) {
+      // Check whether the envar value is actually valid.
+      Error Err = Setter(Data);
+      if (Err) {
+        // The setter reported an invalid value. Mark the user-defined value as
+        // not present and reset to the getter value (default).
+        IsPresent = false;
+        Data = Default;
+        DP("Setter of envar %s failed, resetting to %s\n", Name.data(),
+           std::to_string(Data).data());
+        consumeError(std::move(Err));
+      }
+    } else {
+      DP("Ignoring invalid value %s for envar %s\n", EnvStr, Name.data());
+      Data = Default;
+    }
+  } else {
+    Data = Default;
+  }
+
+  return Error::success();
+}
+
+} // namespace target
+} // namespace omp
+} // namespace llvm
+
+#endif // OPENMP_LIBOMPTARGET_INCLUDE_UTILITIES_H
Index: openmp/libomptarget/include/rtl.h
===================================================================
--- openmp/libomptarget/include/rtl.h
+++ openmp/libomptarget/include/rtl.h
@@ -169,6 +169,9 @@
   // (i.e. the library attempts to load the RTLs (plugins) only once).
   std::once_flag InitFlag;
   void loadRTLs(); // not thread-safe
+
+private:
+  static bool attemptLoadRTL(const std::string &RTLName, RTLInfoTy &RTL);
 };
 
 /// Map between the host entry begin and the translation table. Each
Index: openmp/libomptarget/plugins-nextgen/CMakeLists.txt
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/CMakeLists.txt
@@ -0,0 +1,87 @@
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Build plugins for the user system if available.
+#
+##===----------------------------------------------------------------------===##
+
+add_subdirectory(common)
+
+# void build_generic_elf64_nextgen(string tmachine, string tmachine_name, string tmachine_libname, string elf_machine_id);
+# - build a plugin for an ELF based generic 64-bit target based on libffi.
+# - tmachine: name of the machine processor as used in the cmake build system.
+# - tmachine_name: name of the machine to be printed with the debug messages.
+# - tmachine_libname: machine name to be appended to the plugin library name.
+macro(build_generic_elf64_nextgen tmachine tmachine_name tmachine_libname tmachine_triple elf_machine_id)
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$")
+  if(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+
+    libomptarget_say("Building ${tmachine_name} NextGen offloading plugin.")
+
+    # Define macro to be used as prefix of the runtime messages for this target.
+    add_definitions("-DTARGET_NAME=${tmachine_name}")
+
+    # Define debug prefix. TODO: This should be automatized in the Debug.h but
+    # it requires changing the original plugins.
+    add_definitions(-DDEBUG_PREFIX="TARGET ${tmachine_name} RTL")
+
+    # Define macro with the ELF ID for this target.
+    add_definitions("-DTARGET_ELF_ID=${elf_machine_id}")
+
+    add_llvm_library("omptarget.rtl.${tmachine_libname}.nextgen"
+      SHARED
+
+      ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/src/rtl.cpp
+
+      ADDITIONAL_HEADER_DIRS
+      ${LIBOMPTARGET_INCLUDE_DIR}
+      ${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR}
+
+        LINK_LIBS
+        PRIVATE
+        elf_common
+        MemoryManager
+        PluginInterface
+        ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES}
+        ${OPENMP_PTHREAD_LIB}
+        "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
+
+      NO_INSTALL_RPATH
+    )
+
+    # Install plugin under the lib destination folder.
+    install(TARGETS "omptarget.rtl.${tmachine_libname}.nextgen"
+      LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+    set_target_properties("omptarget.rtl.${tmachine_libname}.nextgen" PROPERTIES
+      INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.."
+      CXX_VISIBILITY_PRESET protected)
+
+    target_include_directories( "omptarget.rtl.${tmachine_libname}.nextgen" PRIVATE
+      ${LIBOMPTARGET_INCLUDE_DIR}
+      ${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR})
+
+    list(APPEND LIBOMPTARGET_TESTED_PLUGINS
+      "omptarget.rtl.${tmachine_libname}.nextgen")
+
+  else(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+    libomptarget_say("Not building ${tmachine_name} NextGen offloading plugin: libffi dependency not found.")
+  endif(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+else()
+  libomptarget_say("Not building ${tmachine_name} NextGen offloading plugin: machine not found in the system.")
+endif()
+endmacro()
+
+add_subdirectory(aarch64)
+add_subdirectory(cuda)
+add_subdirectory(ppc64)
+add_subdirectory(ppc64le)
+add_subdirectory(x86_64)
+
+# Make sure the parent scope can see the plugins that will be created.
+set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
+set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
Index: openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt
@@ -0,0 +1,17 @@
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for an aarch64 machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  build_generic_elf64_nextgen("aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183")
+else()
+ libomptarget_say("Not building aarch64 NextGen offloading plugin: machine not found in the system.")
+endif()
Index: openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt
@@ -0,0 +1,13 @@
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Common parts which can be used by all plugins
+#
+##===----------------------------------------------------------------------===##
+
+add_subdirectory(PluginInterface)
Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt
@@ -0,0 +1,32 @@
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Common parts which can be used by all plugins
+#
+##===----------------------------------------------------------------------===##
+
+# Plugin Interface library.
+add_library(PluginInterface OBJECT PluginInterface.cpp GlobalHandler.cpp)
+
+# Define the TARGET_NAME.
+add_definitions("-DTARGET_NAME=PluginInterface")
+
+# Define the DEBUG_PREFIX.
+add_definitions(-DDEBUG_PREFIX="PluginInterface")
+
+set_property(TARGET PluginInterface PROPERTY POSITION_INDEPENDENT_CODE ON)
+llvm_update_compile_flags(PluginInterface)
+set(LINK_LLVM_LIBS LLVMSupport)
+if (LLVM_LINK_LLVM_DYLIB)
+  set(LINK_LLVM_LIBS LLVM)
+endif()
+target_link_libraries(PluginInterface INTERFACE ${LINK_LLVM_LIBS} PRIVATE elf_common MemoryManager)
+add_dependencies(PluginInterface ${LINK_LLVM_LIBS})
+
+target_include_directories(PluginInterface INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(PluginInterface PRIVATE ${LIBOMPTARGET_INCLUDE_DIR})
Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/GlobalHandler.h
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/GlobalHandler.h
@@ -0,0 +1,180 @@
+//===- GlobalHandler.h - Target independent global & enviroment handling --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Target independent global handler and environment manager.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H
+#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H
+
+#include <string>
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Object/ELFObjectFile.h"
+
+#include "Debug.h"
+#include "Utilities.h"
+#include "omptarget.h"
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+struct DeviceImageTy;
+struct GenericDeviceTy;
+
+using namespace llvm::object;
+
+/// Common abstraction for globals that live on the host and device.
+/// It simply encapsulates the symbol name, symbol size, and symbol address
+/// (which might be host or device depending on the context).
+class GlobalTy {
+  // NOTE: Maybe we can have a pointer to the offload entry name instead of
+  // holding a private copy of the name as a std::string.
+  std::string Name;
+  uint32_t Size;
+  void *Ptr;
+
+public:
+  GlobalTy(const std::string &Name, uint32_t Size, void *Ptr = nullptr)
+      : Name(Name), Size(Size), Ptr(Ptr) {}
+
+  GlobalTy(const __tgt_offload_entry &Entry)
+      : Name(Entry.name), Size(Entry.size), Ptr(Entry.addr) {}
+
+  const std::string &getName() const { return Name; }
+  uint32_t getSize() const { return Size; }
+  void *getPtr() const { return Ptr; }
+
+  void setSize(int32_t S) { Size = S; }
+  void setPtr(void *P) { Ptr = P; }
+};
+
+/// Subclass of GlobalTy that holds the memory for a global of \p Ty.
+template <typename Ty> class StaticGlobalTy : public GlobalTy {
+  Ty Data;
+
+public:
+  template <typename... Args>
+  StaticGlobalTy(const std::string &Name, Args &&...args)
+      : GlobalTy(Name, sizeof(Ty), &Data),
+        Data(Ty{std::forward<Args>(args)...}) {}
+
+  template <typename... Args>
+  StaticGlobalTy(const char *Name, Args &&...args)
+      : GlobalTy(Name, sizeof(Ty), &Data),
+        Data(Ty{std::forward<Args>(args)...}) {}
+
+  template <typename... Args>
+  StaticGlobalTy(const char *Name, const char *Suffix, Args &&...args)
+      : GlobalTy(std::string(Name) + Suffix, sizeof(Ty), &Data),
+        Data(Ty{std::forward<Args>(args)...}) {}
+
+  Ty &getValue() { return Data; }
+  const Ty &getValue() const { return Data; }
+  void setValue(const Ty &V) { Data = V; }
+};
+
+/// Helper class to do the heavy lifting when it comes to moving globals between
+/// host and device. Through the GenericDeviceTy we access memcpy DtoH and HtoD,
+/// which means the only things specialized by the subclass is the retrival of
+/// global metadata (size, addr) from the device.
+/// \see getGlobalMetadataFromDevice
+class GenericGlobalHandlerTy {
+  /// Map to store the ELF object files that have been loaded.
+  llvm::DenseMap<int32_t, ELF64LEObjectFile> ELFObjectFiles;
+
+  /// Get the cached ELF64LEObjectFile previosuly created for a specific
+  /// device image or create it if did not exist.
+  const ELF64LEObjectFile *
+  getOrCreateELFObjectFile(const GenericDeviceTy &Device, DeviceImageTy &Image);
+
+  /// Actually move memory between host and device. See readGlobalFromDevice and
+  /// writeGlobalToDevice for the interface description.
+  Error moveGlobalBetweenDeviceAndHost(GenericDeviceTy &Device,
+                                       DeviceImageTy &Image,
+                                       const GlobalTy &HostGlobal,
+                                       bool Device2Host);
+
+  /// Actually move memory between host and device. See readGlobalFromDevice and
+  /// writeGlobalToDevice for the interface description.
+  Error moveGlobalBetweenDeviceAndHost(GenericDeviceTy &Device,
+                                       DeviceImageTy &Image,
+                                       const GlobalTy &HostGlobal,
+                                       const GlobalTy &DeviceGlobal,
+                                       bool Device2Host);
+
+public:
+  virtual ~GenericGlobalHandlerTy() {}
+
+  /// Get the address and size of a global in the image. Address and size are
+  /// return in \p ImageGlobal, the global name is passed in \p ImageGlobal.
+  Error getGlobalMetadataFromImage(GenericDeviceTy &Device,
+                                   DeviceImageTy &Image, GlobalTy &ImageGlobal);
+
+  /// Read the memory associated with a global from the image and store it on
+  /// the host. The name, size, and destination are defined by \p HostGlobal.
+  Error readGlobalFromImage(GenericDeviceTy &Device, DeviceImageTy &Image,
+                            const GlobalTy &HostGlobal);
+
+  /// Get the address and size of a global from the device. Address is return in
+  /// \p DeviceGlobal, the global name and expected size are passed in
+  /// \p DeviceGlobal.
+  virtual Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
+                                            DeviceImageTy &Image,
+                                            GlobalTy &DeviceGlobal) = 0;
+
+  /// Copy the memory associated with a global from the device to its
+  /// counterpart on the host. The name, size, and destination are defined by
+  /// \p HostGlobal. The origin is defined by \p DeviceGlobal.
+  Error readGlobalFromDevice(GenericDeviceTy &Device, DeviceImageTy &Image,
+                             const GlobalTy &HostGlobal,
+                             const GlobalTy &DeviceGlobal) {
+    return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal,
+                                          DeviceGlobal,
+                                          /* D2H */ true);
+  }
+
+  /// Copy the memory associated with a global from the device to its
+  /// counterpart on the host. The name, size, and destination are defined by
+  /// \p HostGlobal. The origin is automatically resolved.
+  Error readGlobalFromDevice(GenericDeviceTy &Device, DeviceImageTy &Image,
+                             const GlobalTy &HostGlobal) {
+    return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal,
+                                          /* D2H */ true);
+  }
+
+  /// Copy the memory associated with a global from the host to its counterpart
+  /// on the device. The name, size, and origin are defined by \p HostGlobal.
+  /// The destination is defined by \p DeviceGlobal.
+  Error writeGlobalToDevice(GenericDeviceTy &Device, DeviceImageTy &Image,
+                            const GlobalTy &HostGlobal,
+                            const GlobalTy &DeviceGlobal) {
+    return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal,
+                                          DeviceGlobal,
+                                          /* D2H */ false);
+  }
+
+  /// Copy the memory associated with a global from the host to its counterpart
+  /// on the device. The name, size, and origin are defined by \p HostGlobal.
+  /// The destination is automatically resolved.
+  Error writeGlobalToDevice(GenericDeviceTy &Device, DeviceImageTy &Image,
+                            const GlobalTy &HostGlobal) {
+    return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal,
+                                          /* D2H */ false);
+  }
+};
+
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
+
+#endif // LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H
Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/GlobalHandler.cpp
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/GlobalHandler.cpp
@@ -0,0 +1,152 @@
+//===- GlobalHandler.cpp - Target independent global & env. var handling --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Target independent global handler and environment manager.
+//
+//===----------------------------------------------------------------------===//
+
+#include "GlobalHandler.h"
+#include "ELFSymbols.h"
+#include "PluginInterface.h"
+
+#include <cstring>
+
+using namespace llvm;
+using namespace omp;
+using namespace target;
+using namespace plugin;
+
+const ELF64LEObjectFile *
+GenericGlobalHandlerTy::getOrCreateELFObjectFile(const GenericDeviceTy &Device,
+                                                 DeviceImageTy &Image) {
+
+  auto Search = ELFObjectFiles.find(Image.getId());
+  if (Search != ELFObjectFiles.end())
+    // The ELF object file was already there.
+    return &Search->second;
+
+  // The ELF object file we are checking is not created yet.
+  Expected<ELF64LEObjectFile> ElfOrErr =
+      ELF64LEObjectFile::create(Image.getMemoryBuffer());
+  if (!ElfOrErr) {
+    consumeError(ElfOrErr.takeError());
+    return nullptr;
+  }
+
+  auto Result =
+      ELFObjectFiles.try_emplace(Image.getId(), std::move(ElfOrErr.get()));
+  assert(Result.second && "Map insertion failed");
+  assert(Result.first != ELFObjectFiles.end() && "Map insertion failed");
+
+  return &Result.first->second;
+}
+
+Error GenericGlobalHandlerTy::moveGlobalBetweenDeviceAndHost(
+    GenericDeviceTy &Device, DeviceImageTy &Image, const GlobalTy &HostGlobal,
+    bool Device2Host) {
+
+  GlobalTy DeviceGlobal(HostGlobal.getName(), HostGlobal.getSize());
+
+  // Get the metadata from the global on the device.
+  if (auto Err = getGlobalMetadataFromDevice(Device, Image, DeviceGlobal))
+    return Err;
+
+  // Perform the actual transfer.
+  return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal, DeviceGlobal,
+                                        Device2Host);
+}
+
+/// Actually move memory between host and device. See readGlobalFromDevice and
+/// writeGlobalToDevice for the interface description.
+Error GenericGlobalHandlerTy::moveGlobalBetweenDeviceAndHost(
+    GenericDeviceTy &Device, DeviceImageTy &DeviceImage,
+    const GlobalTy &HostGlobal, const GlobalTy &DeviceGlobal,
+    bool Device2Host) {
+
+  // Transfer the data from the source to the destination.
+  if (Device2Host) {
+    if (auto Err =
+            Device.dataRetrieve(HostGlobal.getPtr(), DeviceGlobal.getPtr(),
+                                HostGlobal.getSize(), nullptr))
+      return Err;
+  } else {
+    if (auto Err = Device.dataSubmit(DeviceGlobal.getPtr(), HostGlobal.getPtr(),
+                                     HostGlobal.getSize(), nullptr))
+      return Err;
+  }
+
+  DP("Succesfully %s %u bytes associated with global symbol '%s' %s the device "
+     "(%p -> %p).\n",
+     Device2Host ? "read" : "write", HostGlobal.getSize(),
+     HostGlobal.getName().data(), Device2Host ? "from" : "to",
+     DeviceGlobal.getPtr(), HostGlobal.getPtr());
+
+  return Plugin::success();
+}
+
+Error GenericGlobalHandlerTy::getGlobalMetadataFromImage(
+    GenericDeviceTy &Device, DeviceImageTy &Image, GlobalTy &ImageGlobal) {
+
+  // Get the ELF object file for the image. Notice the ELF object may already
+  // be created in previous calls, so we can reuse it.
+  const ELF64LEObjectFile *ELFObj = getOrCreateELFObjectFile(Device, Image);
+  if (!ELFObj)
+    return Plugin::error("Unable to create ELF object for image %p",
+                         Image.getStart());
+
+  // Search the ELF symbol using the the symbol name.
+  auto SymOrErr = getELFSymbol(*ELFObj, ImageGlobal.getName());
+  if (!SymOrErr)
+    return Plugin::error("Failed ELF lookup of global '%s': %s",
+                         ImageGlobal.getName().data(),
+                         toString(SymOrErr.takeError()).data());
+
+  if (!*SymOrErr)
+    return Plugin::error("Failed to find global symbol '%s' in the ELF image",
+                         ImageGlobal.getName().data());
+
+  // Get the section to which the symbol belongs.
+  auto SymSecOrErr = ELFObj->getELFFile().getSection((*SymOrErr)->st_shndx);
+  if (!SymSecOrErr)
+    return Plugin::error("Failed to get ELF section from global '%s': %s",
+                         ImageGlobal.getName().data(),
+                         toString(SymOrErr.takeError()).data());
+
+  // Save the global symbol's address and size. The address of the global is the
+  // image base address + the section offset + the symbol value.
+  ImageGlobal.setPtr((char *)Image.getStart() + (*SymSecOrErr)->sh_offset +
+                     (*SymOrErr)->st_value);
+  ImageGlobal.setSize((*SymOrErr)->st_size);
+
+  return Plugin::success();
+}
+
+Error GenericGlobalHandlerTy::readGlobalFromImage(GenericDeviceTy &Device,
+                                                  DeviceImageTy &Image,
+                                                  const GlobalTy &HostGlobal) {
+
+  GlobalTy ImageGlobal(HostGlobal.getName(), -1);
+  if (auto Err = getGlobalMetadataFromImage(Device, Image, ImageGlobal))
+    return Err;
+
+  if (ImageGlobal.getSize() != HostGlobal.getSize())
+    return Plugin::error("Transfer failed because global symbol '%s' has "
+                         "%u bytes in the ELF image but %u bytes on the host",
+                         HostGlobal.getName().data(), ImageGlobal.getSize(),
+                         HostGlobal.getSize());
+
+  DP("Global symbol '%s' was found in the ELF image and %u bytes will copied "
+     "from %p to %p.\n",
+     HostGlobal.getName().data(), HostGlobal.getSize(), ImageGlobal.getPtr(),
+     HostGlobal.getPtr());
+
+  // Perform the copy from the image to the host memory.
+  std::memcpy(HostGlobal.getPtr(), ImageGlobal.getPtr(), HostGlobal.getSize());
+
+  return Plugin::success();
+}
Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -0,0 +1,824 @@
+//===- PluginInterface.h - Target independent plugin device interface -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_PLUGININTERFACE_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_PLUGININTERFACE_H
+
+#include <cstddef>
+#include <cstdint>
+#include <list>
+#include <map>
+#include <vector>
+
+#include "Debug.h"
+#include "DeviceEnvironment.h"
+#include "GlobalHandler.h"
+#include "MemoryManager.h"
+#include "Utilities.h"
+#include "omptarget.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include "llvm/Frontend/OpenMP/OMPGridValues.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MemoryBufferRef.h"
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+struct GenericPluginTy;
+struct GenericKernelTy;
+struct GenericDeviceTy;
+
+/// Class that wraps the __tgt_async_info to simply its usage. In case the
+/// object is constructed without a valid __tgt_async_info, the object will use
+/// an internal one and will synchronize the current thread with the pending
+/// operations on object destruction.
+struct AsyncInfoWrapperTy {
+  AsyncInfoWrapperTy(Error &Err, GenericDeviceTy &Device,
+                     __tgt_async_info *AsyncInfoPtr)
+      : Err(Err), ErrOutParam(&Err), Device(Device),
+        AsyncInfoPtr(AsyncInfoPtr ? AsyncInfoPtr : &LocalAsyncInfo) {}
+
+  /// Synchronize with the __tgt_async_info's pending operations if it's the
+  /// internal one.
+  ~AsyncInfoWrapperTy();
+
+  /// Get the raw __tgt_async_info pointer.
+  operator __tgt_async_info *() const { return AsyncInfoPtr; }
+
+  /// Get a reference to the underlying plugin-specific queue type.
+  template <typename Ty> Ty &getQueueAs() const {
+    static_assert(sizeof(Ty) == sizeof(AsyncInfoPtr->Queue),
+                  "Queue is not of the same size as target type");
+    return reinterpret_cast<Ty &>(AsyncInfoPtr->Queue);
+  }
+
+private:
+  Error &Err;
+  ErrorAsOutParameter ErrOutParam;
+  GenericDeviceTy &Device;
+  __tgt_async_info LocalAsyncInfo;
+  __tgt_async_info *const AsyncInfoPtr;
+};
+
+/// Class wrapping a __tgt_device_image and its offload entry table on a
+/// specific device. This class is responsible for storing and managing
+/// the offload entries for an image on a device.
+class DeviceImageTy {
+
+  /// Class representing the offload entry table. The class stores the
+  /// __tgt_target_table and a map to search in the table faster.
+  struct OffloadEntryTableTy {
+    /// Add new entry to the table.
+    void addEntry(const __tgt_offload_entry &Entry) {
+      Entries.push_back(Entry);
+      TTTablePtr.EntriesBegin = &Entries[0];
+      TTTablePtr.EntriesEnd = TTTablePtr.EntriesBegin + Entries.size();
+    }
+
+    /// Get the raw pointer to the __tgt_target_table.
+    operator __tgt_target_table *() {
+      if (Entries.empty())
+        return nullptr;
+      return &TTTablePtr;
+    }
+
+  private:
+    __tgt_target_table TTTablePtr;
+    llvm::SmallVector<__tgt_offload_entry> Entries;
+  };
+
+  /// Image identifier within the corresponding device. Notice that this id is
+  /// not unique between different device; they may overlap.
+  int32_t ImageId;
+
+  /// The pointer to the raw __tgt_device_image.
+  const __tgt_device_image *TgtImage;
+
+  /// Table of offload entries.
+  OffloadEntryTableTy OffloadEntryTable;
+
+public:
+  DeviceImageTy(int32_t Id, const __tgt_device_image *Image)
+      : ImageId(Id), TgtImage(Image) {
+    assert(TgtImage && "Invalid target image");
+  }
+
+  /// Get the image identifier within the device.
+  int32_t getId() const { return ImageId; }
+
+  /// Get the pointer to the raw __tgt_device_image.
+  const __tgt_device_image *getTgtImage() const { return TgtImage; }
+
+  /// Get the image starting address.
+  void *getStart() const { return TgtImage->ImageStart; }
+
+  /// Get the image size.
+  size_t getSize() const {
+    return ((char *)TgtImage->ImageEnd) - ((char *)TgtImage->ImageStart);
+  }
+
+  /// Get a memory buffer reference to the whole image.
+  MemoryBufferRef getMemoryBuffer() const {
+    return MemoryBufferRef(StringRef((const char *)getStart(), getSize()),
+                           "Image");
+  }
+
+  /// Get a reference to the offload entry table for the image.
+  OffloadEntryTableTy &getOffloadEntryTable() { return OffloadEntryTable; }
+};
+
+/// Class implementing common functionalities of offload kernels. Each plugin
+/// should define the specific kernel class, derive from this generic one, and
+/// implement the necessary virtual function members.
+struct GenericKernelTy {
+  /// Construct a kernel with a name and a execution mode.
+  GenericKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode)
+      : Name(Name), ExecutionMode(ExecutionMode), DynamicMemorySize(0),
+        PreferredNumThreads(0), MaxNumThreads(0) {}
+
+  virtual ~GenericKernelTy() {}
+
+  /// Initialize the kernel object from a specific device.
+  Error init(GenericDeviceTy &GenericDevice, DeviceImageTy &Image);
+  virtual Error initImpl(GenericDeviceTy &GenericDevice,
+                         DeviceImageTy &Image) = 0;
+
+  /// Launch the kernel on the specific device. The device must be the same
+  /// one used to initialize the kernel.
+  Error launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
+               ptrdiff_t *ArgOffsets, int32_t NumArgs, uint64_t NumTeamsClause,
+               uint32_t ThreadLimitClause, uint64_t LoopTripCount,
+               AsyncInfoWrapperTy &AsyncInfoWrapper) const;
+  virtual Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
+                           uint64_t NumBlocks, uint32_t DynamicMemorySize,
+                           int32_t NumKernelArgs, void *KernelArgs,
+                           AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
+
+  /// Get the kernel name.
+  const char *getName() const { return Name; }
+
+  /// Indicate whether an execution mode is valid.
+  static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
+    switch (ExecutionMode) {
+    case OMP_TGT_EXEC_MODE_SPMD:
+    case OMP_TGT_EXEC_MODE_GENERIC:
+    case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
+      return true;
+    }
+    return false;
+  }
+
+private:
+  /// Prepare the arguments before launching the kernel.
+  void *prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
+                    ptrdiff_t *ArgOffsets, int32_t NumArgs,
+                    llvm::SmallVectorImpl<void *> &Args,
+                    llvm::SmallVectorImpl<void *> &Ptrs,
+                    AsyncInfoWrapperTy &AsyncInfoWrapper) const;
+
+  /// Get the default number of threads and blocks for the kernel.
+  virtual uint32_t getDefaultNumThreads(GenericDeviceTy &Device) const = 0;
+  virtual uint64_t getDefaultNumBlocks(GenericDeviceTy &Device) const = 0;
+
+  /// Get the number of threads and blocks for the kernel based on the
+  /// user-defined threads and block clauses.
+  uint32_t getNumThreads(GenericDeviceTy &GenericDevice,
+                         uint32_t ThreadLimitClause) const;
+  uint64_t getNumBlocks(GenericDeviceTy &GenericDevice,
+                        uint64_t BlockLimitClause, uint64_t LoopTripCount,
+                        uint32_t NumThreads) const;
+
+  /// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode.
+  bool isGenericSPMDMode() const {
+    return ExecutionMode == OMP_TGT_EXEC_MODE_GENERIC_SPMD;
+  }
+  bool isGenericMode() const {
+    return ExecutionMode == OMP_TGT_EXEC_MODE_GENERIC;
+  }
+  bool isSPMDMode() const { return ExecutionMode == OMP_TGT_EXEC_MODE_SPMD; }
+
+  /// Get the execution mode name of the kernel.
+  const char *getExecutionModeName() const {
+    switch (ExecutionMode) {
+    case OMP_TGT_EXEC_MODE_SPMD:
+      return "SPMD";
+    case OMP_TGT_EXEC_MODE_GENERIC:
+      return "Generic";
+    case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
+      return "Generic-SPMD";
+    }
+    llvm_unreachable("Unknown execution mode!");
+  }
+
+  /// The kernel name.
+  const char *Name;
+
+  /// The execution flags of the kernel.
+  OMPTgtExecModeFlags ExecutionMode;
+
+protected:
+  /// The dynamic memory size reserved for executing the kernel.
+  uint32_t DynamicMemorySize;
+
+  /// The preferred number of threads to run the kernel.
+  uint32_t PreferredNumThreads;
+
+  /// The maximum number of threads which the kernel could leverage.
+  uint32_t MaxNumThreads;
+};
+
+/// Class implementing common functionalities of offload devices. Each plugin
+/// should define the specific device class, derive from this generic one, and
+/// implement the necessary virtual function members.
+struct GenericDeviceTy : public DeviceAllocatorTy {
+  /// Construct a device with its device id within the plugin, the number of
+  /// devices in the plugin and the grid values for that kind of device.
+  GenericDeviceTy(int32_t DeviceId, int32_t NumDevices,
+                  const llvm::omp::GV &GridValues);
+
+  /// Get the device identifier within the corresponding plugin. Notice that
+  /// this id is not unique between different plugins; they may overlap.
+  int32_t getDeviceId() const { return DeviceId; }
+
+  /// Set the context of the device if needed, before calling device-specific
+  /// functions. Plugins may implement this function as a no-op if not needed.
+  virtual Error setContext() = 0;
+
+  /// Initialize the device. After this call, the device should be already
+  /// working and ready to accept queries or modifications.
+  Error init(GenericPluginTy &Plugin);
+  virtual Error initImpl(GenericPluginTy &Plugin) = 0;
+
+  /// Deinitialize the device and free all its resources. After this call, the
+  /// device is no longer considered ready, so no queries or modifications are
+  /// allowed.
+  Error deinit();
+  virtual Error deinitImpl() = 0;
+
+  /// Load the binary image into the device and return the target table.
+  Expected<__tgt_target_table *> loadBinary(GenericPluginTy &Plugin,
+                                            const __tgt_device_image *TgtImage);
+  virtual Expected<DeviceImageTy *>
+  loadBinaryImpl(const __tgt_device_image *TgtImage, int32_t ImageId) = 0;
+
+  /// Setup the device environment if needed. Notice this setup may not be run
+  /// on some plugins. By default, it will be executed, but plugins can change
+  /// this behavior by overriding the shouldSetupDeviceEnvironment function.
+  Error setupDeviceEnvironment(GenericPluginTy &Plugin, DeviceImageTy &Image);
+
+  /// Register the offload entries for a specific image on the device.
+  Error registerOffloadEntries(DeviceImageTy &Image);
+
+  /// Synchronize the current thread with the pending operations on the
+  /// __tgt_async_info structure.
+  Error synchronize(__tgt_async_info *AsyncInfo);
+  virtual Error synchronizeImpl(__tgt_async_info &AsyncInfo) = 0;
+
+  /// Allocate data on the device or involving the device.
+  Expected<void *> dataAlloc(int64_t Size, void *HostPtr, TargetAllocTy Kind);
+
+  /// Deallocate data from the device or involving the device.
+  Error dataDelete(void *TgtPtr, TargetAllocTy Kind);
+
+  /// Submit data to the device (host to device transfer).
+  Error dataSubmit(void *TgtPtr, const void *HstPtr, int64_t Size,
+                   __tgt_async_info *AsyncInfo);
+  virtual Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
+                               AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
+  /// Retrieve data from the device (device to host transfer).
+  Error dataRetrieve(void *HstPtr, const void *TgtPtr, int64_t Size,
+                     __tgt_async_info *AsyncInfo);
+  virtual Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
+                                 AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
+  /// Exchange data between devices (device to device transfer). Calling this
+  /// function is only valid if GenericPlugin::isDataExchangable() passing the
+  /// two devices returns true.
+  Error dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev, void *DstPtr,
+                     int64_t Size, __tgt_async_info *AsyncInfo);
+  virtual Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev,
+                                 void *DstPtr, int64_t Size,
+                                 AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
+  /// Run the target region with multiple teams.
+  Error runTargetTeamRegion(void *EntryPtr, void **ArgPtrs,
+                            ptrdiff_t *ArgOffsets, int32_t NumArgs,
+                            uint64_t NumTeamsClause, uint32_t ThreadLimitClause,
+                            uint64_t LoopTripCount,
+                            __tgt_async_info *AsyncInfo);
+
+  /// Initialize a __tgt_async_info structure. Related to interop features.
+  Error initAsyncInfo(__tgt_async_info **AsyncInfoPtr);
+  virtual Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
+  /// Initialize a __tgt_device_info structure. Related to interop features.
+  Error initDeviceInfo(__tgt_device_info *DeviceInfo);
+  virtual Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) = 0;
+
+  /// Create an event.
+  Error createEvent(void **EventPtrStorage);
+  virtual Error createEventImpl(void **EventPtrStorage) = 0;
+
+  /// Destroy an event.
+  Error destroyEvent(void *Event);
+  virtual Error destroyEventImpl(void *EventPtr) = 0;
+
+  /// Start the recording of the event.
+  Error recordEvent(void *Event, __tgt_async_info *AsyncInfo);
+  virtual Error recordEventImpl(void *EventPtr,
+                                AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
+  /// Wait for an event to finish. Notice this wait is asynchronous if the
+  /// __tgt_async_info is not nullptr.
+  Error waitEvent(void *Event, __tgt_async_info *AsyncInfo);
+  virtual Error waitEventImpl(void *EventPtr,
+                              AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
+  /// Synchronize the current thread with the event.
+  Error syncEvent(void *EventPtr);
+  virtual Error syncEventImpl(void *EventPtr) = 0;
+
+  /// Print information about the device.
+  Error printInfo();
+  virtual Error printInfoImpl() = 0;
+
+  /// Getters of the grid values.
+  uint32_t getWarpSize() const { return GridValues.GV_Warp_Size; }
+  uint32_t getThreadLimit() const { return GridValues.GV_Max_WG_Size; }
+  uint64_t getBlockLimit() const { return GridValues.GV_Max_Teams; }
+  uint32_t getDefaultNumThreads() const {
+    return GridValues.GV_Default_WG_Size;
+  }
+  uint64_t getDefaultNumBlocks() const {
+    // TODO: Introduce a default num blocks value.
+    return GridValues.GV_Default_WG_Size;
+  }
+  uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; }
+
+private:
+  /// Register offload entry for global variable.
+  Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage,
+                                   const __tgt_offload_entry &GlobalEntry,
+                                   __tgt_offload_entry &DeviceEntry);
+
+  /// Register offload entry for kernel function.
+  Error registerKernelOffloadEntry(DeviceImageTy &DeviceImage,
+                                   const __tgt_offload_entry &KernelEntry,
+                                   __tgt_offload_entry &DeviceEntry);
+
+  /// Allocate and construct a kernel object.
+  virtual Expected<GenericKernelTy *>
+  constructKernelEntry(const __tgt_offload_entry &KernelEntry,
+                       DeviceImageTy &Image) = 0;
+
+  /// Get and set the stack size and heap size for the device. If not used, the
+  /// plugin can implement the setters as no-op and setting the output
+  /// value to zero for the getters.
+  virtual Error getDeviceStackSize(uint64_t &V) = 0;
+  virtual Error setDeviceStackSize(uint64_t V) = 0;
+  virtual Error getDeviceHeapSize(uint64_t &V) = 0;
+  virtual Error setDeviceHeapSize(uint64_t V) = 0;
+
+  /// Indicate whether the device should setup the device environment. Notice
+  /// that returning false in this function will change the behavior of the
+  /// setupDeviceEnvironment() function.
+  virtual bool shouldSetupDeviceEnvironment() const { return true; }
+
+  /// Environment variables defined by the OpenMP standard.
+  Int32Envar OMP_TeamLimit;
+  Int32Envar OMP_NumTeams;
+  Int32Envar OMP_TeamsThreadLimit;
+
+  /// Environment variables defined by the LLVM OpenMP implementation.
+  Int32Envar OMPX_DebugKind;
+  UInt32Envar OMPX_SharedMemorySize;
+  UInt64Envar OMPX_TargetStackSize;
+  UInt64Envar OMPX_TargetHeapSize;
+
+  /// Pointer to the memory manager or nullptr if not available.
+  MemoryManagerTy *MemoryManager;
+
+protected:
+  /// Array of images loaded into the device. Images are automatically
+  /// deallocated by the allocator.
+  llvm::SmallVector<DeviceImageTy *> LoadedImages;
+
+  /// The identifier of the device within the plugin. Notice this is not a
+  /// global device id and is not the device id visible to the OpenMP user.
+  const int32_t DeviceId;
+
+  /// The default grid values used for this device.
+  llvm::omp::GV GridValues;
+
+  /// Enumeration used for representing the current state between two devices
+  /// two devices (both under the same plugin) for the peer access between them.
+  /// The states can be a) PENDING when the state has not been queried and needs
+  /// to be queried, b) AVAILABLE when the peer access is available to be used,
+  /// and c) UNAVAILABLE if the system does not allow it.
+  enum class PeerAccessState : uint8_t { AVAILABLE, UNAVAILABLE, PENDING };
+
+  /// Array of peer access states with the rest of devices. This means that if
+  /// the device I has a matrix PeerAccesses with PeerAccesses[J] == AVAILABLE,
+  /// the device I can access device J's memory directly. However, notice this
+  /// does not mean that device J can access device I's memory directly.
+  llvm::SmallVector<PeerAccessState> PeerAccesses;
+  std::mutex PeerAccessesLock;
+};
+
+/// Class implementing common functionalities of offload plugins. Each plugin
+/// should define the specific plugin class, derive from this generic one, and
+/// implement the necessary virtual function members.
+struct GenericPluginTy {
+
+  /// Construct a plugin instance. The number of active instances should be
+  /// always be either zero or one.
+  GenericPluginTy() : RequiresFlags(OMP_REQ_UNDEFINED), GlobalHandler(nullptr) {
+    ++NumActiveInstances;
+  }
+
+  /// Destroy the plugin instance and release all its resources. Also decrease
+  /// the number of instances.
+  virtual ~GenericPluginTy() {
+    // There is no global handler if no device is available.
+    if (GlobalHandler)
+      delete GlobalHandler;
+
+    // Deinitialize all active devices.
+    for (int32_t DeviceId = 0; DeviceId < NumDevices; ++DeviceId) {
+      if (Devices[DeviceId]) {
+        if (auto Err = deinitDevice(DeviceId))
+          REPORT("Failure to deinitialize device %d: %s\n", DeviceId,
+                 toString(std::move(Err)).data());
+      }
+      assert(!Devices[DeviceId] && "Device was not deinitialized");
+    }
+
+    --NumActiveInstances;
+  }
+
+  /// Get the reference to the device with a certain device id.
+  GenericDeviceTy &getDevice(int32_t DeviceId) {
+    assert(isValidDeviceId(DeviceId) && "Invalid device id");
+    assert(Devices[DeviceId] && "Device is unitialized");
+
+    return *Devices[DeviceId];
+  }
+
+  /// Get the number of active devices.
+  int32_t getNumDevices() const { return NumDevices; }
+
+  /// Get the ELF code to recognize the binary image of this plugin.
+  virtual uint16_t getMagicElfBits() const = 0;
+
+  /// Allocate a structure using the internal allocator.
+  template <typename Ty> Ty *allocate() {
+    return reinterpret_cast<Ty *>(Allocator.Allocate(sizeof(Ty), alignof(Ty)));
+  }
+
+  /// Get the reference to the global handler of this plugin.
+  GenericGlobalHandlerTy &getGlobalHandler() {
+    assert(GlobalHandler && "Global handler not initialized");
+    return *GlobalHandler;
+  }
+
+  /// Get the OpenMP requires flags set for this plugin.
+  int64_t getRequiresFlags() const { return RequiresFlags; }
+
+  /// Set the OpenMP requires flags for this plugin.
+  void setRequiresFlag(int64_t Flags) { RequiresFlags = Flags; }
+
+  /// Initialize a device within the plugin.
+  Error initDevice(int32_t DeviceId);
+
+  /// Deinitialize a device within the plugin and release its resources.
+  Error deinitDevice(int32_t DeviceId);
+
+  /// Indicate whether data can be exchanged directly between two devices under
+  /// this same plugin. If this function returns true, it's safe to call the
+  /// GenericDeviceTy::exchangeData() function on the source device.
+  virtual bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) {
+    return isValidDeviceId(SrcDeviceId) && isValidDeviceId(DstDeviceId);
+  }
+
+  /// Indicate if an image is compatible with the plugin devices. Notice that
+  /// this function may be called before actually initializing the devices. So
+  /// we could not move this function into GenericDeviceTy.
+  virtual Expected<bool> isImageCompatible(__tgt_image_info *Info) const = 0;
+
+  /// Indicate whether the plugin supports empty images.
+  virtual bool supportsEmptyImages() const { return false; }
+
+  /// Indicate whether there is any active plugin instance.
+  static bool hasAnyActiveInstance() {
+    assert(NumActiveInstances <= 1 && "Invalid number of instances");
+    return (NumActiveInstances > 0);
+  }
+
+protected:
+  /// Initialize the plugin and prepare for initializing its devices.
+  void init(int NumDevices, GenericGlobalHandlerTy *GlobalHandler) {
+    this->NumDevices = NumDevices;
+    this->GlobalHandler = GlobalHandler;
+
+    assert(Devices.size() == 0 && "Plugin already intialized");
+
+    Devices.resize(NumDevices, nullptr);
+  }
+
+  /// Create a new device with a specific device id.
+  virtual GenericDeviceTy &createDevice(int32_t DeviceId) = 0;
+
+  /// Indicate whether a device id is valid.
+  bool isValidDeviceId(int32_t DeviceId) const {
+    return (DeviceId >= 0 && DeviceId < getNumDevices());
+  }
+
+private:
+  /// Number of devices available for the plugin.
+  int32_t NumDevices;
+
+  /// Array of pointers to the devices. Initially, they are all set to nullptr.
+  /// Once a device is initialized, the pointer is stored in the position given
+  /// by its device id. A position with nullptr means that the corresponding
+  /// device was not initialized yet.
+  llvm::SmallVector<GenericDeviceTy *> Devices;
+
+  /// OpenMP requires flags.
+  int64_t RequiresFlags;
+
+  /// Pointer to the global handler for this plugin.
+  GenericGlobalHandlerTy *GlobalHandler;
+
+  /// Internal allocator for different structures.
+  BumpPtrAllocator Allocator;
+
+  /// Indicates the number of active plugin instances. Actually, we should only
+  /// have one active instance per plugin library. But we use a counter for
+  /// simplicity.
+  static uint32_t NumActiveInstances;
+};
+
+/// Class for simplifying the getter operation of the plugin. Anywhere on the
+/// code, the current plugin can be retrieved by Plugin::get(). The init(),
+/// deinit(), get() and check() functions should be defined by each plugin
+/// implementation.
+class Plugin {
+  /// Avoid instances of this class.
+  Plugin() {}
+  Plugin(const Plugin &) = delete;
+  void operator=(const Plugin &) = delete;
+
+public:
+  /// Initialize the plugin if it was not initialized yet.
+  static Error init();
+
+  /// Deinitialize the plugin if it was not deinitialized yet.
+  static Error deinit();
+
+  /// Get a reference (or create if it was not created) to the plugin instance.
+  static GenericPluginTy &get();
+
+  /// Get a reference to the plugin with a specific plugin-specific type.
+  template <typename Ty> static Ty &get() { return static_cast<Ty &>(get()); }
+
+  /// Indicate if the plugin is currently active. Actually, we check if there is
+  /// any active instances.
+  static bool isActive() { return GenericPluginTy::hasAnyActiveInstance(); }
+
+  /// Create a success error.
+  static Error success() { return Error::success(); }
+
+  /// Create a string error.
+  template <typename... ArgsTy>
+  static Error error(const char *ErrFmt, ArgsTy... Args) {
+    return createStringError(inconvertibleErrorCode(), ErrFmt, Args...);
+  }
+
+  /// Check the plugin-specific error code and return an error or success
+  /// accordingly. In case of an error, create a string error with the error
+  /// description. The ErrFmt should follow the format:
+  ///     "Error in <function name>[<optional info>]: %s"
+  /// The last format specifier "%s" is mandatory and will be used to place the
+  /// error code's description. Notice this function should be only called from
+  /// the plugin-specific code.
+  template <typename... ArgsTy>
+  static Error check(int32_t ErrorCode, const char *ErrFmt, ArgsTy... Args);
+};
+
+/// Auxiliary interface class for GenericDeviceResourcePoolTy. This class acts
+/// as a reference to a device resource, such as a stream, and requires some
+/// basic functions to be implemented. The derived class should define an empty
+/// constructor that creates an empty and invalid resource reference. Do not
+/// create a new resource on the ctor, but on the create() function instead.
+struct GenericDeviceResourceRef {
+  /// Create a new resource and stores a reference.
+  virtual Error create() = 0;
+
+  /// Destroy and release the resources pointed by the reference.
+  virtual Error destroy() = 0;
+};
+
+/// Class that implements a resource pool belonging to a device. This class
+/// operates with references to the actual resources. These reference must
+/// derive from the GenericDeviceResourceRef class and implement the create
+/// and destroy virtual functions.
+template <typename ResourceRef> class GenericDeviceResourcePoolTy {
+  using ResourcePoolTy = GenericDeviceResourcePoolTy<ResourceRef>;
+
+public:
+  /// Create an empty resource pool for a specific device.
+  GenericDeviceResourcePoolTy(GenericDeviceTy &Device)
+      : Device(Device), NextAvailable(0) {}
+
+  /// Destroy the resource pool. At this point, the deinit() function should
+  /// already have been executed so the resource pool should be empty.
+  virtual ~GenericDeviceResourcePoolTy() {
+    assert(ResourcePool.empty() && "Resource pool not empty");
+  }
+
+  /// Initialize the resource pool.
+  Error init(uint32_t InitialSize) {
+    assert(ResourcePool.empty() && "Resource pool already initialized");
+    return ResourcePoolTy::resizeResourcePool(InitialSize);
+  }
+
+  /// Deinitialize the resource pool and delete all resources. This function
+  /// must be called before the destructor.
+  Error deinit() {
+    if (NextAvailable)
+      DP("Missing %d resources to be returned\n", NextAvailable);
+
+    // TODO: This prevents a bug on libomptarget to make the plugins fail. There
+    // may be some resources not returned. Do not destroy these ones.
+    if (auto Err = ResourcePoolTy::resizeResourcePool(NextAvailable))
+      return Err;
+
+    ResourcePool.clear();
+
+    return Plugin::success();
+  }
+
+protected:
+  /// Get resource from the pool or create new resources.
+  ResourceRef getResource() {
+    const std::lock_guard<std::mutex> Lock(Mutex);
+    if (NextAvailable == ResourcePool.size()) {
+      // By default we double the resource pool every time.
+      if (auto Err = ResourcePoolTy::resizeResourcePool(NextAvailable * 2)) {
+        REPORT("Failure to resize the resource pool: %s",
+               toString(std::move(Err)).data());
+        // Return an empty reference.
+        return ResourceRef();
+      }
+    }
+    return ResourcePool[NextAvailable++];
+  }
+
+  /// Return resource to the pool.
+  void returnResource(ResourceRef Resource) {
+    const std::lock_guard<std::mutex> Lock(Mutex);
+    ResourcePool[--NextAvailable] = Resource;
+  }
+
+private:
+  /// The resources between \p OldSize and \p NewSize need to be created or
+  /// destroyed. The mutex is locked when this function is called.
+  Error resizeResourcePoolImpl(uint32_t OldSize, uint32_t NewSize) {
+    assert(OldSize != NewSize && "Resizing to the same size");
+
+    if (auto Err = Device.setContext())
+      return Err;
+
+    if (OldSize < NewSize) {
+      // Create new resources.
+      for (uint32_t I = OldSize; I < NewSize; ++I) {
+        if (auto Err = ResourcePool[I].create())
+          return Err;
+      }
+    } else {
+      // Destroy the obsolete resources.
+      for (uint32_t I = NewSize; I < OldSize; ++I) {
+        if (auto Err = ResourcePool[I].destroy())
+          return Err;
+      }
+    }
+    return Plugin::success();
+  }
+
+  /// Increase or decrease the number of resources. This function should
+  /// be called with the mutex acquired.
+  Error resizeResourcePool(uint32_t NewSize) {
+    uint32_t OldSize = ResourcePool.size();
+
+    // Nothing to do.
+    if (OldSize == NewSize)
+      return Plugin::success();
+
+    if (OldSize > NewSize) {
+      // Decrease the number of resources.
+      auto Err = ResourcePoolTy::resizeResourcePoolImpl(OldSize, NewSize);
+      ResourcePool.resize(NewSize);
+      return Err;
+    }
+
+    // Increase the number of resources otherwise.
+    ResourcePool.resize(NewSize);
+    return ResourcePoolTy::resizeResourcePoolImpl(OldSize, NewSize);
+  }
+
+  /// The device to which the resources belong
+  GenericDeviceTy &Device;
+
+  /// Mutex for the resource pool.
+  std::mutex Mutex;
+
+  /// The next available resource in the pool.
+  uint32_t NextAvailable;
+
+protected:
+  /// The actual resource pool.
+  std::deque<ResourceRef> ResourcePool;
+};
+
+/// Class implementing a common stream manager. This class can be directly used
+/// by the specific plugins if necessary. The StreamRef type should derive from
+/// the GenericDeviceResourceRef. Look at its description to know the details of
+/// their requirements.
+template <typename StreamRef>
+class GenericStreamManagerTy : public GenericDeviceResourcePoolTy<StreamRef> {
+  using ResourcePoolTy = GenericDeviceResourcePoolTy<StreamRef>;
+
+public:
+  /// Create a stream manager with space for an initial number of streams. No
+  /// stream will be created until the init() function is called.
+  GenericStreamManagerTy(GenericDeviceTy &Device, uint32_t DefNumStreams = 32)
+      : ResourcePoolTy(Device),
+        InitialNumStreams("LIBOMPTARGET_NUM_INITIAL_STREAMS", DefNumStreams) {}
+
+  /// Initialize the stream pool and their resources with the initial number of
+  /// streams.
+  Error init() { return ResourcePoolTy::init(InitialNumStreams.get()); }
+
+  /// Get an available stream or create new.
+  StreamRef getStream() { return ResourcePoolTy::getResource(); }
+
+  /// Return idle stream.
+  void returnStream(StreamRef Stream) {
+    ResourcePoolTy::returnResource(Stream);
+  }
+
+private:
+  /// The initial stream pool size, potentially defined by an envar.
+  UInt32Envar InitialNumStreams;
+};
+
+/// Class implementing a common event manager. This class can be directly used
+/// by the specific plugins if necessary. The EventRef type should derive from
+/// the GenericDeviceResourceRef. Look at its description to know the details of
+/// their requirements.
+template <typename EventRef>
+struct GenericEventManagerTy : public GenericDeviceResourcePoolTy<EventRef> {
+  using ResourcePoolTy = GenericDeviceResourcePoolTy<EventRef>;
+
+public:
+  /// Create an event manager with space for an initial number of events. No
+  /// event will be created until the init() function is called.
+  GenericEventManagerTy(GenericDeviceTy &Device, uint32_t DefNumEvents = 32)
+      : ResourcePoolTy(Device),
+        InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", DefNumEvents) {}
+
+  /// Initialize the event pool and their resources with the initial number of
+  /// events.
+  Error init() { return ResourcePoolTy::init(InitialNumEvents.get()); }
+
+  /// Get an available event or create new.
+  EventRef getEvent() { return ResourcePoolTy::getResource(); }
+
+  /// Return an idle event.
+  void returnEvent(EventRef Event) { ResourcePoolTy::returnResource(Event); }
+
+private:
+  /// The initial event pool size, potentially defined by an envar.
+  UInt32Envar InitialNumEvents;
+};
+
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_COMMON_PLUGININTERFACE_H
Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -0,0 +1,849 @@
+//===- PluginInterface.cpp - Target independent plugin device interface ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "PluginInterface.h"
+#include "Debug.h"
+#include "GlobalHandler.h"
+#include "elf_common.h"
+#include "omptarget.h"
+#include "omptargetplugin.h"
+
+#include <cstdint>
+#include <limits>
+
+using namespace llvm;
+using namespace omp;
+using namespace target;
+using namespace plugin;
+
+uint32_t GenericPluginTy::NumActiveInstances = 0;
+
+AsyncInfoWrapperTy::~AsyncInfoWrapperTy() {
+  // If we used a local async info object we want synchronous behavior.
+  // In that case, and assuming the current status code is OK, we will
+  // synchronize explicitly when the object is deleted.
+  if (AsyncInfoPtr == &LocalAsyncInfo && !Err)
+    Err = Device.synchronize(&LocalAsyncInfo);
+}
+
+Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
+                            DeviceImageTy &Image) {
+  PreferredNumThreads = getDefaultNumThreads(GenericDevice);
+  if (isGenericMode())
+    PreferredNumThreads += GenericDevice.getWarpSize();
+
+  MaxNumThreads = GenericDevice.getThreadLimit();
+
+  DynamicMemorySize = GenericDevice.getDynamicMemorySize();
+
+  return initImpl(GenericDevice, Image);
+}
+
+Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
+                              ptrdiff_t *ArgOffsets, int32_t NumArgs,
+                              uint64_t NumTeamsClause,
+                              uint32_t ThreadLimitClause,
+                              uint64_t LoopTripCount,
+                              AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+  llvm::SmallVector<void *, 16> Args;
+  llvm::SmallVector<void *, 16> Ptrs;
+
+  void *KernelArgsPtr = prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, NumArgs,
+                                    Args, Ptrs, AsyncInfoWrapper);
+
+  uint32_t NumThreads = getNumThreads(GenericDevice, ThreadLimitClause);
+  uint64_t NumBlocks =
+      getNumBlocks(GenericDevice, NumTeamsClause, LoopTripCount, NumThreads);
+
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
+       "Launching kernel %s with %d blocks and %d threads in %s mode\n",
+       getName(), NumBlocks, NumThreads, getExecutionModeName());
+
+  return launchImpl(GenericDevice, NumThreads, NumBlocks, DynamicMemorySize,
+                    NumArgs, KernelArgsPtr, AsyncInfoWrapper);
+}
+
+void *GenericKernelTy::prepareArgs(GenericDeviceTy &GenericDevice,
+                                   void **ArgPtrs, ptrdiff_t *ArgOffsets,
+                                   int32_t NumArgs,
+                                   llvm::SmallVectorImpl<void *> &Args,
+                                   llvm::SmallVectorImpl<void *> &Ptrs,
+                                   AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+  Args.resize(NumArgs);
+  Ptrs.resize(NumArgs);
+
+  if (NumArgs == 0)
+    return nullptr;
+
+  for (int I = 0; I < NumArgs; ++I) {
+    Ptrs[I] = (void *)((intptr_t)ArgPtrs[I] + ArgOffsets[I]);
+    Args[I] = &Ptrs[I];
+  }
+  return &Args[0];
+}
+
+uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice,
+                                        uint32_t ThreadLimitClause) const {
+  return std::min(MaxNumThreads, (ThreadLimitClause > 0) ? ThreadLimitClause
+                                                         : PreferredNumThreads);
+}
+
+uint64_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
+                                       uint64_t NumTeamsClause,
+                                       uint64_t LoopTripCount,
+                                       uint32_t NumThreads) const {
+  uint64_t PreferredNumBlocks = getDefaultNumBlocks(GenericDevice);
+  if (NumTeamsClause > 0) {
+    PreferredNumBlocks = NumTeamsClause;
+  } else if (LoopTripCount > 0) {
+    if (isSPMDMode()) {
+      // We have a combined construct, i.e. `target teams distribute
+      // parallel for [simd]`. We launch so many teams so that each thread
+      // will execute one iteration of the loop. round up to the nearest
+      // integer
+      PreferredNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
+    } else {
+      assert((isGenericMode() || isGenericSPMDMode()) &&
+             "Unexpected execution mode!");
+      // If we reach this point, then we have a non-combined construct, i.e.
+      // `teams distribute` with a nested `parallel for` and each team is
+      // assigned one iteration of the `distribute` loop. E.g.:
+      //
+      // #pragma omp target teams distribute
+      // for(...loop_tripcount...) {
+      //   #pragma omp parallel for
+      //   for(...) {}
+      // }
+      //
+      // Threads within a team will execute the iterations of the `parallel`
+      // loop.
+      PreferredNumBlocks = LoopTripCount;
+    }
+  }
+  return std::min(PreferredNumBlocks, GenericDevice.getBlockLimit());
+}
+
+GenericDeviceTy::GenericDeviceTy(int32_t DeviceId, int32_t NumDevices,
+                                 const llvm::omp::GV &OMPGridValues)
+    : OMP_TeamLimit("OMP_TEAM_LIMIT"), OMP_NumTeams("OMP_NUM_TEAMS"),
+      OMP_TeamsThreadLimit("OMP_TEAMS_THREAD_LIMIT"),
+      OMPX_DebugKind("LIBOMPTARGET_DEVICE_RTL_DEBUG"),
+      OMPX_SharedMemorySize("LIBOMPTARGET_SHARED_MEMORY_SIZE"),
+      // Do not initialize the following two envars since they depend on the
+      // device initialization. These cannot be consulted until the device is
+      // initialized correctly. We intialize them in GenericDeviceTy::init().
+      OMPX_TargetStackSize(), OMPX_TargetHeapSize(), MemoryManager(nullptr),
+      DeviceId(DeviceId), GridValues(OMPGridValues),
+      PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock() {
+  if (OMP_NumTeams > 0)
+    GridValues.GV_Max_Teams =
+        std::min(GridValues.GV_Max_Teams, uint32_t(OMP_NumTeams));
+
+  if (OMP_TeamsThreadLimit > 0)
+    GridValues.GV_Max_WG_Size =
+        std::min(GridValues.GV_Max_WG_Size, uint32_t(OMP_TeamsThreadLimit));
+};
+
+Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
+  if (auto Err = initImpl(Plugin))
+    return Err;
+
+  // Read and reinitialize the envars that depend on the device initialization.
+  // Notice these two envars may change the stack size and heap size of the
+  // device, so they need the device properly initialized.
+  auto StackSizeEnvarOrErr = UInt64Envar::create(
+      "LIBOMPTARGET_STACK_SIZE",
+      [this](uint64_t &V) -> Error { return getDeviceStackSize(V); },
+      [this](uint64_t V) -> Error { return setDeviceStackSize(V); });
+  if (!StackSizeEnvarOrErr)
+    return StackSizeEnvarOrErr.takeError();
+  OMPX_TargetStackSize = std::move(*StackSizeEnvarOrErr);
+
+  auto HeapSizeEnvarOrErr = UInt64Envar::create(
+      "LIBOMPTARGET_HEAP_SIZE",
+      [this](uint64_t &V) -> Error { return getDeviceHeapSize(V); },
+      [this](uint64_t V) -> Error { return setDeviceHeapSize(V); });
+  if (!HeapSizeEnvarOrErr)
+    return HeapSizeEnvarOrErr.takeError();
+  OMPX_TargetHeapSize = std::move(*HeapSizeEnvarOrErr);
+
+  // Enable the memory manager if required.
+  auto [ThresholdMM, EnableMM] = MemoryManagerTy::getSizeThresholdFromEnv();
+  if (EnableMM)
+    MemoryManager = new MemoryManagerTy(*this, ThresholdMM);
+
+  return Plugin::success();
+}
+
+Error GenericDeviceTy::deinit() {
+  // Delete the memory manager before deinitilizing the device. Otherwise,
+  // we may delete device allocations after the device is deinitialized.
+  if (MemoryManager)
+    delete MemoryManager;
+  MemoryManager = nullptr;
+
+  return deinitImpl();
+}
+
+Expected<__tgt_target_table *>
+GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
+                            const __tgt_device_image *TgtImage) {
+  DP("Load data from image " DPxMOD "\n", DPxPTR(TgtImage->ImageStart));
+
+  // Load the binary and allocate the image object. Use the next available id
+  // for the image id, which is the number of previously loaded images.
+  auto ImageOrErr = loadBinaryImpl(TgtImage, LoadedImages.size());
+  if (!ImageOrErr)
+    return ImageOrErr.takeError();
+
+  DeviceImageTy *Image = *ImageOrErr;
+  assert(Image != nullptr && "Invalid image");
+
+  // Add the image to list.
+  LoadedImages.push_back(Image);
+
+  // Setup the device environment if needed.
+  if (auto Err = setupDeviceEnvironment(Plugin, *Image))
+    return std::move(Err);
+
+  // Register all offload entries of the image.
+  if (auto Err = registerOffloadEntries(*Image))
+    return std::move(Err);
+
+  // Return the pointer to the table of entries.
+  return Image->getOffloadEntryTable();
+}
+
+Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin,
+                                              DeviceImageTy &Image) {
+  // There are some plugins that do not need this step.
+  if (!shouldSetupDeviceEnvironment())
+    return Plugin::success();
+
+  DeviceEnvironmentTy DeviceEnvironment;
+  DeviceEnvironment.DebugKind = OMPX_DebugKind;
+  DeviceEnvironment.NumDevices = Plugin.getNumDevices();
+  // TODO: The device ID used here is not the real device ID used by OpenMP.
+  DeviceEnvironment.DeviceNum = DeviceId;
+  DeviceEnvironment.DynamicMemSize = OMPX_SharedMemorySize;
+
+  // Create the metainfo of the device environment global.
+  GlobalTy DeviceEnvGlobal("omptarget_device_environment",
+                           sizeof(DeviceEnvironmentTy), &DeviceEnvironment);
+
+  // Write device environment values to the device.
+  GenericGlobalHandlerTy &GlobalHandler = Plugin.getGlobalHandler();
+  return GlobalHandler.writeGlobalToDevice(*this, Image, DeviceEnvGlobal);
+}
+
+Error GenericDeviceTy::registerOffloadEntries(DeviceImageTy &Image) {
+  const __tgt_offload_entry *Begin = Image.getTgtImage()->EntriesBegin;
+  const __tgt_offload_entry *End = Image.getTgtImage()->EntriesEnd;
+  for (const __tgt_offload_entry *Entry = Begin; Entry != End; ++Entry) {
+    // The host should have always something in the address to uniquely
+    // identify the entry.
+    if (!Entry->addr)
+      return Plugin::error("Failure to register entry without address");
+
+    __tgt_offload_entry DeviceEntry = {0};
+
+    if (Entry->size) {
+      if (auto Err = registerGlobalOffloadEntry(Image, *Entry, DeviceEntry))
+        return Err;
+    } else {
+      if (auto Err = registerKernelOffloadEntry(Image, *Entry, DeviceEntry))
+        return Err;
+    }
+
+    assert(DeviceEntry.addr && "Device addr of offload entry cannot be null");
+
+    DP("Entry point " DPxMOD " maps to%s %s (" DPxMOD ")\n",
+       DPxPTR(Entry - Begin), (Entry->size) ? " global" : "", Entry->name,
+       DPxPTR(DeviceEntry.addr));
+  }
+  return Plugin::success();
+}
+
+Error GenericDeviceTy::registerGlobalOffloadEntry(
+    DeviceImageTy &Image, const __tgt_offload_entry &GlobalEntry,
+    __tgt_offload_entry &DeviceEntry) {
+
+  GenericPluginTy &Plugin = Plugin::get();
+
+  DeviceEntry = GlobalEntry;
+
+  // Create a metadata object for the device global.
+  GlobalTy DeviceGlobal(GlobalEntry.name, GlobalEntry.size);
+
+  // Get the address of the device of the global.
+  GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
+  if (auto Err =
+          GHandler.getGlobalMetadataFromDevice(*this, Image, DeviceGlobal))
+    return Err;
+
+  // Store the device address on the device entry.
+  DeviceEntry.addr = DeviceGlobal.getPtr();
+  assert(DeviceEntry.addr && "Invalid device global's address");
+
+  // Note: In the current implementation declare target variables
+  // can either be link or to. This means that once unified
+  // memory is activated via the requires directive, the variable
+  // can be used directly from the host in both cases.
+  if (Plugin.getRequiresFlags() & OMP_REQ_UNIFIED_SHARED_MEMORY) {
+    // If unified memory is present any target link or to variables
+    // can access host addresses directly. There is no longer a
+    // need for device copies.
+    GlobalTy HostGlobal(GlobalEntry);
+    if (auto Err = GHandler.writeGlobalToDevice(*this, Image, HostGlobal,
+                                                DeviceGlobal))
+      return Err;
+  }
+
+  // Add the device entry on the entry table.
+  Image.getOffloadEntryTable().addEntry(DeviceEntry);
+
+  return Plugin::success();
+}
+
+Error GenericDeviceTy::registerKernelOffloadEntry(
+    DeviceImageTy &Image, const __tgt_offload_entry &KernelEntry,
+    __tgt_offload_entry &DeviceEntry) {
+  DeviceEntry = KernelEntry;
+
+  // Create a kernel object.
+  auto KernelOrErr = constructKernelEntry(KernelEntry, Image);
+  if (!KernelOrErr)
+    return KernelOrErr.takeError();
+
+  GenericKernelTy *Kernel = *KernelOrErr;
+  assert(Kernel != nullptr && "Invalid kernel");
+
+  // Initialize the kernel.
+  if (auto Err = Kernel->init(*this, Image))
+    return Err;
+
+  // Set the device entry address to the kernel address and store the entry on
+  // the entry table.
+  DeviceEntry.addr = (void *)Kernel;
+  Image.getOffloadEntryTable().addEntry(DeviceEntry);
+
+  return Plugin::success();
+}
+
+Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo) {
+  if (!AsyncInfo || !AsyncInfo->Queue)
+    return Plugin::error("Invalid async info queue");
+
+  return synchronizeImpl(*AsyncInfo);
+}
+
+Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
+                                            TargetAllocTy Kind) {
+  void *Alloc = nullptr;
+
+  switch (Kind) {
+  case TARGET_ALLOC_DEFAULT:
+  case TARGET_ALLOC_DEVICE:
+    if (MemoryManager) {
+      Alloc = MemoryManager->allocate(Size, HostPtr);
+      if (!Alloc)
+        return Plugin::error("Failed to allocate from memory manager");
+      break;
+    }
+    [[fallthrough]];
+  case TARGET_ALLOC_HOST:
+  case TARGET_ALLOC_SHARED:
+    Alloc = allocate(Size, HostPtr, Kind);
+    if (!Alloc)
+      return Plugin::error("Failed to allocate from device allocator");
+  }
+
+  // Sucessful and valid allocation.
+  if (Alloc)
+    return Alloc;
+
+  // At this point means that we did not tried to allocate from the memory
+  // manager nor the device allocator.
+  return Plugin::error("Invalid target data allocation kind or requested "
+                       "allocator not implemented yet");
+}
+
+Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) {
+  int Res;
+  if (MemoryManager)
+    Res = MemoryManager->free(TgtPtr);
+  else
+    Res = free(TgtPtr, Kind);
+
+  if (Res)
+    return Plugin::error("Failure to deallocate device pointer %p", TgtPtr);
+
+  return Plugin::success();
+}
+
+Error GenericDeviceTy::dataSubmit(void *TgtPtr, const void *HstPtr,
+                                  int64_t Size, __tgt_async_info *AsyncInfo) {
+  auto Err = Plugin::success();
+  AsyncInfoWrapperTy AsyncInfoWrapper(Err, *this, AsyncInfo);
+  Err = dataSubmitImpl(TgtPtr, HstPtr, Size, AsyncInfoWrapper);
+  return Err;
+}
+
+Error GenericDeviceTy::dataRetrieve(void *HstPtr, const void *TgtPtr,
+                                    int64_t Size, __tgt_async_info *AsyncInfo) {
+  auto Err = Plugin::success();
+  AsyncInfoWrapperTy AsyncInfoWrapper(Err, *this, AsyncInfo);
+  Err = dataRetrieveImpl(HstPtr, TgtPtr, Size, AsyncInfoWrapper);
+  return Err;
+}
+
+Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev,
+                                    void *DstPtr, int64_t Size,
+                                    __tgt_async_info *AsyncInfo) {
+  auto Err = Plugin::success();
+  AsyncInfoWrapperTy AsyncInfoWrapper(Err, *this, AsyncInfo);
+  Err = dataExchangeImpl(SrcPtr, DstDev, DstPtr, Size, AsyncInfoWrapper);
+  return Err;
+}
+
+Error GenericDeviceTy::runTargetTeamRegion(
+    void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets, int32_t NumArgs,
+    uint64_t NumTeamsClause, uint32_t ThreadLimitClause, uint64_t LoopTripCount,
+    __tgt_async_info *AsyncInfo) {
+  auto Err = Plugin::success();
+  AsyncInfoWrapperTy AsyncInfoWrapper(Err, *this, AsyncInfo);
+
+  GenericKernelTy &GenericKernel =
+      *reinterpret_cast<GenericKernelTy *>(EntryPtr);
+
+  Err =
+      GenericKernel.launch(*this, ArgPtrs, ArgOffsets, NumArgs, NumTeamsClause,
+                           ThreadLimitClause, LoopTripCount, AsyncInfoWrapper);
+  return Err;
+}
+
+Error GenericDeviceTy::initAsyncInfo(__tgt_async_info **AsyncInfoPtr) {
+  assert(AsyncInfoPtr && "Invalid async info");
+
+  *AsyncInfoPtr = new __tgt_async_info();
+
+  auto Err = Plugin::success();
+  AsyncInfoWrapperTy AsyncInfoWrapper(Err, *this, *AsyncInfoPtr);
+  Err = initAsyncInfoImpl(AsyncInfoWrapper);
+  return Err;
+}
+
+Error GenericDeviceTy::initDeviceInfo(__tgt_device_info *DeviceInfo) {
+  assert(DeviceInfo && "Invalid device info");
+
+  return initDeviceInfoImpl(DeviceInfo);
+}
+
+Error GenericPluginTy::initDevice(int32_t DeviceId) {
+  assert(!Devices[DeviceId] && "Device already initialized");
+
+  // Create the device and save the reference.
+  GenericDeviceTy &Device = createDevice(DeviceId);
+  Devices[DeviceId] = &Device;
+
+  // Initialize the device and its resources.
+  return Device.init(*this);
+}
+
+Error GenericPluginTy::deinitDevice(int32_t DeviceId) {
+  // The device may be already deinitialized.
+  if (Devices[DeviceId] == nullptr)
+    return Plugin::success();
+
+  // Deinitialize the device and release its resources.
+  if (auto Err = Devices[DeviceId]->deinit())
+    return Err;
+
+  // Delete the device and invalidate its reference.
+  delete Devices[DeviceId];
+  Devices[DeviceId] = nullptr;
+
+  return Plugin::success();
+}
+
+Error GenericDeviceTy::printInfo() {
+  // TODO: Print generic information here
+  return printInfoImpl();
+}
+
+Error GenericDeviceTy::createEvent(void **EventPtrStorage) {
+  return createEventImpl(EventPtrStorage);
+}
+
+Error GenericDeviceTy::destroyEvent(void *EventPtr) {
+  return destroyEventImpl(EventPtr);
+}
+
+Error GenericDeviceTy::recordEvent(void *EventPtr,
+                                   __tgt_async_info *AsyncInfo) {
+  auto Err = Plugin::success();
+  AsyncInfoWrapperTy AsyncInfoWrapper(Err, *this, AsyncInfo);
+  Err = recordEventImpl(EventPtr, AsyncInfoWrapper);
+  return Err;
+}
+
+Error GenericDeviceTy::waitEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
+  auto Err = Plugin::success();
+  AsyncInfoWrapperTy AsyncInfoWrapper(Err, *this, AsyncInfo);
+  Err = waitEventImpl(EventPtr, AsyncInfoWrapper);
+  return Err;
+}
+
+Error GenericDeviceTy::syncEvent(void *EventPtr) {
+  return syncEventImpl(EventPtr);
+}
+
+/// Exposed library API function, basically wrappers around the GenericDeviceTy
+/// functionality with the same name. All non-async functions are redirected
+/// to the async versions right away with a NULL AsyncInfoPtr.
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int32_t __tgt_rtl_init_plugin() {
+  auto Err = Plugin::init();
+  if (Err)
+    REPORT("Failure to initialize plugin " GETNAME(TARGET_NAME) ": %s\n",
+           toString(std::move(Err)).data());
+
+  return (bool)Err;
+}
+
+int32_t __tgt_rtl_deinit_plugin() {
+  auto Err = Plugin::deinit();
+  if (Err)
+    REPORT("Failure to deinitialize plugin " GETNAME(TARGET_NAME) ": %s\n",
+           toString(std::move(Err)).data());
+
+  return (bool)Err;
+}
+
+int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *TgtImage) {
+  if (!Plugin::isActive())
+    return false;
+
+  return elf_check_machine(TgtImage, Plugin::get().getMagicElfBits());
+}
+
+int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *TgtImage,
+                                       __tgt_image_info *Info) {
+  if (!Plugin::isActive())
+    return false;
+
+  if (!__tgt_rtl_is_valid_binary(TgtImage))
+    return false;
+
+  // A subarchitecture was not specified. Assume it is compatible.
+  if (!Info->Arch)
+    return true;
+
+  // Check the compatibility with all the available devices. Notice the
+  // devices may not be initialized yet.
+  auto CompatibleOrErr = Plugin::get().isImageCompatible(Info);
+  if (!CompatibleOrErr) {
+    // This error should not abort the execution, so we just inform the user
+    // through the debug system.
+    std::string ErrString = toString(CompatibleOrErr.takeError());
+    DP("Failure to check whether image %p is valid: %s\n", TgtImage,
+       ErrString.data());
+    return false;
+  }
+
+  bool Compatible = *CompatibleOrErr;
+  DP("Image is %scompatible with current environment: %s\n",
+     (Compatible) ? "" : "not", Info->Arch);
+
+  return Compatible;
+}
+
+int32_t __tgt_rtl_supports_empty_images() {
+  return Plugin::get().supportsEmptyImages();
+}
+
+int32_t __tgt_rtl_init_device(int32_t DeviceId) {
+  auto Err = Plugin::get().initDevice(DeviceId);
+  if (Err)
+    REPORT("Failure to initialize device %d: %s\n", DeviceId,
+           toString(std::move(Err)).data());
+
+  return (bool)Err;
+}
+
+int32_t __tgt_rtl_deinit_device(int32_t DeviceId) {
+  auto Err = Plugin::get().deinitDevice(DeviceId);
+  if (Err)
+    REPORT("Failure to deinitialize device %d: %s\n", DeviceId,
+           toString(std::move(Err)).data());
+
+  return (bool)Err;
+}
+
+int32_t __tgt_rtl_number_of_devices() { return Plugin::get().getNumDevices(); }
+
+int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
+  Plugin::get().setRequiresFlag(RequiresFlags);
+  return RequiresFlags;
+}
+
+int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDeviceId,
+                                      int32_t DstDeviceId) {
+  return Plugin::get().isDataExchangable(SrcDeviceId, DstDeviceId);
+}
+
+__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
+                                          __tgt_device_image *TgtImage) {
+  GenericPluginTy &Plugin = Plugin::get();
+  auto TableOrErr = Plugin.getDevice(DeviceId).loadBinary(Plugin, TgtImage);
+  if (!TableOrErr) {
+    auto Err = TableOrErr.takeError();
+    REPORT("Failure to load binary image %p on device %d: %s\n", TgtImage,
+           DeviceId, toString(std::move(Err)).data());
+    return nullptr;
+  }
+
+  __tgt_target_table *Table = *TableOrErr;
+  assert(Table != nullptr && "Invalid table");
+
+  return Table;
+}
+
+void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr,
+                           int32_t Kind) {
+  auto AllocOrErr = Plugin::get().getDevice(DeviceId).dataAlloc(
+      Size, HostPtr, (TargetAllocTy)Kind);
+  if (!AllocOrErr) {
+    auto Err = AllocOrErr.takeError();
+    REPORT("Failure to allocate device memory: %s\n",
+           toString(std::move(Err)).data());
+    return nullptr;
+  }
+  assert(*AllocOrErr && "Null pointer upon successful allocation");
+
+  return *AllocOrErr;
+}
+
+int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr, int32_t Kind) {
+  auto Err =
+      Plugin::get().getDevice(DeviceId).dataDelete(TgtPtr, (TargetAllocTy)Kind);
+  if (Err)
+    REPORT("Failure to deallocate device pointer %p: %s\n", TgtPtr,
+           toString(std::move(Err)).data());
+
+  return (bool)Err;
+}
+
+int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
+                              int64_t Size) {
+  return __tgt_rtl_data_submit_async(DeviceId, TgtPtr, HstPtr, Size,
+                                     /* AsyncInfoPtr */ nullptr);
+}
+
+int32_t __tgt_rtl_data_submit_async(int32_t DeviceId, void *TgtPtr,
+                                    void *HstPtr, int64_t Size,
+                                    __tgt_async_info *AsyncInfoPtr) {
+  auto Err = Plugin::get().getDevice(DeviceId).dataSubmit(TgtPtr, HstPtr, Size,
+                                                          AsyncInfoPtr);
+  if (Err)
+    REPORT("Failure to copy data from host to device. Pointers: host "
+           "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n",
+           DPxPTR(HstPtr), DPxPTR(TgtPtr), Size,
+           toString(std::move(Err)).data());
+
+  return (bool)Err;
+}
+
+int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
+                                int64_t Size) {
+  return __tgt_rtl_data_retrieve_async(DeviceId, HstPtr, TgtPtr, Size,
+                                       /* AsyncInfoPtr */ nullptr);
+}
+
+int32_t __tgt_rtl_data_retrieve_async(int32_t DeviceId, void *HstPtr,
+                                      void *TgtPtr, int64_t Size,
+                                      __tgt_async_info *AsyncInfoPtr) {
+  auto Err = Plugin::get().getDevice(DeviceId).dataRetrieve(HstPtr, TgtPtr,
+                                                            Size, AsyncInfoPtr);
+  if (Err)
+    REPORT("Faliure to copy data from device to host. Pointers: host "
+           "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n",
+           DPxPTR(HstPtr), DPxPTR(TgtPtr), Size,
+           toString(std::move(Err)).data());
+
+  return (bool)Err;
+}
+
+int32_t __tgt_rtl_data_exchange(int32_t SrcDeviceId, void *SrcPtr,
+                                int32_t DstDeviceId, void *DstPtr,
+                                int64_t Size) {
+  return __tgt_rtl_data_exchange_async(SrcDeviceId, SrcPtr, DstDeviceId, DstPtr,
+                                       Size, /* AsyncInfoPtr */ nullptr);
+}
+
+int32_t __tgt_rtl_data_exchange_async(int32_t SrcDeviceId, void *SrcPtr,
+                                      int DstDeviceId, void *DstPtr,
+                                      int64_t Size,
+                                      __tgt_async_info *AsyncInfo) {
+  GenericDeviceTy &SrcDevice = Plugin::get().getDevice(SrcDeviceId);
+  GenericDeviceTy &DstDevice = Plugin::get().getDevice(DstDeviceId);
+  auto Err = SrcDevice.dataExchange(SrcPtr, DstDevice, DstPtr, Size, AsyncInfo);
+  if (Err)
+    REPORT("Failure to copy data from device (%d) to device (%d). Pointers: "
+           "host = " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n",
+           SrcDeviceId, DstDeviceId, DPxPTR(SrcPtr), DPxPTR(DstPtr), Size,
+           toString(std::move(Err)).data());
+
+  return (bool)Err;
+}
+
+int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr,
+                                         void **TgtArgs, ptrdiff_t *TgtOffsets,
+                                         int32_t NumArgs, int32_t NumTeams,
+                                         int32_t ThreadLimit,
+                                         uint64_t LoopTripCount) {
+  return __tgt_rtl_run_target_team_region_async(DeviceId, TgtEntryPtr, TgtArgs,
+                                                TgtOffsets, NumArgs, NumTeams,
+                                                ThreadLimit, LoopTripCount,
+                                                /* AsyncInfoPtr */ nullptr);
+}
+
+int32_t __tgt_rtl_run_target_team_region_async(
+    int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets,
+    int32_t NumArgs, int32_t NumTeams, int32_t ThreadLimit,
+    uint64_t LoopTripCount, __tgt_async_info *AsyncInfoPtr) {
+  auto Err = Plugin::get().getDevice(DeviceId).runTargetTeamRegion(
+      TgtEntryPtr, TgtArgs, TgtOffsets, NumArgs, NumTeams, ThreadLimit,
+      LoopTripCount, AsyncInfoPtr);
+  if (Err)
+    REPORT("Failure to run target region " DPxMOD " in device %d: %s\n",
+           DPxPTR(TgtEntryPtr), DeviceId, toString(std::move(Err)).data());
+
+  return (bool)Err;
+}
+
+int32_t __tgt_rtl_synchronize(int32_t DeviceId,
+                              __tgt_async_info *AsyncInfoPtr) {
+  auto Err = Plugin::get().getDevice(DeviceId).synchronize(AsyncInfoPtr);
+  if (Err)
+    REPORT("Failure to synchronize stream %p: %s\n", AsyncInfoPtr->Queue,
+           toString(std::move(Err)).data());
+
+  return (bool)Err;
+}
+
+int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr,
+                                    void **TgtArgs, ptrdiff_t *TgtOffsets,
+                                    int32_t NumArgs) {
+  return __tgt_rtl_run_target_region_async(DeviceId, TgtEntryPtr, TgtArgs,
+                                           TgtOffsets, NumArgs,
+                                           /* AsyncInfoPtr */ nullptr);
+}
+
+int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, void *TgtEntryPtr,
+                                          void **TgtArgs, ptrdiff_t *TgtOffsets,
+                                          int32_t NumArgs,
+                                          __tgt_async_info *AsyncInfoPtr) {
+  return __tgt_rtl_run_target_team_region_async(
+      DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, NumArgs,
+      /* team num*/ 1, /* thread limit */ 1, /* loop tripcount */ 0,
+      AsyncInfoPtr);
+}
+
+void __tgt_rtl_print_device_info(int32_t DeviceId) {
+  if (auto Err = Plugin::get().getDevice(DeviceId).printInfo())
+    REPORT("Failure to print device %d info: %s\n", DeviceId,
+           toString(std::move(Err)).data());
+}
+
+int32_t __tgt_rtl_create_event(int32_t DeviceId, void **EventPtr) {
+  auto Err = Plugin::get().getDevice(DeviceId).createEvent(EventPtr);
+  if (Err)
+    REPORT("Failure to create event: %s\n", toString(std::move(Err)).data());
+
+  return (bool)Err;
+}
+
+int32_t __tgt_rtl_record_event(int32_t DeviceId, void *EventPtr,
+                               __tgt_async_info *AsyncInfoPtr) {
+  auto Err =
+      Plugin::get().getDevice(DeviceId).recordEvent(EventPtr, AsyncInfoPtr);
+  if (Err)
+    REPORT("Failure to record event %p: %s\n", EventPtr,
+           toString(std::move(Err)).data());
+
+  return (bool)Err;
+}
+
+int32_t __tgt_rtl_wait_event(int32_t DeviceId, void *EventPtr,
+                             __tgt_async_info *AsyncInfoPtr) {
+  auto Err =
+      Plugin::get().getDevice(DeviceId).waitEvent(EventPtr, AsyncInfoPtr);
+  if (Err)
+    REPORT("Failure to wait event %p: %s\n", EventPtr,
+           toString(std::move(Err)).data());
+
+  return (bool)Err;
+}
+
+int32_t __tgt_rtl_sync_event(int32_t DeviceId, void *EventPtr) {
+  auto Err = Plugin::get().getDevice(DeviceId).syncEvent(EventPtr);
+  if (Err)
+    REPORT("Failure to synchronize event %p: %s\n", EventPtr,
+           toString(std::move(Err)).data());
+
+  return (bool)Err;
+}
+
+int32_t __tgt_rtl_destroy_event(int32_t DeviceId, void *EventPtr) {
+  auto Err = Plugin::get().getDevice(DeviceId).destroyEvent(EventPtr);
+  if (Err)
+    REPORT("Failure to destroy event %p: %s\n", EventPtr,
+           toString(std::move(Err)).data());
+
+  return (bool)Err;
+}
+
+void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
+  std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
+  InfoLevel.store(NewInfoLevel);
+}
+
+int32_t __tgt_rtl_init_async_info(int32_t DeviceId,
+                                  __tgt_async_info **AsyncInfoPtr) {
+  assert(AsyncInfoPtr && "Invalid async info");
+
+  auto Err = Plugin::get().getDevice(DeviceId).initAsyncInfo(AsyncInfoPtr);
+  if (Err)
+    REPORT("Failure to initialize async info at " DPxMOD " on device %d: %s\n",
+           DPxPTR(*AsyncInfoPtr), DeviceId, toString(std::move(Err)).data());
+
+  return (bool)Err;
+}
+
+int32_t __tgt_rtl_init_device_info(int32_t DeviceId,
+                                   __tgt_device_info *DeviceInfo,
+                                   const char **ErrStr) {
+  *ErrStr = "";
+
+  auto Err = Plugin::get().getDevice(DeviceId).initDeviceInfo(DeviceInfo);
+  if (Err)
+    REPORT("Failure to initialize device info at " DPxMOD " on device %d: %s\n",
+           DPxPTR(DeviceInfo), DeviceId, toString(std::move(Err)).data());
+
+  return (bool)Err;
+}
+
+#ifdef __cplusplus
+}
+#endif
Index: openmp/libomptarget/plugins-nextgen/cuda/CMakeLists.txt
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/cuda/CMakeLists.txt
@@ -0,0 +1,98 @@
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a CUDA machine if available.
+#
+##===----------------------------------------------------------------------===##
+set(LIBOMPTARGET_BUILD_CUDA_PLUGIN TRUE CACHE BOOL
+  "Whether to build CUDA plugin")
+if (NOT LIBOMPTARGET_BUILD_CUDA_PLUGIN)
+  libomptarget_say("Not building CUDA NextGen offloading plugin: LIBOMPTARGET_BUILD_CUDA_PLUGIN is false")
+  return()
+endif()
+
+if (NOT(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux"))
+  libomptarget_say("Not building CUDA NextGen offloading plugin: only support CUDA in Linux x86_64, ppc64le, or aarch64 hosts.")
+  return()
+endif()
+
+libomptarget_say("Building CUDA NextGen offloading plugin.")
+
+# Define the suffix for the runtime messaging dumps.
+add_definitions("-DTARGET_NAME=CUDA")
+
+# Define debug prefix. TODO: This should be automatized in the Debug.h but it
+# requires changing the original plugins.
+add_definitions(-DDEBUG_PREFIX="TARGET CUDA RTL")
+
+set(LIBOMPTARGET_DLOPEN_LIBCUDA OFF)
+option(LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA "Build with dlopened libcuda" ${LIBOMPTARGET_DLOPEN_LIBCUDA})
+
+set(LIBOMPTARGET_CAN_LINK_LIBCUDA FALSE)
+if (LIBOMPTARGET_DEP_CUDA_FOUND AND LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND)
+  set(LIBOMPTARGET_CAN_LINK_LIBCUDA TRUE)
+endif()
+
+if (LIBOMPTARGET_CAN_LINK_LIBCUDA AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA)
+  libomptarget_say("Building CUDA NextGen plugin linked against libcuda")
+  include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS})
+  add_llvm_library(omptarget.rtl.cuda.nextgen SHARED
+
+    src/rtl.cpp
+
+    ADDITIONAL_HEADER_DIRS
+    ${LIBOMPTARGET_INCLUDE_DIR}
+
+    LINK_LIBS
+    PRIVATE
+    elf_common
+    MemoryManager
+    PluginInterface
+    ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES}
+    ${OPENMP_PTHREAD_LIB}
+    "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
+    "-Wl,-z,defs"
+
+    NO_INSTALL_RPATH
+  )
+else()
+  libomptarget_say("Building CUDA NextGen plugin for dlopened libcuda")
+  include_directories(../../plugins/cuda/dynamic_cuda)
+  add_llvm_library(omptarget.rtl.cuda.nextgen
+    SHARED
+
+    src/rtl.cpp
+    ../../plugins/cuda/dynamic_cuda/cuda.cpp
+
+    ADDITIONAL_HEADER_DIRS
+    ${LIBOMPTARGET_INCLUDE_DIR}
+
+    LINK_LIBS
+    PRIVATE
+    elf_common
+    MemoryManager
+    PluginInterface
+    ${CMAKE_DL_LIBS}
+    ${OPENMP_PTHREAD_LIB}
+    "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
+    "-Wl,-z,defs"
+
+    NO_INSTALL_RPATH
+  )
+endif()
+add_dependencies(omptarget.rtl.cuda.nextgen omptarget.devicertl.nvptx)
+
+# Install plugin under the lib destination folder.
+install(TARGETS omptarget.rtl.cuda.nextgen LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+set_target_properties(omptarget.rtl.cuda.nextgen PROPERTIES 
+  INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.."
+  CXX_VISIBILITY_PRESET protected)
+
+target_include_directories(omptarget.rtl.cuda.nextgen PRIVATE
+  ${LIBOMPTARGET_INCLUDE_DIR}
+)
Index: openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -0,0 +1,1051 @@
+//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// RTL NextGen for CUDA machine
+//
+//===----------------------------------------------------------------------===//
+
+#include <cassert>
+#include <cstddef>
+#include <cuda.h>
+#include <string>
+#include <unordered_map>
+
+#include "Debug.h"
+#include "DeviceEnvironment.h"
+#include "GlobalHandler.h"
+#include "PluginInterface.h"
+
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include "llvm/Frontend/OpenMP/OMPGridValues.h"
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+/// Forward declarations for all specialized data structures.
+struct CUDAKernelTy;
+struct CUDADeviceTy;
+struct CUDAPluginTy;
+struct CUDAStreamManagerTy;
+struct CUDAEventManagerTy;
+
+/// Class implementing the CUDA kernel functionalities which derives from the
+/// generic kernel class.
+struct CUDAKernelTy : public GenericKernelTy {
+  /// Create a CUDA kernel with a name, an execution mode, and the kernel
+  /// function.
+  CUDAKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode,
+               CUfunction Func)
+      : GenericKernelTy(Name, ExecutionMode), Func(Func) {}
+
+  /// Initialize the CUDA kernel
+  Error initImpl(GenericDeviceTy &GenericDevice,
+                 DeviceImageTy &Image) override {
+    int MaxThreads;
+    CUresult Res = cuFuncGetAttribute(
+        &MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Func);
+    if (auto Err = Plugin::check(Res, "Error in cuFuncGetAttribute: %s"))
+      return Err;
+
+    /// Set the maximum number of threads for the CUDA kernel.
+    MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);
+
+    return Plugin::success();
+  }
+
+  /// Launch the CUDA kernel function
+  Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
+                   uint64_t NumBlocks, uint32_t DynamicMemorySize,
+                   int32_t NumKernelArgs, void *KernelArgs,
+                   AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
+
+  /// The default number of blocks is common to the whole device.
+  uint64_t getDefaultNumBlocks(GenericDeviceTy &GenericDevice) const override {
+    return GenericDevice.getDefaultNumBlocks();
+  }
+
+  /// The default number of threads is common to the whole device.
+  uint32_t getDefaultNumThreads(GenericDeviceTy &GenericDevice) const override {
+    return GenericDevice.getDefaultNumThreads();
+  }
+
+private:
+  /// The CUDA kernel function to execute.
+  CUfunction Func;
+};
+
+/// Class wrapping a CUDA stream reference. These are the objects handled by the
+/// Stream Manager for the CUDA plugin.
+class CUDAStreamRef final : public GenericDeviceResourceRef {
+  /// The reference to the CUDA stream.
+  CUstream Stream;
+
+public:
+  /// Create an empty reference to an invalid stream.
+  CUDAStreamRef() : Stream(nullptr) {}
+
+  /// Create a reference to an existing stream.
+  CUDAStreamRef(CUstream Stream) : Stream(Stream) {}
+
+  /// Create a new stream and save the reference. The reference must be empty
+  /// before calling to this function.
+  Error create() override {
+    if (Stream)
+      return Plugin::error("Creating an existing stream");
+
+    CUresult Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING);
+    if (auto Err = Plugin::check(Res, "Error in cuStreamCreate: %s"))
+      return Err;
+
+    return Plugin::success();
+  }
+
+  /// Destroy the referenced stream and invalidate the reference. The reference
+  /// must be to a valid stream before calling to this function.
+  Error destroy() override {
+    if (!Stream)
+      return Plugin::error("Destroying an invalid stream");
+
+    CUresult Res = cuStreamDestroy(Stream);
+    if (auto Err = Plugin::check(Res, "Error in cuStreamDestroy: %s"))
+      return Err;
+
+    Stream = nullptr;
+    return Plugin::success();
+  }
+
+  /// Get the underlying CUstream.
+  operator CUstream() const { return Stream; }
+};
+
+/// Class wrapping a CUDA event reference. These are the objects handled by the
+/// Event Manager for the CUDA plugin.
+class CUDAEventRef final : public GenericDeviceResourceRef {
+  CUevent Event;
+
+public:
+  /// Create an empty reference to an invalid event.
+  CUDAEventRef() : Event(nullptr) {}
+
+  /// Create a reference to an existing event.
+  CUDAEventRef(CUevent Event) : Event(Event) {}
+
+  /// Create a new event and save the reference. The reference must be empty
+  /// before calling to this function.
+  Error create() override {
+    if (Event)
+      return Plugin::error("Creating an existing event");
+
+    CUresult Res = cuEventCreate(&Event, CU_EVENT_DEFAULT);
+    if (auto Err = Plugin::check(Res, "Error in cuEventCreate: %s"))
+      return Err;
+
+    return Plugin::success();
+  }
+
+  /// Destroy the referenced event and invalidate the reference. The reference
+  /// must be to a valid event before calling to this function.
+  Error destroy() override {
+    if (!Event)
+      return Plugin::error("Destroying an invalid event");
+
+    CUresult Res = cuEventDestroy(Event);
+    if (auto Err = Plugin::check(Res, "Error in cuEventDestroy: %s"))
+      return Err;
+
+    Event = nullptr;
+    return Plugin::success();
+  }
+
+  /// Get the underlying CUevent.
+  operator CUevent() const { return Event; }
+};
+
+/// Class implementing the CUDA device images properties.
+struct CUDADeviceImageTy : public DeviceImageTy {
+  /// Create the CUDA image with the id and the target image pointer.
+  CUDADeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage)
+      : DeviceImageTy(ImageId, TgtImage), Module(nullptr) {}
+
+  /// Load the image as a CUDA module.
+  Error loadModule() {
+    assert(!Module && "Module already loaded");
+
+    CUresult Res = cuModuleLoadDataEx(&Module, getStart(), 0, nullptr, nullptr);
+    if (auto Err = Plugin::check(Res, "Error in cuModuleLoadDataEx: %s"))
+      return Err;
+
+    return Plugin::success();
+  }
+
+  /// Unload the CUDA module corresponding to the image.
+  Error unloadModule() {
+    assert(Module && "Module not loaded");
+
+    CUresult Res = cuModuleUnload(Module);
+    if (auto Err = Plugin::check(Res, "Error in cuModuleUnload: %s"))
+      return Err;
+
+    Module = nullptr;
+
+    return Plugin::success();
+  }
+
+  /// Getter of the CUDA module.
+  CUmodule getModule() const { return Module; }
+
+private:
+  /// The CUDA module that loaded the image.
+  CUmodule Module;
+};
+
+/// Class implementing the CUDA device functionalities which derives from the
+/// generic device class.
+struct CUDADeviceTy : public GenericDeviceTy {
+  // Create a CUDA device with a device id and the default CUDA grid values.
+  CUDADeviceTy(int32_t DeviceId, int32_t NumDevices)
+      : GenericDeviceTy(DeviceId, NumDevices, NVPTXGridValues),
+        CUDAStreamManager(*this), CUDAEventManager(*this) {}
+
+  ~CUDADeviceTy() {}
+
+  /// Initialize the device, its resources and get its properties.
+  Error initImpl(GenericPluginTy &Plugin) override {
+    CUresult Res = cuDeviceGet(&Device, DeviceId);
+    if (auto Err = Plugin::check(Res, "Error in cuDeviceGet: %s"))
+      return Err;
+
+    // Query the current flags of the primary context and set its flags if
+    // it is inactive.
+    unsigned int FormerPrimaryCtxFlags = 0;
+    int FormerPrimaryCtxIsActive = 0;
+    Res = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
+                                     &FormerPrimaryCtxIsActive);
+    if (auto Err =
+            Plugin::check(Res, "Error in cuDevicePrimaryCtxGetState: %s"))
+      return Err;
+
+    if (FormerPrimaryCtxIsActive) {
+      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+           "The primary context is active, no change to its flags\n");
+      if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
+          CU_CTX_SCHED_BLOCKING_SYNC)
+        INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+             "Warning: The current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
+    } else {
+      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+           "The primary context is inactive, set its flags to "
+           "CU_CTX_SCHED_BLOCKING_SYNC\n");
+      Res = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
+      if (auto Err =
+              Plugin::check(Res, "Error in cuDevicePrimaryCtxSetFlags: %s"))
+        return Err;
+    }
+
+    // Retain the per device primary context and save it to use whenever this
+    // device is selected.
+    Res = cuDevicePrimaryCtxRetain(&Context, Device);
+    if (auto Err = Plugin::check(Res, "Error in cuDevicePrimaryCtxRetain: %s"))
+      return Err;
+
+    if (auto Err = setContext())
+      return Err;
+
+    // Initialize stream pool.
+    if (auto Err = CUDAStreamManager.init())
+      return Err;
+
+    // Initialize event pool.
+    if (auto Err = CUDAEventManager.init())
+      return Err;
+
+    // Query attributes to determine number of threads/block and blocks/grid.
+    if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
+                                 GridValues.GV_Max_Teams))
+      return Err;
+
+    if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
+                                 GridValues.GV_Max_WG_Size))
+      return Err;
+
+    if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE,
+                                 GridValues.GV_Warp_Size))
+      return Err;
+
+    return Plugin::success();
+  }
+
+  /// Deinitialize the device and release its resources.
+  Error deinitImpl() override {
+    if (Context) {
+      if (auto Err = setContext())
+        return Err;
+    }
+
+    // Deinitialize the stream manager.
+    if (auto Err = CUDAStreamManager.deinit())
+      return Err;
+
+    if (auto Err = CUDAEventManager.deinit())
+      return Err;
+
+    // Close modules if necessary.
+    if (!LoadedImages.empty()) {
+      assert(Context && "Invalid CUDA context");
+
+      // Each image has its own module.
+      for (DeviceImageTy *Image : LoadedImages) {
+        CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(*Image);
+
+        // Unload the module of the image.
+        if (auto Err = CUDAImage.unloadModule())
+          return Err;
+      }
+    }
+
+    if (Context) {
+      CUresult Res = cuDevicePrimaryCtxRelease(Device);
+      if (auto Err =
+              Plugin::check(Res, "Error in cuDevicePrimaryCtxRelease: %s"))
+        return Err;
+    }
+
+    // Invalidate context and device references.
+    Context = nullptr;
+    Device = CU_DEVICE_INVALID;
+
+    return Plugin::success();
+  }
+
+  /// Allocate and construct a CUDA kernel.
+  Expected<GenericKernelTy *>
+  constructKernelEntry(const __tgt_offload_entry &KernelEntry,
+                       DeviceImageTy &Image) override {
+    CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(Image);
+
+    // Retrieve the function pointer of the kernel.
+    CUfunction Func;
+    CUresult Res =
+        cuModuleGetFunction(&Func, CUDAImage.getModule(), KernelEntry.name);
+    if (auto Err = Plugin::check(Res, "Error in cuModuleGetFunction('%s'): %s",
+                                 KernelEntry.name))
+      return std::move(Err);
+
+    DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", DPxPTR(&KernelEntry),
+       KernelEntry.name, DPxPTR(Func));
+
+    // Create a metadata object for the exec mode global (auto-generated).
+    StaticGlobalTy<llvm::omp::OMPTgtExecModeFlags> ExecModeGlobal(
+        KernelEntry.name, "_exec_mode");
+
+    // Retrieve execution mode for the kernel. This may fail since some kernels
+    // may not have a execution mode.
+    GenericGlobalHandlerTy &GHandler = Plugin::get().getGlobalHandler();
+    if (auto Err = GHandler.readGlobalFromImage(*this, Image, ExecModeGlobal)) {
+      // In some cases the execution mode is not included, so use the default.
+      ExecModeGlobal.setValue(llvm::omp::OMP_TGT_EXEC_MODE_GENERIC);
+      // Consume the error since it is acceptable to fail.
+      [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
+
+      DP("Failed to read execution mode for '%s': %s\n"
+         "Using default GENERIC (1) execution mode\n",
+         KernelEntry.name, ErrStr.data());
+    }
+
+    // Check that the retrieved execution mode is valid.
+    if (!GenericKernelTy::isValidExecutionMode(ExecModeGlobal.getValue()))
+      return Plugin::error("Invalid execution mode %d for '%s'",
+                           ExecModeGlobal.getValue(), KernelEntry.name);
+
+    // Allocate and initialize the CUDA kernel.
+    CUDAKernelTy *CUDAKernel = Plugin::get().allocate<CUDAKernelTy>();
+    new (CUDAKernel)
+        CUDAKernelTy(KernelEntry.name, ExecModeGlobal.getValue(), Func);
+
+    return CUDAKernel;
+  }
+
+  /// Set the current context to this device's context.
+  Error setContext() override {
+    CUresult Res = cuCtxSetCurrent(Context);
+    return Plugin::check(Res, "Error in cuCtxSetCurrent: %s");
+  }
+
+  /// Get the stream of the asynchronous info sructure or get a new one.
+  CUstream getStream(AsyncInfoWrapperTy &AsyncInfoWrapper) {
+    CUstream &Stream = AsyncInfoWrapper.getQueueAs<CUstream>();
+    if (!Stream)
+      Stream = CUDAStreamManager.getStream();
+    return Stream;
+  }
+
+  /// Getters of CUDA references.
+  CUcontext getCUDAContext() const { return Context; }
+  CUdevice getCUDADevice() const { return Device; }
+
+  /// Load the binary image into the device and allocate an image object.
+  Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
+                                           int32_t ImageId) override {
+    if (auto Err = setContext())
+      return std::move(Err);
+
+    // Allocate and initialize the image object.
+    CUDADeviceImageTy *CUDAImage = Plugin::get().allocate<CUDADeviceImageTy>();
+    new (CUDAImage) CUDADeviceImageTy(ImageId, TgtImage);
+
+    // Load the CUDA module.
+    if (auto Err = CUDAImage->loadModule())
+      return std::move(Err);
+
+    return CUDAImage;
+  }
+
+  /// Allocate memory on the device or related to the device.
+  void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
+    if (Size == 0)
+      return nullptr;
+
+    if (auto Err = setContext()) {
+      REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data());
+      return nullptr;
+    }
+
+    void *MemAlloc = nullptr;
+    CUdeviceptr DevicePtr;
+    CUresult Res;
+
+    switch (Kind) {
+    case TARGET_ALLOC_DEFAULT:
+    case TARGET_ALLOC_DEVICE:
+      Res = cuMemAlloc(&DevicePtr, Size);
+      MemAlloc = (void *)DevicePtr;
+      break;
+    case TARGET_ALLOC_HOST:
+      Res = cuMemAllocHost(&MemAlloc, Size);
+      break;
+    case TARGET_ALLOC_SHARED:
+      Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL);
+      MemAlloc = (void *)DevicePtr;
+      break;
+    }
+
+    if (auto Err =
+            Plugin::check(Res, "Error in cuMemAlloc[Host|Managed]: %s")) {
+      REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data());
+      return nullptr;
+    }
+    return MemAlloc;
+  }
+
+  /// Deallocate memory on the device or related to the device.
+  int free(void *TgtPtr, TargetAllocTy Kind) override {
+    if (TgtPtr == nullptr)
+      return OFFLOAD_SUCCESS;
+
+    if (auto Err = setContext()) {
+      REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
+      return OFFLOAD_FAIL;
+    }
+
+    CUresult Res;
+    switch (Kind) {
+    case TARGET_ALLOC_DEFAULT:
+    case TARGET_ALLOC_DEVICE:
+    case TARGET_ALLOC_SHARED:
+      Res = cuMemFree((CUdeviceptr)TgtPtr);
+      break;
+    case TARGET_ALLOC_HOST:
+      Res = cuMemFreeHost(TgtPtr);
+      break;
+    }
+
+    if (auto Err = Plugin::check(Res, "Error in cuMemFree[Host]: %s")) {
+      REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
+      return OFFLOAD_FAIL;
+    }
+    return OFFLOAD_SUCCESS;
+  }
+
+  /// Synchronize current thread with the pending operations on the async info.
+  Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
+    CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
+    CUresult Res = cuStreamSynchronize(Stream);
+
+    // Once the stream is synchronized, return it to stream pool and reset
+    // AsyncInfo. This is to make sure the synchronization only works for its
+    // own tasks.
+    CUDAStreamManager.returnStream(Stream);
+    AsyncInfo.Queue = nullptr;
+
+    return Plugin::check(Res, "Error in cuStreamSynchronize: %s");
+  }
+
+  /// Submit data to the device (host to device transfer).
+  Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
+                       AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    if (auto Err = setContext())
+      return Err;
+
+    CUstream Stream = getStream(AsyncInfoWrapper);
+    if (!Stream)
+      return Plugin::error("Failure to get stream");
+
+    CUresult Res = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
+    return Plugin::check(Res, "Error in cuMemcpyHtoDAsync: %s");
+  }
+
+  /// Retrieve data from the device (device to host transfer).
+  Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
+                         AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    if (auto Err = setContext())
+      return Err;
+
+    CUstream Stream = getStream(AsyncInfoWrapper);
+    if (!Stream)
+      return Plugin::error("Failure to get stream");
+
+    CUresult Res = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
+    return Plugin::check(Res, "Error in cuMemcpyDtoHAsync: %s");
+  }
+
+  /// Exchange data between two devices directly. We may use peer access if
+  /// the CUDA devices and driver allow them.
+  Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
+                         void *DstPtr, int64_t Size,
+                         AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+
+  /// Initialize the async info for interoperability purposes.
+  Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    if (auto Err = setContext())
+      return Err;
+
+    if (!getStream(AsyncInfoWrapper))
+      return Plugin::error("Failure to get stream");
+
+    return Plugin::success();
+  }
+
+  /// Initialize the device info for interoperability purposes.
+  Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
+    assert(Context && "Context is null");
+    assert(Device != CU_DEVICE_INVALID && "Invalid CUDA device");
+
+    if (auto Err = setContext())
+      return Err;
+
+    if (!DeviceInfo->Context)
+      DeviceInfo->Context = Context;
+
+    if (!DeviceInfo->Device)
+      DeviceInfo->Device = reinterpret_cast<void *>(Device);
+
+    return Plugin::success();
+  }
+
+  /// Create an event.
+  Error createEventImpl(void **EventPtrStorage) override {
+    CUevent *Event = reinterpret_cast<CUevent *>(EventPtrStorage);
+    *Event = CUDAEventManager.getEvent();
+    return Plugin::success();
+  }
+
+  /// Destroy a previously created event.
+  Error destroyEventImpl(void *EventPtr) override {
+    CUevent Event = reinterpret_cast<CUevent>(EventPtr);
+    CUDAEventManager.returnEvent(Event);
+    return Plugin::success();
+  }
+
+  /// Record the event.
+  Error recordEventImpl(void *EventPtr,
+                        AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    CUevent Event = reinterpret_cast<CUevent>(EventPtr);
+
+    CUstream Stream = getStream(AsyncInfoWrapper);
+    if (!Stream)
+      return Plugin::error("Failure to get stream");
+
+    CUresult Res = cuEventRecord(Event, Stream);
+    return Plugin::check(Res, "Error in cuEventRecord: %s");
+  }
+
+  /// Make the stream wait on the event.
+  Error waitEventImpl(void *EventPtr,
+                      AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    CUevent Event = reinterpret_cast<CUevent>(EventPtr);
+
+    CUstream Stream = getStream(AsyncInfoWrapper);
+    if (!Stream)
+      return Plugin::error("Failure to get stream");
+
+    // Do not use CU_EVENT_WAIT_DEFAULT here as it is only available from
+    // specific CUDA version, and defined as 0x0. In previous version, per CUDA
+    // API document, that argument has to be 0x0.
+    CUresult Res = cuStreamWaitEvent(Stream, Event, 0);
+    return Plugin::check(Res, "Error in cuStreamWaitEvent: %s");
+  }
+
+  /// Synchronize the current thread with the event.
+  Error syncEventImpl(void *EventPtr) override {
+    CUevent Event = reinterpret_cast<CUevent>(EventPtr);
+    CUresult Res = cuEventSynchronize(Event);
+    return Plugin::check(Res, "Error in cuEventSynchronize: %s");
+  }
+
+  /// Print information about the device.
+  Error printInfoImpl() override {
+    char TmpChar[1000];
+    std::string TmpStr;
+    size_t TmpSt;
+    int TmpInt, TmpInt2, TmpInt3;
+
+    // TODO: All these calls should be checked, but the whole printInfo must be
+    // improved, so we will refactor it in the future.
+    cuDriverGetVersion(&TmpInt);
+    printf("    CUDA Driver Version: \t\t%d \n", TmpInt);
+    printf("    CUDA Device Number: \t\t%d \n", DeviceId);
+
+    cuDeviceGetName(TmpChar, 1000, Device);
+    printf("    Device Name: \t\t\t%s \n", TmpChar);
+
+    cuDeviceTotalMem(&TmpSt, Device);
+    printf("    Global Memory Size: \t\t%zu bytes \n", TmpSt);
+    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
+                         Device);
+    printf("    Number of Multiprocessors: \t\t%d \n", TmpInt);
+
+    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device);
+    printf("    Concurrent Copy and Execution: \t%s \n", TmpInt ? "Yes" : "No");
+
+    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
+                         Device);
+    printf("    Total Constant Memory: \t\t%d bytes\n", TmpInt);
+
+    cuDeviceGetAttribute(
+        &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device);
+    printf("    Max Shared Memory per Block: \t%d bytes \n", TmpInt);
+
+    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
+                         Device),
+        printf("    Registers per Block: \t\t%d \n", TmpInt);
+
+    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
+    printf("    Warp Size: \t\t\t\t%d Threads \n", TmpInt);
+
+    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                         Device);
+    printf("    Maximum Threads per Block: \t\t%d \n", TmpInt);
+
+    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
+    cuDeviceGetAttribute(&TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device);
+    cuDeviceGetAttribute(&TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device);
+    printf("    Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2,
+           TmpInt3);
+
+    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device);
+    cuDeviceGetAttribute(&TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device);
+    cuDeviceGetAttribute(&TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device);
+    printf("    Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2,
+           TmpInt3);
+
+    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device);
+    printf("    Maximum Memory Pitch: \t\t%d bytes \n", TmpInt);
+
+    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT,
+                         Device);
+    printf("    Texture Alignment: \t\t\t%d bytes \n", TmpInt);
+
+    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device);
+    printf("    Clock Rate: \t\t\t%d kHz\n", TmpInt);
+
+    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,
+                         Device);
+    printf("    Execution Timeout: \t\t\t%s \n", TmpInt ? "Yes" : "No");
+
+    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device);
+    printf("    Integrated Device: \t\t\t%s \n", TmpInt ? "Yes" : "No");
+
+    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY,
+                         Device);
+    printf("    Can Map Host Memory: \t\t%s \n", TmpInt ? "Yes" : "No");
+
+    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device);
+    if (TmpInt == CU_COMPUTEMODE_DEFAULT)
+      TmpStr = "DEFAULT";
+    else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
+      TmpStr = "PROHIBITED";
+    else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
+      TmpStr = "EXCLUSIVE PROCESS";
+    else
+      TmpStr = "unknown";
+    printf("    Compute Mode: \t\t\t%s \n", TmpStr.c_str());
+
+    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS,
+                         Device);
+    printf("    Concurrent Kernels: \t\t%s \n", TmpInt ? "Yes" : "No");
+
+    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device);
+    printf("    ECC Enabled: \t\t\t%s \n", TmpInt ? "Yes" : "No");
+
+    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
+                         Device);
+    printf("    Memory Clock Rate: \t\t\t%d kHz\n", TmpInt);
+
+    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
+                         Device);
+    printf("    Memory Bus Width: \t\t\t%d bits\n", TmpInt);
+
+    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, Device);
+    printf("    L2 Cache Size: \t\t\t%d bytes \n", TmpInt);
+
+    cuDeviceGetAttribute(
+        &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, Device);
+    printf("    Max Threads Per SMP: \t\t%d \n", TmpInt);
+
+    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT,
+                         Device);
+    printf("    Async Engines: \t\t\t%s (%d) \n", TmpInt ? "Yes" : "No",
+           TmpInt);
+
+    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING,
+                         Device);
+    printf("    Unified Addressing: \t\t%s \n", TmpInt ? "Yes" : "No");
+
+    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device);
+    printf("    Managed Memory: \t\t\t%s \n", TmpInt ? "Yes" : "No");
+
+    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS,
+                         Device);
+    printf("    Concurrent Managed Memory: \t\t%s \n", TmpInt ? "Yes" : "No");
+
+    cuDeviceGetAttribute(
+        &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device);
+    printf("    Preemption Supported: \t\t%s \n", TmpInt ? "Yes" : "No");
+
+    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH,
+                         Device);
+    printf("    Cooperative Launch: \t\t%s \n", TmpInt ? "Yes" : "No");
+
+    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device);
+    printf("    Multi-Device Boars: \t\t%s \n", TmpInt ? "Yes" : "No");
+
+    cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                         Device);
+    cuDeviceGetAttribute(&TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+                         Device);
+    printf("    Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
+
+    return Plugin::success();
+  }
+
+  /// Getters and setters for stack and heap sizes.
+  Error getDeviceStackSize(uint64_t &Value) override {
+    return getCtxLimit(CU_LIMIT_STACK_SIZE, Value);
+  }
+  Error setDeviceStackSize(uint64_t Value) override {
+    return setCtxLimit(CU_LIMIT_STACK_SIZE, Value);
+  }
+  Error getDeviceHeapSize(uint64_t &Value) override {
+    return getCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value);
+  }
+  Error setDeviceHeapSize(uint64_t Value) override {
+    return setCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value);
+  }
+
+  /// CUDA-specific functions for getting and setting context limits.
+  Error setCtxLimit(CUlimit Kind, uint64_t Value) {
+    CUresult Res = cuCtxSetLimit(Kind, Value);
+    return Plugin::check(Res, "Error in cuCtxSetLimit: %s");
+  }
+  Error getCtxLimit(CUlimit Kind, uint64_t &Value) {
+    CUresult Res = cuCtxGetLimit(&Value, Kind);
+    return Plugin::check(Res, "Error in cuCtxGetLimit: %s");
+  }
+
+  /// CUDA-specific function to get device attributes.
+  Error getDeviceAttr(uint32_t Kind, uint32_t &Value) {
+    // TODO: Warn if the new value is larger than the old.
+    CUresult Res =
+        cuDeviceGetAttribute((int *)&Value, (CUdevice_attribute)Kind, Device);
+    return Plugin::check(Res, "Error in cuDeviceGetAttribute: %s");
+  }
+
+private:
+  using CUDAStreamManagerTy = GenericStreamManagerTy<CUDAStreamRef>;
+  using CUDAEventManagerTy = GenericEventManagerTy<CUDAEventRef>;
+
+  /// Stream manager for CUDA streams.
+  CUDAStreamManagerTy CUDAStreamManager;
+
+  /// Event manager for CUDA events.
+  CUDAEventManagerTy CUDAEventManager;
+
+  /// The device's context. This context should be set before performing
+  /// operations on the device.
+  CUcontext Context = nullptr;
+
+  /// The CUDA device handler.
+  CUdevice Device = CU_DEVICE_INVALID;
+};
+
+Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
+                               uint32_t NumThreads, uint64_t NumBlocks,
+                               uint32_t DynamicMemorySize,
+                               int32_t NumKernelArgs, void *KernelArgs,
+                               AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+  CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);
+
+  CUstream Stream = CUDADevice.getStream(AsyncInfoWrapper);
+  if (!Stream)
+    return Plugin::error("Failure to get stream");
+
+  CUresult Res =
+      cuLaunchKernel(Func, NumBlocks, /* gridDimY */ 1,
+                     /* gridDimZ */ 1, NumThreads,
+                     /* blockDimY */ 1, /* blockDimZ */ 1, DynamicMemorySize,
+                     Stream, (void **)KernelArgs, nullptr);
+  return Plugin::check(Res, "Error in cuLaunchKernel for '%s': %s", getName());
+}
+
+/// Class implementing the CUDA-specific functionalities of the global handler.
+class CUDAGlobalHandlerTy final : public GenericGlobalHandlerTy {
+public:
+  /// Get the metadata of a global from the device. The name and size of the
+  /// global is read from DeviceGlobal and the address of the global is written
+  /// to DeviceGlobal.
+  Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
+                                    DeviceImageTy &Image,
+                                    GlobalTy &DeviceGlobal) override {
+    CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(Image);
+
+    const char *GlobalName = DeviceGlobal.getName().data();
+
+    size_t CUSize;
+    CUdeviceptr CUPtr;
+    CUresult Res =
+        cuModuleGetGlobal(&CUPtr, &CUSize, CUDAImage.getModule(), GlobalName);
+    if (auto Err = Plugin::check(Res, "Error in cuModuleGetGlobal for '%s': %s",
+                                 GlobalName))
+      return Err;
+
+    if (CUSize != DeviceGlobal.getSize())
+      return Plugin::error(
+          "Failed to load global '%s' due to size mismatch (%zu != %zu)",
+          GlobalName, CUSize, (size_t)DeviceGlobal.getSize());
+
+    DeviceGlobal.setPtr(reinterpret_cast<void *>(CUPtr));
+    return Plugin::success();
+  }
+};
+
+/// Class implementing the CUDA-specific functionalities of the plugin.
+struct CUDAPluginTy final : public GenericPluginTy {
+  /// Create a CUDA plugin and initialize the CUDA driver.
+  CUDAPluginTy() : GenericPluginTy() {
+    CUresult Res = cuInit(0);
+    if (Res == CUDA_ERROR_INVALID_HANDLE) {
+      // Cannot call cuGetErrorString if dlsym failed.
+      DP("Failed to load CUDA shared library\n");
+      return;
+    }
+
+    if (Res == CUDA_ERROR_NO_DEVICE) {
+      // Do not initialize if there are no devices.
+      DP("There are no devices supporting CUDA.\n");
+      return;
+    }
+
+    if (auto Err = Plugin::check(Res, "Error in cuInit: %s")) {
+      REPORT("%s\n", toString(std::move(Err)).data());
+      return;
+    }
+
+    // Get the number of devices.
+    int NumDevices;
+    Res = cuDeviceGetCount(&NumDevices);
+    if (auto Err = Plugin::check(Res, "Error in cuDeviceGetCount: %s")) {
+      REPORT("%s\n", toString(std::move(Err)).data());
+      return;
+    }
+
+    // Do not initialize if there are no devices.
+    if (NumDevices == 0) {
+      DP("There are no devices supporting CUDA.\n");
+      return;
+    }
+
+    // Initialize the generic plugin structure.
+    GenericPluginTy::init(NumDevices, new CUDAGlobalHandlerTy());
+  }
+
+  /// This class should not be copied.
+  CUDAPluginTy(const CUDAPluginTy &) = delete;
+  CUDAPluginTy(CUDAPluginTy &&) = delete;
+
+  ~CUDAPluginTy() {}
+
+  /// Get the ELF code for recognizing the compatible image binary.
+  uint16_t getMagicElfBits() const override { return ELF::EM_CUDA; }
+
+  /// Create a CUDA device with a specific id.
+  CUDADeviceTy &createDevice(int32_t DeviceId) override {
+    CUDADeviceTy *Device = new CUDADeviceTy(DeviceId, getNumDevices());
+    return *Device;
+  }
+
+  /// Check whether the image is compatible with the available CUDA devices.
+  Expected<bool> isImageCompatible(__tgt_image_info *Info) const override {
+    for (int32_t DevId = 0; DevId < getNumDevices(); ++DevId) {
+      CUdevice Device;
+      CUresult Res = cuDeviceGet(&Device, DevId);
+      if (auto Err = Plugin::check(Res, "Error in cuDeviceGet: %s"))
+        return std::move(Err);
+
+      int32_t Major, Minor;
+      Res = cuDeviceGetAttribute(
+          &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device);
+      if (auto Err = Plugin::check(Res, "Error in cuDeviceGetAttribute: %s"))
+        return std::move(Err);
+
+      Res = cuDeviceGetAttribute(
+          &Minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device);
+      if (auto Err = Plugin::check(Res, "Error in cuDeviceGetAttribute: %s"))
+        return std::move(Err);
+
+      StringRef ArchStr(Info->Arch);
+      StringRef PrefixStr("sm_");
+      if (!ArchStr.startswith(PrefixStr))
+        return Plugin::error("Unrecognized image arch %s", ArchStr.data());
+
+      int32_t ImageMajor = ArchStr[PrefixStr.size() + 0] - '0';
+      int32_t ImageMinor = ArchStr[PrefixStr.size() + 1] - '0';
+
+      // A cubin generated for a certain compute capability is supported to run
+      // on any GPU with the same major revision and same or higher minor
+      // revision.
+      if (Major != ImageMajor || Minor < ImageMinor)
+        return false;
+    }
+    return true;
+  }
+};
+
+Error CUDADeviceTy::dataExchangeImpl(const void *SrcPtr,
+                                     GenericDeviceTy &DstGenericDevice,
+                                     void *DstPtr, int64_t Size,
+                                     AsyncInfoWrapperTy &AsyncInfoWrapper) {
+  if (auto Err = setContext())
+    return Err;
+
+  CUDADeviceTy &DstDevice = static_cast<CUDADeviceTy &>(DstGenericDevice);
+
+  CUresult Res;
+  int32_t DstDeviceId = DstDevice.DeviceId;
+  CUdeviceptr CUSrcPtr = (CUdeviceptr)SrcPtr;
+  CUdeviceptr CUDstPtr = (CUdeviceptr)DstPtr;
+
+  int CanAccessPeer = 0;
+  if (DeviceId != DstDeviceId) {
+    // Make sure the lock is released before performing the copies.
+    std::lock_guard<std::mutex> Lock(PeerAccessesLock);
+
+    switch (PeerAccesses[DstDeviceId]) {
+    case PeerAccessState::AVAILABLE:
+      CanAccessPeer = 1;
+      break;
+    case PeerAccessState::UNAVAILABLE:
+      CanAccessPeer = 0;
+      break;
+    case PeerAccessState::PENDING:
+      // Check whether the source device can access the destination device.
+      Res = cuDeviceCanAccessPeer(&CanAccessPeer, Device, DstDevice.Device);
+      if (auto Err = Plugin::check(Res, "Error in cuDeviceCanAccessPeer: %s"))
+        return Err;
+
+      if (CanAccessPeer) {
+        Res = cuCtxEnablePeerAccess(DstDevice.Context, 0);
+        if (Res == CUDA_ERROR_TOO_MANY_PEERS) {
+          // Resources may be exhausted due to many P2P links.
+          CanAccessPeer = 0;
+          DP("Too many P2P so fall back to D2D memcpy");
+        } else if (auto Err =
+                       Plugin::check(Res, "Error in cuCtxEnablePeerAccess: %s"))
+          return Err;
+      }
+      PeerAccesses[DstDeviceId] = (CanAccessPeer)
+                                      ? PeerAccessState::AVAILABLE
+                                      : PeerAccessState::UNAVAILABLE;
+    }
+  }
+
+  CUstream Stream = getStream(AsyncInfoWrapper);
+  if (!Stream)
+    return Plugin::error("Failure to get stream");
+
+  if (CanAccessPeer) {
+    // TODO: Should we fallback to D2D if peer access fails?
+    Res = cuMemcpyPeerAsync(CUDstPtr, Context, CUSrcPtr, DstDevice.Context,
+                            Size, Stream);
+    return Plugin::check(Res, "Error in cuMemcpyPeerAsync: %s");
+  }
+
+  // Fallback to D2D copy.
+  Res = cuMemcpyDtoDAsync(CUDstPtr, CUSrcPtr, Size, Stream);
+  return Plugin::check(Res, "Error in cuMemcpyDtoDAsync: %s");
+}
+
+Error Plugin::init() {
+  // Call the getter to intialize the CUDA plugin.
+  get();
+  return Plugin::success();
+}
+
+Error Plugin::deinit() {
+  // The CUDA plugin and the CUDA driver should already be deinitialized
+  // at this point. So do nothing for this plugin.
+  if (Plugin::isActive())
+    return Plugin::error("CUDA plugin is not deinitialized");
+
+  return Plugin::success();
+}
+
+GenericPluginTy &Plugin::get() {
+  // The CUDA plugin instance is built the first time that Plugin::get() is
+  // called thanks to the following static variable. The ideal implementation
+  // would initialize the plugin in Plugin::init() (__tgt_rtl_plugin_init) and
+  // destroy it in Plugin::deinit() (__tgt_rtl_plugin_deinit). However, at the
+  // time Plugin::deinit() is called, the CUDA driver is already shut down. That
+  // is caused by the fact that __tgt_rtl_plugin_deinit is called from a dtor
+  // in libomptarget. Thus, this is a workaround until that aspect is fixed.
+  static CUDAPluginTy CUDAPlugin;
+  assert(Plugin::isActive() && "Plugin is not active");
+  return CUDAPlugin;
+}
+
+template <typename... ArgsTy>
+Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
+  CUresult ResultCode = static_cast<CUresult>(Code);
+  if (ResultCode == CUDA_SUCCESS)
+    return Error::success();
+
+  const char *Desc = "Unknown error";
+  CUresult Ret = cuGetErrorString(ResultCode, &Desc);
+  if (Ret != CUDA_SUCCESS)
+    REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code %d\n", Code);
+
+  return createStringError<ArgsTy..., const char *>(inconvertibleErrorCode(),
+                                                    ErrFmt, Args..., Desc);
+}
+
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
Index: openmp/libomptarget/plugins-nextgen/exports
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/exports
@@ -0,0 +1,6 @@
+VERS1.0 {
+  global:
+    __tgt_rtl*;
+  local:
+    *;
+};
Index: openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
@@ -0,0 +1,401 @@
+//===-RTLs/generic-64bit/src/rtl.cpp - Target RTLs Implementation - C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// RTL NextGen for generic 64-bit machine
+//
+//===----------------------------------------------------------------------===//
+
+#include <cassert>
+#include <cstddef>
+#include <ffi.h>
+#include <string>
+#include <unordered_map>
+
+#include "Debug.h"
+#include "DeviceEnvironment.h"
+#include "GlobalHandler.h"
+#include "PluginInterface.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include "llvm/Frontend/OpenMP/OMPGridValues.h"
+#include "llvm/Support/DynamicLibrary.h"
+
+// The number of devices in this plugin.
+#define NUM_DEVICES 4
+
+// The ELF ID should be defined at compile-time by the build system.
+#ifndef TARGET_ELF_ID
+#define TARGET_ELF_ID 0
+#endif
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+/// Forward declarations for all specialized data structures.
+struct GenELF64KernelTy;
+struct GenELF64DeviceTy;
+struct GenELF64PluginTy;
+
+using llvm::sys::DynamicLibrary;
+
+/// Class implementing kernel functionalities for GenELF64.
+struct GenELF64KernelTy : public GenericKernelTy {
+  /// Construct the kernel with a name, execution mode and a function.
+  GenELF64KernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode,
+                   void (*Func)(void))
+      : GenericKernelTy(Name, ExecutionMode), Func(Func) {}
+
+  /// Initialize the kernel.
+  Error initImpl(GenericDeviceTy &GenericDevice,
+                 DeviceImageTy &Image) override {
+    // Set the maximum number of threads to a single.
+    MaxNumThreads = 1;
+    return Plugin::success();
+  }
+
+  /// Launch the kernel using the libffi.
+  Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
+                   uint64_t NumBlocks, uint32_t DynamicMemorySize,
+                   int32_t NumKernelArgs, void *KernelArgs,
+                   AsyncInfoWrapperTy &AsyncInfoWrapper) const override {
+    // Create a vector of ffi_types, one per argument.
+    SmallVector<ffi_type *, 16> ArgTypes(NumKernelArgs, &ffi_type_pointer);
+    ffi_type **ArgTypesPtr = (ArgTypes.size()) ? &ArgTypes[0] : nullptr;
+
+    // Prepare the cif structure before running the kernel function.
+    ffi_cif Cif;
+    ffi_status Status = ffi_prep_cif(&Cif, FFI_DEFAULT_ABI, NumKernelArgs,
+                                     &ffi_type_void, ArgTypesPtr);
+    if (Status != FFI_OK)
+      return Plugin::error("Error in ffi_prep_cif: %d", Status);
+
+    // Call the kernel function through libffi.
+    long Return;
+    ffi_call(&Cif, Func, &Return, (void **)KernelArgs);
+
+    return Plugin::success();
+  }
+
+  /// Get the default number of blocks and threads for the kernel.
+  uint64_t getDefaultNumBlocks(GenericDeviceTy &) const override { return 1; }
+  uint32_t getDefaultNumThreads(GenericDeviceTy &) const override { return 1; }
+
+private:
+  /// The kernel function to execute.
+  void (*Func)(void);
+};
+
+/// Class implementing the GenELF64 device images properties.
+struct GenELF64DeviceImageTy : public DeviceImageTy {
+  /// Create the GenELF64 image with the id and the target image pointer.
+  GenELF64DeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage)
+      : DeviceImageTy(ImageId, TgtImage), DynLib() {}
+
+  /// Getter and setter for the dynamic library.
+  DynamicLibrary &getDynamicLibrary() { return DynLib; }
+  void setDynamicLibrary(const DynamicLibrary &Lib) { DynLib = Lib; }
+
+private:
+  /// The dynamic library that loaded the image.
+  DynamicLibrary DynLib;
+};
+
+/// Class implementing the device functionalities for GenELF64.
+struct GenELF64DeviceTy : public GenericDeviceTy {
+  /// Create the device with a specific id.
+  GenELF64DeviceTy(int32_t DeviceId, int32_t NumDevices)
+      : GenericDeviceTy(DeviceId, NumDevices, GenELF64GridValues) {}
+
+  ~GenELF64DeviceTy() {}
+
+  /// Initialize the device, which is a no-op
+  Error initImpl(GenericPluginTy &Plugin) override { return Plugin::success(); }
+
+  /// Deinitialize the device, which is a no-op
+  Error deinitImpl() override { return Plugin::success(); }
+
+  /// Construct the kernel for a specific image on the device.
+  Expected<GenericKernelTy *>
+  constructKernelEntry(const __tgt_offload_entry &KernelEntry,
+                       DeviceImageTy &Image) override {
+    GlobalTy Func(KernelEntry);
+
+    // Get the metadata (address) of the kernel function.
+    GenericGlobalHandlerTy &GHandler = Plugin::get().getGlobalHandler();
+    if (auto Err = GHandler.getGlobalMetadataFromDevice(*this, Image, Func))
+      return std::move(Err);
+
+    // Allocate and create the kernel.
+    GenELF64KernelTy *GenELF64Kernel =
+        Plugin::get().allocate<GenELF64KernelTy>();
+    new (GenELF64Kernel) GenELF64KernelTy(
+        KernelEntry.name, OMP_TGT_EXEC_MODE_GENERIC, (void (*)())Func.getPtr());
+
+    return GenELF64Kernel;
+  }
+
+  /// Set the current context to this device, which is a no-op.
+  Error setContext() override { return Plugin::success(); }
+
+  /// Load the binary image into the device and allocate an image object.
+  Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
+                                           int32_t ImageId) override {
+    // Allocate and initialize the image object.
+    GenELF64DeviceImageTy *Image =
+        Plugin::get().allocate<GenELF64DeviceImageTy>();
+    new (Image) GenELF64DeviceImageTy(ImageId, TgtImage);
+
+    // Create a temporary file.
+    char TmpFileName[] = "/tmp/tmpfile_XXXXXX";
+    int TmpFileFd = mkstemp(TmpFileName);
+    if (TmpFileFd == -1)
+      return Plugin::error("Failed to create tmpfile for loading target image");
+
+    // Open the temporary file.
+    FILE *TmpFile = fdopen(TmpFileFd, "wb");
+    if (!TmpFile)
+      return Plugin::error("Failed to open tmpfile %s for loading target image",
+                           TmpFileName);
+
+    // Write the image into the temporary file.
+    size_t Written = fwrite(Image->getStart(), Image->getSize(), 1, TmpFile);
+    if (Written != 1)
+      return Plugin::error("Failed to write target image to tmpfile %s",
+                           TmpFileName);
+
+    // Close the temporary file.
+    int Ret = fclose(TmpFile);
+    if (Ret)
+      return Plugin::error("Failed to close tmpfile %s with the target image",
+                           TmpFileName);
+
+    // Load the temporary file as a dynamic library.
+    std::string ErrMsg;
+    DynamicLibrary DynLib =
+        DynamicLibrary::getPermanentLibrary(TmpFileName, &ErrMsg);
+
+    // Check if the loaded library is valid.
+    if (!DynLib.isValid())
+      return Plugin::error("Failed to load target image: %s", ErrMsg.c_str());
+
+    // Save a reference of the image's dynamic library.
+    Image->setDynamicLibrary(DynLib);
+
+    return Image;
+  }
+
+  /// Allocate memory. Use std::malloc in all cases.
+  void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
+    if (Size == 0)
+      return nullptr;
+
+    void *MemAlloc = nullptr;
+    switch (Kind) {
+    case TARGET_ALLOC_DEFAULT:
+    case TARGET_ALLOC_DEVICE:
+    case TARGET_ALLOC_HOST:
+    case TARGET_ALLOC_SHARED:
+      MemAlloc = std::malloc(Size);
+      break;
+    }
+    return MemAlloc;
+  }
+
+  /// Free the memory. Use std::free in all cases.
+  int free(void *TgtPtr, TargetAllocTy Kind) override {
+    std::free(TgtPtr);
+    return OFFLOAD_SUCCESS;
+  }
+
+  /// Submit data to the device (host to device transfer).
+  Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
+                       AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    std::memcpy(TgtPtr, HstPtr, Size);
+    return Plugin::success();
+  }
+
+  /// Retrieve data from the device (device to host transfer).
+  Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
+                         AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    std::memcpy(HstPtr, TgtPtr, Size);
+    return Plugin::success();
+  }
+
+  /// Exchange data between two devices within the plugin. This function is not
+  /// supported in this plugin.
+  Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
+                         void *DstPtr, int64_t Size,
+                         AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    // This function should never be called because the function
+    // GenELF64PluginTy::isDataExchangable() returns false.
+    return Plugin::error("dataExchangeImpl not supported");
+  }
+
+  /// All functions are already synchronous. No need to do anything on this
+  /// synchronization function.
+  Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
+    return Plugin::success();
+  }
+
+  /// This plugin does not support interoperability
+  Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    return Plugin::error("initAsyncInfoImpl not supported");
+  }
+
+  /// This plugin does not support interoperability
+  Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
+    return Plugin::error("initDeviceInfoImpl not supported");
+  }
+
+  /// This plugin does not support the event API. Do nothing without failing.
+  Error createEventImpl(void **EventPtrStorage) override {
+    *EventPtrStorage = nullptr;
+    return Plugin::success();
+  }
+  Error destroyEventImpl(void *EventPtr) override { return Plugin::success(); }
+  Error recordEventImpl(void *EventPtr,
+                        AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    return Plugin::success();
+  }
+  Error waitEventImpl(void *EventPtr,
+                      AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    return Plugin::success();
+  }
+  Error syncEventImpl(void *EventPtr) override { return Plugin::success(); }
+
+  /// Print information about the device.
+  Error printInfoImpl() override {
+    printf("    This is a generic-elf-64bit device\n");
+    return Plugin::success();
+  }
+
+  /// This plugin should not setup the device environment.
+  virtual bool shouldSetupDeviceEnvironment() const override { return false; };
+
+  /// Getters and setters for stack size and heap size not relevant.
+  Error getDeviceStackSize(uint64_t &Value) override {
+    Value = 0;
+    return Plugin::success();
+  }
+  Error setDeviceStackSize(uint64_t Value) override {
+    return Plugin::success();
+  }
+  Error getDeviceHeapSize(uint64_t &Value) override {
+    Value = 0;
+    return Plugin::success();
+  }
+  Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); }
+
+private:
+  /// Grid values for Generic ELF64 plugins.
+  static constexpr GV GenELF64GridValues = {
+      1, // GV_Slot_Size
+      1, // GV_Warp_Size
+      1, // GV_Max_Teams
+      1, // GV_SimpleBufferSize
+      1, // GV_Max_WG_Size
+      1, // GV_Default_WG_Size
+  };
+};
+
+class GenELF64GlobalHandlerTy final : public GenericGlobalHandlerTy {
+public:
+  Error getGlobalMetadataFromDevice(GenericDeviceTy &GenericDevice,
+                                    DeviceImageTy &Image,
+                                    GlobalTy &DeviceGlobal) override {
+    const char *GlobalName = DeviceGlobal.getName().data();
+    GenELF64DeviceImageTy &GenELF64Image =
+        static_cast<GenELF64DeviceImageTy &>(Image);
+
+    // Get dynamic library that has loaded the device image.
+    DynamicLibrary &DynLib = GenELF64Image.getDynamicLibrary();
+
+    // Get the address of the symbol.
+    void *Addr = DynLib.getAddressOfSymbol(GlobalName);
+    if (Addr == nullptr) {
+      return Plugin::error("Failed to load global '%s'", GlobalName);
+    }
+
+    // Save the pointer to the symbol.
+    DeviceGlobal.setPtr(Addr);
+
+    return Plugin::success();
+  }
+};
+
+/// Class implementing the plugin functionalities for GenELF64.
+struct GenELF64PluginTy final : public GenericPluginTy {
+  /// Create the plugin.
+  GenELF64PluginTy() : GenericPluginTy() {
+    // Initialize the generic plugin structure with multiple devices and a
+    // global handler.
+    GenericPluginTy::init(NUM_DEVICES, new GenELF64GlobalHandlerTy());
+  }
+
+  /// This class should not be copied.
+  GenELF64PluginTy(const GenELF64PluginTy &) = delete;
+  GenELF64PluginTy(GenELF64PluginTy &&) = delete;
+
+  ~GenELF64PluginTy() {}
+
+  /// Get the ELF code to recognize the compatible binary images.
+  uint16_t getMagicElfBits() const override { return TARGET_ELF_ID; }
+
+  /// Create a GenELF64 device with a specific id.
+  GenELF64DeviceTy &createDevice(int32_t DeviceId) override {
+    GenELF64DeviceTy *Device = new GenELF64DeviceTy(DeviceId, getNumDevices());
+    return *Device;
+  }
+
+  /// This plugin does not support exchanging data between two devices.
+  bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override {
+    return false;
+  }
+
+  /// All images (ELF-compatible) should be compatible with this plugin.
+  Expected<bool> isImageCompatible(__tgt_image_info *Info) const override {
+    return true;
+  }
+};
+
+Error Plugin::init() {
+  // Call the getter to intialize the GenELF64 plugin.
+  get();
+  return Plugin::success();
+}
+
+Error Plugin::deinit() {
+  // The Generic ELF64 plugin should already be deinitialized at this point.
+  if (Plugin::isActive())
+    return Plugin::error("Generic ELF64 plugin is not deinitialized");
+
+  return Plugin::success();
+}
+
+GenericPluginTy &Plugin::get() {
+  static GenELF64PluginTy GenELF64Plugin;
+  assert(Plugin::isActive() && "Plugin is not active");
+  return GenELF64Plugin;
+}
+
+template <typename... ArgsTy>
+Error Plugin::check(int32_t Code, const char *ErrMsg, ArgsTy... Args) {
+  if (Code == 0)
+    return Error::success();
+
+  return createStringError<ArgsTy..., const char *>(
+      inconvertibleErrorCode(), ErrMsg, Args..., std::to_string(Code).data());
+}
+
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
Index: openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt
@@ -0,0 +1,17 @@
+##===----------------------------------------------------------------------===##
+# 
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a ppc64 machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  build_generic_elf64_nextgen("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu" "21")
+else()
+ libomptarget_say("Not building ppc64 NextGen offloading plugin: machine not found in the system.")
+endif()
Index: openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt
@@ -0,0 +1,17 @@
+##===----------------------------------------------------------------------===##
+# 
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a ppc64le machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  build_generic_elf64_nextgen("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu" "21")
+else()
+ libomptarget_say("Not building ppc64le NextGen offloading plugin: machine not found in the system.")
+endif()
Index: openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt
@@ -0,0 +1,17 @@
+##===----------------------------------------------------------------------===##
+# 
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a x86_64 machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  build_generic_elf64_nextgen("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62")
+else()
+ libomptarget_say("Not building x86_64 NextGen offloading plugin: machine not found in the system.")
+endif()
Index: openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h
===================================================================
--- openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h
+++ openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h
@@ -24,11 +24,14 @@
 typedef struct CUstream_st *CUstream;
 typedef struct CUevent_st *CUevent;
 
+#define CU_DEVICE_INVALID ((CUdevice)-2)
+
 typedef enum cudaError_enum {
   CUDA_SUCCESS = 0,
   CUDA_ERROR_INVALID_VALUE = 1,
   CUDA_ERROR_NO_DEVICE = 100,
   CUDA_ERROR_INVALID_HANDLE = 400,
+  CUDA_ERROR_TOO_MANY_PEERS = 711,
 } CUresult;
 
 typedef enum CUstream_flags_enum {
Index: openmp/libomptarget/src/rtl.cpp
===================================================================
--- openmp/libomptarget/src/rtl.cpp
+++ openmp/libomptarget/src/rtl.cpp
@@ -16,6 +16,8 @@
 #include "private.h"
 #include "rtl.h"
 
+#include "Utilities.h"
+
 #include <cassert>
 #include <cstdlib>
 #include <cstring>
@@ -24,16 +26,17 @@
 
 using namespace llvm;
 using namespace llvm::sys;
+using namespace llvm::omp::target;
 
 // List of all plugins that can support offloading.
 static const char *RTLNames[] = {
-    /* PowerPC target       */ "libomptarget.rtl.ppc64.so",
-    /* x86_64 target        */ "libomptarget.rtl.x86_64.so",
-    /* CUDA target          */ "libomptarget.rtl.cuda.so",
-    /* AArch64 target       */ "libomptarget.rtl.aarch64.so",
-    /* SX-Aurora VE target  */ "libomptarget.rtl.ve.so",
-    /* AMDGPU target        */ "libomptarget.rtl.amdgpu.so",
-    /* Remote target        */ "libomptarget.rtl.rpc.so",
+    /* PowerPC target       */ "libomptarget.rtl.ppc64",
+    /* x86_64 target        */ "libomptarget.rtl.x86_64",
+    /* CUDA target          */ "libomptarget.rtl.cuda",
+    /* AArch64 target       */ "libomptarget.rtl.aarch64",
+    /* SX-Aurora VE target  */ "libomptarget.rtl.ve",
+    /* AMDGPU target        */ "libomptarget.rtl.amdgpu",
+    /* Remote target        */ "libomptarget.rtl.rpc",
 };
 
 PluginManager *PM;
@@ -86,152 +89,166 @@
 
   DP("Loading RTLs...\n");
 
+  BoolEnvar NextGenPlugins("LIBOMPTARGET_NEXTGEN_PLUGINS", false);
+
   // Attempt to open all the plugins and, if they exist, check if the interface
   // is correct and if they are supporting any devices.
-  for (auto *Name : RTLNames) {
-    DP("Loading library '%s'...\n", Name);
-    std::string ErrMsg;
-    auto DynLibrary = std::make_unique<sys::DynamicLibrary>(
-        sys::DynamicLibrary::getPermanentLibrary(Name, &ErrMsg));
-
-    if (!DynLibrary->isValid()) {
-      // Library does not exist or cannot be found.
-      DP("Unable to load library '%s': %s!\n", Name, ErrMsg.c_str());
-      continue;
-    }
-
-    DP("Successfully loaded library '%s'!\n", Name);
-
+  for (const char *Name : RTLNames) {
     AllRTLs.emplace_back();
 
-    // Retrieve the RTL information from the runtime library.
-    RTLInfoTy &R = AllRTLs.back();
-
-    // Remove plugin on failure to call optional init_plugin
-    *((void **)&R.init_plugin) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_init_plugin");
-    if (R.init_plugin) {
-      int32_t Rc = R.init_plugin();
-      if (Rc != OFFLOAD_SUCCESS) {
-        DP("Unable to initialize library '%s': %u!\n", Name, Rc);
-        AllRTLs.pop_back();
+    RTLInfoTy &RTL = AllRTLs.back();
+
+    const std::string BaseRTLName(Name);
+    if (NextGenPlugins) {
+      if (attemptLoadRTL(BaseRTLName + ".nextgen.so", RTL))
         continue;
-      }
-    }
 
-    bool ValidPlugin = true;
-
-    if (!(*((void **)&R.is_valid_binary) =
-              DynLibrary->getAddressOfSymbol("__tgt_rtl_is_valid_binary")))
-      ValidPlugin = false;
-    if (!(*((void **)&R.number_of_devices) =
-              DynLibrary->getAddressOfSymbol("__tgt_rtl_number_of_devices")))
-      ValidPlugin = false;
-    if (!(*((void **)&R.init_device) =
-              DynLibrary->getAddressOfSymbol("__tgt_rtl_init_device")))
-      ValidPlugin = false;
-    if (!(*((void **)&R.load_binary) =
-              DynLibrary->getAddressOfSymbol("__tgt_rtl_load_binary")))
-      ValidPlugin = false;
-    if (!(*((void **)&R.data_alloc) =
-              DynLibrary->getAddressOfSymbol("__tgt_rtl_data_alloc")))
-      ValidPlugin = false;
-    if (!(*((void **)&R.data_submit) =
-              DynLibrary->getAddressOfSymbol("__tgt_rtl_data_submit")))
-      ValidPlugin = false;
-    if (!(*((void **)&R.data_retrieve) =
-              DynLibrary->getAddressOfSymbol("__tgt_rtl_data_retrieve")))
-      ValidPlugin = false;
-    if (!(*((void **)&R.data_delete) =
-              DynLibrary->getAddressOfSymbol("__tgt_rtl_data_delete")))
-      ValidPlugin = false;
-    if (!(*((void **)&R.run_region) =
-              DynLibrary->getAddressOfSymbol("__tgt_rtl_run_target_region")))
-      ValidPlugin = false;
-    if (!(*((void **)&R.run_team_region) = DynLibrary->getAddressOfSymbol(
-              "__tgt_rtl_run_target_team_region")))
-      ValidPlugin = false;
-
-    // Invalid plugin
-    if (!ValidPlugin) {
-      DP("Invalid plugin as necessary interface is not found.\n");
-      AllRTLs.pop_back();
-      continue;
+      DP("Falling back to original plugin...\n");
     }
 
-    // No devices are supported by this RTL?
-    if (!(R.NumberOfDevices = R.number_of_devices())) {
-      // The RTL is invalid! Will pop the object from the RTLs list.
-      DP("No devices supported in this RTL\n");
+    if (!attemptLoadRTL(BaseRTLName + ".so", RTL))
       AllRTLs.pop_back();
-      continue;
+  }
+
+  DP("RTLs loaded!\n");
+}
+
+bool RTLsTy::attemptLoadRTL(const std::string &RTLName, RTLInfoTy &RTL) {
+  const char *Name = RTLName.c_str();
+
+  DP("Loading library '%s'...\n", Name);
+
+  std::string ErrMsg;
+  auto DynLibrary = std::make_unique<sys::DynamicLibrary>(
+      sys::DynamicLibrary::getPermanentLibrary(Name, &ErrMsg));
+
+  if (!DynLibrary->isValid()) {
+    // Library does not exist or cannot be found.
+    DP("Unable to load library '%s': %s!\n", Name, ErrMsg.c_str());
+    return false;
+  }
+
+  DP("Successfully loaded library '%s'!\n", Name);
+
+  // Remove plugin on failure to call optional init_plugin
+  *((void **)&RTL.init_plugin) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_init_plugin");
+  if (RTL.init_plugin) {
+    int32_t Rc = RTL.init_plugin();
+    if (Rc != OFFLOAD_SUCCESS) {
+      DP("Unable to initialize library '%s': %u!\n", Name, Rc);
+      return false;
     }
+  }
 
-#ifdef OMPTARGET_DEBUG
-    R.RTLName = Name;
-#endif
+  bool ValidPlugin = true;
+
+  if (!(*((void **)&RTL.is_valid_binary) =
+            DynLibrary->getAddressOfSymbol("__tgt_rtl_is_valid_binary")))
+    ValidPlugin = false;
+  if (!(*((void **)&RTL.number_of_devices) =
+            DynLibrary->getAddressOfSymbol("__tgt_rtl_number_of_devices")))
+    ValidPlugin = false;
+  if (!(*((void **)&RTL.init_device) =
+            DynLibrary->getAddressOfSymbol("__tgt_rtl_init_device")))
+    ValidPlugin = false;
+  if (!(*((void **)&RTL.load_binary) =
+            DynLibrary->getAddressOfSymbol("__tgt_rtl_load_binary")))
+    ValidPlugin = false;
+  if (!(*((void **)&RTL.data_alloc) =
+            DynLibrary->getAddressOfSymbol("__tgt_rtl_data_alloc")))
+    ValidPlugin = false;
+  if (!(*((void **)&RTL.data_submit) =
+            DynLibrary->getAddressOfSymbol("__tgt_rtl_data_submit")))
+    ValidPlugin = false;
+  if (!(*((void **)&RTL.data_retrieve) =
+            DynLibrary->getAddressOfSymbol("__tgt_rtl_data_retrieve")))
+    ValidPlugin = false;
+  if (!(*((void **)&RTL.data_delete) =
+            DynLibrary->getAddressOfSymbol("__tgt_rtl_data_delete")))
+    ValidPlugin = false;
+  if (!(*((void **)&RTL.run_region) =
+            DynLibrary->getAddressOfSymbol("__tgt_rtl_run_target_region")))
+    ValidPlugin = false;
+  if (!(*((void **)&RTL.run_team_region) =
+            DynLibrary->getAddressOfSymbol("__tgt_rtl_run_target_team_region")))
+    ValidPlugin = false;
+
+  // Invalid plugin
+  if (!ValidPlugin) {
+    DP("Invalid plugin as necessary interface is not found.\n");
+    return false;
+  }
 
-    DP("Registering RTL %s supporting %d devices!\n", R.RTLName.c_str(),
-       R.NumberOfDevices);
-
-    // Optional functions
-    *((void **)&R.deinit_plugin) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_deinit_plugin");
-    *((void **)&R.is_valid_binary_info) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_is_valid_binary_info");
-    *((void **)&R.deinit_device) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_deinit_device");
-    *((void **)&R.init_requires) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_init_requires");
-    *((void **)&R.data_submit_async) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_data_submit_async");
-    *((void **)&R.data_retrieve_async) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_data_retrieve_async");
-    *((void **)&R.run_region_async) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_run_target_region_async");
-    *((void **)&R.run_team_region_async) = DynLibrary->getAddressOfSymbol(
-        "__tgt_rtl_run_target_team_region_async");
-    *((void **)&R.synchronize) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_synchronize");
-    *((void **)&R.data_exchange) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_data_exchange");
-    *((void **)&R.data_exchange_async) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_data_exchange_async");
-    *((void **)&R.is_data_exchangable) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_is_data_exchangable");
-    *((void **)&R.register_lib) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_register_lib");
-    *((void **)&R.unregister_lib) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_unregister_lib");
-    *((void **)&R.supports_empty_images) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_supports_empty_images");
-    *((void **)&R.set_info_flag) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_set_info_flag");
-    *((void **)&R.print_device_info) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_print_device_info");
-    *((void **)&R.create_event) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_create_event");
-    *((void **)&R.record_event) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_record_event");
-    *((void **)&R.wait_event) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_wait_event");
-    *((void **)&R.sync_event) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_sync_event");
-    *((void **)&R.destroy_event) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_destroy_event");
-    *((void **)&R.release_async_info) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_release_async_info");
-    *((void **)&R.init_async_info) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_init_async_info");
-    *((void **)&R.init_device_info) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_init_device_info");
-
-    R.LibraryHandler = std::move(DynLibrary);
+  // No devices are supported by this RTL?
+  if (!(RTL.NumberOfDevices = RTL.number_of_devices())) {
+    // The RTL is invalid! Will pop the object from the RTLs list.
+    DP("No devices supported in this RTL\n");
+    return false;
   }
 
-  DP("RTLs loaded!\n");
+#ifdef LIBOMPTARGET_DEBUG
+  RTL.RTLName = Name;
+#endif
 
-  return;
+  DP("Registering RTL %s supporting %d devices!\n", Name, RTL.NumberOfDevices);
+
+  // Optional functions
+  *((void **)&RTL.deinit_plugin) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_deinit_plugin");
+  *((void **)&RTL.is_valid_binary_info) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_is_valid_binary_info");
+  *((void **)&RTL.deinit_device) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_deinit_device");
+  *((void **)&RTL.init_requires) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_init_requires");
+  *((void **)&RTL.data_submit_async) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_data_submit_async");
+  *((void **)&RTL.data_retrieve_async) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_data_retrieve_async");
+  *((void **)&RTL.run_region_async) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_run_target_region_async");
+  *((void **)&RTL.run_team_region_async) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_run_target_team_region_async");
+  *((void **)&RTL.synchronize) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_synchronize");
+  *((void **)&RTL.data_exchange) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_data_exchange");
+  *((void **)&RTL.data_exchange_async) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_data_exchange_async");
+  *((void **)&RTL.is_data_exchangable) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_is_data_exchangable");
+  *((void **)&RTL.register_lib) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_register_lib");
+  *((void **)&RTL.unregister_lib) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_unregister_lib");
+  *((void **)&RTL.supports_empty_images) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_supports_empty_images");
+  *((void **)&RTL.set_info_flag) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_set_info_flag");
+  *((void **)&RTL.print_device_info) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_print_device_info");
+  *((void **)&RTL.create_event) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_create_event");
+  *((void **)&RTL.record_event) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_record_event");
+  *((void **)&RTL.wait_event) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_wait_event");
+  *((void **)&RTL.sync_event) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_sync_event");
+  *((void **)&RTL.destroy_event) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_destroy_event");
+  *((void **)&RTL.release_async_info) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_release_async_info");
+  *((void **)&RTL.init_async_info) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_init_async_info");
+  *((void **)&RTL.init_device_info) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_init_device_info");
+
+  RTL.LibraryHandler = std::move(DynLibrary);
+
+  // Successfully loaded
+  return true;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
Index: openmp/libomptarget/test/lit.cfg
===================================================================
--- openmp/libomptarget/test/lit.cfg
+++ openmp/libomptarget/test/lit.cfg
@@ -21,6 +21,10 @@
 if 'LIBOMPTARGET_DEBUG' in os.environ:
     config.environment['LIBOMPTARGET_DEBUG'] = os.environ['LIBOMPTARGET_DEBUG']
 
+# Allow running the tests with nextgen plugins when available
+if 'LIBOMPTARGET_NEXTGEN_PLUGINS' in os.environ:
+    config.environment['LIBOMPTARGET_NEXTGEN_PLUGINS'] = os.environ['LIBOMPTARGET_NEXTGEN_PLUGINS']
+
 if 'OMP_TARGET_OFFLOAD' in os.environ:
     config.environment['OMP_TARGET_OFFLOAD'] = os.environ['OMP_TARGET_OFFLOAD']