Index: llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
===================================================================
--- llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
+++ llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
@@ -56,24 +56,24 @@
 
 struct GV {
   /// The size reserved for data in a shared memory slot.
-  const unsigned GV_Slot_Size;
+  unsigned GV_Slot_Size;
   /// The default value of maximum number of threads in a worker warp.
-  const unsigned GV_Warp_Size;
+  unsigned GV_Warp_Size;
 
   constexpr unsigned warpSlotSize() const {
     return GV_Warp_Size * GV_Slot_Size;
   }
 
   /// the maximum number of teams.
-  const unsigned GV_Max_Teams;
+  unsigned GV_Max_Teams;
   // An alternative to the heavy data sharing infrastructure that uses global
   // memory is one that uses device __shared__ memory.  The amount of such space
   // (in bytes) reserved by the OpenMP runtime is noted here.
-  const unsigned GV_SimpleBufferSize;
+  unsigned GV_SimpleBufferSize;
   // The absolute maximum team size for a working group
-  const unsigned GV_Max_WG_Size;
+  unsigned GV_Max_WG_Size;
   // The default maximum team size for a working group
-  const unsigned GV_Default_WG_Size;
+  unsigned GV_Default_WG_Size;
 
   constexpr unsigned maxWarpNumber() const {
     return GV_Max_WG_Size / GV_Warp_Size;
Index: openmp/libomptarget/CMakeLists.txt
===================================================================
--- openmp/libomptarget/CMakeLists.txt
+++ openmp/libomptarget/CMakeLists.txt
@@ -85,6 +85,7 @@
 
 # Build offloading plugins and device RTLs if they are available.
 add_subdirectory(plugins)
+add_subdirectory(plugins-nextgen)
 add_subdirectory(DeviceRTL)
 add_subdirectory(tools)
 
Index: openmp/libomptarget/include/Utilities.h
===================================================================
--- /dev/null
+++ openmp/libomptarget/include/Utilities.h
@@ -0,0 +1,297 @@
+//===------- Utilities.h - Target independent OpenMP target RTL -- C++ ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Routines and classes used to provide useful functionalities like string
+// parsing and environment variables.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_INCLUDE_UTILITIES_H
+#define OPENMP_LIBOMPTARGET_INCLUDE_UTILITIES_H
+
+#include "llvm/ADT/STLFunctionalExtras.h"
+
+#include "Debug.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <sstream>
+#include <string>
+
+
+namespace llvm {
+namespace omp {
+namespace target {
+
+/// Utility class implementing error code checking and function return values.
+/// Do not use StatusCode_ directly, use StatusCode instead, which is defined
+/// below.
+template <typename NotUsed>
+class StatusCode_ {
+  enum CheckedKind {
+    DONE = 0,
+    PENDING = 1,
+    IGNORE = 2
+  };
+
+  int32_t Code;
+  mutable CheckedKind Checked;
+
+  explicit StatusCode_(int32_t CodeValue, CheckedKind CheckedValue) :
+    Code(CodeValue), Checked(CheckedValue) {}
+
+  void setChecked() const {
+    if (Checked != IGNORE)
+      Checked = DONE;
+  }
+
+public:
+  /// Pre-defined status codes useful for returning and comparing.
+  static const StatusCode_ SUCCESS;
+  static const StatusCode_ FAIL;
+
+  /// Create an empty status code. Can be destroyed without checking.
+  StatusCode_() :
+    Code(0), Checked(DONE) {}
+
+  /// Create a status code with a specific value. Must be checked before
+  /// being destroyed.
+  explicit StatusCode_(int32_t CodeValue)
+      : Code(CodeValue), Checked(PENDING) {}
+
+  /// Create a status code from another. Inherit the code only; the code has to
+  /// be checked independently of the other status code object.
+  StatusCode_(const StatusCode_ &OtherSC)
+      : Code(OtherSC.Code), Checked(PENDING) {}
+
+  /// Create a status code moving it from another. Inherit the code only; the
+  /// code has to be checked independently of the other status code object. The
+  /// other status code object does not need to be checked.
+  StatusCode_(StatusCode_ &&OtherSC)
+      : Code(OtherSC.Code), Checked(PENDING) {
+    OtherSC.Checked = DONE;
+  }
+
+  ~StatusCode_() {
+    assert(Checked != PENDING && "Unchecked status code");
+  }
+
+  /// Copy operator working similar as the copy constructor.
+  StatusCode_ &operator=(const StatusCode_ &OtherSC) {
+    if (this != &OtherSC) {
+      assert(Checked != PENDING && "Overwriting unchecked status code");
+
+      Code = OtherSC.Code;
+      Checked = (OtherSC.Checked != IGNORE) ? OtherSC.Checked : PENDING;
+    }
+    return *this;
+  }
+
+  /// Move operator working similar as the move constructor.
+  StatusCode_ &operator=(StatusCode_ &&OtherSC) {
+    if (this != &OtherSC) {
+      assert(Checked != PENDING && "Overwriting unchecked status code");
+      assert(OtherSC.Checked != IGNORE && "Predefined status cannot be moved");
+
+      Code = OtherSC.Code;
+      Checked = OtherSC.Checked;
+
+      OtherSC.Checked = DONE;
+    }
+    return *this;
+  }
+
+  /// Get the status code and set as checked.
+  int32_t getCode() const {
+    setChecked();
+    return Code;
+  }
+
+  /// Indicate whether the status code is successful and set as checked.
+  bool succeed() const {
+    return getCode() == 0;
+  }
+
+  /// Indicate whether the status code is failed and set as checked.
+  bool failed() const {
+    return !succeed();
+  }
+
+  /// Indicate whether the status code is failed and set as checked.
+  operator bool() const {
+    return getCode();
+  }
+
+  /// Compare the status code with another object.
+  bool operator ==(const StatusCode_ &OtherSC) const {
+    setChecked();
+    return (bool)Code == (bool)OtherSC.Code;
+  }
+};
+
+template <typename T>
+const StatusCode_<T> StatusCode_<T>::SUCCESS(0, StatusCode_::IGNORE);
+
+template <typename T>
+const StatusCode_<T> StatusCode_<T>::FAIL(1, StatusCode_::IGNORE);
+
+/// Always use StatusCode instead of StatusCode_. This trick allows using static
+/// inline class members before C++17. Static inline class members are cleanly
+/// supported by C++17 and beyond.
+using StatusCode = StatusCode_<void>;
+
+/// Utility class for parsing strings to other types.
+struct StringParser {
+  /// Parse a string to another type.
+  template <typename Ty>
+  static bool parse(const char *Value, Ty &Result);
+
+  // Convert bool to "Yes" / "No" strings.
+  static constexpr const char *boolToYesNoString(bool Value) {
+    return Value ? "Yes" : "No";
+  }
+};
+
+/// Class for reading and checking environment variables. Currently working with
+/// integer, floats, std::string and bool types.
+template <typename Ty> class Envar {
+  std::string Name;
+  Ty Data;
+  bool IsPresent;
+  bool Initialized;
+
+public:
+  /// Create an empty envar. Cannot be consulted. This constructor is merely
+  /// for convenience.
+  Envar() : Name(), Data(Ty()), IsPresent(false), Initialized(false) {}
+
+  /// Create an envar with a name and an optional default. The Envar object will
+  /// take the value read from the environment variable, or the default if it
+  /// was not set or not correct.
+  Envar(const char *EnvarName, Ty Default = Ty())
+      : Name(EnvarName), Data(Default), IsPresent(false), Initialized(true) {
+    if (const char *EnvStr = getenv(EnvarName)) {
+      // Check whether the envar is defined and valid.
+      IsPresent = StringParser::parse<Ty>(EnvStr, Data);
+
+      if (!IsPresent) {
+        DP("Ignoring invalid value %s for envar %s\n", EnvStr, EnvarName);
+        Data = Default;
+      }
+    }
+  }
+
+  /// Create an envar with a name, getter function and a setter function. The
+  /// Envar object will take the value read from the environment variable if
+  /// this value is accepted by the setter function. Otherwise, the getter
+  /// function will be executed to get the default value. The getter should be
+  /// of the form StatusCode GetterFunctionTy(Ty &Value) and the setter should
+  /// be of the form StatusCode SetterFunctionTy(Ty Value).
+  template <typename GetterFunctor, typename SetterFunctor>
+  Envar(const char *EnvarName, GetterFunctor Getter, SetterFunctor Setter)
+      : Name(EnvarName), Data(Ty()), IsPresent(false), Initialized(true) {
+    init(Getter, Setter);
+  }
+
+  /// Get the definitive value.
+  const Ty &get() const {
+    // Throw a runtime error in case this envar is not initialized.
+    if (!Initialized)
+      FATAL_MESSAGE0(1, "Consulting envar %s before initialization", Name.data());
+
+    return Data;
+  }
+
+  /// Get the definitive value.
+  operator Ty() const { return get(); }
+
+  /// Indicate whether the environment variable was defined and valid.
+  bool isPresent() const { return IsPresent; }
+
+private:
+  template <typename GetterFunctor, typename SetterFunctor>
+  void init(GetterFunctor Getter, SetterFunctor Setter);
+};
+
+/// Define some common envar types.
+using IntEnvar = Envar<int>;
+using Int32Envar = Envar<int32_t>;
+using Int64Envar = Envar<int64_t>;
+using UInt32Envar = Envar<uint32_t>;
+using UInt64Envar = Envar<uint64_t>;
+using StringEnvar = Envar<std::string>;
+using BoolEnvar = Envar<bool>;
+
+template <>
+inline bool StringParser::parse(const char *ValueStr, bool &Result)
+{
+  std::string Value(ValueStr);
+
+  // Convert the string to lowercase.
+  std::transform(Value.begin(), Value.end(), Value.begin(),
+    [](unsigned char c){ return std::tolower(c); });
+
+  // May be implemented with fancier C++ features, but let's keep it simple.
+  if (Value == "true" || Value == "yes" || Value == "on" || Value == "1")
+    Result = true;
+  else if (Value == "false" || Value == "no" || Value == "off" || Value == "0")
+    Result = false;
+  else
+    return false;
+
+  // Parsed correctly.
+  return true;
+}
+
+template <typename Ty>
+inline bool StringParser::parse(const char *Value, Ty &Result)
+{
+  assert(Value && "Parsed value cannot be null");
+
+  std::istringstream Stream(Value);
+  Stream >> Result;
+
+  return !Stream.fail();
+}
+
+template <typename Ty>
+template <typename GetterFunctor, typename SetterFunctor>
+inline void Envar<Ty>::init(GetterFunctor Getter, SetterFunctor Setter) {
+  // Get the default value (current).
+  Ty Default;
+  if (Getter(Default))
+    FATAL_MESSAGE0(1, "Getter for envar %s failed", Name.data());
+
+  if (const char *EnvStr = getenv(Name.data())) {
+    IsPresent = StringParser::parse<Ty>(EnvStr, Data);
+    if (IsPresent) {
+      // Check whether the envar value is actually valid.
+      IsPresent = !((bool) Setter(Data));
+      if (!IsPresent) {
+        // Setter reported value as invalid, reset to the getter value.
+        Data = Default;
+        DP("Setter of envar %s failed, resetting to %s\n",
+           Name.data(), std::to_string(Data).data());
+      }
+    } else {
+      DP("Ignoring invalid value %s for envar %s\n", EnvStr, Name.data());
+      Data = Default;
+    }
+  } else {
+    Data = Default;
+  }
+}
+
+} // namespace target
+} // namespace omp
+} // namespace llvm
+
+#endif // OPENMP_LIBOMPTARGET_INCLUDE_UTILITIES_H
Index: openmp/libomptarget/include/rtl.h
===================================================================
--- openmp/libomptarget/include/rtl.h
+++ openmp/libomptarget/include/rtl.h
@@ -169,6 +169,9 @@
   // (i.e. the library attempts to load the RTLs (plugins) only once).
   std::once_flag InitFlag;
   void loadRTLs(); // not thread-safe
+
+private:
+  static bool attemptLoadRTL(const std::string &RTLName, RTLInfoTy &RTL);
 };
 
 /// Map between the host entry begin and the translation table. Each
Index: openmp/libomptarget/plugins-nextgen/CMakeLists.txt
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/CMakeLists.txt
@@ -0,0 +1,19 @@
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Build plugins for the user system if available.
+#
+##===----------------------------------------------------------------------===##
+
+add_subdirectory(common)
+add_subdirectory(cuda)
+
+# Make sure the parent scope can see the plugins that will be created.
+set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
+set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
+
Index: openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt
@@ -0,0 +1,17 @@
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for an aarch64 machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  build_generic_elf64_nextgen("aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183")
+else()
+ libomptarget_say("Not building aarch64 NextGen offloading plugin: machine not found in the system.")
+endif()
Index: openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt
@@ -0,0 +1,13 @@
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Common parts which can be used by all plugins
+#
+##===----------------------------------------------------------------------===##
+
+add_subdirectory(PluginInterface)
Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt
@@ -0,0 +1,32 @@
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Common parts which can be used by all plugins
+#
+##===----------------------------------------------------------------------===##
+
+# Plugin Interface library.
+add_library(PluginInterface OBJECT PluginInterface.cpp GlobalHandler.cpp)
+
+# Define the TARGET_NAME.
+add_definitions("-DTARGET_NAME=PluginInterface")
+
+# Define the DEBUG_PREFIX.
+add_definitions(-DDEBUG_PREFIX="PluginInterface")
+
+set_property(TARGET PluginInterface PROPERTY POSITION_INDEPENDENT_CODE ON)
+llvm_update_compile_flags(PluginInterface)
+set(LINK_LLVM_LIBS LLVMSupport)
+if (LLVM_LINK_LLVM_DYLIB)
+  set(LINK_LLVM_LIBS LLVM)
+endif()
+target_link_libraries(PluginInterface INTERFACE ${LINK_LLVM_LIBS} PRIVATE elf_common MemoryManager)
+add_dependencies(PluginInterface ${LINK_LLVM_LIBS})
+
+target_include_directories(PluginInterface INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(PluginInterface PRIVATE ${LIBOMPTARGET_INCLUDE_DIR})
Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/GlobalHandler.h
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/GlobalHandler.h
@@ -0,0 +1,197 @@
+//===- GlobalHandler.h - Target independent global & enviroment handling --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Target independent global handler and environment manager.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H
+#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H
+
+#include <string>
+#include <unordered_map>
+
+#include "llvm/Object/ELFObjectFile.h"
+
+#include "Debug.h"
+#include "omptarget.h"
+#include "Utilities.h"
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+struct DeviceImageTy;
+struct GenericDeviceTy;
+
+using namespace llvm::object;
+
+/// Common abstraction for globals that live on the host and device.
+/// It simply encapsulates the symbol name, symbol size, and symbol address
+/// (which might be host or device depending on the context).
+class GlobalTy {
+  // TODO: Maybe we can have a pointer to the offload entry name instead of
+  // holding a private copy of the name as a std::string
+  std::string Name;
+  uint32_t Size;
+  void *Ptr;
+
+public:
+  GlobalTy(const std::string &Name, uint32_t Size, void *Ptr = nullptr)
+      : Name(Name), Size(Size), Ptr(Ptr) {}
+
+  GlobalTy(const __tgt_offload_entry &Entry)
+      : Name(Entry.name), Size(Entry.size), Ptr(Entry.addr) {}
+
+  const std::string &getName() const { return Name; }
+  uint32_t getSize() const { return Size; }
+  void *getPtr() const { return Ptr; }
+
+  void setSize(int32_t S) { Size = S; }
+  void setPtr(void *P) { Ptr = P; }
+};
+
+/// Subclass of GlobalTy that holds the memory for a global of \p Ty.
+template <typename Ty> class StaticGlobalTy : public GlobalTy {
+  Ty Data;
+
+public:
+  template <typename... Args>
+  StaticGlobalTy(const std::string &Name, Args &&...args)
+      : GlobalTy(Name, sizeof(Ty), &Data),
+        Data(Ty{std::forward<Args>(args)...}) {}
+  template <typename... Args>
+  StaticGlobalTy(const char *Name, Args &&...args)
+      : GlobalTy(Name, sizeof(Ty), &Data),
+        Data(Ty{std::forward<Args>(args)...}) {}
+  template <typename... Args>
+  StaticGlobalTy(const char *Name, const char *Suffix, Args &&...args)
+      : GlobalTy(std::string(Name) + Suffix, sizeof(Ty), &Data),
+        Data(Ty{std::forward<Args>(args)...}) {}
+
+  Ty &getValue() { return Data; }
+  const Ty &getValue() const { return Data; }
+  void setValue(const Ty &V) { Data = V; }
+};
+
+/// Subclass of GlobalTy that holds the memory which may exceed the global type
+/// \p Ty.
+template <typename Ty> class DynamicGlobalTy : public GlobalTy {
+public:
+  DynamicGlobalTy(const std::string &Name, uint32_t Size)
+      : GlobalTy(Name, Size, malloc(Size)) {}
+  DynamicGlobalTy(const char *Name, const char *Suffix, uint32_t Size)
+      : GlobalTy(std::string(Name) + Suffix, Size, malloc(Size)) {}
+  ~DynamicGlobalTy() { free(getPtr()); }
+
+  Ty &getValue() { return *static_cast<Ty *>(getPtr()); }
+  const Ty &getValue() const { return *static_cast<Ty *>(getPtr()); }
+  void setValue(const Ty &V) { *getPtr() = V; }
+};
+
+/// Helper class to do the heavy lifting when it comes to moving globals between
+/// host and device. Through the GenericDeviceTy we access memcpy DtoH and HtoD,
+/// which means the only things specialized by the subclass is the retrival of
+/// global metadata (size, addr) from the device.
+/// \see getGlobalMetadataFromDevice
+class GenericGlobalHandlerTy {
+  /// Map to store the ELF object files that have been loaded
+  std::unordered_map<int32_t, ELF64LEObjectFile> ELFObjectFiles;
+  std::mutex ELFObjectFilesMutex;
+
+  /// Get the cached ELF64LEObjectFile previosuly created for a specific
+  /// device image or create it if did not exist.
+  const ELF64LEObjectFile *getOrCreateELFObjectFile(
+      const GenericDeviceTy &Device,
+      DeviceImageTy &Image);
+
+  /// Actually move memory between host and device. See readGlobalFromDevice and
+  /// writeGlobalToDevice for the interface description.
+  StatusCode moveGlobalBetweenDeviceAndHost(GenericDeviceTy &Device,
+                                         DeviceImageTy &Image,
+                                         const GlobalTy &HostGlobal,
+                                         bool Device2Host);
+
+  /// Actually move memory between host and device. See readGlobalFromDevice and
+  /// writeGlobalToDevice for the interface description.
+  StatusCode moveGlobalBetweenDeviceAndHost(GenericDeviceTy &Device,
+                                         DeviceImageTy &Image,
+                                         const GlobalTy &HostGlobal,
+                                         const GlobalTy &DeviceGlobal,
+                                         bool Device2Host);
+
+public:
+  virtual ~GenericGlobalHandlerTy() {}
+
+  /// Get the address and size of a global in the image. Address and size are
+  /// return in \p ImageGlobal, the global name is passed in \p ImageGlobal.
+  StatusCode getGlobalMetadataFromImage(GenericDeviceTy &Device,
+                                     DeviceImageTy &Image,
+                                     GlobalTy &ImageGlobal);
+
+  /// Read the memory associated with a global from the image and store it on
+  /// the host. The name, size, and destination are defined by \p HostGlobal.
+  StatusCode readGlobalFromImage(GenericDeviceTy &Device,
+                                 DeviceImageTy &Image,
+                                 const GlobalTy &HostGlobal);
+
+  /// Get the address and size of a global from the device. Address is return in
+  /// \p DeviceGlobal, the global name and expected size are passed in
+  /// \p DeviceGlobal.
+  virtual StatusCode getGlobalMetadataFromDevice(GenericDeviceTy &Device,
+                                                 DeviceImageTy &Image,
+                                                 GlobalTy &DeviceGlobal) = 0;
+
+  /// Copy the memory associated with a global from the device to its
+  /// counterpart on the host. The name, size, and destination are defined by
+  /// \p HostGlobal. The origin is defined by \p DeviceGlobal.
+  StatusCode readGlobalFromDevice(GenericDeviceTy &Device,
+                                  DeviceImageTy &Image,
+                                  const GlobalTy &HostGlobal,
+                                  const GlobalTy &DeviceGlobal) {
+    return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal, DeviceGlobal,
+                                          /* D2H */ true);
+  }
+
+  /// Copy the memory associated with a global from the device to its
+  /// counterpart on the host. The name, size, and destination are defined by
+  /// \p HostGlobal. The origin is automatically resolved.
+  StatusCode readGlobalFromDevice(GenericDeviceTy &Device,
+                                  DeviceImageTy &Image,
+                                  const GlobalTy &HostGlobal) {
+    return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal, /* D2H */ true);
+  }
+
+  /// Copy the memory associated with a global from the host to its counterpart
+  /// on the device. The name, size, and origin are defined by \p HostGlobal.
+  /// The destination is defined by \p DeviceGlobal.
+  StatusCode writeGlobalToDevice(GenericDeviceTy &Device,
+                                 DeviceImageTy &Image,
+                                 const GlobalTy &HostGlobal,
+                                 const GlobalTy &DeviceGlobal) {
+    return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal, DeviceGlobal,
+                                          /* D2H */ false);
+  }
+
+  /// Copy the memory associated with a global from the host to its counterpart
+  /// on the device. The name, size, and origin are defined by \p HostGlobal.
+  /// The destination is automatically resolved.
+  StatusCode writeGlobalToDevice(GenericDeviceTy &Device,
+                                 DeviceImageTy &Image,
+                                 const GlobalTy &HostGlobal) {
+    return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal, /* D2H */ false);
+  }
+};
+
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
+
+#endif // LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_GLOBALHANDLER_GLOBALHANDLER_H
Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/GlobalHandler.cpp
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/GlobalHandler.cpp
@@ -0,0 +1,196 @@
+//===- GlobalHandler.cpp - Target independent global & env. var handling --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Target independent global handler and environment manager.
+//
+//===----------------------------------------------------------------------===//
+
+#include "GlobalHandler.h"
+#include "PluginInterface.h"
+
+#include <cstring>
+
+using namespace llvm;
+using namespace omp;
+using namespace target;
+using namespace plugin;
+
+const ELF64LEObjectFile *GenericGlobalHandlerTy::getOrCreateELFObjectFile(
+    const GenericDeviceTy &Device,
+    DeviceImageTy &Image) {
+
+  std::lock_guard<std::mutex> Lock(ELFObjectFilesMutex);
+  auto Search = ELFObjectFiles.find(Image.getId());
+  if (Search == ELFObjectFiles.end()) {
+    Expected<ELF64LEObjectFile> ExpectedELF =
+      ELF64LEObjectFile::create(Image.getMemoryBuffer());
+    if (!ExpectedELF) {
+      INFO(OMP_INFOTYPE_DATA_TRANSFER, Device.getDeviceId(),
+         "Unable to open ELF image.");
+      return nullptr;
+    }
+
+    ELFObjectFiles.emplace(Image.getId(), std::move(ExpectedELF.get()));
+
+    Search = ELFObjectFiles.find(Image.getId());
+    assert(Search != ELFObjectFiles.end());
+  }
+  return &Search->second;
+}
+
+
+StatusCode GenericGlobalHandlerTy::moveGlobalBetweenDeviceAndHost(
+    GenericDeviceTy &Device,
+    DeviceImageTy &Image,
+    const GlobalTy &HostGlobal,
+    bool Device2Host) {
+
+  GlobalTy DeviceGlobal(HostGlobal.getName(), HostGlobal.getSize());
+  StatusCode SC = getGlobalMetadataFromDevice(Device, Image, DeviceGlobal);
+  if (SC) {
+    INFO(OMP_INFOTYPE_DATA_TRANSFER, Device.getDeviceId(),
+         "Failed to read global symbol metadata for '%s' from the device",
+         HostGlobal.getName().c_str());
+    return SC;
+  }
+
+  return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal, DeviceGlobal,
+                                        Device2Host);
+}
+
+/// Actually move memory between host and device. See readGlobalFromDevice and
+/// writeGlobalToDevice for the interface description.
+StatusCode GenericGlobalHandlerTy::moveGlobalBetweenDeviceAndHost(
+    GenericDeviceTy &Device,
+    DeviceImageTy &DeviceImage,
+    const GlobalTy &HostGlobal,
+    const GlobalTy &DeviceGlobal,
+    bool Device2Host) {
+
+  StatusCode SC;
+  if (Device2Host)
+    SC = Device.dataRetrieve(HostGlobal.getPtr(), DeviceGlobal.getPtr(),
+                             HostGlobal.getSize(), nullptr);
+  else
+    SC = Device.dataSubmit(DeviceGlobal.getPtr(), HostGlobal.getPtr(),
+                           HostGlobal.getSize(), nullptr);
+
+  if (SC) {
+    INFO(OMP_INFOTYPE_DATA_TRANSFER, Device.getDeviceId(),
+         "Failed to %s %u bytes associated with global symbol '%s' %s "
+         "the device",
+         Device2Host ? "read" : "write", HostGlobal.getSize(),
+         HostGlobal.getName().c_str(), Device2Host ? "from" : "to");
+    return SC;
+  }
+
+  INFO(OMP_INFOTYPE_DATA_TRANSFER, Device.getDeviceId(),
+       "Successfully %s %u bytes associated with global symbol '%s' %s "
+       "the device (%p -> %p)",
+       Device2Host ? "read" : "write", HostGlobal.getSize(),
+       HostGlobal.getName().c_str(), Device2Host ? "from" : "to",
+       DeviceGlobal.getPtr(), HostGlobal.getPtr());
+
+  return SC;
+}
+
+StatusCode GenericGlobalHandlerTy::getGlobalMetadataFromImage(
+    GenericDeviceTy &Device,
+    DeviceImageTy &Image,
+    GlobalTy &ImageGlobal) {
+
+  // Get the ELF object file for the image. Notice the ELF object may already
+  // be created in previous calls, so we can reuse it.
+  const ELF64LEObjectFile *ELF = getOrCreateELFObjectFile(Device, Image);
+  if (!ELF)
+    return StatusCode::FAIL;
+
+  // Then extract the base address of elf image.
+  Expected<uint64_t> StartAddr = ELF->getStartAddress();
+  if (!StartAddr) {
+    INFO(OMP_INFOTYPE_DATA_TRANSFER, Device.getDeviceId(),
+         "Unable to determine ELF start address.");
+    return StatusCode::FAIL;
+  }
+
+  // TODO: We should improve the search by consulting HASH or GNU_HASH tables.
+  for (auto &SymbolIt : ELF->symbols()) {
+    // First check the name, continue if we don't match.
+    Expected<StringRef> Name = SymbolIt.getName();
+    if (!Name || !Name.get().equals(ImageGlobal.getName()))
+      continue;
+
+    // If we match we will either succeed or fail with retriving the content,
+    // either way, the loop is done. First step is to verify the size.
+    ImageGlobal.setSize(SymbolIt.getSize());
+
+    // Then extract the relative offset from the elf image base.
+    Expected<uint64_t> Offset = SymbolIt.getValue();
+    if (!Offset) {
+      INFO(OMP_INFOTYPE_DATA_TRANSFER, Device.getDeviceId(),
+           "Global symbol '%s' was found in the elf image but address could "
+           "not be determined.", ImageGlobal.getName().c_str());
+      return StatusCode::FAIL;
+    }
+
+    Expected<section_iterator> SectionIt = SymbolIt.getSection();
+    if (!SectionIt) {
+      INFO(OMP_INFOTYPE_DATA_TRANSFER, Device.getDeviceId(),
+           "Global symbol '%s' was found in the elf image but section could "
+           "not be determined.", ImageGlobal.getName().c_str());
+      return StatusCode::FAIL;
+    }
+    ELFSectionRef ELFSection = *(SectionIt.get());
+
+    Expected<StringRef> SectionName = ELFSection.getName();
+    if (!SectionName) {
+      INFO(OMP_INFOTYPE_DATA_TRANSFER, Device.getDeviceId(),
+           "Global symbol '%s' was found in the elf image but the section name "
+           "could not be determined.", ImageGlobal.getName().c_str());
+      return StatusCode::FAIL;
+    }
+
+    ImageGlobal.setPtr((char *)Image.getStart() + ELFSection.getOffset() +
+                       Offset.get());
+
+    return StatusCode::SUCCESS;
+  }
+
+  INFO(OMP_INFOTYPE_DATA_TRANSFER, Device.getDeviceId(),
+       "Global symbol '%s' was not found in the elf image.",
+       ImageGlobal.getName().c_str());
+  return StatusCode::FAIL;
+}
+
+StatusCode GenericGlobalHandlerTy::readGlobalFromImage(
+    GenericDeviceTy &Device,
+    DeviceImageTy &Image,
+    const GlobalTy &HostGlobal) {
+
+  GlobalTy ImageGlobal(HostGlobal.getName(), -1);
+  StatusCode SC = getGlobalMetadataFromImage(Device, Image, ImageGlobal);
+  if (SC)
+    return SC;
+
+  if (ImageGlobal.getSize() != HostGlobal.getSize()) {
+    INFO(OMP_INFOTYPE_DATA_TRANSFER, Device.getDeviceId(),
+         "Global symbol '%s' has %u bytes in the elf image but %u bytes "
+         "on the host, abort transfer.",
+         HostGlobal.getName().c_str(), ImageGlobal.getSize(),
+         HostGlobal.getSize());
+    return StatusCode::FAIL;
+  }
+
+  INFO(OMP_INFOTYPE_DATA_TRANSFER, Device.getDeviceId(),
+       "Global symbol '%s' was found in the elf image and %u bytes will "
+       "copied from %p to %p.",
+       HostGlobal.getName().c_str(), HostGlobal.getSize(), ImageGlobal.getPtr(),
+       HostGlobal.getPtr());
+  std::memcpy(HostGlobal.getPtr(), ImageGlobal.getPtr(), HostGlobal.getSize());
+  return StatusCode::SUCCESS;
+}
Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -0,0 +1,820 @@
+//===- PluginInterface.h - Target independent plugin device interface -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_PLUGININTERFACE_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_PLUGININTERFACE_H
+
+#include <cstddef>
+#include <cstdint>
+#include <list>
+#include <map>
+#include <vector>
+
+#include "Debug.h"
+#include "DeviceEnvironment.h"
+#include "GlobalHandler.h"
+#include "MemoryManager.h"
+#include "Utilities.h"
+#include "omptarget.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include "llvm/Frontend/OpenMP/OMPGridValues.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MemoryBufferRef.h"
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+struct GenericPluginTy;
+struct GenericKernelTy;
+struct GenericDeviceTy;
+
+/// Class that wraps the __tgt_async_info to simply its usage. In case the
+/// object is constructed without a valid __tgt_async_info, the object will use
+/// an internal one and will synchronize the current thread with the pending
+/// operations on object destruction.
+struct AsyncInfoWrapperTy {
+  AsyncInfoWrapperTy(StatusCode &SC, GenericDeviceTy &Device,
+                     __tgt_async_info *AsyncInfoPtr)
+      : SC(SC), Device(Device),
+        AsyncInfoPtr(AsyncInfoPtr ? AsyncInfoPtr : &LocalAsyncInfo) {}
+
+  /// Synchronize with the __tgt_async_info's pending operations if it's the
+  /// internal one.
+  ~AsyncInfoWrapperTy();
+
+  /// Get the raw __tgt_async_info pointer.
+  operator __tgt_async_info *() const { return AsyncInfoPtr; }
+
+  /// Get a reference to the underlying plugin-specific queue type.
+  template <typename Ty> Ty &getQueueAs() const {
+    static_assert(sizeof(Ty) == sizeof(AsyncInfoPtr->Queue),
+                  "Queue is not of the same size as target type!");
+    return reinterpret_cast<Ty &>(AsyncInfoPtr->Queue);
+  }
+
+private:
+  StatusCode &SC;
+  GenericDeviceTy &Device;
+  __tgt_async_info LocalAsyncInfo;
+  __tgt_async_info *const AsyncInfoPtr;
+};
+
+/// Class wrapping a __tgt_device_image and its offload entry table on a
+/// specific device. This class is responsible for storing and managing
+/// the offload entries for an image on a device.
+class DeviceImageTy {
+
+  /// Class representing the offload entry table. The class stores the
+  /// __tgt_target_table and a map to search in the table faster.
+  struct OffloadEntryTableTy {
+    /// Clear the table of entries.
+    void clear() {
+      Entries.clear();
+      TTTablePtr.EntriesBegin = TTTablePtr.EntriesEnd = nullptr;
+    }
+
+    /// Add new entry in the table.
+    void addEntry(const __tgt_offload_entry &Entry) {
+      Entries.push_back(Entry);
+      TTTablePtr.EntriesBegin = &Entries[0];
+      TTTablePtr.EntriesEnd = TTTablePtr.EntriesBegin + Entries.size();
+    }
+
+    /// Get the offload entry by its address.
+    __tgt_offload_entry *getEntry(void *Addr) {
+      auto It = EntryMap.find(Addr);
+      if (It == EntryMap.end())
+        return nullptr;
+      return &Entries[It->second];
+    }
+
+    /// Get the raw pointer to the __tgt_target_table.
+    operator __tgt_target_table *() {
+      if (Entries.empty())
+        return nullptr;
+      return &TTTablePtr;
+    }
+
+  private:
+    __tgt_target_table TTTablePtr;
+    std::map<void *, unsigned> EntryMap;
+    std::vector<__tgt_offload_entry> Entries;
+  };
+
+  /// Image identifier within the corresponding device. Notice that this id is
+  /// not unique between different device; they may overlap.
+  int32_t ImageId;
+
+  /// The pointer to the raw __tgt_device_image.
+  const __tgt_device_image *TgtImage;
+
+  /// Table of offload entries.
+  OffloadEntryTableTy OffloadEntryTable;
+
+public:
+  DeviceImageTy(int32_t Id, const __tgt_device_image *Image)
+      : ImageId(Id), TgtImage(Image) {
+    assert(TgtImage && "Target image is invalid");
+  }
+
+  /// Get the image identifier within the device.
+  int32_t getId() const { return ImageId; }
+
+  /// Get the pointer to the raw __tgt_device_image.
+  const __tgt_device_image *getTgtImage() const { return TgtImage; }
+
+  /// Get the image starting address.
+  void *getStart() const { return TgtImage->ImageStart; }
+
+  /// Get the image size.
+  size_t getSize() const {
+    return ((char *)TgtImage->ImageEnd) - ((char *)TgtImage->ImageStart);
+  }
+
+  /// Get a memory buffer reference to the whole image.
+  MemoryBufferRef getMemoryBuffer() const {
+    return MemoryBufferRef(
+        StringRef((const char *)getStart(), getSize()), "Image");
+  }
+
+  /// Get a reference to the offload entry table for the image.
+  OffloadEntryTableTy &getOffloadEntryTable() { return OffloadEntryTable; }
+};
+
+/// Class implementing common functionalities of offload kernels. Each plugin
+/// should define the specific kernel class, derive from this generic one, and
+/// implement the necessary virtual function members.
+struct GenericKernelTy {
+  /// Construct a kernel with a name and a execution mode.
+  GenericKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode)
+      : Name(Name), ExecutionMode(ExecutionMode) {}
+
+  virtual ~GenericKernelTy() {}
+
+  /// Initialize the kernel object from a specific device.
+  StatusCode init(GenericDeviceTy &GenericDevice);
+  virtual StatusCode initImpl(GenericDeviceTy &GenericDevice) = 0;
+
+  /// Launch the kernel on the specific device. The device must be the same
+  /// one used to initialize the kernel.
+  StatusCode launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
+                    ptrdiff_t *ArgOffsets, int32_t NumArgs,
+                    int32_t NumTeamsClause, int32_t ThreadLimitClause,
+                    int32_t LoopTripCount,
+                    AsyncInfoWrapperTy &AsyncInfoWrapper) const;
+  virtual StatusCode launchImpl(GenericDeviceTy &GenericDevice,
+                                int32_t NumThreads, int32_t NumBlocks,
+                                int32_t DynamicMemorySize,
+                                int32_t NumKernelArgs, void *KernelArgs,
+                                AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
+private:
+  /// Prepare the arguments before launching the kernel.
+  void *prepareArgs(GenericDeviceTy &GenericDevice,
+                    void **ArgPtrs, ptrdiff_t *ArgOffsets,
+                    int32_t NumArgs,
+                    llvm::SmallVectorImpl<void *> &Args,
+                    llvm::SmallVectorImpl<void *> &Ptrs,
+                    AsyncInfoWrapperTy &AsyncInfoWrapper) const;
+
+  /// Get the default number of threads and blocks for the kernel.
+  virtual int32_t getDefaultNumThreads(GenericDeviceTy &Device) const = 0;
+  virtual int32_t getDefaultNumBlocks(GenericDeviceTy &Device) const = 0;
+
+  /// Get the number of threads and blocks for the kernel based on the
+  /// user-defined threads and block clauses.
+  int32_t getNumThreads(GenericDeviceTy &GenericDevice,
+                        int32_t ThreadLimitClause) const;
+  int32_t getNumBlocks(GenericDeviceTy &GenericDevice, int32_t BlockLimitClause,
+                       int32_t LoopTripCount, int32_t NumThreads) const;
+
+  /// Get the kernel name.
+  const char *getName() const { return Name; }
+
+  /// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode.
+  bool isGenericSPMDMode() const {
+    return ExecutionMode == OMP_TGT_EXEC_MODE_GENERIC_SPMD;
+  }
+  bool isGenericMode() const {
+    return ExecutionMode == OMP_TGT_EXEC_MODE_GENERIC;
+  }
+  bool isSPMDMode() const {
+    return ExecutionMode == OMP_TGT_EXEC_MODE_SPMD;
+  }
+
+  /// Get the execution mode name of the kernel.
+  const char *getExecutionModeName() const {
+    switch (ExecutionMode) {
+    case OMP_TGT_EXEC_MODE_SPMD:
+      return "SPMD";
+    case OMP_TGT_EXEC_MODE_GENERIC:
+      return "Generic";
+    case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
+      return "Generic-SPMD";
+    }
+    llvm_unreachable("Unknown execution mode!");
+  }
+
+  /// The kernel name.
+  const char *Name;
+
+  /// The execution flags of the kernel.
+  OMPTgtExecModeFlags ExecutionMode;
+
+protected:
+  /// The dynamic memory size reserved for executing the kernel.
+  int32_t DynamicMemorySize = -1;
+
+  /// The preferred number of threads to run the kernel.
+  int32_t PreferredNumThreads = -1;
+
+  /// The maximum number of threads which the kernel could leverage.
+  int32_t MaxNumThreads = -1;
+};
+
+/// Class implementing common functionalities of offload devices. Each plugin
+/// should define the specific device class, derive from this generic one, and
+/// implement the necessary virtual function members.
+struct GenericDeviceTy : public DeviceAllocatorTy {
+  /// Construct a device with its device id within the plugin, the number of
+  /// devices in the plugin and the grid values for that kind of device.
+  GenericDeviceTy(int32_t DeviceId, int32_t NumDevices,
+      const llvm::omp::GV &GridValues);
+
+  /// Get the device identifier within the corresponding plugin. Notice that
+  /// this id is not unique between different plugins; they may overlap.
+  int32_t getDeviceId() const { return DeviceId; }
+
+  /// Set the context of the device if needed, before calling device-specific
+  /// functions. Plugins may implement this function as a no-op if not needed.
+  virtual StatusCode setContext() = 0;
+
+  /// Initialize the device. After this call, the device should be already
+  /// working and ready to accept queries or modifications.
+  StatusCode init(GenericPluginTy &Plugin);
+  virtual StatusCode initImpl(GenericPluginTy &Plugin) = 0;
+
+  /// Deinitialize the device and free all its resources. After this call, the
+  /// device is no longer considered ready, so no queries or modifications are
+  /// allowed.
+  StatusCode deinit();
+  virtual StatusCode deinitImpl() = 0;
+
+  /// Load the binary image into the device and return the target table.
+  __tgt_target_table *loadBinary(GenericPluginTy &Plugin,
+                                 const __tgt_device_image *TgtImage);
+  virtual DeviceImageTy *loadBinaryImpl(const __tgt_device_image *TgtImage,
+                                        int32_t ImageId) = 0;
+
+  /// Setup the device environment if needed. Notice this setup may not be run
+  /// on some plugins. By default, it will be executed, but plugins can change
+  /// this behavior by overriding the shouldSetupDeviceEnvironment function.
+  StatusCode setupDeviceEnvironment(GenericPluginTy &Plugin,
+                                    DeviceImageTy &Image);
+
+  /// Register the offload entries for a specific image on the device.
+  StatusCode registerOffloadEntries(DeviceImageTy &Image);
+
+  /// Synchronize the current thread with the pending operations on the
+  /// __tgt_async_info structure.
+  StatusCode synchronize(__tgt_async_info *AsyncInfo);
+  virtual StatusCode synchronizeImpl(__tgt_async_info &AsyncInfo) = 0;
+
+  /// Allocate data on the device or involving the device.
+  void *dataAlloc(int64_t Size, void *HostPtr, TargetAllocTy Kind);
+
+  /// Deallocate data from the device or involving the device.
+  StatusCode dataDelete(void *TgtPtr, TargetAllocTy Kind);
+
+  /// Submit data to the device (host to device transfer).
+  StatusCode dataSubmit(void *TgtPtr, const void *HstPtr, int64_t Size,
+                        __tgt_async_info *AsyncInfo);
+  virtual StatusCode dataSubmitImpl(void *TgtPtr, const void *HstPtr,
+                                    int64_t Size,
+                                    AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
+  /// Retrieve data from the device (device to host transfer).
+  StatusCode dataRetrieve(void *HstPtr, const void *TgtPtr, int64_t Size,
+                          __tgt_async_info *AsyncInfo);
+  virtual StatusCode dataRetrieveImpl(void *HstPtr, const void *TgtPtr,
+                                      int64_t Size,
+                                      AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
+  /// Exchange data between devices (device to device transfer). Calling this function
+  /// is only valid if GenericPlugin::isDataExchangable() passing the two devices
+  /// returns true.
+  StatusCode dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev, void *DstPtr,
+                          int64_t Size, __tgt_async_info *AsyncInfo);
+  virtual StatusCode dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev,
+                                      void *DstPtr, int64_t Size,
+                                      AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
+  /// Run the target region with multiple teams.
+  StatusCode runTargetTeamRegion(void *EntryPtr, void **ArgPtrs,
+                                 ptrdiff_t *ArgOffsets, int32_t NumArgs,
+                                 int32_t NumTeamsClause,
+                                 int32_t ThreadLimitClause,
+                                 uint64_t LoopTripCount,
+                                 __tgt_async_info *AsyncInfo);
+
+  /// Initialize a __tgt_async_info structure. Related to interop features.
+  StatusCode initAsyncInfo(__tgt_async_info **AsyncInfoPtr);
+  virtual StatusCode initAsyncInfoImpl(
+      AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
+  /// Initialize a __tgt_device_info structure. Related to interop features.
+  StatusCode initDeviceInfo(__tgt_device_info *DeviceInfo, const char **Err);
+  virtual StatusCode initDeviceInfoImpl(__tgt_device_info *DeviceInfo) = 0;
+
+  /// Create an event.
+  StatusCode createEvent(void **EventPtrStorage);
+  virtual StatusCode createEventImpl(void **EventPtrStorage) = 0;
+
+  /// Destroy an event.
+  StatusCode destroyEvent(void *Event);
+  virtual StatusCode destroyEventImpl(void *EventPtr) = 0;
+
+  /// Start the recording of the event.
+  StatusCode recordEvent(void *Event, __tgt_async_info *AsyncInfo);
+  virtual StatusCode recordEventImpl(void *EventPtr,
+                                     AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
+  /// Wait for an event to finish. Notice this wait is asynchronous if the
+  /// __tgt_async_info is not nullptr.
+  StatusCode waitEvent(void *Event, __tgt_async_info *AsyncInfo);
+  virtual StatusCode waitEventImpl(void *EventPtr,
+                                   AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;
+
+  /// Synchronize the current thread with the event.
+  StatusCode syncEvent(void *EventPtr);
+  virtual StatusCode syncEventImpl(void *EventPtr) = 0;
+
+  /// Print information about the device.
+  void printInfo();
+  virtual void printInfoImpl() = 0;
+
+  /// Getters of the grid values.
+  int32_t getWarpSize() const { return GridValues.GV_Warp_Size; }
+  int32_t getBlockLimit() const { return GridValues.GV_Max_Teams; }
+  int32_t getThreadLimit() const { return GridValues.GV_Max_WG_Size; }
+  int32_t getDefaultNumThreads() const {
+    return GridValues.GV_Default_WG_Size;
+  }
+  int32_t getDefaultNumBlocks() const {
+    // TODO: Introduce a default num blocks value.
+    return GridValues.GV_Default_WG_Size;
+  }
+  int32_t getDynamicMemorySize() const {
+    return OMPX_SharedMemorySize;
+  }
+
+private:
+  /// Register offload entry for global variable.
+  StatusCode registerGlobalOffloadEntry(DeviceImageTy &DeviceImage,
+                                        const __tgt_offload_entry &GlobalEntry,
+                                        __tgt_offload_entry &DeviceEntry);
+
+  /// Register offload entry for kernel function.
+  StatusCode registerKernelOffloadEntry(DeviceImageTy &DeviceImage,
+                                        const __tgt_offload_entry &KernelEntry,
+                                        __tgt_offload_entry &DeviceEntry);
+
+  /// Allocate and construct a kernel object.
+  virtual GenericKernelTy *
+  constructKernelEntry(const __tgt_offload_entry &KernelEntry,
+                       DeviceImageTy &Image) = 0;
+
+  /// Get and set the stack size and heap size for the device. If not used, the
+  /// plugin can implement the setters as no-op and setting the output
+  /// value to zero for the getters.
+  virtual StatusCode getDeviceStackSize(uint64_t &V) = 0;
+  virtual StatusCode setDeviceStackSize(uint64_t V) = 0;
+  virtual StatusCode getDeviceHeapSize(uint64_t &V) = 0;
+  virtual StatusCode setDeviceHeapSize(uint64_t V) = 0;
+
+  /// Environment variables defined by the OpenMP standard.
+  Int32Envar OMP_TeamLimit;
+  Int32Envar OMP_NumTeams;
+  Int32Envar OMP_TeamsThreadLimit;
+
+  /// Environment variables defined by the LLVM OpenMP implementation.
+  Int32Envar OMPX_DebugKind;
+  Int32Envar OMPX_SharedMemorySize;
+  UInt64Envar OMPX_TargetStackSize;
+  UInt64Envar OMPX_TargetHeapSize;
+
+  /// Pointer to the memory manager or nullptr if not available.
+  MemoryManagerTy *MemoryManager;
+
+protected:
+  /// Array of images loaded into the device. Images are automatically
+  /// deallocated by the allocator.
+  std::vector<DeviceImageTy *> LoadedImages;
+
+  /// The identifier of the device within the plugin. Notice this is not a
+  /// global device id and is not the device id visible to the OpenMP user.
+  const int32_t DeviceId;
+
+  /// The default grid values used for this device.
+  llvm::omp::GV GridValues;
+
+  /// Enumeration used for representing the current state between two devices
+  /// two devices (both under the same plugin) for the peer access between them.
+  /// The states can be a) PENDING when the state has not been queried and needs
+  /// to be queried, b) AVAILABLE when the peer access is available to be used,
+  /// and c) UNAVAILABLE if the system does not allow it.
+  enum class PeerAccessState : uint8_t {
+    AVAILABLE,
+    UNAVAILABLE,
+    PENDING
+  };
+
+  /// Array of peer access states with the rest of devices. This means that if
+  /// the device I has a matrix PeerAccesses with PeerAccesses[J] == AVAILABLE,
+  /// the device I can access device J's memory directly. However, notice this
+  /// does not mean that device J can access device I's memory directly.
+  std::vector<PeerAccessState> PeerAccesses;
+  std::mutex PeerAccessesLock;
+};
+
+/// Class implementing common functionalities of offload plugins. Each plugin
+/// should define the specific plugin class, derive from this generic one, and
+/// implement the necessary virtual function members.
+struct GenericPluginTy {
+
+  /// Construct a plugin instance. The number of active instances should be
+  /// always be either zero or one.
+  GenericPluginTy() {
+    ++NumActiveInstances;
+  }
+
+  /// Destroy the plugin instance and release all its resources. Also decrease
+  /// the number of instances.
+  virtual ~GenericPluginTy() {
+    // There is no global handler if no device is available.
+    if (GlobalHandler)
+      delete GlobalHandler;
+
+    // Deinitialize all active devices.
+    for (int32_t Id = 0; Id < NumDevices; ++Id) {
+      if (Devices[Id]) {
+        StatusCode SC = deinitDevice(Id);
+        if (SC)
+          REPORT("Error when deinitializing device %d\n", Id);
+      }
+      assert(!Devices[Id] && "Device was not deinitialized");
+    }
+
+    --NumActiveInstances;
+  }
+
+  /// Get the reference to the device with a certain device id.
+  GenericDeviceTy &getDevice(int32_t DeviceId) {
+    assert(isValidDeviceId(DeviceId) && "Device Id is invalid");
+    assert(Devices[DeviceId] && "Device is unitialized");
+
+    return *Devices[DeviceId];
+  }
+
+  /// Get the number of active devices.
+  int32_t getNumDevices() const { return NumDevices; }
+
+  /// Get the ELF code to recognize the binary image of this plugin.
+  virtual uint16_t getMagicElfBits() const = 0;
+
+  /// Allocate a structure using the internal allocator.
+  template <typename Ty> Ty *allocate() {
+    return reinterpret_cast<Ty *>(Allocator.Allocate(sizeof(Ty), alignof(Ty)));
+  }
+
+  /// Get the reference to the global handler of this plugin.
+  GenericGlobalHandlerTy &getGlobalHandler() {
+    assert(GlobalHandler && "Global handler not initialized");
+    return *GlobalHandler;
+  }
+
+  /// Get the OpenMP requires flags set for this plugin.
+  int64_t getRequiresFlags() const { return RequiresFlags; }
+
+  /// Set the OpenMP requires flags for this plugin.
+  void setRequiresFlag(int64_t Flags) { RequiresFlags = Flags; }
+
+  /// Initialize a device within the plugin.
+  StatusCode initDevice(int32_t DeviceId);
+
+  /// Deinitialize a device within the plugin and release its resources.
+  StatusCode deinitDevice(int32_t DeviceId);
+
+  /// Indicate whether data can be exchanged directly between two devices under
+  /// this same plugin. If this function returns true, it's safe to call the
+  /// GenericDeviceTy::exchangeData() function on the source device.
+  virtual bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) {
+    return isValidDeviceId(SrcDeviceId) && isValidDeviceId(DstDeviceId);
+  }
+
+  /// Indicate if an image is compatible with a specific device. Notice that
+  /// this function may be called before actually initializing the device. So
+  /// we could not move this function into GenericDeviceTy.
+  virtual bool isImageCompatible(int32_t DeviceId,
+                                 __tgt_image_info *Info) const = 0;
+
+  /// Indicate whether the plugin supports empty images.
+  virtual bool supportsEmptyImages() const { return false; }
+
+  /// Indicate whether there is any active plugin instance.
+  static bool hasAnyActiveInstance() {
+    assert(NumActiveInstances <= 1);
+    return (NumActiveInstances > 0);
+  }
+
+protected:
+  /// Initialize the plugin and prepare for initializing its devices.
+  void init(int NumDevices, GenericGlobalHandlerTy *GlobalHandler) {
+    this->NumDevices = NumDevices;
+    this->GlobalHandler = GlobalHandler;
+
+    assert(Devices.size() == 0);
+
+    Devices.resize(NumDevices, nullptr);
+  }
+
+  /// Create a new device with a specific device id.
+  virtual GenericDeviceTy &createDevice(int32_t DeviceId) = 0;
+
+  /// Indicate whether a device id is valid.
+  bool isValidDeviceId(int32_t DeviceId) const {
+    return (DeviceId >= 0 && DeviceId < getNumDevices());
+  }
+
+private:
+  /// Number of devices available for the plugin.
+  int32_t NumDevices;
+
+  /// Array of pointers to the devices. Initially, they are all set to nullptr.
+  /// Once a device is initialized, the pointer is stored in the position given
+  /// by its device id. A position with nullptr means that the corresponding
+  /// device was not initialized yet.
+  std::vector<GenericDeviceTy *> Devices;
+
+  /// OpenMP requires flags
+  int64_t RequiresFlags = OMP_REQ_UNDEFINED;
+
+  /// Pointer to the global handler for this plugin.
+  GenericGlobalHandlerTy *GlobalHandler;
+
+  /// Internal allocator for different structures.
+  BumpPtrAllocator Allocator;
+
+  /// Indicates the number of active plugin instances. Actually, we should only
+  /// have one active instance per plugin library. But we use a counter for
+  /// simplicity.
+  static uint32_t NumActiveInstances;
+};
+
+/// Class for simplifying the getter operation of the plugin. Anywhere on the
+/// code, the current plugin can be retrieved by Plugin::get(). The init(),
+/// deinit(), get() and getErrorStr() functions should be defined by each plugin
+/// implementation.
+class Plugin {
+  /// Avoid instances of this class.
+  Plugin() {}
+  Plugin(const Plugin &) = delete;
+  void operator=(const Plugin &) = delete;
+
+public:
+  /// Initialize the plugin if it was not initialized yet.
+  static StatusCode init();
+
+  /// Deinitialize the plugin if it was not deinitialized yet.
+  static StatusCode deinit();
+
+  /// Get a reference (or create if it was not created) to the plugin instance.
+  static GenericPluginTy &get();
+
+  /// Get a reference to the plugin with a specific plugin-specific type.
+  template <typename Ty>
+  static Ty &get() {
+    return static_cast<Ty &>(get());
+  }
+
+  /// Indicate if the plugin is currently active. Actually, we check if there is
+  /// any active instances.
+  static bool isActive() { return GenericPluginTy::hasAnyActiveInstance(); }
+
+  /// Get a string description from a status code.
+  static const char *getErrorStr(const StatusCode &SC);
+};
+
+/// Auxiliary interface class for GenericDeviceResourcePoolTy. This class acts
+/// as a reference to a device resource, such as a stream, and requires some
+/// basic functions to be implemented. The derived class should define an empty
+/// constructor that creates an empty and invalid resource reference. Do not
+/// create a new resource on the ctor, but on the create() function instead.
+struct GenericDeviceResourceRef {
+  /// Create a new resource and stores a reference.
+  virtual StatusCode create() = 0;
+
+  /// Destroy and release the resources pointed by the reference.
+  virtual StatusCode destroy() = 0;
+};
+
+/// Class that implements a resource pool belonging to a device. This class
+/// operates with references to the actual resources. These reference must
+/// derive from the GenericDeviceResourceRef class and implement the create
+/// and destroy virtual functions.
+template <typename ResourceRef> class GenericDeviceResourcePoolTy {
+  using ResourcePoolTy = GenericDeviceResourcePoolTy<ResourceRef>;
+
+public:
+  /// Create an empty resource pool for a specific device.
+  GenericDeviceResourcePoolTy(GenericDeviceTy &Device)
+    : Device(Device), NextAvailable(0) {}
+
+  /// Destroy the resource pool. At this point, the deinit() function should
+  /// already have been executed so the resource pool should be empty.
+  virtual ~GenericDeviceResourcePoolTy() { assert(ResourcePool.empty()); }
+
+  /// Initialize the resource pool.
+  StatusCode init(uint32_t InitialSize) {
+    assert(ResourcePool.empty() && "resource pool has been initialized");
+    return ResourcePoolTy::resizeResourcePool(InitialSize);
+  }
+
+  /// Deinitialize the resource pool and delete all resources. This function
+  /// must be called before the destructor.
+  StatusCode deinit() {
+    StatusCode SC = ResourcePoolTy::resizeResourcePool(0);
+    if (SC)
+      return SC;
+
+    ResourcePool.clear();
+
+    return StatusCode::SUCCESS;
+  }
+
+protected:
+  /// Get resource from the pool or create new resources.
+  ResourceRef getResource() {
+    const std::lock_guard<std::mutex> Lock(Mutex);
+    if (NextAvailable == ResourcePool.size()) {
+      // By default we double the resource pool every time.
+      if (ResourcePoolTy::resizeResourcePool(NextAvailable * 2))
+        return ResourceRef();
+    }
+    return ResourcePool[NextAvailable++];
+  }
+
+  /// Return resource to the pool.
+  void returnResource(ResourceRef Resource) {
+    const std::lock_guard<std::mutex> Lock(Mutex);
+    ResourcePool[--NextAvailable] = Resource;
+  }
+
+private:
+  /// The resources between \p OldSize and \p NewSize need to be created or
+  /// destroyed. The mutex is locked when this function is called.
+  StatusCode resizeResourcePoolImpl(uint32_t OldSize, uint32_t NewSize) {
+    assert(OldSize != NewSize && "resizing to the same size");
+
+    if (Device.setContext())
+      return StatusCode::FAIL;
+
+    if (OldSize < NewSize) {
+      // Create new resources.
+      for (uint32_t I = OldSize; I < NewSize; ++I) {
+        StatusCode SC = ResourcePool[I].create();
+        if (SC)
+          return SC;
+      }
+    } else {
+      // Destroy the obsolete resources.
+      for (uint32_t I = NewSize; I < OldSize; ++I) {
+        StatusCode SC = ResourcePool[I].destroy();
+        if (SC)
+          return SC;
+      }
+    }
+    return StatusCode::SUCCESS;
+  }
+
+  /// Increase or decrease the number of resources. This function should
+  /// be called with the mutex acquired.
+  StatusCode resizeResourcePool(uint32_t NewSize) {
+    uint32_t OldSize = ResourcePool.size();
+
+    StatusCode SC;
+    if (OldSize == NewSize) {
+      SC = StatusCode::SUCCESS;
+    } else if (OldSize < NewSize) {
+      ResourcePool.resize(NewSize);
+      SC = ResourcePoolTy::resizeResourcePoolImpl(OldSize, NewSize);
+    } else {
+      SC = ResourcePoolTy::resizeResourcePoolImpl(OldSize, NewSize);
+      ResourcePool.resize(NewSize);
+    }
+    return SC;
+  }
+
+  /// The device to which the resources belong
+  GenericDeviceTy &Device;
+
+  /// Mutex for the resource pool.
+  std::mutex Mutex;
+
+  /// The next available resource in the pool.
+  uint32_t NextAvailable;
+
+protected:
+  /// The actual resource pool.
+  std::deque<ResourceRef> ResourcePool;
+};
+
+/// Class implementing a common stream manager. This class can be directly used
+/// by the specific plugins if necessary. The StreamRef type should derive from
+/// the GenericDeviceResourceRef. Look at its description to know the details of
+/// their requirements.
+template <typename StreamRef>
+class GenericStreamManagerTy : public GenericDeviceResourcePoolTy<StreamRef> {
+  using ResourcePoolTy = GenericDeviceResourcePoolTy<StreamRef>;
+
+public:
+  /// Create a stream manager with space for an initial number of streams. No
+  /// stream will be created until the init() function is called.
+  GenericStreamManagerTy(GenericDeviceTy &Device, uint32_t DefNumStreams = 32)
+    : ResourcePoolTy(Device),
+      InitialNumStreams("LIBOMPTARGET_NUM_INITIAL_STREAMS", DefNumStreams) {}
+
+  /// Initialize the stream pool and their resources with the initial number of
+  /// streams.
+  StatusCode init() {
+    return ResourcePoolTy::init(InitialNumStreams.get());
+  }
+
+  /// Get an available stream or create new.
+  StreamRef getStream() {
+    return ResourcePoolTy::getResource();
+  }
+
+  /// Return idle stream.
+  void returnStream(StreamRef Stream) {
+    ResourcePoolTy::returnResource(Stream);
+  }
+
+private:
+  /// The initial stream pool size, potentially defined by an envar.
+  UInt32Envar InitialNumStreams;
+};
+
+/// Class implementing a common event manager. This class can be directly used
+/// by the specific plugins if necessary. The EventRef type should derive from
+/// the GenericDeviceResourceRef. Look at its description to know the details of
+/// their requirements.
+template <typename EventRef>
+struct GenericEventManagerTy : public GenericDeviceResourcePoolTy<EventRef> {
+  using ResourcePoolTy = GenericDeviceResourcePoolTy<EventRef>;
+
+public:
+  /// Create an event manager with space for an initial number of events. No
+  /// event will be created until the init() function is called.
+  GenericEventManagerTy(GenericDeviceTy &Device, uint32_t DefNumEvents = 32)
+    : ResourcePoolTy(Device),
+      InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", DefNumEvents) {}
+
+  /// Initialize the event pool and their resources with the initial number of
+  /// events.
+  StatusCode init() {
+    return ResourcePoolTy::init(InitialNumEvents.get());
+  }
+
+  /// Get an available event or create new.
+  EventRef getEvent() {
+    return ResourcePoolTy::getResource();
+  }
+
+  /// Return an idle event.
+  void returnEvent(EventRef Event) {
+    ResourcePoolTy::returnResource(Event);
+  }
+
+private:
+  /// The initial event pool size, potentially defined by an envar.
+  UInt32Envar InitialNumEvents;
+};
+
+} // namespace plugin
+} // namespace target
+} // namespace omp
+} // namespace llvm
+
+#endif // OPENMP_LIBOMPTARGET_PLUGINS_COMMON_PLUGININTERFACE_H
Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -0,0 +1,821 @@
+//===- PluginInterface.cpp - Target independent plugin device interface ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "Debug.h"
+#include "elf_common.h"
+#include "PluginInterface.h"
+#include "GlobalHandler.h"
+#include "omptarget.h"
+#include "omptargetplugin.h"
+
+#include <cstdint>
+#include <limits>
+
+using namespace llvm;
+using namespace omp;
+using namespace target;
+using namespace plugin;
+
+uint32_t GenericPluginTy::NumActiveInstances = 0;
+
+AsyncInfoWrapperTy::~AsyncInfoWrapperTy() {
+  // If we used a local async info object we want synchronous behavior.
+  // In that case, and assuming the current status code is OK, we will
+  // synchronize explicitly when the object is deleted.
+  if (AsyncInfoPtr == &LocalAsyncInfo && SC.succeed())
+    SC = Device.synchronize(&LocalAsyncInfo);
+}
+
+StatusCode GenericKernelTy::init(GenericDeviceTy &GenericDevice) {
+  PreferredNumThreads = getDefaultNumThreads(GenericDevice);
+  if (isGenericMode())
+    PreferredNumThreads += GenericDevice.getWarpSize();
+
+  MaxNumThreads = GenericDevice.getThreadLimit();
+
+  DynamicMemorySize = GenericDevice.getDynamicMemorySize();
+
+  StatusCode SC = initImpl(GenericDevice);
+  if (SC) {
+    REPORT("Failure to initialize kernel\n");
+  }
+  return SC;
+}
+
+StatusCode GenericKernelTy::launch(GenericDeviceTy &GenericDevice,
+                                   void **ArgPtrs, ptrdiff_t *ArgOffsets,
+                                   int32_t NumArgs, int32_t NumTeamsClause,
+                                   int32_t ThreadLimitClause,
+                                   int32_t LoopTripCount,
+                                   AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+  llvm::SmallVector<void *, 16> Args;
+  llvm::SmallVector<void *, 16> Ptrs;
+
+  void *KernelArgsPtr = prepareArgs(GenericDevice, ArgPtrs, ArgOffsets,
+                                    NumArgs, Args, Ptrs, AsyncInfoWrapper);
+
+  int32_t NumThreads = getNumThreads(GenericDevice, ThreadLimitClause);
+  int32_t NumBlocks = getNumBlocks(GenericDevice, NumTeamsClause, LoopTripCount,
+                                   NumThreads);
+
+  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
+       "Launching kernel %s with %d blocks and %d threads in %s mode\n",
+       getName(), NumBlocks, NumThreads, getExecutionModeName());
+
+  StatusCode SC = launchImpl(GenericDevice, NumThreads, NumBlocks,
+                             DynamicMemorySize, NumArgs, KernelArgsPtr,
+                             AsyncInfoWrapper);
+  if (SC) {
+    REPORT("Failure to launch kernel %s\n", getName());
+  }
+  return SC;
+}
+
+void *GenericKernelTy::prepareArgs(GenericDeviceTy &GenericDevice,
+                                   void **ArgPtrs, ptrdiff_t *ArgOffsets,
+                                   int32_t NumArgs,
+                                   llvm::SmallVectorImpl<void *> &Args,
+                                   llvm::SmallVectorImpl<void *> &Ptrs,
+                                   AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+  Args.resize(NumArgs);
+  Ptrs.resize(NumArgs);
+
+  if (NumArgs == 0)
+    return nullptr;
+
+  for (int I = 0; I < NumArgs; ++I) {
+    Ptrs[I] = (void *)((intptr_t)ArgPtrs[I] + ArgOffsets[I]);
+    Args[I] = &Ptrs[I];
+  }
+  return &Args[0];
+}
+
+int32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice,
+                                       int32_t ThreadLimitClause) const {
+  return std::min(MaxNumThreads, (ThreadLimitClause > 0) ? ThreadLimitClause
+                                                         : PreferredNumThreads);
+}
+
+int32_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
+                                      int32_t NumTeamsClause,
+                                      int32_t LoopTripCount,
+                                      int32_t NumThreads) const {
+  int32_t PreferredNumBlocks = getDefaultNumBlocks(GenericDevice);
+  if (NumTeamsClause > 0) {
+    PreferredNumBlocks = NumTeamsClause;
+  } else if (LoopTripCount > 0) {
+    if (isSPMDMode()) {
+      // We have a combined construct, i.e. `target teams distribute
+      // parallel for [simd]`. We launch so many teams so that each thread
+      // will execute one iteration of the loop. round up to the nearest
+      // integer
+      PreferredNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
+    } else {
+      assert((isGenericMode() || isGenericSPMDMode()) &&
+             "Unexpected execution mode!");
+      // If we reach this point, then we have a non-combined construct, i.e.
+      // `teams distribute` with a nested `parallel for` and each team is
+      // assigned one iteration of the `distribute` loop. E.g.:
+      //
+      // #pragma omp target teams distribute
+      // for(...loop_tripcount...) {
+      //   #pragma omp parallel for
+      //   for(...) {}
+      // }
+      //
+      // Threads within a team will execute the iterations of the `parallel`
+      // loop.
+      PreferredNumBlocks = LoopTripCount;
+    }
+  }
+  return std::min(PreferredNumBlocks, GenericDevice.getBlockLimit());
+}
+
+GenericDeviceTy::GenericDeviceTy(int32_t DeviceId, int32_t NumDevices,
+                                 const llvm::omp::GV &OMPGridValues)
+    : OMP_TeamLimit("OMP_TEAM_LIMIT"),
+      OMP_NumTeams("OMP_NUM_TEAMS"),
+      OMP_TeamsThreadLimit("OMP_TEAMS_THREAD_LIMIT"),
+      OMPX_DebugKind("LIBOMPTARGET_DEVICE_RTL_DEBUG"),
+      OMPX_SharedMemorySize("LIBOMPTARGET_SHARED_MEMORY_SIZE"),
+      // Do not initialize the following two envars since they depend on the
+      // device initialization. These cannot be consulted until the device is
+      // initialized correctly. We intialize them in GenericDeviceTy::init().
+      OMPX_TargetStackSize(),
+      OMPX_TargetHeapSize(),
+      MemoryManager(nullptr),
+      DeviceId(DeviceId),
+      GridValues(OMPGridValues),
+      PeerAccesses(NumDevices, PeerAccessState::PENDING),
+      PeerAccessesLock() {
+  if (OMP_NumTeams > 0)
+    GridValues.GV_Max_Teams =
+        std::min(GridValues.GV_Max_Teams, uint32_t(OMP_NumTeams));
+
+  if (OMP_TeamsThreadLimit > 0)
+    GridValues.GV_Max_WG_Size = std::min(GridValues.GV_Max_WG_Size,
+                                         uint32_t(OMP_TeamsThreadLimit));
+};
+
+StatusCode GenericDeviceTy::init(GenericPluginTy &Plugin) {
+  StatusCode SC = initImpl(Plugin);
+  if (SC) {
+    REPORT("Failure to initialize device\n");
+  }
+
+  // Read and reinitialize the envars that depend on the device initialization.
+  // Notice these two envars may change the stack size and heap size of the
+  // device, so they need the device properly initialized.
+  OMPX_TargetStackSize = UInt64Envar("LIBOMPTARGET_STACK_SIZE",
+      [this](uint64_t &V) -> StatusCode { return getDeviceStackSize(V); },
+      [this](uint64_t V) -> StatusCode { return setDeviceStackSize(V); });
+
+  OMPX_TargetHeapSize = UInt64Envar("LIBOMPTARGET_HEAP_SIZE",
+      [this](uint64_t &V) -> StatusCode { return getDeviceHeapSize(V); },
+      [this](uint64_t V) -> StatusCode { return setDeviceHeapSize(V); });
+
+  // Enable the memory manager if required.
+  bool EnableMM;
+  size_t ThresholdMM;
+  std::tie(ThresholdMM, EnableMM) = MemoryManagerTy::getSizeThresholdFromEnv();
+  if (EnableMM)
+    MemoryManager = new MemoryManagerTy(*this, ThresholdMM);
+
+  return SC;
+}
+
+StatusCode GenericDeviceTy::deinit() {
+  // Delete the memory manager before deinitilizing the device. Otherwise,
+  // we may delete device allocations after the device is deinitialized.
+  if (MemoryManager)
+    delete MemoryManager;
+  MemoryManager = nullptr;
+
+  StatusCode SC = deinitImpl();
+  if (SC) {
+    REPORT("Failure to deinitialize device (%d): %s\n", DeviceId,
+           Plugin::getErrorStr(SC));
+  }
+  return SC;
+}
+
+__tgt_target_table *
+GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
+                            const __tgt_device_image *TgtImage) {
+  DP("Load data from image " DPxMOD "\n", DPxPTR(TgtImage->ImageStart));
+
+  // Load the binary and allocate the image object. Use the next available id
+  // for the image id, which is the number of previously loaded images.
+  DeviceImageTy *Image = loadBinaryImpl(TgtImage, LoadedImages.size());
+  if (!Image) {
+    REPORT("Failure to load binary (%p) in vendor specific part.\n", TgtImage);
+    return nullptr;
+  }
+
+  // Add the image to list.
+  LoadedImages.push_back(Image);
+
+  // Setup the device environment if needed.
+  StatusCode SC = setupDeviceEnvironment(Plugin, *Image);
+  if (SC) {
+    REPORT("Failure to setup device environment.\n");
+    return nullptr;
+  }
+
+  // Register all offload entries of the image.
+  SC = registerOffloadEntries(*Image);
+  if (SC) {
+    REPORT("Failed to register offload entries from device image.\n");
+    return nullptr;
+  }
+
+  // Return the pointer to the table of entries.
+  return Image->getOffloadEntryTable();
+}
+
+StatusCode GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin,
+                                                   DeviceImageTy &Image) {
+  DeviceEnvironmentTy DeviceEnvironment;
+  DeviceEnvironment.DebugKind = OMPX_DebugKind;
+  DeviceEnvironment.NumDevices = Plugin.getNumDevices();
+  // TODO: The device ID used here is not the real device ID used by OpenMP.
+  DeviceEnvironment.DeviceNum = DeviceId;
+  DeviceEnvironment.DynamicMemSize = OMPX_SharedMemorySize;
+
+  GlobalTy DeviceEnvGlobal("omptarget_device_environment",
+                           sizeof(DeviceEnvironmentTy), &DeviceEnvironment);
+
+  GenericGlobalHandlerTy &GlobalHandler = Plugin.getGlobalHandler();
+  if (GlobalHandler.writeGlobalToDevice(*this, Image, DeviceEnvGlobal)) {
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+         "Failed to write device environment into image.");
+    // TODO: Check the device gfx name against the image gfx name.
+    return StatusCode::FAIL;
+  }
+  return StatusCode::SUCCESS;
+}
+
+StatusCode GenericDeviceTy::registerOffloadEntries(DeviceImageTy &Image) {
+  const __tgt_offload_entry *Begin = Image.getTgtImage()->EntriesBegin;
+  const __tgt_offload_entry *End = Image.getTgtImage()->EntriesEnd;
+  for (const __tgt_offload_entry *Entry = Begin; Entry != End; ++Entry) {
+    if (!Entry->addr) {
+      // The host should have always something in the address to uniquely
+      // identify the entry.
+      INFO(OMP_INFOTYPE_ALL, DeviceId,
+           "Unexpected host entry without address (size: %ld), abort!\n",
+           Entry->size);
+      return StatusCode::FAIL;
+    }
+
+    StatusCode SC;
+    __tgt_offload_entry DeviceEntry = {0};
+
+    if (Entry->size)
+      SC = registerGlobalOffloadEntry(Image, *Entry, DeviceEntry);
+    else
+      SC = registerKernelOffloadEntry(Image, *Entry, DeviceEntry);
+    if (SC)
+      return StatusCode::FAIL;
+
+    assert(DeviceEntry.addr && "Device addr of offload entry cannot be null");
+
+    DP("Entry point " DPxMOD " maps to%s %s (" DPxMOD ")\n",
+       DPxPTR(Entry - Begin), (Entry->size) ? " global" : "", Entry->name,
+       DPxPTR(DeviceEntry.addr));
+  }
+  return StatusCode::SUCCESS;
+}
+
+StatusCode GenericDeviceTy::registerGlobalOffloadEntry(DeviceImageTy &Image,
+    const __tgt_offload_entry &GlobalEntry, __tgt_offload_entry &DeviceEntry) {
+  GenericPluginTy &Plugin = Plugin::get();
+  GenericGlobalHandlerTy &GlobalHandler = Plugin.getGlobalHandler();
+
+  DeviceEntry = GlobalEntry;
+
+  // Create a metadata object for the device global.
+  GlobalTy DeviceGlobal(GlobalEntry.name, GlobalEntry.size);
+
+  // Get the address of the device of the global.
+  if (GlobalHandler.getGlobalMetadataFromDevice(*this, Image, DeviceGlobal))
+    return StatusCode::FAIL;
+
+  // Store the device address on the device entry.
+  DeviceEntry.addr = DeviceGlobal.getPtr();
+  assert(DeviceEntry.addr && "Device global's address cannot be null");
+
+  // Note: In the current implementation declare target variables
+  // can either be link or to. This means that once unified
+  // memory is activated via the requires directive, the variable
+  // can be used directly from the host in both cases.
+  if (Plugin.getRequiresFlags() & OMP_REQ_UNIFIED_SHARED_MEMORY) {
+    // If unified memory is present any target link or to variables
+    // can access host addresses directly. There is no longer a
+    // need for device copies.
+    GlobalTy HostGlobal(GlobalEntry);
+    if (GlobalHandler.writeGlobalToDevice(*this, Image, HostGlobal, DeviceGlobal))
+      return StatusCode::FAIL;
+  }
+
+  // Add the device entry on the entry table.
+  Image.getOffloadEntryTable().addEntry(DeviceEntry);
+
+  return StatusCode::SUCCESS;
+}
+
+StatusCode GenericDeviceTy::registerKernelOffloadEntry(DeviceImageTy &Image,
+    const __tgt_offload_entry &KernelEntry, __tgt_offload_entry &DeviceEntry) {
+  DeviceEntry = KernelEntry;
+
+  // Create a kernel object.
+  GenericKernelTy *Kernel = constructKernelEntry(KernelEntry, Image);
+  if (!Kernel)
+    return StatusCode::FAIL;
+
+  // Initialize the kernel.
+  StatusCode SC = Kernel->init(*this);
+  if (SC)
+    return SC;
+
+  // Set the device entry address to the kernel address and store the entry on
+  // the entry table.
+  DeviceEntry.addr = (void *)Kernel;
+  Image.getOffloadEntryTable().addEntry(DeviceEntry);
+
+  return StatusCode::SUCCESS;
+}
+
+StatusCode GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo) {
+  if (!AsyncInfo || !AsyncInfo->Queue)
+    return StatusCode::FAIL;
+
+  StatusCode SC = synchronizeImpl(*AsyncInfo);
+  if (SC) {
+    REPORT("Error when synchronizing stream (%p): %s\n", AsyncInfo->Queue,
+           Plugin::getErrorStr(SC));
+  }
+  return SC;
+}
+
+void *GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
+                                 TargetAllocTy Kind) {
+  switch (Kind) {
+  case TARGET_ALLOC_DEFAULT:
+  case TARGET_ALLOC_DEVICE:
+    if (MemoryManager) {
+      return MemoryManager->allocate(Size, HostPtr);
+    }
+    LLVM_FALLTHROUGH;
+  case TARGET_ALLOC_HOST:
+  case TARGET_ALLOC_SHARED:
+    return allocate(Size, HostPtr, Kind);
+  }
+
+  REPORT("Invalid target data allocation kind or requested allocator not "
+         "implemented yet.\n");
+
+  return nullptr;
+}
+
+StatusCode GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) {
+  StatusCode SC;
+
+  if (MemoryManager) {
+    SC = StatusCode(MemoryManager->free(TgtPtr));
+  } else {
+    SC = StatusCode(free(TgtPtr, Kind));
+  }
+
+  if (SC) {
+    REPORT("Failed to deallocate device pointer %p\n", TgtPtr);
+  }
+  return SC;
+}
+
+StatusCode GenericDeviceTy::dataSubmit(void *TgtPtr, const void *HstPtr,
+                                       int64_t Size,
+                                       __tgt_async_info *AsyncInfo) {
+  StatusCode SC;
+  AsyncInfoWrapperTy AsyncInfoWrapper(SC, *this, AsyncInfo);
+  SC = dataSubmitImpl(TgtPtr, HstPtr, Size, AsyncInfoWrapper);
+  if (SC) {
+    REPORT("Error when copying data from host to device. Pointers: host "
+           "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
+           DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
+  }
+  return SC;
+}
+
+StatusCode GenericDeviceTy::dataRetrieve(void *HstPtr, const void *TgtPtr,
+                                         int64_t Size,
+                                         __tgt_async_info *AsyncInfo) {
+  StatusCode SC;
+  AsyncInfoWrapperTy AsyncInfoWrapper(SC, *this, AsyncInfo);
+  SC = dataRetrieveImpl(HstPtr, TgtPtr, Size, AsyncInfoWrapper);
+  if (SC) {
+    REPORT("Error when copying data from device to host. Pointers: host "
+           "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
+           DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
+  }
+  return SC;
+}
+
+StatusCode GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev,
+                                         void *DstPtr, int64_t Size,
+                                         __tgt_async_info *AsyncInfo) {
+  StatusCode SC;
+  AsyncInfoWrapperTy AsyncInfoWrapper(SC, *this, AsyncInfo);
+  SC = dataExchangeImpl(SrcPtr, DstDev, DstPtr, Size, AsyncInfoWrapper);
+  if (SC) {
+    REPORT("Error when copying data from device (%d) to device (%d). Pointers: "
+           "host "
+           "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
+           DeviceId, DstDev.DeviceId, DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
+  }
+  return SC;
+}
+
+StatusCode GenericDeviceTy::runTargetTeamRegion(
+    void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets, int32_t NumArgs,
+    int32_t NumTeamsClause, int32_t ThreadLimitClause, uint64_t LoopTripCount,
+    __tgt_async_info *AsyncInfo) {
+  StatusCode SC;
+  AsyncInfoWrapperTy AsyncInfoWrapper(SC, *this, AsyncInfo);
+
+  GenericKernelTy &GenericKernel =
+      *reinterpret_cast<GenericKernelTy *>(EntryPtr);
+
+  int32_t LoopTripCount32 =
+      (LoopTripCount > uint64_t(std::numeric_limits<int32_t>::max()))
+          ? std::numeric_limits<int32_t>::max()
+          : LoopTripCount;
+
+  SC = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, NumArgs, NumTeamsClause,
+                            ThreadLimitClause, LoopTripCount32,
+                            AsyncInfoWrapper);
+  if (SC) {
+    REPORT("Error when running target region " DPxMOD " in device %d\n",
+           DPxPTR(EntryPtr), DeviceId);
+  }
+  return SC;
+}
+
+StatusCode GenericDeviceTy::initAsyncInfo(__tgt_async_info **AsyncInfoPtr) {
+  assert(AsyncInfoPtr && "Async info pointer is null");
+
+  *AsyncInfoPtr = new __tgt_async_info();
+
+  StatusCode SC;
+  AsyncInfoWrapperTy AsyncInfoWrapper(SC, *this, *AsyncInfoPtr);
+
+  SC = initAsyncInfoImpl(AsyncInfoWrapper);
+  if (SC) {
+    REPORT("Error when initializing async info at " DPxMOD " on device %d\n",
+           DPxPTR(*AsyncInfoPtr), DeviceId);
+  }
+  return SC;
+}
+
+StatusCode GenericDeviceTy::initDeviceInfo(__tgt_device_info *DeviceInfo,
+                                           const char **ErrStr) {
+  assert(DeviceInfo && "Device info is null");
+
+  *ErrStr = nullptr;
+
+  StatusCode SC = initDeviceInfoImpl(DeviceInfo);
+  if (SC) {
+    REPORT("Error when initializing device info at " DPxMOD " on device %d\n",
+           DPxPTR(DeviceInfo), DeviceId);
+  }
+  return SC;
+}
+
+StatusCode GenericPluginTy::initDevice(int32_t DeviceId) {
+  assert(!Devices[DeviceId] && "Device already initialized");
+
+  // Create the device and save the reference.
+  GenericDeviceTy &Device = createDevice(DeviceId);
+  Devices[DeviceId] = &Device;
+
+  // Initialize the device and its resources.
+  StatusCode SC = Device.init(*this);
+  if (SC) {
+    REPORT("Failure to initialize device (%d): %s\n", DeviceId,
+           Plugin::getErrorStr(SC));
+  }
+  return SC;
+}
+
+StatusCode GenericPluginTy::deinitDevice(int32_t DeviceId) {
+  // The device may be already deinitialized.
+  if (Devices[DeviceId] == nullptr)
+    return StatusCode::SUCCESS;
+
+  // Deinitialize the device and release its resources.
+  StatusCode SC = Devices[DeviceId]->deinit();
+  if (SC) {
+    REPORT("Failure to deinitialize device (%d): %s\n", DeviceId,
+           Plugin::getErrorStr(SC));
+  }
+
+  // Delete the device and invalidate its reference.
+  delete Devices[DeviceId];
+  Devices[DeviceId] = nullptr;
+
+  return StatusCode::SUCCESS;
+}
+
+void GenericDeviceTy::printInfo() {
+  // TODO: Print generic information here
+  printInfoImpl();
+}
+
+StatusCode GenericDeviceTy::createEvent(void **EventPtrStorage) {
+  StatusCode SC = createEventImpl(EventPtrStorage);
+  if (SC) {
+    REPORT("Failure to create event: %s\n", Plugin::getErrorStr(SC));
+  }
+  return SC;
+}
+
+StatusCode GenericDeviceTy::destroyEvent(void *EventPtr) {
+  StatusCode SC = destroyEventImpl(EventPtr);
+  if (SC) {
+    REPORT("Failure to destroy event (%p): %s\n", EventPtr,
+           Plugin::getErrorStr(SC));
+  }
+  return SC;
+}
+
+StatusCode GenericDeviceTy::recordEvent(void *EventPtr,
+                                        __tgt_async_info *AsyncInfo) {
+  StatusCode SC;
+  AsyncInfoWrapperTy AsyncInfoWrapper(SC, *this, AsyncInfo);
+  SC = recordEventImpl(EventPtr, AsyncInfoWrapper);
+  if (SC) {
+    REPORT("Failure to record event (%p): %s\n", EventPtr,
+           Plugin::getErrorStr(SC));
+  }
+  return SC;
+}
+
+StatusCode GenericDeviceTy::waitEvent(void *EventPtr,
+                                      __tgt_async_info *AsyncInfo) {
+  StatusCode SC;
+  AsyncInfoWrapperTy AsyncInfoWrapper(SC, *this, AsyncInfo);
+  SC = waitEventImpl(EventPtr, AsyncInfoWrapper);
+  if (SC) {
+    REPORT("Failure to wait for event (%p): %s\n", EventPtr,
+           Plugin::getErrorStr(SC));
+  }
+  return SC;
+}
+
+StatusCode GenericDeviceTy::syncEvent(void *EventPtr) {
+  StatusCode SC = syncEventImpl(EventPtr);
+  if (SC) {
+    REPORT("Failure to sync event (%p): %s\n", EventPtr,
+           Plugin::getErrorStr(SC));
+  }
+  return SC;
+}
+
+/// Exposed library API function, basically wrappers around the GenericDeviceTy
+/// functionality with the same name. All non-async functions are redirected
+/// to the async versions right away with a NULL AsyncInfoPtr.
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int32_t __tgt_rtl_init_plugin() {
+  StatusCode SC = Plugin::init();
+  if (SC) {
+    REPORT("Error when initializing plugin" GETNAME(TARGET_NAME) "\n");
+  }
+  return SC;
+}
+
+int32_t __tgt_rtl_deinit_plugin() {
+  StatusCode SC = Plugin::deinit();
+  if (SC) {
+    REPORT("Error when deinitializing plugin" GETNAME(TARGET_NAME) "\n");
+  }
+  return SC;
+}
+
+int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *TgtImage) {
+  if (!Plugin::isActive())
+    return false;
+
+  return elf_check_machine(TgtImage, Plugin::get().getMagicElfBits());
+}
+
+int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *TgtImage,
+                                       __tgt_image_info *Info) {
+  if (!Plugin::isActive())
+    return false;
+
+  if (!__tgt_rtl_is_valid_binary(TgtImage))
+    return false;
+
+  // A subarchitecture was not specified. Assume it is compatible.
+  if (!Info->Arch)
+    return true;
+
+  GenericPluginTy &Plugin = Plugin::get();
+
+  int32_t NumDevices = Plugin.getNumDevices();
+  for (int32_t DeviceId = 0; DeviceId < NumDevices; ++DeviceId) {
+    if (!Plugin.isImageCompatible(DeviceId, Info))
+      return false;
+  }
+
+  DP("Image is compatible with current environment: %s\n", Info->Arch);
+
+  return true;
+}
+
+int32_t __tgt_rtl_supports_empty_images() {
+  // KNOTE: Only VE supports empty images
+  return Plugin::get().supportsEmptyImages();
+}
+
+int32_t __tgt_rtl_deinit_device(int32_t DeviceId) {
+  return Plugin::get().deinitDevice(DeviceId);
+}
+
+int32_t __tgt_rtl_number_of_devices() {
+  return Plugin::get().getNumDevices();
+}
+
+int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
+  Plugin::get().setRequiresFlag(RequiresFlags);
+  return RequiresFlags;
+}
+
+int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDeviceId, int32_t DstDeviceId) {
+  return Plugin::get().isDataExchangable(SrcDeviceId, DstDeviceId);
+}
+
+int32_t __tgt_rtl_init_device(int32_t DeviceId) {
+  return Plugin::get().initDevice(DeviceId);
+}
+
+__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
+                                          __tgt_device_image *TgtImage) {
+  GenericPluginTy &Plugin = Plugin::get();
+  return Plugin.getDevice(DeviceId).loadBinary(Plugin, TgtImage);
+}
+
+void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr,
+                           int32_t Kind) {
+  return Plugin::get().getDevice(DeviceId).dataAlloc(Size, HostPtr,
+                                                    (TargetAllocTy)Kind);
+}
+
+int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr, int32_t Kind) {
+  return Plugin::get().getDevice(DeviceId).dataDelete(TgtPtr, (TargetAllocTy)Kind);
+}
+
+int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
+                              int64_t Size) {
+  return __tgt_rtl_data_submit_async(DeviceId, TgtPtr, HstPtr, Size,
+                                     /* AsyncInfoPtr */ nullptr);
+}
+
+int32_t __tgt_rtl_data_submit_async(int32_t DeviceId, void *TgtPtr,
+                                    void *HstPtr, int64_t Size,
+                                    __tgt_async_info *AsyncInfoPtr) {
+  return Plugin::get().getDevice(DeviceId).dataSubmit(TgtPtr, HstPtr, Size,
+                                                     AsyncInfoPtr);
+}
+
+int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
+                                int64_t Size) {
+  return __tgt_rtl_data_retrieve_async(DeviceId, HstPtr, TgtPtr, Size,
+                                       /* AsyncInfoPtr */ nullptr);
+}
+
+int32_t __tgt_rtl_data_retrieve_async(int32_t DeviceId, void *HstPtr,
+                                      void *TgtPtr, int64_t Size,
+                                      __tgt_async_info *AsyncInfoPtr) {
+  return Plugin::get().getDevice(DeviceId).dataRetrieve(HstPtr, TgtPtr, Size,
+                                                       AsyncInfoPtr);
+}
+
+int32_t __tgt_rtl_data_exchange(int32_t SrcDeviceId, void *SrcPtr,
+                                int32_t DstDeviceId, void *DstPtr,
+                                int64_t Size) {
+  return __tgt_rtl_data_exchange_async(SrcDeviceId, SrcPtr, DstDeviceId, DstPtr,
+                                       Size, /* AsyncInfoPtr */ nullptr);
+}
+
+int32_t __tgt_rtl_data_exchange_async(int32_t SrcDeviceId, void *SrcPtr,
+                                      int DstDeviceId, void *DstPtr,
+                                      int64_t Size,
+                                      __tgt_async_info *AsyncInfo) {
+  GenericDeviceTy &SrcDevice = Plugin::get().getDevice(SrcDeviceId);
+  GenericDeviceTy &DstDevice = Plugin::get().getDevice(DstDeviceId);
+  return SrcDevice.dataExchange(SrcPtr, DstDevice, DstPtr, Size, AsyncInfo);
+}
+
+int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr,
+                                         void **TgtArgs,
+                                         ptrdiff_t *TgtOffsets,
+                                         int32_t NumArgs, int32_t NumTeams,
+                                         int32_t ThreadLimit,
+                                         uint64_t LoopTripCount) {
+  return __tgt_rtl_run_target_team_region_async(
+      DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, NumArgs, NumTeams,
+      ThreadLimit, LoopTripCount,
+      /* AsyncInfoPtr */ nullptr);
+}
+
+int32_t __tgt_rtl_run_target_team_region_async(
+    int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
+    ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams,
+    int32_t ThreadLimit, uint64_t LoopTripCount,
+    __tgt_async_info *AsyncInfoPtr) {
+  return Plugin::get().getDevice(DeviceId).runTargetTeamRegion(
+      TgtEntryPtr, TgtArgs, TgtOffsets, NumArgs, NumTeams, ThreadLimit,
+      LoopTripCount, AsyncInfoPtr);
+}
+
+int32_t __tgt_rtl_synchronize(int32_t DeviceId,
+                              __tgt_async_info *AsyncInfoPtr) {
+  return Plugin::get().getDevice(DeviceId).synchronize(AsyncInfoPtr);
+}
+
+int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr,
+                                    void **TgtArgs, ptrdiff_t *TgtOffsets,
+                                    int32_t NumArgs) {
+  return __tgt_rtl_run_target_region_async(DeviceId, TgtEntryPtr, TgtArgs,
+                                           TgtOffsets, NumArgs,
+                                           /* AsyncInfoPtr */ nullptr);
+}
+
+int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId,
+                                          void *TgtEntryPtr, void **TgtArgs,
+                                          ptrdiff_t *TgtOffsets,
+                                          int32_t NumArgs,
+                                          __tgt_async_info *AsyncInfoPtr) {
+  return __tgt_rtl_run_target_team_region_async(
+      DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, NumArgs,
+      /* team num*/ 1, /* thread limit */ 1, /* loop tripcount */ 0,
+      AsyncInfoPtr);
+}
+
+void __tgt_rtl_print_device_info(int32_t DeviceId) {
+  Plugin::get().getDevice(DeviceId).printInfo();
+}
+
+int32_t __tgt_rtl_create_event(int32_t DeviceId, void **EventPtr) {
+  return Plugin::get().getDevice(DeviceId).createEvent(EventPtr);
+}
+
+int32_t __tgt_rtl_record_event(int32_t DeviceId, void *EventPtr,
+                               __tgt_async_info *AsyncInfoPtr) {
+  return Plugin::get().getDevice(DeviceId).recordEvent(EventPtr, AsyncInfoPtr);
+}
+
+int32_t __tgt_rtl_wait_event(int32_t DeviceId, void *EventPtr,
+                             __tgt_async_info *AsyncInfoPtr) {
+  return Plugin::get().getDevice(DeviceId).waitEvent(EventPtr, AsyncInfoPtr);
+}
+
+int32_t __tgt_rtl_sync_event(int32_t DeviceId, void *EventPtr) {
+  return Plugin::get().getDevice(DeviceId).syncEvent(EventPtr);
+}
+
+int32_t __tgt_rtl_destroy_event(int32_t DeviceId, void *EventPtr) {
+  return Plugin::get().getDevice(DeviceId).destroyEvent(EventPtr);
+}
+
+void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
+  std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
+  InfoLevel.store(NewInfoLevel);
+}
+
+int32_t __tgt_rtl_init_async_info(int32_t DeviceId, __tgt_async_info **AsyncInfoPtr) {
+  assert(AsyncInfoPtr && "async_info is nullptr");
+
+  return Plugin::get().getDevice(DeviceId).initAsyncInfo(AsyncInfoPtr);
+}
+
+int32_t __tgt_rtl_init_device_info(int32_t DeviceId,
+                                   __tgt_device_info *DeviceInfo,
+                                   const char **ErrStr) {
+  return Plugin::get().getDevice(DeviceId).initDeviceInfo(DeviceInfo, ErrStr);
+}
+
+#ifdef __cplusplus
+}
+#endif
Index: openmp/libomptarget/plugins-nextgen/cuda/CMakeLists.txt
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/cuda/CMakeLists.txt
@@ -0,0 +1,111 @@
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a CUDA machine if available.
+#
+##===----------------------------------------------------------------------===##
+set(LIBOMPTARGET_BUILD_CUDA_PLUGIN TRUE CACHE BOOL
+  "Whether to build CUDA plugin")
+if (NOT LIBOMPTARGET_BUILD_CUDA_PLUGIN)
+  libomptarget_say("Not building CUDA NextGen offloading plugin: LIBOMPTARGET_BUILD_CUDA_PLUGIN is false")
+  return()
+endif()
+
+if (NOT(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux"))
+  libomptarget_say("Not building CUDA NextGen offloading plugin: only support CUDA in Linux x86_64, ppc64le, or aarch64 hosts.")
+  return()
+endif()
+
+libomptarget_say("Building CUDA NextGen offloading plugin.")
+
+# Define the suffix for the runtime messaging dumps.
+add_definitions("-DTARGET_NAME=CUDA")
+
+# Define debug prefix. TODO: This should be automatized in the Debug.h but it
+# requires changing the original plugins.
+add_definitions(-DDEBUG_PREFIX="TARGET CUDA RTL")
+
+set(LIBOMPTARGET_DLOPEN_LIBCUDA OFF)
+option(LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA "Build with dlopened libcuda" ${LIBOMPTARGET_DLOPEN_LIBCUDA})
+
+set(LIBOMPTARGET_CAN_LINK_LIBCUDA FALSE)
+if (LIBOMPTARGET_DEP_CUDA_FOUND AND LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND)
+  set(LIBOMPTARGET_CAN_LINK_LIBCUDA TRUE)
+endif()
+
+if (LIBOMPTARGET_CAN_LINK_LIBCUDA AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA)
+  libomptarget_say("Building CUDA NextGen plugin linked against libcuda")
+  include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS})
+  add_llvm_library(omptarget.rtl.cuda.nextgen SHARED
+
+    src/rtl.cpp
+
+    ADDITIONAL_HEADER_DIRS
+    ${LIBOMPTARGET_INCLUDE_DIR}
+
+    LINK_LIBS
+    PRIVATE
+    elf_common
+    MemoryManager
+    PluginInterface
+    ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES}
+    ${OPENMP_PTHREAD_LIB}
+    "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
+    "-Wl,-z,defs"
+
+    NO_INSTALL_RPATH
+  )
+else()
+  libomptarget_say("Building CUDA NextGen plugin for dlopened libcuda")
+  include_directories(dynamic_cuda)
+  add_llvm_library(omptarget.rtl.cuda.nextgen 
+    SHARED
+
+    src/rtl.cpp
+    dynamic_cuda/cuda.cpp
+
+    ADDITIONAL_HEADER_DIRS
+    ${LIBOMPTARGET_INCLUDE_DIR}
+
+    LINK_LIBS
+    PRIVATE
+    elf_common
+    MemoryManager
+    PluginInterface
+    ${CMAKE_DL_LIBS}
+    ${OPENMP_PTHREAD_LIB}
+    "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
+    "-Wl,-z,defs"
+
+    NO_INSTALL_RPATH
+  )
+endif()
+add_dependencies(omptarget.rtl.cuda.nextgen omptarget.devicertl.nvptx)
+
+# Install plugin under the lib destination folder.
+install(TARGETS omptarget.rtl.cuda.nextgen LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+set_target_properties(omptarget.rtl.cuda.nextgen PROPERTIES 
+  INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/..")
+
+target_include_directories(omptarget.rtl.cuda.nextgen PRIVATE
+  ${LIBOMPTARGET_INCLUDE_DIR}
+)
+
+# Report to the parent scope that we are building a plugin for CUDA.
+# This controls whether tests are run for the nvptx offloading target
+# Run them if libcuda is available, or if the user explicitly asked for dlopen
+# Otherwise this plugin is being built speculatively and there may be no cuda available
+#if (LIBOMPTARGET_CAN_LINK_LIBCUDA OR LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA)
+#  libomptarget_say("Enable tests using CUDA NextGen plugin")
+#  set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda-nextgen nvptx64-nvidia-cuda-nextgen-oldDriver" PARENT_SCOPE)
+#  set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda-nextgen nvptx64-nvidia-cuda-nextgen-LTO" PARENT_SCOPE)
+#  list(APPEND LIBOMPTARGET_TESTED_PLUGINS "omptarget.rtl.cuda.nextgen")
+#  set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
+#else()
+#  libomptarget_say("Disabling tests using CUDA NextGen plugin as cuda may not be available")
+#endif()
Index: openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -0,0 +1,266 @@
+//===--- cuda/dynamic_cuda/cuda.h --------------------------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The parts of the cuda api that are presently in use by the openmp cuda plugin
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef DYNAMIC_CUDA_CUDA_H_INCLUDED
+#define DYNAMIC_CUDA_CUDA_H_INCLUDED
+
+#include <cstddef>
+#include <cstdint>
+
+typedef int CUdevice;
+typedef uintptr_t CUdeviceptr;
+typedef struct CUmod_st *CUmodule;
+typedef struct CUctx_st *CUcontext;
+typedef struct CUfunc_st *CUfunction;
+typedef struct CUstream_st *CUstream;
+typedef struct CUevent_st *CUevent;
+
+typedef enum cudaError_enum {
+  CUDA_SUCCESS = 0,
+  CUDA_ERROR_INVALID_VALUE = 1,
+  CUDA_ERROR_NO_DEVICE = 100,
+  CUDA_ERROR_INVALID_HANDLE = 400,
+} CUresult;
+
+typedef enum CUstream_flags_enum {
+  CU_STREAM_DEFAULT = 0x0,
+  CU_STREAM_NON_BLOCKING = 0x1,
+} CUstream_flags;
+
+typedef enum CUlimit_enum {
+  CU_LIMIT_STACK_SIZE = 0x0,
+  CU_LIMIT_PRINTF_FIFO_SIZE = 0x1,
+  CU_LIMIT_MALLOC_HEAP_SIZE = 0x2,
+  CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH = 0x3,
+  CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x4,
+  CU_LIMIT_MAX_L2_FETCH_GRANULARITY = 0x5,
+  CU_LIMIT_PERSISTING_L2_CACHE_SIZE = 0x6,
+  CU_LIMIT_MAX
+} CUlimit;
+
+typedef enum CUdevice_attribute_enum {
+  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,
+  CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,
+  CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,
+  CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,
+  CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,
+  CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,
+  CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,
+  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,
+  CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,
+  CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,
+  CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,
+  CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,
+  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,
+  CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,
+  CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,
+  CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,
+  CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,
+  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,
+  CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,
+  CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,
+  CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,
+  CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29,
+  CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30,
+  CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,
+  CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32,
+  CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33,
+  CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34,
+  CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35,
+  CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36,
+  CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37,
+  CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38,
+  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,
+  CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,
+  CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43,
+  CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49,
+  CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50,
+  CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74,
+  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,
+  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,
+  CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77,
+  CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78,
+  CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79,
+  CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80,
+  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81,
+  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82,
+  CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83,
+  CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84,
+  CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85,
+  CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86,
+  CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87,
+  CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88,
+  CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89,
+  CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90,
+  CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91,
+  CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92,
+  CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93,
+  CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94,
+  CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95,
+  CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96,
+  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97,
+  CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98,
+  CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99,
+  CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100,
+  CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101,
+  CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED = 102,
+  CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 102,
+  CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103,
+  CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104,
+  CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105,
+  CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106,
+  CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107,
+  CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108,
+  CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE = 109,
+  CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110,
+  CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111,
+  CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED = 112,
+  CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113,
+  CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED = 114,
+  CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = 115,
+  CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED = 116,
+  CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS = 117,
+  CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING = 118,
+  CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES = 119,
+  CU_DEVICE_ATTRIBUTE_MAX,
+} CUdevice_attribute;
+
+typedef enum CUfunction_attribute_enum {
+  CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
+} CUfunction_attribute;
+
+typedef enum CUctx_flags_enum {
+  CU_CTX_SCHED_BLOCKING_SYNC = 0x04,
+  CU_CTX_SCHED_MASK = 0x07,
+} CUctx_flags;
+
+typedef enum CUmemAttach_flags_enum {
+  CU_MEM_ATTACH_GLOBAL = 0x1,
+  CU_MEM_ATTACH_HOST = 0x2,
+  CU_MEM_ATTACH_SINGLE = 0x4,
+} CUmemAttach_flags;
+
+typedef enum CUcomputeMode_enum {
+  CU_COMPUTEMODE_DEFAULT = 0,
+  CU_COMPUTEMODE_PROHIBITED = 2,
+  CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3,
+} CUcompute_mode;
+
+typedef enum CUevent_flags_enum {
+  CU_EVENT_DEFAULT = 0x0,
+  CU_EVENT_BLOCKING_SYNC = 0x1,
+  CU_EVENT_DISABLE_TIMING = 0x2,
+  CU_EVENT_INTERPROCESS = 0x4
+} CUevent_flags;
+
+CUresult cuCtxGetDevice(CUdevice *);
+CUresult cuDeviceGet(CUdevice *, int);
+CUresult cuDeviceGetAttribute(int *, CUdevice_attribute, CUdevice);
+CUresult cuDeviceGetCount(int *);
+CUresult cuFuncGetAttribute(int *, CUfunction_attribute, CUfunction);
+
+// Device info
+CUresult cuDeviceGetName(char *, int, CUdevice);
+CUresult cuDeviceTotalMem(size_t *, CUdevice);
+CUresult cuDriverGetVersion(int *);
+
+CUresult cuGetErrorString(CUresult, const char **);
+CUresult cuInit(unsigned);
+CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, unsigned, unsigned,
+                        unsigned, unsigned, unsigned, CUstream, void **,
+                        void **);
+
+CUresult cuMemAlloc(CUdeviceptr *, size_t);
+CUresult cuMemAllocHost(void **, size_t);
+CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int);
+
+CUresult cuMemcpyDtoDAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream);
+CUresult cuMemcpyDtoH(void *, CUdeviceptr, size_t);
+CUresult cuMemcpyDtoHAsync(void *, CUdeviceptr, size_t, CUstream);
+CUresult cuMemcpyHtoD(CUdeviceptr, const void *, size_t);
+CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream);
+
+CUresult cuMemFree(CUdeviceptr);
+CUresult cuMemFreeHost(void *);
+
+CUresult cuModuleGetFunction(CUfunction *, CUmodule, const char *);
+CUresult cuModuleGetGlobal(CUdeviceptr *, size_t *, CUmodule, const char *);
+
+CUresult cuModuleUnload(CUmodule);
+CUresult cuStreamCreate(CUstream *, unsigned);
+CUresult cuStreamDestroy(CUstream);
+CUresult cuStreamSynchronize(CUstream);
+CUresult cuCtxSetCurrent(CUcontext);
+CUresult cuDevicePrimaryCtxRelease(CUdevice);
+CUresult cuDevicePrimaryCtxGetState(CUdevice, unsigned *, int *);
+CUresult cuDevicePrimaryCtxSetFlags(CUdevice, unsigned);
+CUresult cuDevicePrimaryCtxRetain(CUcontext *, CUdevice);
+CUresult cuModuleLoadDataEx(CUmodule *, const void *, unsigned, void *,
+                            void **);
+
+CUresult cuDeviceCanAccessPeer(int *, CUdevice, CUdevice);
+CUresult cuCtxEnablePeerAccess(CUcontext, unsigned);
+CUresult cuMemcpyPeerAsync(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext,
+                           size_t, CUstream);
+
+CUresult cuCtxGetLimit(size_t *, CUlimit);
+CUresult cuCtxSetLimit(CUlimit, size_t);
+
+CUresult cuEventCreate(CUevent *, unsigned int);
+CUresult cuEventRecord(CUevent, CUstream);
+CUresult cuStreamWaitEvent(CUstream, CUevent, unsigned int);
+CUresult cuEventSynchronize(CUevent);
+CUresult cuEventDestroy(CUevent);
+
+#endif
Index: openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -0,0 +1,145 @@
+//===--- cuda/dynamic_cuda/cuda.pp ------------------------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implement subset of cuda api by calling into cuda library via dlopen
+// Does the dlopen/dlsym calls as part of the call to cuInit
+//
+//===----------------------------------------------------------------------===//
+
+#include "cuda.h"
+#include "Debug.h"
+#include "dlwrap.h"
+
+#include <string>
+#include <unordered_map>
+
+#include <dlfcn.h>
+
+DLWRAP_INITIALIZE();
+
+DLWRAP_INTERNAL(cuInit, 1);
+
+DLWRAP(cuCtxGetDevice, 1);
+DLWRAP(cuDeviceGet, 2);
+DLWRAP(cuDeviceGetAttribute, 3);
+DLWRAP(cuDeviceGetCount, 1);
+DLWRAP(cuFuncGetAttribute, 3);
+
+// Device info
+DLWRAP(cuDeviceGetName, 3);
+DLWRAP(cuDeviceTotalMem, 2);
+DLWRAP(cuDriverGetVersion, 1);
+
+DLWRAP(cuGetErrorString, 2);
+DLWRAP(cuLaunchKernel, 11);
+
+DLWRAP(cuMemAlloc, 2);
+DLWRAP(cuMemAllocHost, 2);
+DLWRAP(cuMemAllocManaged, 3);
+
+DLWRAP(cuMemcpyDtoDAsync, 4);
+DLWRAP(cuMemcpyDtoH, 3);
+DLWRAP(cuMemcpyDtoHAsync, 4);
+DLWRAP(cuMemcpyHtoD, 3);
+DLWRAP(cuMemcpyHtoDAsync, 4);
+
+DLWRAP(cuMemFree, 1);
+DLWRAP(cuMemFreeHost, 1);
+DLWRAP(cuModuleGetFunction, 3);
+DLWRAP(cuModuleGetGlobal, 4);
+
+DLWRAP(cuModuleUnload, 1);
+DLWRAP(cuStreamCreate, 2);
+DLWRAP(cuStreamDestroy, 1);
+DLWRAP(cuStreamSynchronize, 1);
+DLWRAP(cuCtxSetCurrent, 1);
+DLWRAP(cuDevicePrimaryCtxRelease, 1);
+DLWRAP(cuDevicePrimaryCtxGetState, 3);
+DLWRAP(cuDevicePrimaryCtxSetFlags, 2);
+DLWRAP(cuDevicePrimaryCtxRetain, 2);
+DLWRAP(cuModuleLoadDataEx, 5);
+
+DLWRAP(cuDeviceCanAccessPeer, 3);
+DLWRAP(cuCtxEnablePeerAccess, 2);
+DLWRAP(cuMemcpyPeerAsync, 6);
+
+DLWRAP(cuCtxGetLimit, 2);
+DLWRAP(cuCtxSetLimit, 2);
+
+DLWRAP(cuEventCreate, 2);
+DLWRAP(cuEventRecord, 2);
+DLWRAP(cuStreamWaitEvent, 3);
+DLWRAP(cuEventSynchronize, 1);
+DLWRAP(cuEventDestroy, 1);
+
+DLWRAP_FINALIZE();
+
+#ifndef DYNAMIC_CUDA_PATH
+#define DYNAMIC_CUDA_PATH "libcuda.so"
+#endif
+
+static bool checkForCUDA() {
+  // return true if dlopen succeeded and all functions found
+
+  // Prefer _v2 versions of functions if found in the library
+  std::unordered_map<std::string, const char *> TryFirst = {
+      {"cuMemAlloc", "cuMemAlloc_v2"},
+      {"cuMemFree", "cuMemFree_v2"},
+      {"cuMemcpyDtoH", "cuMemcpyDtoH_v2"},
+      {"cuMemcpyHtoD", "cuMemcpyHtoD_v2"},
+      {"cuStreamDestroy", "cuStreamDestroy_v2"},
+      {"cuModuleGetGlobal", "cuModuleGetGlobal_v2"},
+      {"cuMemcpyDtoHAsync", "cuMemcpyDtoHAsync_v2"},
+      {"cuMemcpyDtoDAsync", "cuMemcpyDtoDAsync_v2"},
+      {"cuMemcpyHtoDAsync", "cuMemcpyHtoDAsync_v2"},
+      {"cuDevicePrimaryCtxRelease", "cuDevicePrimaryCtxRelease_v2"},
+      {"cuDevicePrimaryCtxSetFlags", "cuDevicePrimaryCtxSetFlags_v2"},
+  };
+
+  const char *CudaLib = DYNAMIC_CUDA_PATH;
+  void *DynlibHandle = dlopen(CudaLib, RTLD_NOW);
+  if (!DynlibHandle) {
+    DP("Unable to load library '%s': %s!\n", CudaLib, dlerror());
+    return false;
+  }
+
+  for (size_t I = 0; I < dlwrap::size(); I++) {
+    const char *Sym = dlwrap::symbol(I);
+
+    auto It = TryFirst.find(Sym);
+    if (It != TryFirst.end()) {
+      const char *First = It->second;
+      void *P = dlsym(DynlibHandle, First);
+      if (P) {
+        DP("Implementing %s with dlsym(%s) -> %p\n", Sym, First, P);
+        *dlwrap::pointer(I) = P;
+        continue;
+      }
+    }
+
+    void *P = dlsym(DynlibHandle, Sym);
+    if (P == nullptr) {
+      DP("Unable to find '%s' in '%s'!\n", Sym, CudaLib);
+      return false;
+    }
+    DP("Implementing %s with dlsym(%s) -> %p\n", Sym, Sym, P);
+
+    *dlwrap::pointer(I) = P;
+  }
+
+  return true;
+}
+
+CUresult cuInit(unsigned X) {
+  // Note: Called exactly once from cuda rtl.cpp in a global constructor so
+  // does not need to handle being called repeatedly or concurrently
+  if (!checkForCUDA()) {
+    return CUDA_ERROR_INVALID_HANDLE;
+  }
+  return dlwrap_cuInit(X);
+}
Index: openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -0,0 +1,1025 @@
+//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// RTL NextGen for CUDA machine
+//
+//===----------------------------------------------------------------------===//
+
+#include <cassert>
+#include <cstddef>
+#include <cuda.h>
+#include <string>
+#include <unordered_map>
+
+#include "Debug.h"
+#include "DeviceEnvironment.h"
+#include "GlobalHandler.h"
+#include "PluginInterface.h"
+
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include "llvm/Frontend/OpenMP/OMPGridValues.h"
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+/// Forward declarations for all specialized data structures.
+struct CUDAKernelTy;
+struct CUDADeviceTy;
+struct CUDAPluginTy;
+struct CUDAStreamManagerTy;
+struct CUDAEventManagerTy;
+
+// TODO: Improve this function.
+template <typename... ArgsTy>
+static bool checkResult(CUresult Err, const char *ErrMsg, ArgsTy... Args) {
+  if (Err == CUDA_SUCCESS)
+    return false;
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wformat-security"
+  REPORT(ErrMsg, Args...);
+#pragma clang diagnostic pop
+
+  const char *ErrStr = nullptr;
+  CUresult ErrStrStatus = cuGetErrorString(Err, &ErrStr);
+  if (ErrStrStatus == CUDA_ERROR_INVALID_VALUE) {
+    REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code: %d\n", Err);
+  } else if (ErrStrStatus == CUDA_SUCCESS) {
+    REPORT(GETNAME(TARGET_NAME) " error is: %s\n", ErrStr);
+  } else {
+    REPORT("Unresolved " GETNAME(TARGET_NAME) " error code: %d\n"
+           "Unsuccessful cuGetErrorString return status: %d\n", Err, ErrStrStatus);
+  }
+  return true;
+}
+
+/// Class implementing the CUDA kernel functionalities which derives from the
+/// generic kernel class.
+struct CUDAKernelTy : public GenericKernelTy {
+  /// Create a CUDA kernel with a name, an execution mode, and the kernel function.
+  CUDAKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode,
+               CUfunction Func)
+      : GenericKernelTy(Name, ExecutionMode), Func(Func) {}
+
+  /// Initialize the CUDA kernel
+  StatusCode initImpl(GenericDeviceTy &GenericDevice) override {
+    int MaxThreads;
+    CUresult Err = cuFuncGetAttribute(
+        &MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Func);
+    if (checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
+      return StatusCode(Err);
+
+    /// Set the maximum number of threads for the CUDA kernel.
+    MaxNumThreads = std::min(MaxNumThreads, MaxThreads);
+
+    return StatusCode::SUCCESS;
+  }
+
+  /// Launch the CUDA kernel function
+  StatusCode launchImpl(GenericDeviceTy &GenericDevice, int32_t NumThreads,
+                        int32_t NumBlocks, int32_t DynamicMemorySize,
+                        int32_t NumKernelArgs, void *KernelArgs,
+                        AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
+
+  /// The default number of blocks is common to the whole device.
+  int32_t getDefaultNumBlocks(GenericDeviceTy &GenericDevice) const override {
+    return GenericDevice.getDefaultNumBlocks();
+  }
+
+  /// The default number of threads is common to the whole device.
+  int32_t getDefaultNumThreads(GenericDeviceTy &GenericDevice) const override {
+    return GenericDevice.getDefaultNumThreads();
+  }
+
+private:
+  /// The CUDA kernel function to execute.
+  CUfunction Func;
+};
+
+/// Class wrapping a CUDA stream reference. These are the objects handled by the
+/// Stream Manager for the CUDA plugin.
+class CUDAStreamRef final : public GenericDeviceResourceRef {
+  /// The reference to the CUDA stream.
+  CUstream Stream;
+
+public:
+  /// Create an empty reference to an invalid stream.
+  CUDAStreamRef() : Stream(nullptr) {}
+
+  /// Create a reference to an existing stream.
+  CUDAStreamRef(CUstream Stream) : Stream(Stream) {}
+
+  /// Create a new stream and save the reference. The reference must be empty
+  /// before calling to this function.
+  StatusCode create() override {
+    if (Stream != nullptr)
+      return StatusCode::FAIL;
+
+    CUresult Err = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING);
+    if (checkResult(Err, "Error returned from cuStreamCreate\n"))
+      return StatusCode::FAIL;
+
+    return StatusCode::SUCCESS;
+  }
+
+  /// Destroy the referenced stream and invalidate the reference. The reference
+  /// must be to a valid stream before calling to this function.
+  StatusCode destroy() override {
+    if (Stream == nullptr)
+      return StatusCode::FAIL;
+
+    CUresult Err = cuStreamDestroy(Stream);
+    if (checkResult(Err, "Error returned from cuStreamDestroy\n"))
+      return StatusCode::FAIL;
+
+    Stream = nullptr;
+    return StatusCode::SUCCESS;
+  }
+
+  /// Get the underlying CUstream.
+  operator CUstream() const { return Stream; }
+};
+
+/// Class wrapping a CUDA event reference. These are the objects handled by the
+/// Event Manager for the CUDA plugin.
+class CUDAEventRef final : public GenericDeviceResourceRef {
+  CUevent Event;
+
+public:
+  /// Create an empty reference to an invalid event.
+  CUDAEventRef() : Event(nullptr) {}
+
+  /// Create a reference to an existing event.
+  CUDAEventRef(CUevent Event) : Event(Event) {}
+
+  /// Create a new event and save the reference. The reference must be empty
+  /// before calling to this function.
+  StatusCode create() override {
+    if (Event != nullptr)
+      return StatusCode::FAIL;
+
+    CUresult Err = cuEventCreate(&Event, CU_EVENT_DEFAULT);
+    if (checkResult(Err, "Error returned from cuEventCreate\n"))
+      return StatusCode::FAIL;
+
+    return StatusCode::SUCCESS;
+  }
+
+  /// Destroy the referenced event and invalidate the reference. The reference
+  /// must be to a valid event before calling to this function.
+  StatusCode destroy() override {
+    if (Event == nullptr)
+      return StatusCode::FAIL;
+
+    CUresult Err = cuEventDestroy(Event);
+    if (checkResult(Err, "Error returned from cuEventDestroy\n"))
+      return StatusCode::FAIL;
+
+    Event = nullptr;
+    return StatusCode::SUCCESS;
+  }
+
+  /// Get the underlying CUevent.
+  operator CUevent() const { return Event; }
+};
+
+/// Class implementing the CUDA device images properties.
+struct CUDADeviceImageTy : public DeviceImageTy {
+  /// Create the CUDA image with the id and the target image pointer.
+  CUDADeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage)
+    : DeviceImageTy(ImageId, TgtImage), Module(nullptr) {}
+
+  /// Getter and setter for the CUDA module.
+  CUmodule getModule() const { return Module; }
+  void setModule(CUmodule CUDAModule) { Module = CUDAModule; }
+
+private:
+  /// The CUDA module that loaded the image.
+  CUmodule Module;
+};
+
+/// Class implementing the CUDA device functionalities which derives from the
+/// generic device class.
+struct CUDADeviceTy : public GenericDeviceTy {
+  // Create a CUDA device with a device id and the default CUDA grid values.
+  CUDADeviceTy(int32_t DeviceId, int32_t NumDevices)
+    : GenericDeviceTy(DeviceId, NumDevices, NVPTXGridValues),
+      CUDAStreamManager(*this), CUDAEventManager(*this) {}
+
+  ~CUDADeviceTy() {}
+
+  /// Initialize the device, its resources and get its properties.
+  StatusCode initImpl(GenericPluginTy &Plugin) override {
+    CUresult Err = cuDeviceGet(&Device, DeviceId);
+    if (checkResult(Err, "Error returned from cuDeviceGet\n"))
+      return StatusCode::FAIL;
+
+    // Query the current flags of the primary context and set its flags if
+    // it is inactive.
+    unsigned int FormerPrimaryCtxFlags = 0;
+    int FormerPrimaryCtxIsActive = 0;
+    Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
+                                     &FormerPrimaryCtxIsActive);
+    if (checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
+      return StatusCode::FAIL;
+
+    if (FormerPrimaryCtxIsActive) {
+      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+           "The primary context is active, no change to its flags\n");
+      if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
+          CU_CTX_SCHED_BLOCKING_SYNC)
+        INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+             "Warning: The current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
+    } else {
+      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+           "The primary context is inactive, set its flags to "
+           "CU_CTX_SCHED_BLOCKING_SYNC\n");
+      Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
+      if (checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
+        return StatusCode::FAIL;
+    }
+
+    // Retain the per device primary context and save it to use whenever this
+    // device is selected.
+    Err = cuDevicePrimaryCtxRetain(&Context, Device);
+    if (checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
+      return StatusCode::FAIL;
+
+    if (setContext())
+      return StatusCode::FAIL;
+
+    // Initialize stream pool.
+    if (CUDAStreamManager.init())
+      return StatusCode::FAIL;
+
+    if (CUDAEventManager.init())
+      return StatusCode::FAIL;
+
+    // Query attributes to determine number of threads/block and blocks/grid.
+    if (getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
+                      GridValues.GV_Max_Teams))
+      return StatusCode::FAIL;
+    if (getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
+                      GridValues.GV_Max_WG_Size))
+      return StatusCode::FAIL;
+    if (getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE, GridValues.GV_Warp_Size))
+      return StatusCode::FAIL;
+
+    return StatusCode::SUCCESS;
+  }
+
+  /// Deinitialize the device and release its resources.
+  StatusCode deinitImpl() override {
+    if (Context && setContext())
+      return StatusCode::FAIL;
+
+    // Deinitialize the stream manager.
+    if (CUDAStreamManager.deinit())
+      return StatusCode::FAIL;
+
+    if (CUDAEventManager.deinit())
+      return StatusCode::FAIL;
+
+    // Close modules if necessary.
+    if (!LoadedImages.empty()) {
+      assert(Context && "Context is invalid");
+
+      // Each image has its own module.
+      for (DeviceImageTy *Image : LoadedImages) {
+        CUDADeviceImageTy &CUDAImage =
+            static_cast<CUDADeviceImageTy &>(*Image);
+
+        // Unload the module of the image.
+        CUresult Err = cuModuleUnload(CUDAImage.getModule());
+        if (checkResult(Err, "Error returned from cuModuleUnload\n"))
+          return StatusCode(Err);
+
+        CUDAImage.setModule(nullptr);
+      }
+    }
+
+    if (Context) {
+      CUresult Err = cuDevicePrimaryCtxRelease(Device);
+      if (checkResult(Err, "Error returned from cuDevicePrimaryCtxRelease\n"))
+        return StatusCode(Err);
+    }
+
+    // Invalidate context and device references.
+    Context = nullptr;
+    Device = CU_DEVICE_INVALID;
+
+    return StatusCode::SUCCESS;
+  }
+
+  /// Allocate and construct a CUDA kernel.
+  GenericKernelTy *
+  constructKernelEntry(const __tgt_offload_entry &KernelEntry,
+                       DeviceImageTy &Image) override {
+    CUDADeviceImageTy &CUDAImage =
+        static_cast<CUDADeviceImageTy &>(Image);
+
+    // Retrieve the function pointer of the kernel.
+    CUfunction Func;
+    CUresult Err = cuModuleGetFunction(&Func, CUDAImage.getModule(),
+                                       KernelEntry.name);
+    if (checkResult(Err, "Failed loading '%s'\n", KernelEntry.name))
+      return nullptr;
+
+    DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", DPxPTR(&KernelEntry),
+       KernelEntry.name, DPxPTR(Func));
+
+    // Create a metadata object for the exec mode global (auto-generated).
+    StaticGlobalTy<llvm::omp::OMPTgtExecModeFlags> ExecModeGlobal(
+        KernelEntry.name, "_exec_mode");
+
+    // Retrieve execution mode for the kernel.
+    GenericGlobalHandlerTy &GlobalHandler = Plugin::get().getGlobalHandler();
+    if (GlobalHandler.readGlobalFromImage(*this, Image, ExecModeGlobal)) {
+      DP("Failed to read execution mode for '%s', using default GENERIC (1)\n",
+         KernelEntry.name);
+      // In some cases the execution mode is not included, so use the default.
+      ExecModeGlobal.setValue(llvm::omp::OMP_TGT_EXEC_MODE_GENERIC);
+    }
+
+    // Allocate and initialize the CUDA kernel.
+    CUDAKernelTy *CUDAKernel = Plugin::get().allocate<CUDAKernelTy>();
+    new (CUDAKernel)
+        CUDAKernelTy(KernelEntry.name, ExecModeGlobal.getValue(), Func);
+
+    return CUDAKernel;
+  }
+
+  /// Set the current context to this device's context.
+  StatusCode setContext() override {
+    CUresult Err = cuCtxSetCurrent(Context);
+    if (checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
+      return StatusCode::FAIL;
+    return StatusCode::SUCCESS;
+  }
+
+  /// Get the stream of the asynchronous info sructure or get a new one.
+  CUstream getStream(AsyncInfoWrapperTy &AsyncInfoWrapper) {
+    CUstream &Stream = AsyncInfoWrapper.getQueueAs<CUstream>();
+    if (!Stream)
+      Stream = CUDAStreamManager.getStream();
+    return Stream;
+  }
+
+  /// Getters of CUDA references.
+  CUcontext getCUDAContext() const { return Context; }
+  CUdevice getCUDADevice() const { return Device; }
+
+  /// Load the binary image into the device and allocate an image object.
+  DeviceImageTy *loadBinaryImpl(const __tgt_device_image *TgtImage,
+                                int32_t ImageId) override {
+    if (setContext())
+      return nullptr;
+
+    // Allocate and initialize the image object.
+    CUDADeviceImageTy *CUDAImage = Plugin::get().allocate<CUDADeviceImageTy>();
+    new (CUDAImage)
+        CUDADeviceImageTy(ImageId, TgtImage);
+
+    // Load the binary as a CUDA module.
+    CUmodule Module = nullptr;
+    CUresult Err = cuModuleLoadDataEx(&Module, CUDAImage->getStart(), 0,
+                                      nullptr, nullptr);
+    if (checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
+      return nullptr;
+
+    // Save the reference of the module on the image.
+    CUDAImage->setModule(Module);
+
+    return CUDAImage;
+  }
+
+  /// Allocate memory on the device or related to the device.
+  void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
+    if (Size == 0)
+      return nullptr;
+
+    if (setContext())
+      return nullptr;
+
+    CUresult Err;
+    void *MemAlloc = nullptr;
+    CUdeviceptr DevicePtr;
+    switch (Kind) {
+    case TARGET_ALLOC_DEFAULT:
+    case TARGET_ALLOC_DEVICE:
+      Err = cuMemAlloc(&DevicePtr, Size);
+      if (checkResult(Err, "Error returned from cuMemAlloc\n"))
+        return nullptr;
+      MemAlloc = (void *)DevicePtr;
+      break;
+    case TARGET_ALLOC_HOST:
+      Err = cuMemAllocHost(&MemAlloc, Size);
+      if (checkResult(Err, "Error returned from cuMemAllocHost\n"))
+        return nullptr;
+      break;
+    case TARGET_ALLOC_SHARED:
+      Err = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL);
+      if (checkResult(Err, "Error returned from cuMemAllocManaged\n"))
+        return nullptr;
+      MemAlloc = (void *)DevicePtr;
+      break;
+    }
+
+    return MemAlloc;
+  }
+
+  /// Deallocate memory on the device or related to the device.
+  int free(void *TgtPtr, TargetAllocTy Kind) override {
+    if (TgtPtr == nullptr)
+      return OFFLOAD_SUCCESS;
+
+    if (setContext())
+      return OFFLOAD_FAIL;
+
+    CUresult Err;
+    switch (Kind) {
+    case TARGET_ALLOC_DEFAULT:
+    case TARGET_ALLOC_DEVICE:
+    case TARGET_ALLOC_SHARED:
+      Err = cuMemFree((CUdeviceptr)TgtPtr);
+      if (checkResult(Err, "Error returned from cuMemFree\n"))
+        return OFFLOAD_FAIL;
+      break;
+    case TARGET_ALLOC_HOST:
+      Err = cuMemFreeHost(TgtPtr);
+      if (checkResult(Err, "Error returned from cuMemFreeHost\n"))
+        return OFFLOAD_FAIL;
+      break;
+    }
+
+    return OFFLOAD_SUCCESS;
+  }
+
+  /// Synchronize current thread with the pending operations on the async info.
+  StatusCode synchronizeImpl(__tgt_async_info &AsyncInfo) override {
+    CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
+    CUresult Err = cuStreamSynchronize(Stream);
+
+    // Once the stream is synchronized, return it to stream pool and reset
+    // AsyncInfo. This is to make sure the synchronization only works for its
+    // own tasks.
+    CUDAStreamManager.returnStream(Stream);
+    AsyncInfo.Queue = nullptr;
+
+    return StatusCode(Err);
+  }
+
+  /// Submit data to the device (host to device transfer).
+  StatusCode dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
+                            AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    if (setContext())
+      return StatusCode::FAIL;
+
+    CUstream Stream = getStream(AsyncInfoWrapper);
+
+    return StatusCode(
+        cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream));
+  }
+
+  /// Retrieve data from the device (device to host transfer).
+  StatusCode dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
+                              AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    if (setContext())
+      return StatusCode::FAIL;
+
+    CUstream Stream = getStream(AsyncInfoWrapper);
+
+    return StatusCode(
+        cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream));
+  }
+
+  /// Exchange data between two devices directly. We may use peer access if
+  /// the CUDA devices and driver allow them.
+  StatusCode dataExchangeImpl(const void *SrcPtr,
+                              GenericDeviceTy &DstGenericDevice,
+                              void *DstPtr, int64_t Size,
+                              AsyncInfoWrapperTy &AsyncInfoWrapper) override;
+
+  /// Initialize the async info for interoperability purposes.
+  StatusCode initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    if (setContext())
+      return StatusCode::FAIL;
+
+    getStream(AsyncInfoWrapper);
+
+    return StatusCode::SUCCESS;
+  }
+
+  /// Initialize the device info for interoperability purposes.
+  StatusCode initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
+    assert(Context && "Context is null");
+    assert(Device != CU_DEVICE_INVALID && "Device is invalid");
+
+    if (setContext())
+      return StatusCode::FAIL;
+
+    if (!DeviceInfo->Context)
+      DeviceInfo->Context = Context;
+
+    if (!DeviceInfo->Device)
+      DeviceInfo->Device = reinterpret_cast<void *>(Device);
+
+    return StatusCode::SUCCESS;
+  }
+
+  /// Create an event.
+  StatusCode createEventImpl(void **EventPtrStorage) override {
+    CUevent *Event = reinterpret_cast<CUevent *>(EventPtrStorage);
+    *Event = CUDAEventManager.getEvent();
+    return StatusCode::SUCCESS;
+  }
+
+  /// Destroy a previously created event.
+  StatusCode destroyEventImpl(void *EventPtr) override {
+    CUevent Event = reinterpret_cast<CUevent>(EventPtr);
+    CUDAEventManager.returnEvent(Event);
+    return StatusCode::SUCCESS;
+  }
+
+  /// Record the event.
+  StatusCode recordEventImpl(void *EventPtr,
+                             AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    CUevent Event = reinterpret_cast<CUevent>(EventPtr);
+    return StatusCode(cuEventRecord(Event, getStream(AsyncInfoWrapper)));
+  }
+
+  /// Make the stream wait on the event.
+  StatusCode waitEventImpl(void *EventPtr,
+                           AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from
+    // specific CUDA version, and defined as 0x0. In previous version, per CUDA
+    // API document, that argument has to be 0x0.
+    CUevent Event = reinterpret_cast<CUevent>(EventPtr);
+    return StatusCode(cuStreamWaitEvent(getStream(AsyncInfoWrapper), Event, 0));
+  }
+
+  /// Synchronize the current thread with the event.
+  StatusCode syncEventImpl(void *EventPtr) override {
+    CUevent Event = reinterpret_cast<CUevent>(EventPtr);
+    return StatusCode(cuEventSynchronize(Event));
+  }
+
+  /// Print information about the device.
+  void printInfoImpl() override {
+    char TmpChar[1000];
+    std::string TmpStr;
+    size_t TmpSt;
+    int TmpInt, TmpInt2, TmpInt3;
+
+    cuDriverGetVersion(&TmpInt);
+    printf("    CUDA Driver Version: \t\t%d \n", TmpInt);
+    printf("    CUDA Device Number: \t\t%d \n", DeviceId);
+    checkResult(cuDeviceGetName(TmpChar, 1000, Device),
+                "Error returned from cuDeviceGetName\n");
+    printf("    Device Name: \t\t\t%s \n", TmpChar);
+    checkResult(cuDeviceTotalMem(&TmpSt, Device),
+                "Error returned from cuDeviceTotalMem\n");
+    printf("    Global Memory Size: \t\t%zu bytes \n", TmpSt);
+    checkResult(cuDeviceGetAttribute(
+                    &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device),
+                "Error returned from cuDeviceGetAttribute\n");
+    printf("    Number of Multiprocessors: \t\t%d \n", TmpInt);
+    checkResult(
+        cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device),
+        "Error returned from cuDeviceGetAttribute\n");
+    printf("    Concurrent Copy and Execution: \t%s \n", StringParser::boolToYesNoString(TmpInt));
+    checkResult(cuDeviceGetAttribute(
+                    &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device),
+                "Error returned from cuDeviceGetAttribute\n");
+    printf("    Total Constant Memory: \t\t%d bytes\n", TmpInt);
+    checkResult(
+        cuDeviceGetAttribute(
+            &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device),
+        "Error returned from cuDeviceGetAttribute\n");
+    printf("    Max Shared Memory per Block: \t%d bytes \n", TmpInt);
+    checkResult(
+        cuDeviceGetAttribute(
+            &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device),
+        "Error returned from cuDeviceGetAttribute\n");
+    printf("    Registers per Block: \t\t%d \n", TmpInt);
+    checkResult(
+        cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device),
+        "Error returned from cuDeviceGetAttribute\n");
+    printf("    Warp Size: \t\t\t\t%d Threads \n", TmpInt);
+    checkResult(cuDeviceGetAttribute(
+                    &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device),
+                "Error returned from cuDeviceGetAttribute\n");
+    printf("    Maximum Threads per Block: \t\t%d \n", TmpInt);
+    checkResult(cuDeviceGetAttribute(
+                    &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device),
+                "Error returned from cuDeviceGetAttribute\n");
+    checkResult(cuDeviceGetAttribute(
+                    &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device),
+                "Error returned from cuDeviceGetAttribute\n");
+    checkResult(cuDeviceGetAttribute(
+                    &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device),
+                "Error returned from cuDeviceGetAttribute\n");
+    printf("    Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2,
+           TmpInt3);
+    checkResult(cuDeviceGetAttribute(
+                    &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device),
+                "Error returned from cuDeviceGetAttribute\n");
+    checkResult(cuDeviceGetAttribute(
+                    &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device),
+                "Error returned from cuDeviceGetAttribute\n");
+    checkResult(cuDeviceGetAttribute(
+                    &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device),
+                "Error returned from cuDeviceGetAttribute\n");
+    printf("    Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2,
+           TmpInt3);
+    checkResult(
+        cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device),
+        "Error returned from cuDeviceGetAttribute\n");
+    printf("    Maximum Memory Pitch: \t\t%d bytes \n", TmpInt);
+    checkResult(cuDeviceGetAttribute(
+                    &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device),
+                "Error returned from cuDeviceGetAttribute\n");
+    printf("    Texture Alignment: \t\t\t%d bytes \n", TmpInt);
+    checkResult(
+        cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device),
+        "Error returned from cuDeviceGetAttribute\n");
+    printf("    Clock Rate: \t\t\t%d kHz\n", TmpInt);
+    checkResult(cuDeviceGetAttribute(
+                    &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device),
+                "Error returned from cuDeviceGetAttribute\n");
+    printf("    Execution Timeout: \t\t\t%s \n", StringParser::boolToYesNoString(TmpInt));
+    checkResult(
+        cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device),
+        "Error returned from cuDeviceGetAttribute\n");
+    printf("    Integrated Device: \t\t\t%s \n", StringParser::boolToYesNoString(TmpInt));
+    checkResult(cuDeviceGetAttribute(
+                    &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device),
+                "Error returned from cuDeviceGetAttribute\n");
+    printf("    Can Map Host Memory: \t\t%s \n", StringParser::boolToYesNoString(TmpInt));
+    checkResult(
+        cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device),
+        "Error returned from cuDeviceGetAttribute\n");
+    if (TmpInt == CU_COMPUTEMODE_DEFAULT)
+      TmpStr = "DEFAULT";
+    else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
+      TmpStr = "PROHIBITED";
+    else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
+      TmpStr = "EXCLUSIVE PROCESS";
+    else
+      TmpStr = "unknown";
+    printf("    Compute Mode: \t\t\t%s \n", TmpStr.c_str());
+    checkResult(cuDeviceGetAttribute(
+                    &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device),
+                "Error returned from cuDeviceGetAttribute\n");
+    printf("    Concurrent Kernels: \t\t%s \n", StringParser::boolToYesNoString(TmpInt));
+    checkResult(
+        cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device),
+        "Error returned from cuDeviceGetAttribute\n");
+    printf("    ECC Enabled: \t\t\t%s \n", StringParser::boolToYesNoString(TmpInt));
+    checkResult(cuDeviceGetAttribute(
+                    &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device),
+                "Error returned from cuDeviceGetAttribute\n");
+    printf("    Memory Clock Rate: \t\t\t%d kHz\n", TmpInt);
+    checkResult(
+        cuDeviceGetAttribute(
+            &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device),
+        "Error returned from cuDeviceGetAttribute\n");
+    printf("    Memory Bus Width: \t\t\t%d bits\n", TmpInt);
+    checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
+                                     Device),
+                "Error returned from cuDeviceGetAttribute\n");
+    printf("    L2 Cache Size: \t\t\t%d bytes \n", TmpInt);
+    checkResult(cuDeviceGetAttribute(
+                    &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
+                    Device),
+                "Error returned from cuDeviceGetAttribute\n");
+    printf("    Max Threads Per SMP: \t\t%d \n", TmpInt);
+    checkResult(cuDeviceGetAttribute(
+                    &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device),
+                "Error returned from cuDeviceGetAttribute\n");
+    printf("    Async Engines: \t\t\t%s (%d) \n", StringParser::boolToYesNoString(TmpInt), TmpInt);
+    checkResult(cuDeviceGetAttribute(
+                    &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device),
+                "Error returned from cuDeviceGetAttribute\n");
+    printf("    Unified Addressing: \t\t%s \n", StringParser::boolToYesNoString(TmpInt));
+    checkResult(cuDeviceGetAttribute(
+                    &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device),
+                "Error returned from cuDeviceGetAttribute\n");
+    printf("    Managed Memory: \t\t\t%s \n", StringParser::boolToYesNoString(TmpInt));
+    checkResult(
+        cuDeviceGetAttribute(
+            &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device),
+        "Error returned from cuDeviceGetAttribute\n");
+    printf("    Concurrent Managed Memory: \t\t%s \n", StringParser::boolToYesNoString(TmpInt));
+    checkResult(
+        cuDeviceGetAttribute(
+            &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device),
+        "Error returned from cuDeviceGetAttribute\n");
+    printf("    Preemption Supported: \t\t%s \n", StringParser::boolToYesNoString(TmpInt));
+    checkResult(cuDeviceGetAttribute(
+                    &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device),
+                "Error returned from cuDeviceGetAttribute\n");
+    printf("    Cooperative Launch: \t\t%s \n", StringParser::boolToYesNoString(TmpInt));
+    checkResult(cuDeviceGetAttribute(
+                    &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device),
+                "Error returned from cuDeviceGetAttribute\n");
+    printf("    Multi-Device Boars: \t\t%s \n", StringParser::boolToYesNoString(TmpInt));
+    checkResult(
+        cuDeviceGetAttribute(
+            &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device),
+        "Error returned from cuDeviceGetAttribute\n");
+    checkResult(
+        cuDeviceGetAttribute(
+            &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device),
+        "Error returned from cuDeviceGetAttribute\n");
+    printf("    Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
+  }
+
+  /// Getters and setters for stack and heap sizes.
+  StatusCode getDeviceStackSize(uint64_t &Value) override {
+    return getCtxLimit(CU_LIMIT_STACK_SIZE, Value);
+  }
+  StatusCode setDeviceStackSize(uint64_t Value) override {
+    return setCtxLimit(CU_LIMIT_STACK_SIZE, Value);
+  }
+  StatusCode getDeviceHeapSize(uint64_t &Value) override {
+    return getCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value);
+  }
+  StatusCode setDeviceHeapSize(uint64_t Value) override {
+    return setCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value);
+  }
+
+  /// CUDA-specific functions for getting and setting context limits.
+  StatusCode setCtxLimit(CUlimit Kind, uint64_t Value) {
+    CUresult Res = cuCtxSetLimit(Kind, Value);
+    return StatusCode(Res);
+  }
+  StatusCode getCtxLimit(CUlimit Kind, uint64_t &Value) {
+    CUresult Res = cuCtxGetLimit(&Value, Kind);
+    return StatusCode(Res);
+  }
+
+  /// CUDA-specific function to get device attributes.
+  StatusCode getDeviceAttr(uint32_t Kind, uint32_t &Value) {
+    // TODO: Warn if the new value is larger than the old.
+    CUresult Err =
+        cuDeviceGetAttribute((int *)&Value, (CUdevice_attribute)Kind, Device);
+    if (checkResult(Err, "Error returned from cuDeviceGetAttribute\n"))
+      return StatusCode::FAIL;
+    return StatusCode::SUCCESS;
+  }
+
+private:
+  using CUDAStreamManagerTy = GenericStreamManagerTy<CUDAStreamRef>;
+  using CUDAEventManagerTy = GenericEventManagerTy<CUDAEventRef>;
+
+  /// Stream manager for CUDA streams.
+  CUDAStreamManagerTy CUDAStreamManager;
+
+  /// Event manager for CUDA events.
+  CUDAEventManagerTy CUDAEventManager;
+
+  /// The device's context. This context should be set before performing
+  /// operations on the device.
+  CUcontext Context = nullptr;
+
+  /// The CUDA device handler.
+  CUdevice Device = CU_DEVICE_INVALID;
+};
+
+StatusCode CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
+                                    int32_t NumThreads, int32_t NumBlocks,
+                                    int32_t DynamicMemorySize,
+                                    int32_t NumKernelArgs, void *KernelArgs,
+                                    AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+  CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);
+  CUstream Stream = CUDADevice.getStream(AsyncInfoWrapper);
+
+  return StatusCode(cuLaunchKernel(Func, NumBlocks, /* gridDimY */ 1,
+                                   /* gridDimZ */ 1, NumThreads,
+                                   /* blockDimY */ 1, /* blockDimZ */ 1,
+                                   DynamicMemorySize, Stream,
+                                   (void **)KernelArgs, nullptr));
+}
+
+/// Class implementing the CUDA-specific functionalities of the global handler.
+class CUDAGlobalHandler final : public GenericGlobalHandlerTy {
+public:
+  /// Get the metadata of a global from the device. The name and size of the
+  /// global is read from DeviceGlobal and the address of the global is written
+  /// to DeviceGlobal.
+  StatusCode getGlobalMetadataFromDevice(GenericDeviceTy &Device,
+                                         DeviceImageTy &Image,
+                                         GlobalTy &DeviceGlobal) override {
+    CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(Image);
+
+    const char *GlobalName = DeviceGlobal.getName().data();
+
+    size_t CUSize;
+    CUdeviceptr CUPtr;
+    CUresult Err = cuModuleGetGlobal(&CUPtr, &CUSize, CUDAImage.getModule(),
+                                     GlobalName);
+    if (checkResult(Err, "Failed to load global '%s'\n", GlobalName))
+      return StatusCode::FAIL;
+
+    if (CUSize != DeviceGlobal.getSize()) {
+      DP("Failed to load global '%s' due to size mismatch (%zu != %zu)\n",
+         GlobalName, CUSize, (size_t) DeviceGlobal.getSize());
+      return StatusCode::FAIL;
+    }
+
+    DeviceGlobal.setPtr(reinterpret_cast<void *>(CUPtr));
+    return StatusCode::SUCCESS;
+  }
+};
+
+/// Class implementing the CUDA-specific functionalities of the plugin.
+struct CUDAPluginTy final : public GenericPluginTy {
+  /// Create a CUDA plugin and initialize the CUDA driver.
+  CUDAPluginTy() : GenericPluginTy() {
+    CUresult Err = cuInit(0);
+    if (Err == CUDA_ERROR_INVALID_HANDLE) {
+      // Cannot call cuGetErrorString if dlsym failed.
+      DP("Failed to load CUDA shared library\n");
+      return;
+    } else if (Err == CUDA_ERROR_NO_DEVICE) {
+      // Do not initialize if there are no devices.
+      DP("There are no devices supporting CUDA.\n");
+      return;
+    }
+
+    if (checkResult(Err, "Error returned from cuInit\n"))
+      return;
+
+    // Get the number of devices.
+    int NumDevices;
+    Err = cuDeviceGetCount(&NumDevices);
+    if (checkResult(Err, "Error returned from cuDeviceGetCount\n"))
+      return;
+
+    // Do not initialize if there are no devices.
+    if (NumDevices == 0) {
+      DP("There are no devices supporting CUDA.\n");
+      return;
+    }
+
+    // Initialize the generic plugin structure.
+    GenericPluginTy::init(NumDevices, new CUDAGlobalHandler());
+  }
+
+  /// This class should not be copied.
+  CUDAPluginTy(const CUDAPluginTy &) = delete;
+  CUDAPluginTy(CUDAPluginTy &&) = delete;
+
+  ~CUDAPluginTy() {}
+
+  /// Get the ELF code for recognizing the compatible image binary.
+  uint16_t getMagicElfBits() const override { return /* EM_CUDA */ 190; }
+
+  /// Create a CUDA device with a specific id.
+  CUDADeviceTy &createDevice(int32_t DeviceId) override {
+    CUDADeviceTy *Device = new CUDADeviceTy(DeviceId, getNumDevices());
+    return *Device;
+  }
+
+  /// Check whether the image is compatible with a CUDA device.
+  bool isImageCompatible(int32_t DeviceId,
+      __tgt_image_info *Info) const override {
+    assert(isValidDeviceId(DeviceId) && "Invalid CUDA device");
+
+    CUdevice Device;
+    if (cuDeviceGet(&Device, DeviceId) != CUDA_SUCCESS)
+      return false;
+
+    int32_t Major, Minor;
+    if (cuDeviceGetAttribute(&Major,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                             Device) != CUDA_SUCCESS)
+      return false;
+
+    if (cuDeviceGetAttribute(&Minor,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+                             Device) != CUDA_SUCCESS)
+      return false;
+
+    StringRef ArchStr(Info->Arch);
+    StringRef PrefixStr("sm_");
+    if (!ArchStr.startswith(PrefixStr)) {
+      REPORT("Unrecognized image architecture %s\n", ArchStr.data());
+      return false;
+    }
+
+    int32_t ImageMajor = ArchStr[PrefixStr.size()+0] - '0';
+    int32_t ImageMinor = ArchStr[PrefixStr.size()+1] - '0';
+
+    // A cubin generated for a certain compute capability is supported to run on
+    // any GPU with the same major revision and same or higher minor revision.
+    return (Major == ImageMajor && Minor >= ImageMinor);
+  }
+};
+
+StatusCode
+CUDADeviceTy::dataExchangeImpl(const void *SrcPtr,
+                               GenericDeviceTy &DstGenericDevice,
+                               void *DstPtr, int64_t Size,
+                               AsyncInfoWrapperTy &AsyncInfoWrapper) {
+  if (setContext())
+    return StatusCode::FAIL;
+
+  CUDADeviceTy &DstDevice = static_cast<CUDADeviceTy &>(DstGenericDevice);
+
+  CUresult Err;
+  int32_t DstDeviceId = DstDevice.DeviceId;
+  CUdeviceptr CUSrcPtr = (CUdeviceptr) SrcPtr;
+  CUdeviceptr CUDstPtr = (CUdeviceptr) DstPtr;
+
+  int CanAccessPeer = 0;
+  if (DeviceId != DstDeviceId) {
+    // Make sure the lock is released before performing the copies.
+    std::lock_guard<std::mutex> Lock(PeerAccessesLock);
+
+    switch (PeerAccesses[DstDeviceId]) {
+    case PeerAccessState::AVAILABLE:
+      CanAccessPeer = 1;
+      break;
+    case PeerAccessState::UNAVAILABLE:
+      CanAccessPeer = 0;
+      break;
+    case PeerAccessState::PENDING:
+      // Check whether the source device can access the destination device.
+      Err = cuDeviceCanAccessPeer(&CanAccessPeer, Device, DstDevice.Device);
+      if (checkResult(Err, "Error returned from cuDeviceCanAccessPeer\n"))
+        return StatusCode(Err);
+
+      if (CanAccessPeer) {
+        Err = cuCtxEnablePeerAccess(DstDevice.Context, 0);
+        if (Err == CUDA_ERROR_TOO_MANY_PEERS) {
+          // Resources may be exhausted due to many P2P links.
+          CanAccessPeer = 0;
+          DP("Too many P2P so fall back to D2D memcpy");
+        } else if (checkResult(Err, "Error returned from cuCtxEnablePeerAccess\n"))
+          return StatusCode(Err);
+      }
+      PeerAccesses[DstDeviceId] = (CanAccessPeer) ?
+          PeerAccessState::AVAILABLE : PeerAccessState::UNAVAILABLE;
+    }
+  }
+
+  CUstream Stream = getStream(AsyncInfoWrapper);
+
+  // TODO: Should we fallback to D2D if peer access fails?
+  if (CanAccessPeer)
+    return StatusCode(cuMemcpyPeerAsync(CUDstPtr, Context, CUSrcPtr,
+                                        DstDevice.Context, Size, Stream));
+
+  // Fallback to D2D copy.
+  return StatusCode(cuMemcpyDtoDAsync(CUDstPtr, CUSrcPtr, Size, Stream));
+}
+
+StatusCode Plugin::init() {
+  // Call the getter to intialize the CUDA plugin.
+  get();
+  return StatusCode::SUCCESS;
+}
+
+StatusCode Plugin::deinit() {
+  // The CUDA plugin and the CUDA driver should already be deinitialized
+  // at this point. So do nothing for this plugin.
+  if (Plugin::isActive())
+    return StatusCode::FAIL;
+
+  return StatusCode::SUCCESS;
+}
+
+GenericPluginTy &Plugin::get() {
+  static CUDAPluginTy CUDAPlugin;
+  assert(Plugin::isActive() && "Plugin is not active");
+  return CUDAPlugin;
+}
+
+const char *Plugin::getErrorStr(const StatusCode &SC) {
+  CUresult ErrorCode = static_cast<CUresult>(SC.getCode());
+
+  const char *ErrorStr = nullptr;
+  CUresult Ret = cuGetErrorString(ErrorCode, &ErrorStr);
+  if (Ret == CUDA_SUCCESS) {
+    return ErrorStr;
+  } else if (Ret == CUDA_ERROR_INVALID_VALUE) {
+    REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code: %d\n", ErrorCode);
+  } else {
+    checkResult(Ret, "Error returned from cuGetErrorString\n");
+  }
+  return nullptr;
+}
+
+} // plugin
+} // target
+} // omp
+} // llvm
Index: openmp/libomptarget/plugins-nextgen/exports
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/exports
@@ -0,0 +1,40 @@
+VERS1.0 {
+  global:
+    __tgt_rtl_init_plugin;
+    __tgt_rtl_deinit_plugin;
+    __tgt_rtl_is_valid_binary;
+    __tgt_rtl_is_valid_binary_info;
+    __tgt_rtl_is_data_exchangable;
+    __tgt_rtl_number_of_devices;
+    __tgt_rtl_init_requires;
+    __tgt_rtl_init_device;
+    __tgt_rtl_deinit_device;
+    __tgt_rtl_load_binary;
+    __tgt_rtl_data_alloc;
+    __tgt_rtl_data_submit;
+    __tgt_rtl_data_submit_async;
+    __tgt_rtl_data_retrieve;
+    __tgt_rtl_data_retrieve_async;
+    __tgt_rtl_data_exchange;
+    __tgt_rtl_data_exchange_async;
+    __tgt_rtl_data_delete;
+    __tgt_rtl_run_target_team_region;
+    __tgt_rtl_run_target_team_region_async;
+    __tgt_rtl_run_target_region;
+    __tgt_rtl_run_target_region_async;
+    __tgt_rtl_synchronize;
+    __tgt_rtl_register_lib;
+    __tgt_rtl_unregister_lib;
+    __tgt_rtl_supports_empty_images;
+    __tgt_rtl_set_info_flag;
+    __tgt_rtl_print_device_info;
+    __tgt_rtl_create_event;
+    __tgt_rtl_record_event;
+    __tgt_rtl_wait_event;
+    __tgt_rtl_sync_event;
+    __tgt_rtl_destroy_event;
+    __tgt_rtl_init_device_info;
+    __tgt_rtl_init_async_info;
+  local:
+    *;
+};
Index: openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
@@ -0,0 +1,417 @@
+//===-RTLs/generic-64bit/src/rtl.cpp - Target RTLs Implementation - C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// RTL NextGen for generic 64-bit machine
+//
+//===----------------------------------------------------------------------===//
+
+#include <cassert>
+#include <cstddef>
+#include <ffi.h>
+#include <string>
+#include <unordered_map>
+
+#include "Debug.h"
+#include "DeviceEnvironment.h"
+#include "GlobalHandler.h"
+#include "PluginInterface.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include "llvm/Frontend/OpenMP/OMPGridValues.h"
+#include "llvm/Support/DynamicLibrary.h"
+
+// The number of devices in this plugin.
+#define NUM_DEVICES 4
+
+// The ELF ID should be defined at compile-time by the build system.
+#ifndef TARGET_ELF_ID
+#define TARGET_ELF_ID 0
+#endif
+
+namespace llvm {
+namespace omp {
+namespace target {
+namespace plugin {
+
+/// Forward declarations for all specialized data structures.
+struct GenELF64KernelTy;
+struct GenELF64DeviceTy;
+struct GenELF64PluginTy;
+
+using llvm::sys::DynamicLibrary;
+
+/// Class implementing kernel functionalities for GenELF64.
+struct GenELF64KernelTy : public GenericKernelTy {
+  /// Construct the kernel with a name, execution mode and a function.
+  GenELF64KernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode,
+                   void (*Func)(void))
+      : GenericKernelTy(Name, ExecutionMode), Func(Func) {}
+
+  /// Initialize the kernel.
+  StatusCode initImpl(GenericDeviceTy &GenericDevice) override {
+    // Set the maximum number of threads to a single.
+    MaxNumThreads = 1;
+    return StatusCode::SUCCESS;
+  }
+
+  /// Launch the kernel using the libffi.
+  StatusCode launchImpl(GenericDeviceTy &GenericDevice, int32_t NumThreads,
+                        int32_t NumBlocks, int32_t DynamicMemorySize,
+                        int32_t NumKernelArgs, void *KernelArgs,
+                        AsyncInfoWrapperTy &AsyncInfoWrapper) const override {
+    // Create a vector of ffi_types, one per argument.
+    SmallVector<ffi_type *, 16> ArgTypes(NumKernelArgs, &ffi_type_pointer);
+    ffi_type **ArgTypesPtr = (ArgTypes.size()) ? &ArgTypes[0] : nullptr;
+
+    // Prepare the cif structure before running the kernel function.
+    ffi_cif Cif;
+    ffi_status Status = ffi_prep_cif(&Cif, FFI_DEFAULT_ABI, NumKernelArgs,
+                                     &ffi_type_void, ArgTypesPtr);
+    if (Status != FFI_OK)
+      return StatusCode::FAIL;
+
+    // Call the kernel function through libffi.
+    long Return;
+    ffi_call(&Cif, Func, &Return, (void **) KernelArgs);
+
+    return StatusCode::SUCCESS;
+  }
+
+  /// Get the default number of blocks and threads for the kernel.
+  int32_t getDefaultNumBlocks(GenericDeviceTy &) const override { return 1; }
+  int32_t getDefaultNumThreads(GenericDeviceTy &) const override { return 1; }
+
+private:
+  /// The kernel function to execute.
+  void (*Func)(void);
+};
+
+/// Class implementing the GenELF64 device images properties.
+struct GenELF64DeviceImageTy : public DeviceImageTy {
+  /// Create the GenELF64 image with the id and the target image pointer.
+  GenELF64DeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage)
+    : DeviceImageTy(ImageId, TgtImage), DynLib() {}
+
+  /// Getter and setter for the dynamic library.
+  DynamicLibrary &getDynamicLibrary() { return DynLib; }
+  void setDynamicLibrary(const DynamicLibrary &Lib) { DynLib = Lib; }
+
+private:
+  /// The dynamic library that loaded the image.
+  DynamicLibrary DynLib;
+};
+
+/// Class implementing the device functionalities for GenELF64.
+struct GenELF64DeviceTy : public GenericDeviceTy {
+  /// Create the device with a specific id.
+  GenELF64DeviceTy(int32_t DeviceId, int32_t NumDevices)
+    : GenericDeviceTy(DeviceId, NumDevices, GenELF64GridValues) {}
+
+  ~GenELF64DeviceTy() {}
+
+  /// Initialize the device, which is a no-op
+  StatusCode initImpl(GenericPluginTy &Plugin) override {
+    return StatusCode::SUCCESS;
+  }
+
+  /// Deinitialize the device, which is a no-op
+  StatusCode deinitImpl() override {
+    return StatusCode::SUCCESS;
+  }
+
+  /// Construct the kernel for a specific image on the device.
+  GenericKernelTy *
+  constructKernelEntry(const __tgt_offload_entry &KernelEntry,
+                       DeviceImageTy &Image) override {
+    GlobalTy Function(KernelEntry);
+
+    // Get the metadata (address) of the kernel function.
+    GenericGlobalHandlerTy &GlobalHandler = Plugin::get().getGlobalHandler();
+    if (GlobalHandler.getGlobalMetadataFromDevice(*this, Image, Function))
+      return nullptr;
+
+    // Allocate and create the kernel.
+    GenELF64KernelTy *GenELF64Kernel = Plugin::get().allocate<GenELF64KernelTy>();
+    new (GenELF64Kernel)
+        GenELF64KernelTy(KernelEntry.name, OMP_TGT_EXEC_MODE_GENERIC, (void (*)()) Function.getPtr());
+
+    return GenELF64Kernel;
+  }
+
+  /// Set the current context to this device, which is a no-op.
+  StatusCode setContext() override {
+    return StatusCode::SUCCESS;
+  }
+
+  /// Load the binary image into the device and allocate an image object.
+  DeviceImageTy *loadBinaryImpl(const __tgt_device_image *TgtImage,
+                                int32_t ImageId) override {
+    // Allocate and initialize the image object.
+    GenELF64DeviceImageTy *Image =
+        Plugin::get().allocate<GenELF64DeviceImageTy>();
+    new (Image)
+        GenELF64DeviceImageTy(ImageId, TgtImage);
+
+    // Create a temporary file.
+    char TmpFileName[] = "/tmp/tmpfile_XXXXXX";
+    int TmpFileFd = mkstemp(TmpFileName);
+    if (TmpFileFd == -1) {
+      DP("Failed to create tmp file for loading target image\n");
+      return nullptr;
+    }
+
+    // Open the temporary file.
+    FILE *TmpFile = fdopen(TmpFileFd, "wb");
+    if (!TmpFile) {
+      DP("Failed to open tmp file %s for loading target image\n", TmpFileName);
+      return nullptr;
+    }
+
+    // Write the image into the temporary file.
+    size_t Written = fwrite(Image->getStart(), Image->getSize(), 1, TmpFile);
+    if (Written != 1) {
+      DP("Failed to write target image to tmp file %s\n", TmpFileName);
+      return nullptr;
+    }
+
+    // Close the temporary file.
+    int Err = fclose(TmpFile);
+    if (Err) {
+      DP("Failed to close tmp file %s with the target image\n", TmpFileName);
+      return nullptr;
+    }
+
+    // Load the temporary file as a dynamic library.
+    std::string ErrMsg;
+    DynamicLibrary DynLib =
+        DynamicLibrary::getPermanentLibrary(TmpFileName, &ErrMsg);
+
+    // Check if the loaded library is valid.
+    if (!DynLib.isValid()) {
+      DP("Failed to load target image: %s\n", ErrMsg.c_str());
+      return nullptr;
+    }
+
+    // Save a reference of the image's dynamic library.
+    Image->setDynamicLibrary(DynLib);
+
+    return Image;
+  }
+
+  /// Allocate memory. Use std::malloc in all cases.
+  void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
+    if (Size == 0)
+      return nullptr;
+
+    void *MemAlloc = nullptr;
+    switch (Kind) {
+    case TARGET_ALLOC_DEFAULT:
+    case TARGET_ALLOC_DEVICE:
+    case TARGET_ALLOC_HOST:
+    case TARGET_ALLOC_SHARED:
+      MemAlloc = std::malloc(Size);
+      break;
+    }
+    return MemAlloc;
+  }
+
+  /// Free the memory. Use std::free in all cases.
+  int free(void *TgtPtr, TargetAllocTy Kind) override {
+    std::free(TgtPtr);
+    return OFFLOAD_SUCCESS;
+  }
+
+  /// Submit data to the device (host to device transfer).
+  StatusCode dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
+                            AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    std::memcpy(TgtPtr, HstPtr, Size);
+    return StatusCode::SUCCESS;
+  }
+
+  /// Retrieve data from the device (device to host transfer).
+  StatusCode dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
+                              AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    std::memcpy(HstPtr, TgtPtr, Size);
+    return StatusCode::SUCCESS;
+  }
+
+  /// Exchange data between two devices within the plugin. This function is not
+  /// supported in this plugin.
+  StatusCode dataExchangeImpl(const void *SrcPtr,
+                              GenericDeviceTy &DstGenericDevice,
+                              void *DstPtr, int64_t Size,
+                              AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    // This function should never be called because the function
+    // GenELF64Plugin::isDataExchangable() returns false.
+    return StatusCode::FAIL;
+  }
+
+  /// All functions are already synchronous. No need to do anything on this
+  /// synchronization function.
+  StatusCode synchronizeImpl(__tgt_async_info &AsyncInfo) override {
+    return StatusCode::SUCCESS;
+  }
+
+  /// This plugin does not support interoperability
+  StatusCode initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    return StatusCode::FAIL;
+  }
+
+  /// This plugin does not support interoperability
+  StatusCode initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
+    return StatusCode::FAIL;
+  }
+
+  /// This plugin does not support the event API. Do nothing without failing.
+  StatusCode createEventImpl(void **EventPtrStorage) override {
+    *EventPtrStorage = nullptr;
+    return StatusCode::SUCCESS;
+  }
+  StatusCode destroyEventImpl(void *EventPtr) override {
+    return StatusCode::SUCCESS;
+  }
+  StatusCode recordEventImpl(void *EventPtr,
+                             AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    return StatusCode::SUCCESS;
+  }
+  StatusCode waitEventImpl(void *EventPtr,
+                           AsyncInfoWrapperTy &AsyncInfoWrapper) override {
+    return StatusCode::SUCCESS;
+  }
+  StatusCode syncEventImpl(void *EventPtr) override {
+    return StatusCode::SUCCESS;
+  }
+
+  /// Print information about the device.
+  void printInfoImpl() override {
+    printf("    This is a generic-elf-64bit device\n");
+  }
+
+  /// This plugin should not setup the device environment.
+  virtual bool shouldSetupDeviceEnvironment() const override { return false; };
+
+  /// Getters and setters for stack size and heap size not relevant.
+  StatusCode getDeviceStackSize(uint64_t &Value) override {
+    Value = 0;
+    return StatusCode::SUCCESS;
+  }
+  StatusCode setDeviceStackSize(uint64_t Value) override {
+    return StatusCode::SUCCESS;
+  }
+  StatusCode getDeviceHeapSize(uint64_t &Value) override {
+    Value = 0;
+    return StatusCode::SUCCESS;
+  }
+  StatusCode setDeviceHeapSize(uint64_t Value) override {
+    return StatusCode::SUCCESS;
+  }
+
+private:
+  /// Grid values for Generic ELF64 plugins.
+  static constexpr GV GenELF64GridValues = {
+    1, // GV_Slot_Size
+    1, // GV_Warp_Size
+    1, // GV_Max_Teams
+    1, // GV_SimpleBufferSize
+    1, // GV_Max_WG_Size
+    1, // GV_Default_WG_Size
+  };
+};
+
+class GenELF64GlobalHandler final : public GenericGlobalHandlerTy {
+public:
+  StatusCode getGlobalMetadataFromDevice(GenericDeviceTy &GenericDevice,
+                                         DeviceImageTy &Image,
+                                         GlobalTy &DeviceGlobal) override {
+    const char *GlobalName = DeviceGlobal.getName().data();
+    GenELF64DeviceImageTy &GenELF64Image =
+        static_cast<GenELF64DeviceImageTy &>(Image);
+
+    // Get dynamic library that has loaded the device image.
+    DynamicLibrary &DynLib = GenELF64Image.getDynamicLibrary();
+
+    // Get the address of the symbol.
+    void *Addr = DynLib.getAddressOfSymbol(GlobalName);
+    if (Addr == nullptr) {
+      DP("Failed to load global '%s'\n", GlobalName);
+      return StatusCode::FAIL;
+    }
+
+    // Save the pointer to the symbol.
+    DeviceGlobal.setPtr(Addr);
+
+    return StatusCode::SUCCESS;
+  }
+};
+
+/// Class implementing the plugin functionalities for GenELF64.
+struct GenELF64PluginTy final : public GenericPluginTy {
+  /// Create the plugin.
+  GenELF64PluginTy() : GenericPluginTy() {
+    // Initialize the generic plugin structure with multiple devices and a
+    // global handler.
+    GenericPluginTy::init(NUM_DEVICES, new GenELF64GlobalHandler());
+  }
+
+  /// This class should not be copied.
+  GenELF64PluginTy(const GenELF64PluginTy &) = delete;
+  GenELF64PluginTy(GenELF64PluginTy &&) = delete;
+
+  ~GenELF64PluginTy() {}
+
+  /// Get the ELF code to recognize the compatible binary images.
+  uint16_t getMagicElfBits() const override { return TARGET_ELF_ID; }
+
+  /// Create a GenELF64 device with a specific id.
+  GenELF64DeviceTy &createDevice(int32_t DeviceId) override {
+    GenELF64DeviceTy *Device = new GenELF64DeviceTy(DeviceId, getNumDevices());
+    return *Device;
+  }
+
+  /// This plugin does not support exchanging data between two devices.
+  bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override {
+    return false;
+  }
+
+  /// All images (ELF-compatible) should be compatible with this plugin.
+  bool isImageCompatible(int32_t DeviceId,
+                         __tgt_image_info *Info) const override {
+    assert(isValidDeviceId(DeviceId) && "Invalid Generic ELF64 device");
+    return true;
+  }
+};
+
+StatusCode Plugin::init() {
+  // Call the getter to intialize the GenELF64 plugin.
+  get();
+  return StatusCode::SUCCESS;
+}
+
+StatusCode Plugin::deinit() {
+  // The Generic ELF64 plugin should already be deinitialized at this point.
+  if (Plugin::isActive())
+    return StatusCode::FAIL;
+
+  return StatusCode::SUCCESS;
+}
+
+GenericPluginTy &Plugin::get() {
+  static GenELF64PluginTy GenELF64Plugin;
+  assert(Plugin::isActive() && "Plugin is not active");
+  return GenELF64Plugin;
+}
+
+const char *Plugin::getErrorStr(const StatusCode &SC) {
+  return "No description";
+}
+
+} // plugin
+} // target
+} // omp
+} // llvm
Index: openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt
@@ -0,0 +1,17 @@
+##===----------------------------------------------------------------------===##
+# 
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a ppc64 machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  build_generic_elf64_nextgen("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu" "21")
+else()
+ libomptarget_say("Not building ppc64 NextGen offloading plugin: machine not found in the system.")
+endif()
Index: openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt
@@ -0,0 +1,17 @@
+##===----------------------------------------------------------------------===##
+# 
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a ppc64le machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  build_generic_elf64_nextgen("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu" "21")
+else()
+ libomptarget_say("Not building ppc64le NextGen offloading plugin: machine not found in the system.")
+endif()
Index: openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt
===================================================================
--- /dev/null
+++ openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt
@@ -0,0 +1,17 @@
+##===----------------------------------------------------------------------===##
+# 
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+#
+# Build a plugin for a x86_64 machine if available.
+#
+##===----------------------------------------------------------------------===##
+
+if(CMAKE_SYSTEM_NAME MATCHES "Linux")
+  build_generic_elf64_nextgen("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62")
+else()
+ libomptarget_say("Not building x86_64 NextGen offloading plugin: machine not found in the system.")
+endif()
Index: openmp/libomptarget/src/rtl.cpp
===================================================================
--- openmp/libomptarget/src/rtl.cpp
+++ openmp/libomptarget/src/rtl.cpp
@@ -15,6 +15,7 @@
 #include "device.h"
 #include "private.h"
 #include "rtl.h"
+#include "Utilities.h"
 
 #include <cassert>
 #include <cstdlib>
@@ -24,16 +25,17 @@
 
 using namespace llvm;
 using namespace llvm::sys;
+using namespace llvm::omp::target;
 
 // List of all plugins that can support offloading.
 static const char *RTLNames[] = {
-    /* PowerPC target       */ "libomptarget.rtl.ppc64.so",
-    /* x86_64 target        */ "libomptarget.rtl.x86_64.so",
-    /* CUDA target          */ "libomptarget.rtl.cuda.so",
-    /* AArch64 target       */ "libomptarget.rtl.aarch64.so",
-    /* SX-Aurora VE target  */ "libomptarget.rtl.ve.so",
-    /* AMDGPU target        */ "libomptarget.rtl.amdgpu.so",
-    /* Remote target        */ "libomptarget.rtl.rpc.so",
+    /* PowerPC target       */ "libomptarget.rtl.ppc64",
+    /* x86_64 target        */ "libomptarget.rtl.x86_64",
+    /* CUDA target          */ "libomptarget.rtl.cuda",
+    /* AArch64 target       */ "libomptarget.rtl.aarch64",
+    /* SX-Aurora VE target  */ "libomptarget.rtl.ve",
+    /* AMDGPU target        */ "libomptarget.rtl.amdgpu",
+    /* Remote target        */ "libomptarget.rtl.rpc",
 };
 
 PluginManager *PM;
@@ -86,152 +88,166 @@
 
   DP("Loading RTLs...\n");
 
+  BoolEnvar NextGenPlugins("LIBOMPTARGET_NEXTGEN_PLUGINS", false);
+
   // Attempt to open all the plugins and, if they exist, check if the interface
   // is correct and if they are supporting any devices.
-  for (auto *Name : RTLNames) {
-    DP("Loading library '%s'...\n", Name);
-    std::string ErrMsg;
-    auto DynLibrary = std::make_unique<sys::DynamicLibrary>(
-        sys::DynamicLibrary::getPermanentLibrary(Name, &ErrMsg));
-
-    if (!DynLibrary->isValid()) {
-      // Library does not exist or cannot be found.
-      DP("Unable to load library '%s': %s!\n", Name, ErrMsg.c_str());
-      continue;
-    }
-
-    DP("Successfully loaded library '%s'!\n", Name);
-
+  for (const char *Name : RTLNames) {
     AllRTLs.emplace_back();
 
-    // Retrieve the RTL information from the runtime library.
-    RTLInfoTy &R = AllRTLs.back();
-
-    // Remove plugin on failure to call optional init_plugin
-    *((void **)&R.init_plugin) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_init_plugin");
-    if (R.init_plugin) {
-      int32_t Rc = R.init_plugin();
-      if (Rc != OFFLOAD_SUCCESS) {
-        DP("Unable to initialize library '%s': %u!\n", Name, Rc);
-        AllRTLs.pop_back();
+    RTLInfoTy &RTL = AllRTLs.back();
+
+    const std::string BaseRTLName(Name);
+    if (NextGenPlugins) {
+      if (attemptLoadRTL(BaseRTLName + ".nextgen.so", RTL))
         continue;
-      }
-    }
 
-    bool ValidPlugin = true;
-
-    if (!(*((void **)&R.is_valid_binary) =
-              DynLibrary->getAddressOfSymbol("__tgt_rtl_is_valid_binary")))
-      ValidPlugin = false;
-    if (!(*((void **)&R.number_of_devices) =
-              DynLibrary->getAddressOfSymbol("__tgt_rtl_number_of_devices")))
-      ValidPlugin = false;
-    if (!(*((void **)&R.init_device) =
-              DynLibrary->getAddressOfSymbol("__tgt_rtl_init_device")))
-      ValidPlugin = false;
-    if (!(*((void **)&R.load_binary) =
-              DynLibrary->getAddressOfSymbol("__tgt_rtl_load_binary")))
-      ValidPlugin = false;
-    if (!(*((void **)&R.data_alloc) =
-              DynLibrary->getAddressOfSymbol("__tgt_rtl_data_alloc")))
-      ValidPlugin = false;
-    if (!(*((void **)&R.data_submit) =
-              DynLibrary->getAddressOfSymbol("__tgt_rtl_data_submit")))
-      ValidPlugin = false;
-    if (!(*((void **)&R.data_retrieve) =
-              DynLibrary->getAddressOfSymbol("__tgt_rtl_data_retrieve")))
-      ValidPlugin = false;
-    if (!(*((void **)&R.data_delete) =
-              DynLibrary->getAddressOfSymbol("__tgt_rtl_data_delete")))
-      ValidPlugin = false;
-    if (!(*((void **)&R.run_region) =
-              DynLibrary->getAddressOfSymbol("__tgt_rtl_run_target_region")))
-      ValidPlugin = false;
-    if (!(*((void **)&R.run_team_region) = DynLibrary->getAddressOfSymbol(
-              "__tgt_rtl_run_target_team_region")))
-      ValidPlugin = false;
-
-    // Invalid plugin
-    if (!ValidPlugin) {
-      DP("Invalid plugin as necessary interface is not found.\n");
-      AllRTLs.pop_back();
-      continue;
+      DP("Falling back to original plugin...\n");
     }
 
-    // No devices are supported by this RTL?
-    if (!(R.NumberOfDevices = R.number_of_devices())) {
-      // The RTL is invalid! Will pop the object from the RTLs list.
-      DP("No devices supported in this RTL\n");
+    if (!attemptLoadRTL(BaseRTLName + ".so", RTL))
       AllRTLs.pop_back();
-      continue;
+  }
+
+  DP("RTLs loaded!\n");
+}
+
+bool RTLsTy::attemptLoadRTL(const std::string &RTLName, RTLInfoTy &RTL) {
+  const char *Name = RTLName.c_str();
+
+  DP("Loading library '%s'...\n", Name);
+
+  std::string ErrMsg;
+  auto DynLibrary = std::make_unique<sys::DynamicLibrary>(
+      sys::DynamicLibrary::getPermanentLibrary(Name, &ErrMsg));
+
+  if (!DynLibrary->isValid()) {
+    // Library does not exist or cannot be found.
+    DP("Unable to load library '%s': %s!\n", Name, ErrMsg.c_str());
+    return false;
+  }
+
+  DP("Successfully loaded library '%s'!\n", Name);
+
+  // Remove plugin on failure to call optional init_plugin
+  *((void **)&RTL.init_plugin) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_init_plugin");
+  if (RTL.init_plugin) {
+    int32_t Rc = RTL.init_plugin();
+    if (Rc != OFFLOAD_SUCCESS) {
+      DP("Unable to initialize library '%s': %u!\n", Name, Rc);
+      return false;
     }
+  }
 
-#ifdef OMPTARGET_DEBUG
-    R.RTLName = Name;
-#endif
+  bool ValidPlugin = true;
+
+  if (!(*((void **)&RTL.is_valid_binary) =
+            DynLibrary->getAddressOfSymbol("__tgt_rtl_is_valid_binary")))
+    ValidPlugin = false;
+  if (!(*((void **)&RTL.number_of_devices) =
+            DynLibrary->getAddressOfSymbol("__tgt_rtl_number_of_devices")))
+    ValidPlugin = false;
+  if (!(*((void **)&RTL.init_device) =
+            DynLibrary->getAddressOfSymbol("__tgt_rtl_init_device")))
+    ValidPlugin = false;
+  if (!(*((void **)&RTL.load_binary) =
+            DynLibrary->getAddressOfSymbol("__tgt_rtl_load_binary")))
+    ValidPlugin = false;
+  if (!(*((void **)&RTL.data_alloc) =
+            DynLibrary->getAddressOfSymbol("__tgt_rtl_data_alloc")))
+    ValidPlugin = false;
+  if (!(*((void **)&RTL.data_submit) =
+            DynLibrary->getAddressOfSymbol("__tgt_rtl_data_submit")))
+    ValidPlugin = false;
+  if (!(*((void **)&RTL.data_retrieve) =
+            DynLibrary->getAddressOfSymbol("__tgt_rtl_data_retrieve")))
+    ValidPlugin = false;
+  if (!(*((void **)&RTL.data_delete) =
+            DynLibrary->getAddressOfSymbol("__tgt_rtl_data_delete")))
+    ValidPlugin = false;
+  if (!(*((void **)&RTL.run_region) =
+            DynLibrary->getAddressOfSymbol("__tgt_rtl_run_target_region")))
+    ValidPlugin = false;
+  if (!(*((void **)&RTL.run_team_region) = DynLibrary->getAddressOfSymbol(
+            "__tgt_rtl_run_target_team_region")))
+    ValidPlugin = false;
+
+  // Invalid plugin
+  if (!ValidPlugin) {
+    DP("Invalid plugin as necessary interface is not found.\n");
+    return false;
+  }
 
-    DP("Registering RTL %s supporting %d devices!\n", R.RTLName.c_str(),
-       R.NumberOfDevices);
-
-    // Optional functions
-    *((void **)&R.deinit_plugin) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_deinit_plugin");
-    *((void **)&R.is_valid_binary_info) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_is_valid_binary_info");
-    *((void **)&R.deinit_device) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_deinit_device");
-    *((void **)&R.init_requires) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_init_requires");
-    *((void **)&R.data_submit_async) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_data_submit_async");
-    *((void **)&R.data_retrieve_async) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_data_retrieve_async");
-    *((void **)&R.run_region_async) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_run_target_region_async");
-    *((void **)&R.run_team_region_async) = DynLibrary->getAddressOfSymbol(
-        "__tgt_rtl_run_target_team_region_async");
-    *((void **)&R.synchronize) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_synchronize");
-    *((void **)&R.data_exchange) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_data_exchange");
-    *((void **)&R.data_exchange_async) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_data_exchange_async");
-    *((void **)&R.is_data_exchangable) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_is_data_exchangable");
-    *((void **)&R.register_lib) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_register_lib");
-    *((void **)&R.unregister_lib) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_unregister_lib");
-    *((void **)&R.supports_empty_images) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_supports_empty_images");
-    *((void **)&R.set_info_flag) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_set_info_flag");
-    *((void **)&R.print_device_info) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_print_device_info");
-    *((void **)&R.create_event) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_create_event");
-    *((void **)&R.record_event) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_record_event");
-    *((void **)&R.wait_event) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_wait_event");
-    *((void **)&R.sync_event) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_sync_event");
-    *((void **)&R.destroy_event) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_destroy_event");
-    *((void **)&R.release_async_info) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_release_async_info");
-    *((void **)&R.init_async_info) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_init_async_info");
-    *((void **)&R.init_device_info) =
-        DynLibrary->getAddressOfSymbol("__tgt_rtl_init_device_info");
-
-    R.LibraryHandler = std::move(DynLibrary);
+  // No devices are supported by this RTL?
+  if (!(RTL.NumberOfDevices = RTL.number_of_devices())) {
+    // The RTL is invalid! Will pop the object from the RTLs list.
+    DP("No devices supported in this RTL\n");
+    return false;
   }
 
-  DP("RTLs loaded!\n");
+#ifdef LIBOMPTARGET_DEBUG
+  RTL.RTLName = Name;
+#endif
 
-  return;
+  DP("Registering RTL %s supporting %d devices!\n", Name, RTL.NumberOfDevices);
+
+  // Optional functions
+  *((void **)&RTL.deinit_plugin) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_deinit_plugin");
+  *((void **)&RTL.is_valid_binary_info) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_is_valid_binary_info");
+  *((void **)&RTL.deinit_device) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_deinit_device");
+  *((void **)&RTL.init_requires) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_init_requires");
+  *((void **)&RTL.data_submit_async) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_data_submit_async");
+  *((void **)&RTL.data_retrieve_async) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_data_retrieve_async");
+  *((void **)&RTL.run_region_async) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_run_target_region_async");
+  *((void **)&RTL.run_team_region_async) = DynLibrary->getAddressOfSymbol(
+      "__tgt_rtl_run_target_team_region_async");
+  *((void **)&RTL.synchronize) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_synchronize");
+  *((void **)&RTL.data_exchange) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_data_exchange");
+  *((void **)&RTL.data_exchange_async) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_data_exchange_async");
+  *((void **)&RTL.is_data_exchangable) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_is_data_exchangable");
+  *((void **)&RTL.register_lib) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_register_lib");
+  *((void **)&RTL.unregister_lib) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_unregister_lib");
+  *((void **)&RTL.supports_empty_images) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_supports_empty_images");
+  *((void **)&RTL.set_info_flag) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_set_info_flag");
+  *((void **)&RTL.print_device_info) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_print_device_info");
+  *((void **)&RTL.create_event) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_create_event");
+  *((void **)&RTL.record_event) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_record_event");
+  *((void **)&RTL.wait_event) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_wait_event");
+  *((void **)&RTL.sync_event) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_sync_event");
+  *((void **)&RTL.destroy_event) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_destroy_event");
+  *((void **)&RTL.release_async_info) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_release_async_info");
+  *((void **)&RTL.init_async_info) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_init_async_info");
+  *((void **)&RTL.init_device_info) =
+      DynLibrary->getAddressOfSymbol("__tgt_rtl_init_device_info");
+
+  RTL.LibraryHandler = std::move(DynLibrary);
+
+  // Successfully loaded
+  return true;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
Index: openmp/libomptarget/test/lit.cfg
===================================================================
--- openmp/libomptarget/test/lit.cfg
+++ openmp/libomptarget/test/lit.cfg
@@ -21,6 +21,10 @@
 if 'LIBOMPTARGET_DEBUG' in os.environ:
     config.environment['LIBOMPTARGET_DEBUG'] = os.environ['LIBOMPTARGET_DEBUG']
 
+# Allow running the tests with nextgen plugins when available
+if 'LIBOMPTARGET_NEXTGEN_PLUGINS' in os.environ:
+    config.environment['LIBOMPTARGET_NEXTGEN_PLUGINS'] = os.environ['LIBOMPTARGET_NEXTGEN_PLUGINS']
+
 if 'OMP_TARGET_OFFLOAD' in os.environ:
     config.environment['OMP_TARGET_OFFLOAD'] = os.environ['OMP_TARGET_OFFLOAD']