diff --git a/openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake b/openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake
--- a/openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake
+++ b/openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake
@@ -281,3 +281,61 @@
 endif()
 
 set(OPENMP_PTHREAD_LIB ${LLVM_PTHREAD_LIB})
+
+################################################################################
+# Looking for OpenCL...
+################################################################################
+
+# Cmake 3.4.3 cannot find OpenCL library unless an OpenCL SDK is
+# installed as root. This workaround is a fallback to let us find the library as
+# well as includes for now.
+message(STATUS "Looking for OpenCL includes.")
+
+find_path(LIBOMPTARGET_DEP_OPENCL_INCLUDE_DIRS
+  NAMES
+    CL/cl.h OpenCL/cl.h
+  PATHS
+    ENV LIBOMPTARGET_OCL_ROOT
+    ENV CPATH
+  PATH_SUFFIXES
+    include include/sycl)
+message(STATUS "OpenCL include DIR: ${LIBOMPTARGET_DEP_OPENCL_INCLUDE_DIRS}")
+
+if (NOT LIBOMPTARGET_DEP_OPENCL_INCLUDE_DIRS)
+  set(LIBOMPTARGET_DEP_OPENCL_FOUND FALSE)
+  message(STATUS "Could NOT find OpenCL. Missing includes.")
+else()
+
+  message(STATUS "Looking for OpenCL library.")
+
+  find_library(LIBOMPTARGET_DEP_OPENCL_LIBRARIES
+    NAMES OpenCL
+    PATHS
+      ENV LIBOMPTARGET_OCL_ROOT
+      ENV LIBRARY_PATH
+      ENV LD_LIBRARY_PATH
+    PATH_SUFFIXES
+      lib lib64 lib/intel64_lin)
+  message(STATUS "OpenCL lib: ${LIBOMPTARGET_DEP_OPENCL_LIBRARIES}")
+
+  if (NOT LIBOMPTARGET_DEP_OPENCL_LIBRARIES)
+    set(LIBOMPTARGET_DEP_OPENCL_FOUND FALSE)
+    message(STATUS "Could NOT find OpenCL. Missing libs.")
+  else()
+    set(LIBOMPTARGET_DEP_OPENCL_FOUND TRUE)
+  endif()
+
+endif()
+
+if (NOT LIBOMPTARGET_DEP_OPENCL_FOUND)
+  message(STATUS "Looking for OpenCL again.")
+  find_package(OpenCL)
+  set(LIBOMPTARGET_DEP_OPENCL_FOUND ${OPENCL_FOUND})
+  set(LIBOMPTARGET_DEP_OPENCL_LIBRARIES ${OPENCL_LIBRARIES})
+  set(LIBOMPTARGET_DEP_OPENCL_INCLUDE_DIRS ${OPENCL_INCLUDE_DIRS})
+endif()
+
+mark_as_advanced(
+        LIBOMPTARGET_DEP_OPENCL_FOUND
+        LIBOMPTARGET_DEP_OPENCL_INCLUDE_DIRS
+        LIBOMPTARGET_DEP_OPENCL_LIBRARIES)
diff --git a/openmp/libomptarget/plugins/CMakeLists.txt b/openmp/libomptarget/plugins/CMakeLists.txt
--- a/openmp/libomptarget/plugins/CMakeLists.txt
+++ b/openmp/libomptarget/plugins/CMakeLists.txt
@@ -81,6 +81,7 @@
 add_subdirectory(ve)
 add_subdirectory(x86_64)
 add_subdirectory(remote)
+add_subdirectory(opencl)
 
 # Make sure the parent scope can see the plugins that will be created.
 set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
diff --git a/openmp/libomptarget/plugins/common/elf_common/CMakeLists.txt b/openmp/libomptarget/plugins/common/elf_common/CMakeLists.txt
--- a/openmp/libomptarget/plugins/common/elf_common/CMakeLists.txt
+++ b/openmp/libomptarget/plugins/common/elf_common/CMakeLists.txt
@@ -10,7 +10,7 @@
 #
 ##===----------------------------------------------------------------------===##
 
-add_library(elf_common OBJECT elf_common.cpp)
+add_library(elf_common OBJECT elf_common.cpp elf_light.cpp)
 
 # Build elf_common with PIC to be able to link it with plugin shared libraries.
 set_property(TARGET elf_common PROPERTY POSITION_INDEPENDENT_CODE ON)
diff --git a/openmp/libomptarget/plugins/common/elf_common/elf_light.h b/openmp/libomptarget/plugins/common/elf_common/elf_light.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/plugins/common/elf_common/elf_light.h
@@ -0,0 +1,181 @@
+//===-- elf_light.h - Basic ELF functionality -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Light ELF implementation provides basic ELF reading functionality.
+// It may be used in systems without libelf support, if the corresponding
+// LLVM ELF implementation is available.
+// The interface declared here must be independent of libelf.h/elf.h.
+//
+// NOTE: we can try to rely on https://github.com/WolfgangSt/libelf
+//       on Windows, if this implementation gets more complex.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_ELF_COMMON_ELF_LIGHT_H
+#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_ELF_COMMON_ELF_LIGHT_H
+
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+
+class ElfL;
+class ElfLSegmentNoteIterator;
+class ElfLSectionNoteIterator;
+class ElfNote;
+
+// Class representing NOTEs from PT_NOTE segments and SHT_NOTE sections.
+class ElfLNote {
+  const void *Impl = nullptr;
+
+  friend class ElfLSegmentNoteIterator;
+  friend class ElfLSectionNoteIterator;
+
+  // Only ElfLSectionNoteIterator is allowed to create notes via its
+  // operator*().
+  explicit ElfLNote(const void *I);
+  ElfLNote &operator=(const ElfLNote &) = delete;
+
+public:
+  // FIXME: add move copy constructor and assignment operator.
+  ElfLNote(const ElfLNote &);
+  ~ElfLNote();
+  // Returns the note's name size not including the null terminator.
+  // Note that it may be illegal to access the getName() pointer
+  // beyond the returned size, i.e. the implementation may
+  // not guarantee that there is '\0' after getNameSize()
+  // characters of the name.
+  uint64_t getNameSize() const;
+  // Returns a pointer to the beginning of the note's name.
+  const char *getName() const;
+  // Returns the number of bytes in the descriptor.
+  uint64_t getDescSize() const;
+  // Returns a pointer to the beginning of the note's descriptor.
+  // It is illegal to access more that getDescSize() bytes
+  // via this pointer.
+  const uint8_t *getDesc() const;
+  uint64_t getType() const;
+};
+
+// Iterator over NOTEs in PT_NOTE segments.
+class ElfLSegmentNoteIterator
+    : std::iterator<std::forward_iterator_tag, ElfLNote> {
+
+  void *Impl = nullptr;
+
+  friend class ElfL;
+
+  // Only ElfL is allowed to create iterators to itself.
+  ElfLSegmentNoteIterator(const void *I, bool IsEnd = false);
+  ElfLSectionNoteIterator &operator=(const ElfLSegmentNoteIterator &) = delete;
+
+public:
+  // FIXME: add move copy constructor and assignment operator.
+  ElfLSegmentNoteIterator(const ElfLSegmentNoteIterator &Other);
+  ~ElfLSegmentNoteIterator();
+  ElfLSegmentNoteIterator &operator++();
+  bool operator==(const ElfLSegmentNoteIterator Other) const;
+  bool operator!=(const ElfLSegmentNoteIterator Other) const;
+  ElfLNote operator*() const;
+};
+
+// Iterator over NOTEs in SHT_NOTE sections.
+class ElfLSectionNoteIterator
+    : std::iterator<std::forward_iterator_tag, ElfLNote> {
+
+  void *Impl = nullptr;
+
+  friend class ElfL;
+
+  // Only ElfL is allowed to create iterators to itself.
+  ElfLSectionNoteIterator(const void *I, bool IsEnd = false);
+  ElfLSectionNoteIterator &operator=(const ElfLSectionNoteIterator &) = delete;
+
+public:
+  // FIXME: add move copy constructor and assignment operator.
+  ElfLSectionNoteIterator(const ElfLSectionNoteIterator &Other);
+  ~ElfLSectionNoteIterator();
+  ElfLSectionNoteIterator &operator++();
+  bool operator==(const ElfLSectionNoteIterator Other) const;
+  bool operator!=(const ElfLSectionNoteIterator Other) const;
+  ElfLNote operator*() const;
+};
+
+// Class representing ELF section.
+class ElfLSection {
+  const void *Impl = nullptr;
+
+  friend class ElfLSectionIterator;
+
+  // Only ElfLSectionIterator is allowed to create sections via its
+  // operator*().
+  explicit ElfLSection(const void *I);
+  ElfLSection &operator=(const ElfLSection &) = delete;
+
+public:
+  // FIXME: add move copy constructor and assignment operator.
+  ElfLSection(const ElfLSection &);
+  ~ElfLSection();
+
+  // Returns the section name, which is is a null-terminated string.
+  const char *getName() const;
+  // Returns the section size.
+  uint64_t getSize() const;
+  // Returns a pointer to the beginning of the section.
+  const uint8_t *getContents() const;
+};
+
+// Iterator over sections.
+class ElfLSectionIterator
+    : std::iterator<std::forward_iterator_tag, ElfLSection> {
+
+  void *Impl = nullptr;
+
+  friend class ElfL;
+
+  // Only ElfL is allowed to create iterators to itself.
+  ElfLSectionIterator(const void *I, bool IsEnd = false);
+  ElfLSectionIterator &operator=(const ElfLSectionIterator &) = delete;
+
+public:
+  // FIXME: add move copy constructor and assignment operator.
+  ElfLSectionIterator(const ElfLSectionIterator &Other);
+  ~ElfLSectionIterator();
+  ElfLSectionIterator &operator++();
+  bool operator==(const ElfLSectionIterator Other) const;
+  bool operator!=(const ElfLSectionIterator Other) const;
+  ElfLSection operator*() const;
+};
+
+// Wrapper around the given ELF image.
+class ElfL {
+  // Opaque pointer to the actual implementation.
+  void *Impl = nullptr;
+
+  // FIXME: implement if needed.
+  ElfL(const ElfL &) = delete;
+  ElfL &operator=(const ElfL &) = delete;
+
+public:
+  ElfL(char *Begin, size_t Size);
+  ~ElfL();
+  bool isValidElf() const;
+  const char *getErrmsg(int N) const;
+  uint16_t getEMachine() const;
+  uint16_t getEType() const;
+
+  static bool isDynType(uint16_t Ty);
+
+  ElfLSectionNoteIterator section_notes_begin() const;
+  ElfLSectionNoteIterator section_notes_end() const;
+  ElfLSegmentNoteIterator segment_notes_begin() const;
+  ElfLSegmentNoteIterator segment_notes_end() const;
+  ElfLSectionIterator sections_begin() const;
+  ElfLSectionIterator sections_end() const;
+};
+
+#endif // LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_ELF_COMMON_ELF_LIGHT_H
diff --git a/openmp/libomptarget/plugins/common/elf_common/elf_light.cpp b/openmp/libomptarget/plugins/common/elf_common/elf_light.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/plugins/common/elf_common/elf_light.cpp
@@ -0,0 +1,1682 @@
+//===-- elf_light.cpp - Basic ELF functionality -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "elf_light.h"
+#include "Debug.h"
+#include <assert.h>
+
+#ifndef TARGET_NAME
+#define TARGET_NAME ELF light
+#endif
+#define DEBUG_PREFIX "TARGET " GETNAME(TARGET_NAME)
+
+#if MAY_USE_LIBELF
+
+// Implementation based on libelf.
+#include <gelf.h>
+#include <libelf.h>
+
+// Helper methods to align addresses.
+template <typename T> inline T alignDown(T value, size_t alignment) {
+  return (T)(value & ~(alignment - 1));
+}
+
+template <typename T> inline T *alignDown(T *value, size_t alignment) {
+  return reinterpret_cast<T *>(alignDown((intptr_t)value, alignment));
+}
+
+template <typename T> inline T alignUp(T value, size_t alignment) {
+  return alignDown((T)(value + alignment - 1), alignment);
+}
+
+template <typename T> inline T *alignUp(T *value, size_t alignment) {
+  return reinterpret_cast<T *>(
+      alignDown((intptr_t)(value + alignment - 1), alignment));
+}
+
+// FIXME: this is taken from openmp/libomptarget/plugins/amdgpu/impl/system.cpp,
+//        but it may be incorrect for 64-bit ELF. Elf64_Nhdr and Elf32_Nhdr
+//        have different representation. The alignment used for the name and
+//        the descriptor is still 4 bytes. At the same time, it seems to work
+//        for 64-bit ELFs produced by LLVM.
+struct Elf_Note {
+  uint32_t n_namesz; // Length of note's name.
+  uint32_t n_descsz; // Length of note's value.
+  uint32_t n_type;   // Type of note.
+  // then name
+  // then padding, optional
+  // then desc, at 4 byte alignment (not 8, despite being elf64)
+};
+
+static const uint32_t NoteAlignment = 4;
+
+// Implementation of the iterator for SHT_NOTE sections.
+// The iterator allows processing all NOTEs in all SHT_NOTE sections
+// in the ELF image provided during the iterator object construction.
+class ElfLSectionNoteIteratorImpl {
+  // A pointer to Elf object created by elf_memory() for
+  // the ELF image we are going to iterate.
+  Elf *EF;
+
+  // A pointer to the current SHT_NOTE section.
+  // In the initial state of the iterator object this will actually
+  // point to the very first section in the ELF image, but it will be
+  // adjusted right away either to point to the first SHT_NOTE section
+  // or set to nullptr (if there are no SHT_NOTE sections).
+  Elf_Scn *CurrentSection = nullptr;
+
+  // A pointer to the current NOTE inside a SHT_NOTE section
+  // pointed by CurrentSection. If it is nullptr, then this means
+  // that the iterator object is an end() iterator.
+  Elf_Note *NPtr = nullptr;
+
+  uint64_t getNotesBeginAddr(const GElf_Shdr &Shdr) const {
+    return reinterpret_cast<uint64_t>(elf_rawfile(EF, nullptr)) +
+           Shdr.sh_offset;
+  }
+
+  uint64_t getNotesEndAddr(const GElf_Shdr &Shdr) const {
+    return getNotesBeginAddr(Shdr) + Shdr.sh_size;
+  }
+
+  uint64_t getNoteSize(const Elf_Note &Note) const {
+    return sizeof(Note) + alignUp(Note.n_namesz, NoteAlignment) +
+           alignUp(Note.n_descsz, NoteAlignment);
+  }
+
+  // Given the current state of the iterator object, advances
+  // the iterator forward to point to the next NOTE in the next
+  // SHT_NOTE section.
+  // If there is no such a NOTE, then it sets the iterator
+  // object to the end() state.
+  //
+  // Note that this method does not change the iterator, if
+  // NPtr is pointing to a valid note within CurrentSection.
+  // The iterator advancement in this case is done via operator++.
+  void autoAdvance(bool IsFirst = false) {
+    // Cannot advance, if CurrentSection is NULL.
+    if (!CurrentSection)
+      return;
+
+    // NPtr points to a valid NOTE in CurrentSection, thus,
+    // no auto advancement.
+    if (NPtr)
+      return;
+
+    GElf_Shdr Shdr;
+    gelf_getshdr(CurrentSection, &Shdr);
+
+    // CurrentSection is a valid section, and NPtr is an end() iterator.
+    //
+    // If IsFirst is true, then we just in the initial state, and
+    // we need to set CurrentSection to the first SHT_NOTE section (if any),
+    // and, then, NPtr to the first note in this section.
+    //
+    // If IsFirst is false, then we've reached the end of the current
+    // SHT_NOTE section, and should find the next section with notes.
+    if (!IsFirst || gelf_getshdr(CurrentSection, &Shdr)->sh_type != SHT_NOTE)
+      CurrentSection = elf_nextscn(EF, CurrentSection);
+
+    while (CurrentSection &&
+           gelf_getshdr(CurrentSection, &Shdr)->sh_type != SHT_NOTE)
+      CurrentSection = elf_nextscn(EF, CurrentSection);
+
+    if (!CurrentSection) {
+      // No more sections.
+      // Note that NPtr is already nullptr indicating the end() iterator.
+      return;
+    }
+
+    gelf_getshdr(CurrentSection, &Shdr);
+    uint64_t NotesBegin = getNotesBeginAddr(Shdr);
+    uint64_t NotesEnd = getNotesEndAddr(Shdr);
+    if (NotesBegin >= NotesEnd) {
+      // Something went wrong. Assume that we've reached
+      // the end of all notes.
+      CurrentSection = nullptr;
+      NPtr = nullptr;
+      return;
+    }
+
+    NPtr = reinterpret_cast<Elf_Note *>(NotesBegin);
+    assert(NPtr && "Invalid SHT_NOTE section.");
+  }
+
+  bool operator!=(const ElfLSectionNoteIteratorImpl Other) const {
+    return !(*this == Other);
+  }
+
+public:
+  ElfLSectionNoteIteratorImpl(Elf *RawElf, bool IsEnd = false) : EF(RawElf) {
+    assert(EF && "Trying to iterate invalid ELF.");
+
+    if (IsEnd) {
+      // NPtr equal to nullptr means end() iterator.
+      return;
+    }
+
+    // Set CurrentSection to the very first section,
+    // and let autoAdvance() find the first valid note (if any).
+    CurrentSection = elf_getscn(EF, 0);
+    autoAdvance(true);
+  }
+
+  bool operator==(const ElfLSectionNoteIteratorImpl Other) const {
+    // They should be pointing to the same NOTE to be equal.
+    return NPtr == Other.NPtr;
+  }
+
+  const Elf_Note *operator*() const {
+    assert(*this != ElfLSectionNoteIteratorImpl(EF, true) &&
+           "Dereferencing the end iterator.");
+    return NPtr;
+  }
+
+  // Advance to the next NOTE in the CurrentSection.
+  // If there is no next NOTE, then autoAdvance() to the next
+  // SHT_NOTE section and its first NOTE.
+  ElfLSectionNoteIteratorImpl &operator++() {
+    assert(*this != ElfLSectionNoteIteratorImpl(EF, true) &&
+           "Incrementing the end iterator.");
+
+    GElf_Shdr Shdr;
+    gelf_getshdr(CurrentSection, &Shdr);
+    uint64_t NotesBegin = getNotesBeginAddr(Shdr);
+    uint64_t NotesEnd = getNotesEndAddr(Shdr);
+    assert(reinterpret_cast<uint64_t>(NPtr) >= NotesBegin &&
+           reinterpret_cast<uint64_t>(NPtr) < NotesEnd &&
+           "Invalid pointer to a note computed somewhere else.");
+    (void)NotesBegin;
+
+    uint64_t NoteSize = getNoteSize(*NPtr);
+    NPtr =
+        reinterpret_cast<Elf_Note *>(reinterpret_cast<char *>(NPtr) + NoteSize);
+    if (reinterpret_cast<uint64_t>(NPtr) >= NotesEnd ||
+        reinterpret_cast<uint64_t>(NPtr) + sizeof(*NPtr) >= NotesEnd) {
+      // We've reached the end of the current NOTE section.
+      NPtr = nullptr;
+    }
+
+    // Auto advance to the next section, if needed.
+    autoAdvance();
+    return *this;
+  }
+};
+
+// Implementation of the iterator for PT_NOTE segments.
+// The iterator allows processing all NOTEs in all PT_NOTE segments
+// in the ELF image provided during the iterator object construction.
+class ElfLSegmentNoteIteratorImpl {
+  // A pointer to Elf object created by elf_memory() for
+  // the ELF image we are going to iterate.
+  Elf *EF;
+
+  // A pointer to the current PT_NOTE segment.
+  // In the initial state of the iterator object this will actually
+  // point to the very first segment in the ELF image, but it will be
+  // adjusted right away either to point to the first PT_NOTE segment
+  // or set to nullptr (if there are no PT_NOTE segments).
+  size_t NumberOfPhdrs = (std::numeric_limits<size_t>::max)();
+  size_t CurrentSegment = (std::numeric_limits<size_t>::max)();
+
+  // A pointer to the current NOTE inside a PT_NOTE segment
+  // pointed by CurrentSegment. If it is nullptr, then this means
+  // that the iterator object is an end() iterator.
+  Elf_Note *NPtr = nullptr;
+
+  uint64_t getNotesBeginAddr(const GElf_Phdr &Phdr) const {
+    return reinterpret_cast<uint64_t>(elf_rawfile(EF, nullptr)) + Phdr.p_offset;
+  }
+
+  uint64_t getNotesEndAddr(const GElf_Phdr &Phdr) const {
+    return getNotesBeginAddr(Phdr) + Phdr.p_filesz;
+  }
+
+  uint64_t getNoteSize(const Elf_Note &Note) const {
+    return sizeof(Note) + alignUp(Note.n_namesz, NoteAlignment) +
+           alignUp(Note.n_descsz, NoteAlignment);
+  }
+
+  // Given the current state of the iterator object, advances
+  // the iterator forward to point to the next NOTE in the next
+  // PT_NOTE segment.
+  // If there is no such a NOTE, then it sets the iterator
+  // object to the end() state.
+  //
+  // Note that this method does not change the iterator, if
+  // NPtr is pointing to a valid note within CurrentSegment.
+  // The iterator advancement in this case is done via operator++.
+  void autoAdvance(bool IsFirst = false) {
+    // Cannot advance, if CurrentSegment is invalid.
+    if (CurrentSegment >= NumberOfPhdrs)
+      return;
+
+    // NPtr points to a valid NOTE in CurrentSegment, thus,
+    // no auto advancement.
+    if (NPtr)
+      return;
+
+    GElf_Phdr Phdr;
+    gelf_getphdr(EF, CurrentSegment, &Phdr);
+
+    // CurrentSegment is a valid segment, and NPtr is an end() iterator.
+    //
+    // If IsFirst is true, then we just in the initial state, and
+    // we need to set CurrentSegment to the first PT_NOTE segment (if any),
+    // and, then, NPtr to the first note in this segment.
+    //
+    // If IsFirst is false, then we've reached the end of the current
+    // PT_NOTE segment, and should find the next segment with notes.
+    if (!IsFirst || Phdr.p_type != PT_NOTE)
+      ++CurrentSegment;
+
+    while (CurrentSegment < NumberOfPhdrs) {
+      if (gelf_getphdr(EF, CurrentSegment, &Phdr) != &Phdr)
+        continue;
+
+      if (Phdr.p_type == PT_NOTE)
+        break;
+
+      ++CurrentSegment;
+    }
+
+    if (CurrentSegment >= NumberOfPhdrs) {
+      // No more segments.
+      // Note that NPtr is already nullptr indicating the end() iterator.
+      return;
+    }
+
+    if (gelf_getphdr(EF, CurrentSegment, &Phdr) != &Phdr)
+      assert(false && "Invalid program header selected above.");
+
+    uint64_t NotesBegin = getNotesBeginAddr(Phdr);
+    uint64_t NotesEnd = getNotesEndAddr(Phdr);
+    if (NotesBegin >= NotesEnd) {
+      // Something went wrong. Assume that we've reached
+      // the end of all notes.
+      CurrentSegment = NumberOfPhdrs;
+      NPtr = nullptr;
+      return;
+    }
+
+    NPtr = reinterpret_cast<Elf_Note *>(NotesBegin);
+    assert(NPtr && "Invalid PT_NOTE segment.");
+  }
+
+  bool operator!=(const ElfLSegmentNoteIteratorImpl Other) const {
+    return !(*this == Other);
+  }
+
+public:
+  ElfLSegmentNoteIteratorImpl(Elf *RawElf, bool IsEnd = false) : EF(RawElf) {
+    assert(EF && "Trying to iterate invalid ELF.");
+
+    if (IsEnd) {
+      // NPtr equal to nullptr means end() iterator.
+      return;
+    }
+
+    // Set CurrentSegment to the very first segment,
+    // and let autoAdvance() find the first valid note (if any).
+    CurrentSegment = 0;
+
+    // Set NumberOfPhdrs to 0, if we cannot query it.
+    if (elf_getphdrnum(EF, &NumberOfPhdrs) != 0)
+      NumberOfPhdrs = 0;
+    autoAdvance(true);
+  }
+
+  bool operator==(const ElfLSegmentNoteIteratorImpl Other) const {
+    // They should be pointing to the same NOTE to be equal.
+    return NPtr == Other.NPtr;
+  }
+
+  const Elf_Note *operator*() const {
+    assert(*this != ElfLSegmentNoteIteratorImpl(EF, true) &&
+           "Dereferencing the end iterator.");
+    return NPtr;
+  }
+
+  // Advance to the next NOTE in the CurrentSegment.
+  // If there is no next NOTE, then autoAdvance() to the next
+  // PT_NOTE segment and its first NOTE.
+  ElfLSegmentNoteIteratorImpl &operator++() {
+    assert(*this != ElfLSegmentNoteIteratorImpl(EF, true) &&
+           "Incrementing the end iterator.");
+
+    GElf_Phdr Phdr;
+    gelf_getphdr(EF, CurrentSegment, &Phdr);
+    uint64_t NotesBegin = getNotesBeginAddr(Phdr);
+    uint64_t NotesEnd = getNotesEndAddr(Phdr);
+    assert(reinterpret_cast<uint64_t>(NPtr) >= NotesBegin &&
+           reinterpret_cast<uint64_t>(NPtr) < NotesEnd &&
+           "Invalid pointer to a note computed somewhere else.");
+    (void)NotesBegin;
+
+    uint64_t NoteSize = getNoteSize(*NPtr);
+    NPtr =
+        reinterpret_cast<Elf_Note *>(reinterpret_cast<char *>(NPtr) + NoteSize);
+    if (reinterpret_cast<uint64_t>(NPtr) >= NotesEnd ||
+        reinterpret_cast<uint64_t>(NPtr) + sizeof(*NPtr) >= NotesEnd) {
+      // We've reached the end of the current NOTE section.
+      NPtr = nullptr;
+    }
+
+    // Auto advance to the next section, if needed.
+    autoAdvance();
+    return *this;
+  }
+};
+
+class ElfLSectionImpl {
+  Elf *EF = nullptr;
+  Elf_Scn *Section = nullptr;
+
+public:
+  ElfLSectionImpl(Elf *EF, Elf_Scn *Section) : EF(EF), Section(Section) {}
+
+  const char *getName() const {
+    size_t SHStrNdx;
+    if (elf_getshdrstrndx(EF, &SHStrNdx) != 0)
+      return "";
+
+    GElf_Shdr Shdr;
+    gelf_getshdr(Section, &Shdr);
+    char *Name = elf_strptr(EF, SHStrNdx, static_cast<size_t>(Shdr.sh_name));
+    return Name ? Name : "";
+  }
+
+  uint64_t getSize() const {
+    Elf_Data *Desc = elf_rawdata(Section, nullptr);
+    if (!Desc)
+      return 0;
+
+    return Desc->d_size;
+  }
+
+  const uint8_t *getContents() const {
+    Elf_Data *Desc = elf_rawdata(Section, nullptr);
+    if (!Desc)
+      return 0;
+
+    return reinterpret_cast<const uint8_t *>(Desc->d_buf);
+  }
+};
+
+class ElfLSectionIteratorImpl {
+  // A pointer to Elf object created by elf_memory() for
+  // the ELF image we are going to iterate.
+  Elf *EF = nullptr;
+
+  // A pointer to the current section.
+  Elf_Scn *CurrentSection = nullptr;
+
+  bool operator!=(const ElfLSectionIteratorImpl Other) const {
+    return !(*this == Other);
+  }
+
+public:
+  ElfLSectionIteratorImpl(Elf *RawElf, bool IsEnd = false) : EF(RawElf) {
+    assert(EF && "Trying to iterate invalid ELF.");
+    if (IsEnd)
+      return;
+
+    CurrentSection = elf_getscn(EF, 0);
+  }
+
+  bool operator==(const ElfLSectionIteratorImpl Other) const {
+    return CurrentSection == Other.CurrentSection;
+  }
+
+  ElfLSectionImpl *operator*() const {
+    assert(*this != ElfLSectionIteratorImpl(EF, true) &&
+           "Dereferencing the end iterator.");
+    return new ElfLSectionImpl(EF, CurrentSection);
+  }
+
+  ElfLSectionIteratorImpl &operator++() {
+    assert(*this != ElfLSectionIteratorImpl(EF, true) &&
+           "Dereferencing the end iterator.");
+
+    CurrentSection = elf_nextscn(EF, CurrentSection);
+    return *this;
+  }
+};
+
+// Actual implementation of ElfL via libelf.
+// It is constructed from an ELF image defined by its
+// starting pointer in memory and a length in bytes.
+class ElfLImpl {
+  // A pointer to Elf object created by elf_memory() for
+  // the ELF image.
+  Elf *EF = nullptr;
+
+  // Class of the ELF image.
+  unsigned ElfClass = ELFCLASSNONE;
+
+  // A pointer to the ELF image's header.
+  // Depending on the class it may be either 'Elf32_Ehdr *'
+  // or 'Elf64_Ehdr '.
+  const void *Header = nullptr;
+
+  // Let the owning object access this.
+  friend class ElfL;
+
+public:
+  ElfLImpl(Elf *RawElf, unsigned ElfClass, const void *Header)
+      : EF(RawElf), ElfClass(ElfClass), Header(Header) {}
+
+  // Allocates and constructs a new iterator for NOTEs in
+  // SHT_NOTE sections of the ELF image.
+  ElfLSectionNoteIteratorImpl *
+  createSectionNoteIteratorImpl(bool IsEnd) const {
+    return new ElfLSectionNoteIteratorImpl(EF, IsEnd);
+  }
+
+  // Allocates and constructs a new iterator for NOTEs in
+  // PT_NOTE segments of the ELF image.
+  ElfLSegmentNoteIteratorImpl *
+  createSegmentNoteIteratorImpl(bool IsEnd) const {
+    return new ElfLSegmentNoteIteratorImpl(EF, IsEnd);
+  }
+
+  ElfLSectionIteratorImpl *
+  createSectionIteratorImpl(bool IsEnd) const {
+    return new ElfLSectionIteratorImpl(EF, IsEnd);
+  }
+};
+
+ElfL::ElfL(char *Begin, size_t Size) {
+  Elf *ElfHandle = elf_memory(Begin, Size);
+  if (!ElfHandle) {
+    elf_end(ElfHandle);
+    return;
+  }
+
+  const Elf32_Ehdr *Header32 = elf32_getehdr(ElfHandle);
+  const Elf64_Ehdr *Header64 = elf64_getehdr(ElfHandle);
+
+  if (!Header32 == !Header64) {
+    // Ambiguous ELF header or unrecognized ELF image.
+    elf_end(ElfHandle);
+    return;
+  }
+
+  const void *Header = nullptr;
+  unsigned ElfClass = ELFCLASSNONE;
+
+  if (Header32) {
+    ElfClass = ELFCLASS32;
+    Header = reinterpret_cast<const void *>(Header32);
+  } else {
+    ElfClass = ELFCLASS64;
+    Header = reinterpret_cast<const void *>(Header64);
+  }
+
+  Impl = reinterpret_cast<void *>(new ElfLImpl(ElfHandle, ElfClass, Header));
+}
+
+ElfL::~ElfL() {
+  if (Impl) {
+    ElfLImpl *EImpl = reinterpret_cast<ElfLImpl *>(Impl);
+    elf_end(EImpl->EF);
+    delete EImpl;
+  }
+}
+
+bool ElfL::isValidElf() const {
+  ElfLImpl *EImpl = reinterpret_cast<ElfLImpl *>(Impl);
+  return Impl && EImpl->Header && EImpl->ElfClass != ELFCLASSNONE;
+}
+
+const char *ElfL::getErrmsg(int N) const { return elf_errmsg(-1); }
+
+uint16_t ElfL::getEMachine() const {
+  assert(isValidElf() && "Invalid ELF.");
+  ElfLImpl *EImpl = reinterpret_cast<ElfLImpl *>(Impl);
+  if (EImpl->ElfClass == ELFCLASS32)
+    return reinterpret_cast<const Elf32_Ehdr *>(EImpl->Header)->e_machine;
+  else if (EImpl->ElfClass == ELFCLASS64)
+    return reinterpret_cast<const Elf64_Ehdr *>(EImpl->Header)->e_machine;
+  else
+    assert(false && "Unsupported ELF class.");
+
+  return EM_NONE;
+}
+
+uint16_t ElfL::getEType() const {
+  assert(isValidElf() && "Invalid ELF.");
+  ElfLImpl *EImpl = reinterpret_cast<ElfLImpl *>(Impl);
+  if (EImpl->ElfClass == ELFCLASS32)
+    return reinterpret_cast<const Elf32_Ehdr *>(EImpl->Header)->e_type;
+  else if (EImpl->ElfClass == ELFCLASS64)
+    return reinterpret_cast<const Elf64_Ehdr *>(EImpl->Header)->e_type;
+  else
+    assert(false && "Unsupported ELF class.");
+
+  return ET_NONE;
+}
+
+bool ElfL::isDynType(uint16_t Ty) { return Ty == ET_DYN; }
+
+ElfLSectionNoteIterator ElfL::section_notes_begin() const {
+  return ElfLSectionNoteIterator(reinterpret_cast<const ElfLImpl *>(Impl));
+}
+
+ElfLSectionNoteIterator ElfL::section_notes_end() const {
+  return ElfLSectionNoteIterator(reinterpret_cast<const ElfLImpl *>(Impl),
+                                 true);
+}
+
+ElfLSectionNoteIterator::ElfLSectionNoteIterator(const void *I, bool IsEnd) {
+  const ElfLImpl *EImpl = reinterpret_cast<const ElfLImpl *>(I);
+  Impl = EImpl->createSectionNoteIteratorImpl(IsEnd);
+}
+
+ElfLSectionNoteIterator::ElfLSectionNoteIterator(
+    const ElfLSectionNoteIterator &Other) {
+  ElfLSectionNoteIteratorImpl *IImpl =
+      reinterpret_cast<ElfLSectionNoteIteratorImpl *>(Other.Impl);
+  Impl = new ElfLSectionNoteIteratorImpl(*IImpl);
+}
+
+ElfLSectionNoteIterator::~ElfLSectionNoteIterator() {
+  assert(Impl && "Invalid ElfLSectionNoteIterator object.");
+  ElfLSectionNoteIteratorImpl *IImpl =
+      reinterpret_cast<ElfLSectionNoteIteratorImpl *>(Impl);
+  delete IImpl;
+}
+
+bool ElfLSectionNoteIterator::operator==(
+    const ElfLSectionNoteIterator Other) const {
+  const ElfLSectionNoteIteratorImpl *Lhs =
+      reinterpret_cast<const ElfLSectionNoteIteratorImpl *>(Impl);
+  const ElfLSectionNoteIteratorImpl *Rhs =
+      reinterpret_cast<const ElfLSectionNoteIteratorImpl *>(Other.Impl);
+  return (*Lhs == *Rhs);
+}
+
+bool ElfLSectionNoteIterator::operator!=(
+    const ElfLSectionNoteIterator Other) const {
+  return !(*this == Other);
+}
+
+ElfLSectionNoteIterator &ElfLSectionNoteIterator::operator++() {
+  ElfLSectionNoteIteratorImpl *IImpl =
+      reinterpret_cast<ElfLSectionNoteIteratorImpl *>(Impl);
+  ++(*IImpl);
+  return *this;
+}
+
+ElfLNote ElfLSectionNoteIterator::operator*() const {
+  ElfLSectionNoteIteratorImpl *IImpl =
+      reinterpret_cast<ElfLSectionNoteIteratorImpl *>(Impl);
+  return ElfLNote(**IImpl);
+}
+
+ElfLSegmentNoteIterator ElfL::segment_notes_begin() const {
+  return ElfLSegmentNoteIterator(reinterpret_cast<const ElfLImpl *>(Impl));
+}
+
+ElfLSegmentNoteIterator ElfL::segment_notes_end() const {
+  return ElfLSegmentNoteIterator(reinterpret_cast<const ElfLImpl *>(Impl),
+                                 true);
+}
+
+ElfLSegmentNoteIterator::ElfLSegmentNoteIterator(const void *I, bool IsEnd) {
+  const ElfLImpl *EImpl = reinterpret_cast<const ElfLImpl *>(I);
+  Impl = EImpl->createSegmentNoteIteratorImpl(IsEnd);
+}
+
+ElfLSegmentNoteIterator::ElfLSegmentNoteIterator(
+    const ElfLSegmentNoteIterator &Other) {
+  ElfLSegmentNoteIteratorImpl *IImpl =
+      reinterpret_cast<ElfLSegmentNoteIteratorImpl *>(Other.Impl);
+  Impl = new ElfLSegmentNoteIteratorImpl(*IImpl);
+}
+
+ElfLSegmentNoteIterator::~ElfLSegmentNoteIterator() {
+  assert(Impl && "Invalid ElfLSegmentNoteIterator object.");
+  ElfLSegmentNoteIteratorImpl *IImpl =
+      reinterpret_cast<ElfLSegmentNoteIteratorImpl *>(Impl);
+  delete IImpl;
+}
+
+bool ElfLSegmentNoteIterator::operator==(
+    const ElfLSegmentNoteIterator Other) const {
+  const ElfLSegmentNoteIteratorImpl *Lhs =
+      reinterpret_cast<const ElfLSegmentNoteIteratorImpl *>(Impl);
+  const ElfLSegmentNoteIteratorImpl *Rhs =
+      reinterpret_cast<const ElfLSegmentNoteIteratorImpl *>(Other.Impl);
+  return (*Lhs == *Rhs);
+}
+
+bool ElfLSegmentNoteIterator::operator!=(
+    const ElfLSegmentNoteIterator Other) const {
+  return !(*this == Other);
+}
+
+ElfLSegmentNoteIterator &ElfLSegmentNoteIterator::operator++() {
+  ElfLSegmentNoteIteratorImpl *IImpl =
+      reinterpret_cast<ElfLSegmentNoteIteratorImpl *>(Impl);
+  ++(*IImpl);
+  return *this;
+}
+
+ElfLNote ElfLSegmentNoteIterator::operator*() const {
+  ElfLSegmentNoteIteratorImpl *IImpl =
+      reinterpret_cast<ElfLSegmentNoteIteratorImpl *>(Impl);
+  return ElfLNote(**IImpl);
+}
+
+ElfLNote::ElfLNote(const void *I) {
+  // ElfLNote::Impl is a pointer to Elf_Note in this implementation.
+  // A pointer to Elf_Note is returned by
+  // ElfLSectionNoteIteratorImpl::operator*().
+  Impl = I;
+}
+
+ElfLNote::ElfLNote(const ElfLNote &Other) { Impl = Other.Impl; }
+
+ElfLNote::~ElfLNote() {}
+
+uint64_t ElfLNote::getNameSize() const {
+  const Elf_Note *Note = reinterpret_cast<const Elf_Note *>(Impl);
+  if (Note->n_namesz == 0)
+    return 0;
+  // libelf returns name size that accounts for the null terminator.
+  // ELF light interface returns the size ignoring it.
+  return Note->n_namesz - 1;
+}
+
+const char *ElfLNote::getName() const {
+  const Elf_Note *Note = reinterpret_cast<const Elf_Note *>(Impl);
+  return reinterpret_cast<const char *>(Note) + sizeof(*Note);
+}
+
+uint64_t ElfLNote::getDescSize() const {
+  const Elf_Note *Note = reinterpret_cast<const Elf_Note *>(Impl);
+  return Note->n_descsz;
+}
+
+const uint8_t *ElfLNote::getDesc() const {
+  const Elf_Note *Note = reinterpret_cast<const Elf_Note *>(Impl);
+  return reinterpret_cast<const uint8_t *>(Note) + sizeof(*Note) +
+         alignUp(getNameSize(), NoteAlignment);
+}
+
+uint64_t ElfLNote::getType() const {
+  const Elf_Note *Note = reinterpret_cast<const Elf_Note *>(Impl);
+  return Note->n_type;
+}
+
+ElfLSection::ElfLSection(const void *I) {
+  Impl = I;
+}
+
+ElfLSection::ElfLSection(const ElfLSection &Other) {
+  const ElfLSectionImpl *SImpl =
+      reinterpret_cast<const ElfLSectionImpl *>(Other.Impl);
+  Impl = new ElfLSectionImpl(*SImpl);
+}
+
+ElfLSection::~ElfLSection() {
+  const ElfLSectionImpl *SImpl =
+      reinterpret_cast<const ElfLSectionImpl *>(Impl);
+  delete SImpl;
+}
+
+const char *ElfLSection::getName() const {
+  const ElfLSectionImpl *SImpl =
+      reinterpret_cast<const ElfLSectionImpl *>(Impl);
+  return SImpl->getName();
+}
+
+uint64_t ElfLSection::getSize() const {
+  const ElfLSectionImpl *SImpl =
+      reinterpret_cast<const ElfLSectionImpl *>(Impl);
+  return SImpl->getSize();
+}
+
+const uint8_t *ElfLSection::getContents() const {
+  const ElfLSectionImpl *SImpl =
+      reinterpret_cast<const ElfLSectionImpl *>(Impl);
+  return SImpl->getContents();
+}
+
+ElfLSectionIterator ElfL::sections_begin() const {
+  return ElfLSectionIterator(Impl);
+}
+
+ElfLSectionIterator ElfL::sections_end() const {
+  return ElfLSectionIterator(Impl, true);
+}
+
+ElfLSectionIterator::ElfLSectionIterator(const void *I, bool IsEnd) {
+  const ElfLImpl *EImpl = reinterpret_cast<const ElfLImpl *>(I);
+  Impl = EImpl->createSectionIteratorImpl(IsEnd);
+}
+
+ElfLSectionIterator::ElfLSectionIterator(
+    const ElfLSectionIterator &Other) {
+  ElfLSectionIteratorImpl *IImpl =
+      reinterpret_cast<ElfLSectionIteratorImpl *>(Other.Impl);
+  Impl = new ElfLSectionIteratorImpl(*IImpl);
+}
+
+ElfLSectionIterator::~ElfLSectionIterator() {
+  assert(Impl && "Invalid ElfLSectionIterator object.");
+  ElfLSectionIteratorImpl *IImpl =
+      reinterpret_cast<ElfLSectionIteratorImpl *>(Impl);
+  delete IImpl;
+}
+
+bool ElfLSectionIterator::operator==(
+    const ElfLSectionIterator Other) const {
+  const ElfLSectionIteratorImpl *Lhs =
+      reinterpret_cast<const ElfLSectionIteratorImpl *>(Impl);
+  const ElfLSectionIteratorImpl *Rhs =
+      reinterpret_cast<const ElfLSectionIteratorImpl *>(Other.Impl);
+  return (*Lhs == *Rhs);
+}
+
+bool ElfLSectionIterator::operator!=(
+    const ElfLSectionIterator Other) const {
+  return !(*this == Other);
+}
+
+ElfLSectionIterator &ElfLSectionIterator::operator++() {
+  ElfLSectionIteratorImpl *IImpl =
+      reinterpret_cast<ElfLSectionIteratorImpl *>(Impl);
+  ++(*IImpl);
+  return *this;
+}
+
+ElfLSection ElfLSectionIterator::operator*() const {
+  ElfLSectionIteratorImpl *IImpl =
+      reinterpret_cast<ElfLSectionIteratorImpl *>(Impl);
+  return ElfLSection(**IImpl);
+}
+#else // !MAY_USE_LIBELF
+
+// Implementation based on LLVM ELF binary format.
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/ELFTypes.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+using namespace llvm;
+using namespace llvm::ELF;
+using namespace llvm::object;
+
+class ElfLNoteImplBase {
+public:
+  virtual ~ElfLNoteImplBase() = default;
+  virtual ElfLNoteImplBase *clone() const = 0;
+  virtual size_t getNameSize() const = 0;
+  virtual const char *getName() const = 0;
+  virtual size_t getDescSize() const = 0;
+  virtual const uint8_t *getDesc() const = 0;
+  virtual uint32_t getType() const = 0;
+};
+
+template <class ELFT> class ElfLNoteImpl : public ElfLNoteImplBase {
+  using Elf_Note = typename ELFT::Note;
+  const Elf_Note Note;
+
+public:
+  ElfLNoteImpl(const Elf_Note Note) : Note(Note) {}
+  ElfLNoteImpl(const ElfLNoteImpl &) = default;
+  ElfLNoteImplBase *clone() const override { return new ElfLNoteImpl(*this); }
+  ~ElfLNoteImpl() = default;
+  size_t getNameSize() const override { return Note.getName().size(); }
+  const char *getName() const override { return Note.getName().data(); }
+  size_t getDescSize() const override { return Note.getDesc().size(); }
+  const uint8_t *getDesc() const override { return Note.getDesc().data(); }
+  uint32_t getType() const override { return Note.getType(); }
+};
+
+class ElfLNoteIteratorImplBase {
+protected:
+  const endianness TargetEndianness;
+  const bool Is64Bits;
+  const bool IsSectionIterator;
+
+  ElfLNoteIteratorImplBase(endianness TargetEndianness, bool Is64Bits,
+                           bool IsSectionIterator)
+      : TargetEndianness(TargetEndianness), Is64Bits(Is64Bits),
+        IsSectionIterator(IsSectionIterator) {}
+
+public:
+  ElfLNoteIteratorImplBase(const ElfLNoteIteratorImplBase &) = default;
+  virtual ~ElfLNoteIteratorImplBase() = default;
+  virtual ElfLNoteIteratorImplBase *clone() const = 0;
+  virtual ElfLNoteIteratorImplBase &operator++() = 0;
+  virtual bool operator==(const ElfLNoteIteratorImplBase &) const = 0;
+  virtual ElfLNoteImplBase *operator*() const = 0;
+
+  endianness getEndianness() const { return TargetEndianness; }
+
+  bool is64Bits() const { return Is64Bits; }
+
+  bool isSectionIterator() const { return IsSectionIterator; }
+};
+
+template <class ELFT>
+class ElfLNoteIteratorImpl : public ElfLNoteIteratorImplBase {
+protected:
+  using NoteIterator = typename ELFT::NoteIterator;
+
+  const ELFFile<ELFT> &EF;
+  NoteIterator NotesIt;
+  Error &Err;
+
+  explicit ElfLNoteIteratorImpl(const ELFFile<ELFT> &EF, Error &Err,
+                                bool IsSectionIterator)
+      : ElfLNoteIteratorImplBase(ELFT::TargetEndianness, ELFT::Is64Bits,
+                                 IsSectionIterator),
+        EF(EF), NotesIt(EF.notes_end()), Err(Err) {}
+
+public:
+  ElfLNoteIteratorImpl(const ElfLNoteIteratorImpl &) = default;
+  virtual ~ElfLNoteIteratorImpl() = default;
+
+  static bool classof(const ElfLNoteIteratorImplBase *B) {
+    return (B->getEndianness() == ELFT::TargetEndianness &&
+            B->is64Bits() == ELFT::Is64Bits);
+  }
+};
+
+template <class ELFT>
+class ElfLSectionNoteIteratorImpl : public ElfLNoteIteratorImpl<ELFT> {
+  using Elf_Shdr = typename ELFT::Shdr;
+  using Elf_Shdr_Range = typename ELFT::ShdrRange;
+  using NoteIterator = typename ElfLNoteIteratorImpl<ELFT>::NoteIterator;
+  using SectionsIteratorTy = typename Elf_Shdr_Range::iterator;
+
+  SectionsIteratorTy SectionsIt;
+
+  const ELFFile<ELFT> &getEF() const { return this->EF; }
+  const NoteIterator &getNotesIt() const { return this->NotesIt; }
+  Error &getErr() const { return this->Err; }
+  NoteIterator &getNotesIt() { return this->NotesIt; }
+  SectionsIteratorTy section_begin() const {
+    Expected<Elf_Shdr_Range> Sections = getEF().sections();
+    if (!Sections)
+      return SectionsIteratorTy();
+
+    return Sections->begin();
+  }
+
+  SectionsIteratorTy section_end() const {
+    Expected<Elf_Shdr_Range> Sections = getEF().sections();
+    if (!Sections)
+      return SectionsIteratorTy();
+
+    return Sections->end();
+  }
+
+  bool isEqual(const ElfLSectionNoteIteratorImpl &Lhs,
+               const ElfLSectionNoteIteratorImpl &Rhs) const {
+    // Check for end() iterators, first.
+    if (Lhs.SectionsIt == section_end() && Rhs.SectionsIt == section_end())
+      return true;
+
+    if (Lhs.SectionsIt != Rhs.SectionsIt)
+      return false;
+
+    return Lhs.getNotesIt() == Rhs.getNotesIt();
+  }
+
+  void autoAdvance(bool IsFirst = false) {
+    if (SectionsIt == section_end())
+      return;
+
+    if (getNotesIt() != getEF().notes_end())
+      return;
+
+    // SectionsIt is not an end iterator, and NotesIt is an end()
+    // iterator.
+    //
+    // If IsFirst is true, then we just in the initial state, and
+    // we need to set SectionsIt to the first SHT_NOTE section (if any),
+    // and, then, NotesIt to the first note in this section.
+    //
+    // If IsFirst is false, then we've reached the end of the current
+    // SHT_NOTE section, and should find the next section with notes.
+    if (!IsFirst || SectionsIt->sh_type != ELF::SHT_NOTE)
+      ++SectionsIt;
+
+    while (SectionsIt != section_end() &&
+           SectionsIt->sh_type != ELF::SHT_NOTE) {
+      ++SectionsIt;
+    }
+
+    if (SectionsIt == section_end()) {
+      // No more sections.
+      return;
+    }
+
+    const Elf_Shdr &Section = *SectionsIt;
+    getNotesIt() = getEF().notes_begin(Section, getErr());
+
+    // Auto advance the iterator, if the NOTE section
+    // does not contain any notes (e.g. some error happened
+    // during the note parsing).
+    autoAdvance();
+  }
+
+  bool operator!=(const ElfLSectionNoteIteratorImpl &Other) const {
+    return !(*this == Other);
+  }
+
+public:
+  ElfLSectionNoteIteratorImpl(const ELFFile<ELFT> &EF, Error &Err,
+                              bool IsEnd = false)
+      : ElfLNoteIteratorImpl<ELFT>(EF, Err, true) {
+    if (IsEnd) {
+      SectionsIt = section_end();
+      // It is an end() iterator, if SectionsIt is an end() iterator.
+      return;
+    }
+
+    SectionsIt = section_begin();
+    autoAdvance(true);
+  }
+
+  ElfLSectionNoteIteratorImpl(const ElfLSectionNoteIteratorImpl &Copy) =
+      default;
+
+  ElfLNoteIteratorImplBase *clone() const override {
+    return new ElfLSectionNoteIteratorImpl(*this);
+  }
+
+  bool operator==(const ElfLNoteIteratorImplBase &Other) const override {
+    if (const ElfLSectionNoteIteratorImpl *OPtr =
+            dyn_cast<const ElfLSectionNoteIteratorImpl>(&Other)) {
+      return isEqual(*this, *OPtr);
+    }
+    return false;
+  }
+
+  ElfLSectionNoteIteratorImpl &operator++() override {
+    assert(*this != ElfLSectionNoteIteratorImpl(getEF(), getErr(), true) &&
+           "Incrementing the end iterator.");
+    // Move the notes iterator within the current section.
+    ++getNotesIt();
+    autoAdvance();
+
+    return *this;
+  }
+
+  ElfLNoteImplBase *operator*() const override {
+    assert(*this != ElfLSectionNoteIteratorImpl(getEF(), getErr(), true) &&
+           "Dereferencing the end iterator.");
+    return new ElfLNoteImpl<ELFT>(*getNotesIt());
+  }
+
+  static bool classof(const ElfLNoteIteratorImplBase *B) {
+    return (ElfLNoteIteratorImpl<ELFT>::classof(B) &&
+            B->isSectionIterator() == true);
+  }
+};
+
+template <class ELFT>
+class ElfLSegmentNoteIteratorImpl : public ElfLNoteIteratorImpl<ELFT> {
+  using Elf_Phdr = typename ELFT::Phdr;
+  using Elf_Phdr_Range = typename ELFT::PhdrRange;
+  using NoteIterator = typename ElfLNoteIteratorImpl<ELFT>::NoteIterator;
+  using SegmentIteratorTy = typename Elf_Phdr_Range::iterator;
+
+  SegmentIteratorTy SegmentsIt;
+
+  const ELFFile<ELFT> &getEF() const { return this->EF; }
+  const NoteIterator &getNotesIt() const { return this->NotesIt; }
+  Error &getErr() const { return this->Err; }
+  NoteIterator &getNotesIt() { return this->NotesIt; }
+  SegmentIteratorTy segment_begin() const {
+    Expected<Elf_Phdr_Range> Segments = getEF().program_headers();
+    if (!Segments)
+      return SegmentIteratorTy();
+
+    return Segments->begin();
+  }
+
+  SegmentIteratorTy segment_end() const {
+    Expected<Elf_Phdr_Range> Segments = getEF().program_headers();
+    if (!Segments)
+      return SegmentIteratorTy();
+
+    return Segments->end();
+  }
+
+  bool isEqual(const ElfLSegmentNoteIteratorImpl &Lhs,
+               const ElfLSegmentNoteIteratorImpl &Rhs) const {
+    // Check for end() iterators, first.
+    if (Lhs.SegmentsIt == segment_end() && Rhs.SegmentsIt == segment_end())
+      return true;
+
+    if (Lhs.SegmentsIt != Rhs.SegmentsIt)
+      return false;
+
+    return Lhs.getNotesIt() == Rhs.getNotesIt();
+  }
+
+  void autoAdvance(bool IsFirst = false) {
+    if (SegmentsIt == segment_end())
+      return;
+
+    if (getNotesIt() != getEF().notes_end())
+      return;
+
+    // SegmentsIt is not an end iterator, and NotesIt is an end()
+    // iterator.
+    //
+    // If IsFirst is true, then we just in the initial state, and
+    // we need to set SegmentsIt to the first PT_NOTE segment (if any),
+    // and, then, NotesIt to the first note in this segment.
+    //
+    // If IsFirst is false, then we've reached the end of the current
+    // PT_NOTE segment, and should find the next segment with notes.
+    if (!IsFirst || SegmentsIt->p_type != ELF::SHT_NOTE)
+      ++SegmentsIt;
+
+    while (SegmentsIt != segment_end() && SegmentsIt->p_type != ELF::PT_NOTE) {
+      ++SegmentsIt;
+    }
+
+    if (SegmentsIt == segment_end()) {
+      // No more segments.
+      return;
+    }
+
+    const Elf_Phdr &Segment = *SegmentsIt;
+    getNotesIt() = getEF().notes_begin(Segment, getErr());
+
+    // Auto advance the iterator, if the NOTE segment
+    // does not contain any notes (e.g. some error happened
+    // during the note parsing).
+    autoAdvance();
+  }
+
+  bool operator!=(const ElfLSegmentNoteIteratorImpl &Other) const {
+    return !(*this == Other);
+  }
+
+public:
+  ElfLSegmentNoteIteratorImpl(const ELFFile<ELFT> &EF, Error &Err,
+                              bool IsEnd = false)
+      : ElfLNoteIteratorImpl<ELFT>(EF, Err, false) {
+    if (IsEnd) {
+      SegmentsIt = segment_end();
+      // It is an end() iterator, if SegmentsIt is an end() iterator.
+      return;
+    }
+
+    SegmentsIt = segment_begin();
+    autoAdvance(true);
+  }
+
+  ElfLSegmentNoteIteratorImpl(const ElfLSegmentNoteIteratorImpl &Copy) =
+      default;
+
+  ElfLNoteIteratorImplBase *clone() const override {
+    return new ElfLSegmentNoteIteratorImpl(*this);
+  }
+
+  bool operator==(const ElfLNoteIteratorImplBase &Other) const override {
+    if (const ElfLSegmentNoteIteratorImpl *OPtr =
+            dyn_cast<const ElfLSegmentNoteIteratorImpl>(&Other)) {
+      return isEqual(*this, *OPtr);
+    }
+    return false;
+  }
+
+  ElfLSegmentNoteIteratorImpl &operator++() override {
+    assert(*this != ElfLSegmentNoteIteratorImpl(getEF(), getErr(), true) &&
+           "Incrementing the end iterator.");
+    // Move the notes iterator within the current segment.
+    ++getNotesIt();
+    autoAdvance();
+
+    return *this;
+  }
+
+  ElfLNoteImplBase *operator*() const override {
+    assert(*this != ElfLSegmentNoteIteratorImpl(getEF(), getErr(), true) &&
+           "Dereferencing the end iterator.");
+    return new ElfLNoteImpl<ELFT>(*getNotesIt());
+  }
+
+  static bool classof(const ElfLNoteIteratorImplBase *B) {
+    return (ElfLNoteIteratorImpl<ELFT>::classof(B) &&
+            B->isSectionIterator() == false);
+  }
+};
+
+class ElfLSectionImplBase {
+public:
+  virtual ~ElfLSectionImplBase() = default;
+  virtual ElfLSectionImplBase *clone() const = 0;
+  virtual const char *getName() const = 0;
+  virtual uint64_t getSize() const = 0;
+  virtual const uint8_t *getContents() const = 0;
+};
+
+template <class ELFT> class ElfLSectionImpl : public ElfLSectionImplBase {
+  using Elf_Shdr = typename ELFT::Shdr;
+
+  const ELFFile<ELFT> &EF;
+  const Elf_Shdr &Section;
+
+public:
+  ElfLSectionImpl(const ELFFile<ELFT> &EF, const Elf_Shdr &Section)
+    : EF(EF), Section(Section) {}
+  ElfLSectionImpl(const ElfLSectionImpl &) = default;
+  ElfLSectionImpl *clone() const override { return new ElfLSectionImpl(*this); }
+  ~ElfLSectionImpl() = default;
+
+  const char *getName() const override {
+    Expected<StringRef> NameOrErr = EF.getSectionName(Section);
+    if (!NameOrErr) {
+      consumeError(NameOrErr.takeError());
+      return "";
+    }
+    return NameOrErr->data();
+  }
+
+  uint64_t getSize() const override {
+    Expected<ArrayRef<uint8_t>> ContentsOrErr = EF.getSectionContents(Section);
+    if (!ContentsOrErr) {
+      consumeError(ContentsOrErr.takeError());
+      return 0;
+    }
+    return ContentsOrErr->size();
+  }
+
+  const uint8_t *getContents() const override {
+    Expected<ArrayRef<uint8_t>> ContentsOrErr = EF.getSectionContents(Section);
+    if (!ContentsOrErr) {
+      consumeError(ContentsOrErr.takeError());
+      return 0;
+    }
+    return ContentsOrErr->data();
+  }
+};
+
+class ElfLSectionIteratorImplBase {
+protected:
+  const endianness TargetEndianness;
+  const bool Is64Bits;
+
+  ElfLSectionIteratorImplBase(endianness TargetEndianness, bool Is64Bits)
+    : TargetEndianness(TargetEndianness), Is64Bits(Is64Bits) {}
+
+public:
+  ElfLSectionIteratorImplBase(const ElfLSectionIteratorImplBase &) = default;
+  virtual ~ElfLSectionIteratorImplBase() = default;
+  virtual ElfLSectionIteratorImplBase *clone() const = 0;
+  virtual ElfLSectionIteratorImplBase &operator++() = 0;
+  virtual bool operator==(const ElfLSectionIteratorImplBase &) const = 0;
+  virtual ElfLSectionImplBase *operator*() const = 0;
+
+  endianness getEndianness() const { return TargetEndianness; }
+
+  bool is64Bits() const { return Is64Bits; }
+};
+
+template <class ELFT>
+class ElfLSectionIteratorImpl : public ElfLSectionIteratorImplBase {
+  using Elf_Shdr = typename ELFT::Shdr;
+  using Elf_Shdr_Range = typename ELFT::ShdrRange;
+  using SectionsIteratorTy = typename Elf_Shdr_Range::iterator;
+
+  const ELFFile<ELFT> &EF;
+  SectionsIteratorTy SectionsIt;
+
+  const ELFFile<ELFT> &getEF() const { return EF; }
+
+  SectionsIteratorTy section_begin() const {
+    Expected<Elf_Shdr_Range> Sections = getEF().sections();
+    if (!Sections)
+      return SectionsIteratorTy();
+
+    return Sections->begin();
+  }
+
+  SectionsIteratorTy section_end() const {
+    Expected<Elf_Shdr_Range> Sections = getEF().sections();
+    if (!Sections)
+      return SectionsIteratorTy();
+
+    return Sections->end();
+  }
+
+  bool isEqual(const ElfLSectionIteratorImpl &Lhs,
+               const ElfLSectionIteratorImpl &Rhs) const {
+    return Lhs.SectionsIt == Rhs.SectionsIt;
+  }
+
+  bool operator!=(const ElfLSectionIteratorImpl Other) const {
+    return !(*this == Other);
+  }
+
+public:
+  ElfLSectionIteratorImpl(const ELFFile<ELFT> &EF, bool IsEnd = false)
+    : ElfLSectionIteratorImplBase(ELFT::TargetEndianness, ELFT::Is64Bits),
+      EF(EF) {
+    if (IsEnd) {
+      SectionsIt = section_end();
+      return;
+    }
+
+    SectionsIt = section_begin();
+  }
+
+  ElfLSectionIteratorImpl *clone() const override {
+    return new ElfLSectionIteratorImpl(*this);
+  }
+
+  bool operator==(const ElfLSectionIteratorImplBase &Other) const override {
+    if (const ElfLSectionIteratorImpl *OPtr =
+            dyn_cast<const ElfLSectionIteratorImpl>(&Other)) {
+      return isEqual(*this, *OPtr);
+    }
+    return false;
+  }
+
+  ElfLSectionImplBase *operator*() const override {
+    assert(*this != ElfLSectionIteratorImpl(EF, true) &&
+           "Dereferencing the end iterator.");
+    return new ElfLSectionImpl<ELFT>(EF, *SectionsIt);
+  }
+
+  ElfLSectionIteratorImpl &operator++() override {
+    assert(*this != ElfLSectionIteratorImpl(EF, true) &&
+           "Dereferencing the end iterator.");
+
+    ++SectionsIt;
+    return *this;
+  }
+
+  static bool classof(const ElfLSectionIteratorImplBase *B) {
+    return (B->getEndianness() == ELFT::TargetEndianness &&
+            B->is64Bits() == ELFT::Is64Bits);
+  }
+};
+
+class ElfLImplBase {
+public:
+  ElfLImplBase() = default;
+  ElfLImplBase(const ElfLImplBase &) = delete;
+  ElfLImplBase &operator=(const ElfLImplBase &) = delete;
+  virtual ~ElfLImplBase() = default;
+  virtual uint16_t getEMachine() const = 0;
+  virtual uint16_t getEType() const = 0;
+
+  virtual ElfLNoteIteratorImplBase *
+  createSectionNoteIteratorImpl(bool IsEnd) const = 0;
+  virtual ElfLNoteIteratorImplBase *
+  createSegmentNoteIteratorImpl(bool IsEnd) const = 0;
+  virtual ElfLSectionIteratorImplBase *
+  createSectionIteratorImpl(bool IsEnd) const = 0;
+};
+
+template <class ELFT> class ElfLImpl : public ElfLImplBase {
+  std::unique_ptr<ELFObjectFile<ELFT>> File;
+  Error *Err = nullptr;
+
+  friend class ElfL;
+
+public:
+  ElfLImpl(std::unique_ptr<ObjectFile> F) {
+    ObjectFile *FPtr = F.release();
+    if (auto *Obj = dyn_cast<ELFObjectFile<ELFT>>(FPtr))
+      File = std::unique_ptr<ELFObjectFile<ELFT>>(Obj);
+    else
+      assert(false && "Not an ELF object file, or ELF class is wrong.");
+
+    Err = new Error(std::move(Error::success()));
+  }
+  ElfLImpl(const ElfLImpl &) = delete;
+  ElfLImpl &operator=(const ElfLImpl &) = delete;
+  virtual ~ElfLImpl() {
+    if (!Err)
+      return;
+
+    if (*Err) {
+      auto ErrorString = toString(std::move(*Err));
+      DP("Destroying ELF object parsed with errors: %s\n", ErrorString.c_str());
+    } else {
+      delete Err;
+    }
+    Err = nullptr;
+  }
+  uint16_t getEMachine() const override {
+    return cast<ELFObjectFileBase>(File.get())->getEMachine();
+  }
+  uint16_t getEType() const override {
+    return cast<ELFObjectFileBase>(File.get())->getEType();
+  }
+
+  ElfLNoteIteratorImplBase *
+  createSectionNoteIteratorImpl(bool IsEnd) const override {
+    return new ElfLSectionNoteIteratorImpl<ELFT>(File->getELFFile(), *Err,
+                                                 IsEnd);
+  }
+
+  ElfLNoteIteratorImplBase *
+  createSegmentNoteIteratorImpl(bool IsEnd) const override {
+    return new ElfLSegmentNoteIteratorImpl<ELFT>(File->getELFFile(), *Err,
+                                                 IsEnd);
+  }
+
+  ElfLSectionIteratorImplBase *
+  createSectionIteratorImpl(bool IsEnd) const override {
+    return new ElfLSectionIteratorImpl<ELFT>(File->getELFFile(), IsEnd);
+  }
+};
+
+ElfL::ElfL(char *Begin, size_t Size) {
+  StringRef StrBuf(Begin, Size);
+  std::unique_ptr<MemoryBuffer> MemBuf =
+      MemoryBuffer::getMemBuffer(StrBuf, "", false);
+  Expected<std::unique_ptr<ObjectFile>> BinOrErr =
+      ObjectFile::createELFObjectFile(MemBuf->getMemBufferRef(),
+                                      /*InitContent=*/false);
+  if (!BinOrErr) {
+    consumeError(BinOrErr.takeError());
+    return;
+  }
+
+  if (isa<ELF64LEObjectFile>(BinOrErr->get())) {
+    Impl =
+        reinterpret_cast<void *>(new ElfLImpl<ELF64LE>(std::move(*BinOrErr)));
+  } else if (isa<ELF32LEObjectFile>(BinOrErr->get()))
+    Impl =
+        reinterpret_cast<void *>(new ElfLImpl<ELF32LE>(std::move(*BinOrErr)));
+  else if (isa<ELF32BEObjectFile>(BinOrErr->get()))
+    Impl =
+        reinterpret_cast<void *>(new ElfLImpl<ELF32BE>(std::move(*BinOrErr)));
+  else if (isa<ELF64BEObjectFile>(BinOrErr->get()))
+    Impl =
+        reinterpret_cast<void *>(new ElfLImpl<ELF64BE>(std::move(*BinOrErr)));
+}
+
+ElfL::~ElfL() {
+  ElfLImplBase *EImpl = reinterpret_cast<ElfLImplBase *>(Impl);
+  delete EImpl;
+}
+
+bool ElfL::isValidElf() const { return Impl; }
+
+const char *ElfL::getErrmsg(int N) const {
+  // TODO: return text representation for the latest Error.
+  return "LLVM ELF error";
+}
+
+uint16_t ElfL::getEMachine() const {
+  assert(isValidElf() && "Invalid ELF.");
+  ElfLImplBase *EImpl = reinterpret_cast<ElfLImplBase *>(Impl);
+  return EImpl->getEMachine();
+}
+
+uint16_t ElfL::getEType() const {
+  assert(isValidElf() && "Invalid ELF.");
+  ElfLImplBase *EImpl = reinterpret_cast<ElfLImplBase *>(Impl);
+  return EImpl->getEType();
+}
+
+bool ElfL::isDynType(uint16_t Ty) { return Ty == ET_DYN; }
+
+ElfLSectionNoteIterator::ElfLSectionNoteIterator(const void *I, bool IsEnd) {
+  const ElfLImplBase *EImpl = reinterpret_cast<const ElfLImplBase *>(I);
+  // Create new ElfLSectionNoteIteratorImpl<ELFT> object.
+  Impl = EImpl->createSectionNoteIteratorImpl(IsEnd);
+}
+
+ElfLSectionNoteIterator::~ElfLSectionNoteIterator() {
+  const ElfLNoteIteratorImplBase *IImpl =
+      reinterpret_cast<const ElfLNoteIteratorImplBase *>(Impl);
+  delete IImpl;
+}
+
+ElfLSectionNoteIterator::ElfLSectionNoteIterator(
+    const ElfLSectionNoteIterator &Other) {
+  const ElfLNoteIteratorImplBase *IImpl =
+      reinterpret_cast<const ElfLNoteIteratorImplBase *>(Other.Impl);
+  Impl = IImpl->clone();
+}
+
+bool ElfLSectionNoteIterator::operator==(
+    const ElfLSectionNoteIterator Other) const {
+  const ElfLNoteIteratorImplBase *Lhs =
+      reinterpret_cast<const ElfLNoteIteratorImplBase *>(Impl);
+  const ElfLNoteIteratorImplBase *Rhs =
+      reinterpret_cast<const ElfLNoteIteratorImplBase *>(Other.Impl);
+  return (*Lhs == *Rhs);
+}
+
+bool ElfLSectionNoteIterator::operator!=(
+    const ElfLSectionNoteIterator Other) const {
+  return !(*this == Other);
+}
+
+ElfLSectionNoteIterator &ElfLSectionNoteIterator::operator++() {
+  ElfLNoteIteratorImplBase *EImpl =
+      reinterpret_cast<ElfLNoteIteratorImplBase *>(Impl);
+  ++(*EImpl);
+  return *this;
+}
+
+ElfLNote ElfLSectionNoteIterator::operator*() const { return ElfLNote(Impl); }
+
+ElfLSectionNoteIterator ElfL::section_notes_begin() const {
+  assert(isValidElf() && "Invalid ELF.");
+  return ElfLSectionNoteIterator(reinterpret_cast<const ElfLImplBase *>(Impl));
+}
+
+ElfLSectionNoteIterator ElfL::section_notes_end() const {
+  assert(isValidElf() && "Invalid ELF.");
+  return ElfLSectionNoteIterator(reinterpret_cast<const ElfLImplBase *>(Impl),
+                                 true);
+}
+
+ElfLSegmentNoteIterator::ElfLSegmentNoteIterator(const void *I, bool IsEnd) {
+  const ElfLImplBase *EImpl = reinterpret_cast<const ElfLImplBase *>(I);
+  // Create new ElfLSegmentNoteIteratorImpl<ELFT> object.
+  Impl = EImpl->createSegmentNoteIteratorImpl(IsEnd);
+}
+
+ElfLSegmentNoteIterator::~ElfLSegmentNoteIterator() {
+  const ElfLNoteIteratorImplBase *IImpl =
+      reinterpret_cast<const ElfLNoteIteratorImplBase *>(Impl);
+  delete IImpl;
+}
+
+ElfLSegmentNoteIterator::ElfLSegmentNoteIterator(
+    const ElfLSegmentNoteIterator &Other) {
+  const ElfLNoteIteratorImplBase *IImpl =
+      reinterpret_cast<const ElfLNoteIteratorImplBase *>(Other.Impl);
+  Impl = IImpl->clone();
+}
+
+bool ElfLSegmentNoteIterator::operator==(
+    const ElfLSegmentNoteIterator Other) const {
+  const ElfLNoteIteratorImplBase *Lhs =
+      reinterpret_cast<const ElfLNoteIteratorImplBase *>(Impl);
+  const ElfLNoteIteratorImplBase *Rhs =
+      reinterpret_cast<const ElfLNoteIteratorImplBase *>(Other.Impl);
+  return (*Lhs == *Rhs);
+}
+
+bool ElfLSegmentNoteIterator::operator!=(
+    const ElfLSegmentNoteIterator Other) const {
+  return !(*this == Other);
+}
+
+ElfLSegmentNoteIterator &ElfLSegmentNoteIterator::operator++() {
+  ElfLNoteIteratorImplBase *EImpl =
+      reinterpret_cast<ElfLNoteIteratorImplBase *>(Impl);
+  ++(*EImpl);
+  return *this;
+}
+
+ElfLNote ElfLSegmentNoteIterator::operator*() const { return ElfLNote(Impl); }
+
+ElfLSegmentNoteIterator ElfL::segment_notes_begin() const {
+  assert(isValidElf() && "Invalid ELF.");
+  return ElfLSegmentNoteIterator(reinterpret_cast<const ElfLImplBase *>(Impl));
+}
+
+ElfLSegmentNoteIterator ElfL::segment_notes_end() const {
+  assert(isValidElf() && "Invalid ELF.");
+  return ElfLSegmentNoteIterator(reinterpret_cast<const ElfLImplBase *>(Impl),
+                                 true);
+}
+
+ElfLNote::ElfLNote(const void *IteratorImpl) {
+  const ElfLNoteIteratorImplBase *IImpl =
+      reinterpret_cast<const ElfLNoteIteratorImplBase *>(IteratorImpl);
+  Impl = **IImpl;
+}
+
+ElfLNote::ElfLNote(const ElfLNote &Other) {
+  const ElfLNoteImplBase *NImpl =
+      reinterpret_cast<const ElfLNoteImplBase *>(Impl);
+  if (NImpl)
+    Impl = NImpl->clone();
+}
+
+ElfLNote::~ElfLNote() {
+  const ElfLNoteImplBase *NImpl =
+      reinterpret_cast<const ElfLNoteImplBase *>(Impl);
+  delete NImpl;
+}
+
+uint64_t ElfLNote::getNameSize() const {
+  const ElfLNoteImplBase *NImpl =
+      reinterpret_cast<const ElfLNoteImplBase *>(Impl);
+  return NImpl->getNameSize();
+}
+
+const char *ElfLNote::getName() const {
+  const ElfLNoteImplBase *NImpl =
+      reinterpret_cast<const ElfLNoteImplBase *>(Impl);
+  return NImpl->getName();
+}
+
+uint64_t ElfLNote::getDescSize() const {
+  const ElfLNoteImplBase *NImpl =
+      reinterpret_cast<const ElfLNoteImplBase *>(Impl);
+  return NImpl->getDescSize();
+}
+
+const uint8_t *ElfLNote::getDesc() const {
+  const ElfLNoteImplBase *NImpl =
+      reinterpret_cast<const ElfLNoteImplBase *>(Impl);
+  return NImpl->getDesc();
+}
+
+uint64_t ElfLNote::getType() const {
+  const ElfLNoteImplBase *NImpl =
+      reinterpret_cast<const ElfLNoteImplBase *>(Impl);
+  return NImpl->getType();
+}
+
+ElfLSection::ElfLSection(const void *I) {
+  Impl = I;
+}
+
+ElfLSection::ElfLSection(const ElfLSection &Other) {
+  const ElfLSectionImplBase *SImpl =
+      reinterpret_cast<const ElfLSectionImplBase *>(Other.Impl);
+  Impl = SImpl->clone();
+}
+
+ElfLSection::~ElfLSection() {
+  const ElfLSectionImplBase *SImpl =
+      reinterpret_cast<const ElfLSectionImplBase *>(Impl);
+  delete SImpl;
+}
+
+const char *ElfLSection::getName() const {
+  const ElfLSectionImplBase *SImpl =
+      reinterpret_cast<const ElfLSectionImplBase *>(Impl);
+  return SImpl->getName();
+}
+
+uint64_t ElfLSection::getSize() const {
+  const ElfLSectionImplBase *SImpl =
+      reinterpret_cast<const ElfLSectionImplBase *>(Impl);
+  return SImpl->getSize();
+}
+
+const uint8_t *ElfLSection::getContents() const {
+  const ElfLSectionImplBase *SImpl =
+      reinterpret_cast<const ElfLSectionImplBase *>(Impl);
+  return SImpl->getContents();
+}
+
+ElfLSectionIterator ElfL::sections_begin() const {
+  assert(isValidElf() && "Invalid ELF.");
+  return ElfLSectionIterator(reinterpret_cast<const ElfLImplBase *>(Impl));
+}
+
+ElfLSectionIterator ElfL::sections_end() const {
+  assert(isValidElf() && "Invalid ELF.");
+  return ElfLSectionIterator(reinterpret_cast<const ElfLImplBase *>(Impl),
+                             true);
+}
+
+ElfLSectionIterator::ElfLSectionIterator(const void *I, bool IsEnd) {
+  const ElfLImplBase *EImpl = reinterpret_cast<const ElfLImplBase *>(I);
+  Impl = EImpl->createSectionIteratorImpl(IsEnd);
+}
+
+ElfLSectionIterator::ElfLSectionIterator(
+    const ElfLSectionIterator &Other) {
+  ElfLSectionIteratorImplBase *IImpl =
+      reinterpret_cast<ElfLSectionIteratorImplBase *>(Other.Impl);
+  Impl = IImpl->clone();
+}
+
+ElfLSectionIterator::~ElfLSectionIterator() {
+  ElfLSectionIteratorImplBase *IImpl =
+      reinterpret_cast<ElfLSectionIteratorImplBase *>(Impl);
+  delete IImpl;
+}
+
+bool ElfLSectionIterator::operator==(
+    const ElfLSectionIterator Other) const {
+  const ElfLSectionIteratorImplBase *Lhs =
+      reinterpret_cast<const ElfLSectionIteratorImplBase *>(Impl);
+  const ElfLSectionIteratorImplBase *Rhs =
+      reinterpret_cast<const ElfLSectionIteratorImplBase *>(Other.Impl);
+  return (*Lhs == *Rhs);
+}
+
+bool ElfLSectionIterator::operator!=(
+    const ElfLSectionIterator Other) const {
+  return !(*this == Other);
+}
+
+ElfLSectionIterator &ElfLSectionIterator::operator++() {
+  ElfLSectionIteratorImplBase *IImpl =
+      reinterpret_cast<ElfLSectionIteratorImplBase *>(Impl);
+  ++(*IImpl);
+  return *this;
+}
+
+ElfLSection ElfLSectionIterator::operator*() const {
+  ElfLSectionIteratorImplBase *IImpl =
+      reinterpret_cast<ElfLSectionIteratorImplBase *>(Impl);
+  return ElfLSection(**IImpl);
+}
+#endif // !MAY_USE_LIBELF
diff --git a/openmp/libomptarget/plugins/opencl/CMakeLists.txt b/openmp/libomptarget/plugins/opencl/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/plugins/opencl/CMakeLists.txt
@@ -0,0 +1,69 @@
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Plugin for OpenCL device
+#
+##===----------------------------------------------------------------------===##
+
+if(LIBOMPTARGET_DEP_OPENCL_FOUND)
+  if((CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)$" OR
+      CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64") AND
+     (CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Windows"))
+
+    libomptarget_say("Building OpenCL offloading plugin.")
+
+    # Define the suffix for the runtime messaging dumps.
+    add_definitions(-DTARGET_NAME=OPENCL)
+    add_definitions(-DCL_TARGET_OPENCL_VERSION=210)
+
+    if(LIBOMPTARGET_CMAKE_BUILD_TYPE MATCHES debug)
+      add_definitions(-DSPIR_ERROR_REPORT)
+      add_definitions(-DOMPTARGET_OPENCL_DEBUG)
+    endif()
+
+    add_library(omptarget.rtl.opencl SHARED src/rtl.cpp)
+
+    target_include_directories(omptarget.rtl.opencl PRIVATE
+      ${LIBOMPTARGET_INCLUDE_DIR}
+      ${LIBOMPTARGET_DEP_OPENCL_INCLUDE_DIRS}
+      ${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
+    )
+
+    if (NOT OPENMP_STANDALONE_BUILD)
+      # We have to disable EH for Windows compilation.
+      # For standalone OpenMP build, we need to come up
+      # with our own EH flags management.
+      llvm_update_compile_flags(omptarget.rtl.opencl)
+    endif()
+
+    install(TARGETS omptarget.rtl.opencl
+            LIBRARY DESTINATION lib${LIBOMPTARGET_LIBDIR_SUFFIX})
+
+    # OpenMP runtime library must be linked in.
+    target_link_libraries(omptarget.rtl.opencl PRIVATE omp)
+
+    target_link_libraries(omptarget.rtl.opencl PRIVATE elf_common LLVMSupport
+                          ${LIBOMPTARGET_DEP_OPENCL_LIBRARIES})
+
+    if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+      target_link_libraries(omptarget.rtl.opencl PRIVATE dl
+          ${OPENMP_PTHREAD_LIB}
+          "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports")
+    elseif (NOT CMAKE_SYSTEM_NAME MATCHES "Windows")
+      message(FATAL_ERROR "Missing platfrom support.")
+    endif()
+
+    # Report to the parent scope that we are building a plugin for OpenCL.
+    set(LIBOMPTARGET_SYSTEM_TARGETS
+        "${LIBOMPTARGET_SYSTEM_TARGETS} spir64-unknown-unknown" PARENT_SCOPE)
+  else()
+    libomptarget_say("Not building OpenCL offloading plugin: only support OpenCL in x86_64 Linux/Windows hosts.")
+  endif()
+else()
+  libomptarget_say("Not building OpenCL offloading plugin: OpenCL not found in system.")
+endif()
diff --git a/openmp/libomptarget/plugins/opencl/src/rtl-trace.h b/openmp/libomptarget/plugins/opencl/src/rtl-trace.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/plugins/opencl/src/rtl-trace.h
@@ -0,0 +1,1176 @@
+//===--- Target RTLs Implementation ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Code for tracing RTL
+//
+//===----------------------------------------------------------------------===//
+#ifndef RTL_TRACE_H
+#define RTL_TRACE_H
+
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+#include <inttypes.h>
+#include <string>
+#include "omptarget.h"
+#include "Debug.h"
+
+#define STR(x) #x
+#define TO_STRING(x) STR(x)
+
+#define TARGET_NAME OPENCL
+#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
+
+extern int DebugLevel;
+
+#define DPCALL(...)                                                            \
+  do {                                                                         \
+    if (DebugLevel > 1)                                                        \
+      DP(__VA_ARGS__);                                                         \
+  } while (0)
+
+#define FATAL_ERROR(Msg)                                                       \
+  do {                                                                         \
+    fprintf(stderr, "%s --> ", DEBUG_PREFIX);                                  \
+    fprintf(stderr, "Error: %s failed (%s) -- exiting...\n", __func__, Msg);   \
+    exit(EXIT_FAILURE);                                                        \
+  } while (0)
+
+#define WARNING(...)                                                           \
+  do {                                                                         \
+    fprintf(stderr, "%s --> ", DEBUG_PREFIX);                                  \
+    fprintf(stderr, "Warning: " __VA_ARGS__);                                  \
+  } while (0)
+
+typedef cl_int (CL_API_CALL *clGetDeviceGlobalVariablePointerINTEL_fn)(
+    cl_device_id,
+    cl_program,
+    const char *,
+    size_t *,
+    void **);
+typedef cl_int (CL_API_CALL *clGetKernelSuggestedLocalWorkSizeINTEL_fn)(
+    cl_command_queue,
+    cl_kernel,
+    cl_uint,
+    const size_t *,
+    const size_t *,
+    size_t *);
+typedef cl_int (CL_API_CALL *clSetProgramSpecializationConstant_fn)(
+    cl_program, cl_uint, size_t, const void *);
+
+#define FOR_EACH_COMMON_EXTENSION_FN(M)                                        \
+  M(clGetMemAllocInfoINTEL)                                                    \
+  M(clHostMemAllocINTEL)                                                       \
+  M(clDeviceMemAllocINTEL)                                                     \
+  M(clSharedMemAllocINTEL)                                                     \
+  M(clMemFreeINTEL)                                                            \
+  M(clSetKernelArgMemPointerINTEL)                                             \
+  M(clEnqueueMemcpyINTEL)                                                      \
+  M(clSetProgramSpecializationConstant)                                        \
+  M(clGetDeviceGlobalVariablePointerINTEL)                                     \
+  M(clGetKernelSuggestedLocalWorkSizeINTEL)
+
+#define FOR_EACH_EXTENSION_FN(M) FOR_EACH_COMMON_EXTENSION_FN(M)
+
+enum ExtensionIdTy {
+#define EXTENSION_FN_ID(Fn) Fn##Id,
+  FOR_EACH_EXTENSION_FN(EXTENSION_FN_ID)
+  ExtensionIdLast
+};
+
+#define FOREACH_CL_ERROR_CODE(FN)                                              \
+  FN(CL_SUCCESS)                                                               \
+  FN(CL_DEVICE_NOT_FOUND)                                                      \
+  FN(CL_DEVICE_NOT_AVAILABLE)                                                  \
+  FN(CL_COMPILER_NOT_AVAILABLE)                                                \
+  FN(CL_MEM_OBJECT_ALLOCATION_FAILURE)                                         \
+  FN(CL_OUT_OF_RESOURCES)                                                      \
+  FN(CL_OUT_OF_HOST_MEMORY)                                                    \
+  FN(CL_PROFILING_INFO_NOT_AVAILABLE)                                          \
+  FN(CL_MEM_COPY_OVERLAP)                                                      \
+  FN(CL_IMAGE_FORMAT_MISMATCH)                                                 \
+  FN(CL_IMAGE_FORMAT_NOT_SUPPORTED)                                            \
+  FN(CL_BUILD_PROGRAM_FAILURE)                                                 \
+  FN(CL_MAP_FAILURE)                                                           \
+  FN(CL_MISALIGNED_SUB_BUFFER_OFFSET)                                          \
+  FN(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST)                             \
+  FN(CL_COMPILE_PROGRAM_FAILURE)                                               \
+  FN(CL_LINKER_NOT_AVAILABLE)                                                  \
+  FN(CL_LINK_PROGRAM_FAILURE)                                                  \
+  FN(CL_DEVICE_PARTITION_FAILED)                                               \
+  FN(CL_KERNEL_ARG_INFO_NOT_AVAILABLE)                                         \
+  FN(CL_INVALID_VALUE)                                                         \
+  FN(CL_INVALID_DEVICE_TYPE)                                                   \
+  FN(CL_INVALID_PLATFORM)                                                      \
+  FN(CL_INVALID_DEVICE)                                                        \
+  FN(CL_INVALID_CONTEXT)                                                       \
+  FN(CL_INVALID_QUEUE_PROPERTIES)                                              \
+  FN(CL_INVALID_COMMAND_QUEUE)                                                 \
+  FN(CL_INVALID_HOST_PTR)                                                      \
+  FN(CL_INVALID_MEM_OBJECT)                                                    \
+  FN(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)                                       \
+  FN(CL_INVALID_IMAGE_SIZE)                                                    \
+  FN(CL_INVALID_SAMPLER)                                                       \
+  FN(CL_INVALID_BINARY)                                                        \
+  FN(CL_INVALID_BUILD_OPTIONS)                                                 \
+  FN(CL_INVALID_PROGRAM)                                                       \
+  FN(CL_INVALID_PROGRAM_EXECUTABLE)                                            \
+  FN(CL_INVALID_KERNEL_NAME)                                                   \
+  FN(CL_INVALID_KERNEL_DEFINITION)                                             \
+  FN(CL_INVALID_KERNEL)                                                        \
+  FN(CL_INVALID_ARG_INDEX)                                                     \
+  FN(CL_INVALID_ARG_VALUE)                                                     \
+  FN(CL_INVALID_ARG_SIZE)                                                      \
+  FN(CL_INVALID_KERNEL_ARGS)                                                   \
+  FN(CL_INVALID_WORK_DIMENSION)                                                \
+  FN(CL_INVALID_WORK_GROUP_SIZE)                                               \
+  FN(CL_INVALID_WORK_ITEM_SIZE)                                                \
+  FN(CL_INVALID_GLOBAL_OFFSET)                                                 \
+  FN(CL_INVALID_EVENT_WAIT_LIST)                                               \
+  FN(CL_INVALID_EVENT)                                                         \
+  FN(CL_INVALID_OPERATION)                                                     \
+  FN(CL_INVALID_GL_OBJECT)                                                     \
+  FN(CL_INVALID_BUFFER_SIZE)                                                   \
+  FN(CL_INVALID_MIP_LEVEL)                                                     \
+  FN(CL_INVALID_GLOBAL_WORK_SIZE)                                              \
+  FN(CL_INVALID_PROPERTY)                                                      \
+  FN(CL_INVALID_IMAGE_DESCRIPTOR)                                              \
+  FN(CL_INVALID_COMPILER_OPTIONS)                                              \
+  FN(CL_INVALID_LINKER_OPTIONS)                                                \
+  FN(CL_INVALID_DEVICE_PARTITION_COUNT)                                        \
+  FN(CL_INVALID_PIPE_SIZE)                                                     \
+  FN(CL_INVALID_DEVICE_QUEUE)
+
+#define CASE_TO_STRING(s) case s: return #s;
+
+const char *getCLErrorName(int error) {
+  switch (error) {
+    FOREACH_CL_ERROR_CODE(CASE_TO_STRING)
+  default:
+    return "Unknown Error";
+  }
+}
+
+#define TRACE_FN(Name) CLTR##Name
+#define TRACE_FN_ARG_BEGIN()                                                   \
+  do {                                                                         \
+    std::string fn(__func__);                                                  \
+    DPCALL("CL_CALLEE: %s (\n", fn.substr(4).c_str());                         \
+  } while (0)
+#define TRACE_FN_ARG_END() DPCALL(")\n")
+#define TRACE_FN_ARG(Arg, Fmt) DPCALL("    %s = " Fmt "\n", TO_STRING(Arg), Arg)
+#define TRACE_FN_ARG_PTR(Arg)                                                  \
+  DPCALL("    %s = " DPxMOD "\n", TO_STRING(Arg), DPxPTR(Arg))
+#define TRACE_FN_ARG_INT(Arg) TRACE_FN_ARG(Arg, "%" PRId32)
+#define TRACE_FN_ARG_SIZE(Arg) TRACE_FN_ARG(Arg, "%zu")
+#define TRACE_FN_ARG_UINT(Arg) TRACE_FN_ARG(Arg, "%" PRIu32)
+#define TRACE_FN_ARG_ULONG(Arg) TRACE_FN_ARG(Arg, "%" PRIu64)
+
+cl_int TRACE_FN(clCompileProgram)(
+    cl_program program,
+    cl_uint num_devices,
+    const cl_device_id *device_list,
+    const char *options,
+    cl_uint num_input_headers,
+    const cl_program *input_headers,
+    const char **header_include_names,
+    void (CL_CALLBACK *pfn_notify)(cl_program, void *),
+    void *user_data) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(program);
+  TRACE_FN_ARG_UINT(num_devices);
+  TRACE_FN_ARG_PTR(device_list);
+  TRACE_FN_ARG_PTR(options);
+  TRACE_FN_ARG_UINT(num_input_headers);
+  TRACE_FN_ARG_PTR(input_headers);
+  TRACE_FN_ARG_PTR(header_include_names);
+  TRACE_FN_ARG_PTR(pfn_notify);
+  TRACE_FN_ARG_PTR(user_data);
+  TRACE_FN_ARG_END();
+  return clCompileProgram(program, num_devices, device_list, options,
+                          num_input_headers, input_headers,
+                          header_include_names, pfn_notify, user_data);
+}
+
+cl_int TRACE_FN(clBuildProgram)(
+    cl_program program,
+    cl_uint num_devices,
+    const cl_device_id *device_list,
+    const char *options,
+    void (CL_CALLBACK *pfn_notify)(cl_program, void *),
+    void *user_data) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(program);
+  TRACE_FN_ARG_UINT(num_devices);
+  TRACE_FN_ARG_PTR(device_list);
+  TRACE_FN_ARG_PTR(options);
+  TRACE_FN_ARG_PTR(pfn_notify);
+  TRACE_FN_ARG_PTR(user_data);
+  TRACE_FN_ARG_END();
+  return clBuildProgram(program, num_devices, device_list, options, pfn_notify,
+                        user_data);
+}
+
+cl_mem TRACE_FN(clCreateBuffer)(
+    cl_context context,
+    cl_mem_flags flags,
+    size_t size,
+    void *host_ptr,
+    cl_int *errcode_ret) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(context);
+  TRACE_FN_ARG_ULONG(flags);
+  TRACE_FN_ARG_SIZE(size);
+  TRACE_FN_ARG_PTR(host_ptr);
+  TRACE_FN_ARG_PTR(errcode_ret);
+  TRACE_FN_ARG_END();
+  return clCreateBuffer(context, flags, size, host_ptr, errcode_ret);
+}
+
+cl_command_queue TRACE_FN(clCreateCommandQueueWithProperties)(
+    cl_context context,
+    cl_device_id device,
+    const cl_queue_properties *properties,
+    cl_int *errcode_ret) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(context);
+  TRACE_FN_ARG_PTR(device);
+  TRACE_FN_ARG_PTR(properties);
+  TRACE_FN_ARG_PTR(errcode_ret);
+  TRACE_FN_ARG_END();
+  return clCreateCommandQueueWithProperties(context, device, properties,
+                                            errcode_ret);
+}
+
+cl_context TRACE_FN(clCreateContext)(
+    const cl_context_properties *properties,
+    cl_uint num_devices,
+    const cl_device_id *devices,
+    void (CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
+    void *user_data,
+    cl_int *errcode_ret) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(properties);
+  TRACE_FN_ARG_UINT(num_devices);
+  TRACE_FN_ARG_PTR(devices);
+  TRACE_FN_ARG_PTR(pfn_notify);
+  TRACE_FN_ARG_PTR(user_data);
+  TRACE_FN_ARG_PTR(errcode_ret);
+  TRACE_FN_ARG_END();
+  return clCreateContext(properties, num_devices, devices, pfn_notify,
+                         user_data, errcode_ret);
+}
+
+cl_kernel TRACE_FN(clCreateKernel)(
+    cl_program program,
+    const char *kernel_name,
+    cl_int *errcode_ret) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(program);
+  TRACE_FN_ARG_PTR(kernel_name);
+  TRACE_FN_ARG_PTR(errcode_ret);
+  TRACE_FN_ARG_END();
+  return clCreateKernel(program, kernel_name, errcode_ret);
+}
+
+cl_program TRACE_FN(clCreateProgramWithIL)(
+    cl_context context,
+    const void *il,
+    size_t length,
+    cl_int *errcode_ret) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(context);
+  TRACE_FN_ARG_PTR(il);
+  TRACE_FN_ARG_SIZE(length);
+  TRACE_FN_ARG_PTR(errcode_ret);
+  TRACE_FN_ARG_END();
+  return clCreateProgramWithIL(context, il, length, errcode_ret);
+}
+
+cl_int TRACE_FN(clEnqueueBarrierWithWaitList)(
+    cl_command_queue command_queue,
+    cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(command_queue);
+  TRACE_FN_ARG_UINT(num_events_in_wait_list);
+  TRACE_FN_ARG_PTR(event_wait_list);
+  TRACE_FN_ARG_PTR(event);
+  TRACE_FN_ARG_END();
+  return clEnqueueBarrierWithWaitList(command_queue, num_events_in_wait_list,
+                                      event_wait_list, event);
+}
+
+cl_int TRACE_FN(clEnqueueNDRangeKernel)(
+    cl_command_queue command_queue,
+    cl_kernel kernel,
+    cl_uint work_dim,
+    const size_t *global_work_offset,
+    const size_t *global_work_size,
+    const size_t *local_work_size,
+    cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(command_queue);
+  TRACE_FN_ARG_PTR(kernel);
+  TRACE_FN_ARG_UINT(work_dim);
+  TRACE_FN_ARG_PTR(global_work_offset);
+  TRACE_FN_ARG_PTR(global_work_size);
+  TRACE_FN_ARG_PTR(local_work_size);
+  TRACE_FN_ARG_UINT(num_events_in_wait_list);
+  TRACE_FN_ARG_PTR(event_wait_list);
+  TRACE_FN_ARG_PTR(event);
+  TRACE_FN_ARG_END();
+  return clEnqueueNDRangeKernel(command_queue, kernel, work_dim,
+                                global_work_offset, global_work_size,
+                                local_work_size, num_events_in_wait_list,
+                                event_wait_list, event);
+}
+
+cl_int TRACE_FN(clEnqueueReadBuffer)(
+    cl_command_queue command_queue,
+    cl_mem buffer,
+    cl_bool blocking_read,
+    size_t offset,
+    size_t size,
+    void *ptr,
+    cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(command_queue);
+  TRACE_FN_ARG_PTR(buffer);
+  TRACE_FN_ARG_UINT(blocking_read);
+  TRACE_FN_ARG_SIZE(offset);
+  TRACE_FN_ARG_SIZE(size);
+  TRACE_FN_ARG_PTR(ptr);
+  TRACE_FN_ARG_UINT(num_events_in_wait_list);
+  TRACE_FN_ARG_PTR(event_wait_list);
+  TRACE_FN_ARG_PTR(event);
+  TRACE_FN_ARG_END();
+  return clEnqueueReadBuffer(command_queue, buffer, blocking_read, offset, size,
+                             ptr, num_events_in_wait_list, event_wait_list,
+                             event);
+}
+
+cl_int TRACE_FN(clEnqueueSVMMap)(
+    cl_command_queue command_queue,
+    cl_bool blocking_map,
+    cl_map_flags flags,
+    void *svm_ptr,
+    size_t size,
+    cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(command_queue);
+  TRACE_FN_ARG_UINT(blocking_map);
+  TRACE_FN_ARG_ULONG(flags);
+  TRACE_FN_ARG_PTR(svm_ptr);
+  TRACE_FN_ARG_SIZE(size);
+  TRACE_FN_ARG_UINT(num_events_in_wait_list);
+  TRACE_FN_ARG_PTR(event_wait_list);
+  TRACE_FN_ARG_PTR(event);
+  TRACE_FN_ARG_END();
+  return clEnqueueSVMMap(command_queue, blocking_map, flags, svm_ptr, size,
+                         num_events_in_wait_list, event_wait_list, event);
+}
+
+cl_int TRACE_FN(clEnqueueSVMMemcpy)(
+    cl_command_queue command_queue,
+    cl_bool blocking_copy,
+    void *dst_ptr,
+    const void *src_ptr,
+    size_t size,
+    cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(command_queue);
+  TRACE_FN_ARG_UINT(blocking_copy);
+  TRACE_FN_ARG_PTR(dst_ptr);
+  TRACE_FN_ARG_PTR(src_ptr);
+  TRACE_FN_ARG_SIZE(size);
+  TRACE_FN_ARG_UINT(num_events_in_wait_list);
+  TRACE_FN_ARG_PTR(event_wait_list);
+  TRACE_FN_ARG_PTR(event);
+  TRACE_FN_ARG_END();
+  return clEnqueueSVMMemcpy(command_queue, blocking_copy, dst_ptr, src_ptr,
+                            size, num_events_in_wait_list, event_wait_list,
+                            event);
+}
+
+cl_int TRACE_FN(clEnqueueMemcpyINTEL)(
+    clEnqueueMemcpyINTEL_fn funcptr,
+    cl_command_queue command_queue,
+    cl_bool blocking,
+    void *dst_ptr,
+    const void *src_ptr,
+    size_t size,
+    cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(command_queue);
+  TRACE_FN_ARG_UINT(blocking);
+  TRACE_FN_ARG_PTR(dst_ptr);
+  TRACE_FN_ARG_PTR(src_ptr);
+  TRACE_FN_ARG_SIZE(size);
+  TRACE_FN_ARG_UINT(num_events_in_wait_list);
+  TRACE_FN_ARG_PTR(event_wait_list);
+  TRACE_FN_ARG_PTR(event);
+  TRACE_FN_ARG_END();
+  return funcptr(command_queue, blocking, dst_ptr, src_ptr, size,
+                 num_events_in_wait_list, event_wait_list, event);
+}
+
+cl_int TRACE_FN(clEnqueueSVMUnmap)(
+    cl_command_queue command_queue,
+    void *svm_ptr,
+    cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(command_queue);
+  TRACE_FN_ARG_PTR(svm_ptr);
+  TRACE_FN_ARG_UINT(num_events_in_wait_list);
+  TRACE_FN_ARG_PTR(event_wait_list);
+  TRACE_FN_ARG_PTR(event);
+  TRACE_FN_ARG_END();
+  return clEnqueueSVMUnmap(command_queue, svm_ptr, num_events_in_wait_list,
+                           event_wait_list, event);
+}
+
+cl_int TRACE_FN(clEnqueueWriteBuffer)(
+    cl_command_queue command_queue,
+    cl_mem buffer,
+    cl_bool blocking_write,
+    size_t offset,
+    size_t size,
+    const void *ptr,
+    cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(command_queue);
+  TRACE_FN_ARG_PTR(buffer);
+  TRACE_FN_ARG_UINT(blocking_write);
+  TRACE_FN_ARG_SIZE(offset);
+  TRACE_FN_ARG_SIZE(size);
+  TRACE_FN_ARG_PTR(ptr);
+  TRACE_FN_ARG_UINT(num_events_in_wait_list);
+  TRACE_FN_ARG_PTR(event_wait_list);
+  TRACE_FN_ARG_PTR(event);
+  TRACE_FN_ARG_END();
+  return clEnqueueWriteBuffer(command_queue, buffer, blocking_write, offset,
+                              size, ptr, num_events_in_wait_list,
+                              event_wait_list, event);
+}
+
+cl_int TRACE_FN(clFinish)(
+    cl_command_queue command_queue) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(command_queue);
+  TRACE_FN_ARG_END();
+  return clFinish(command_queue);
+}
+
+cl_int TRACE_FN(clGetDeviceAndHostTimer)(
+    cl_device_id device,
+    cl_ulong *device_timestamp,
+    cl_ulong *host_timestamp) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(device);
+  TRACE_FN_ARG_PTR(device_timestamp);
+  TRACE_FN_ARG_PTR(host_timestamp);
+  TRACE_FN_ARG_END();
+  return clGetDeviceAndHostTimer(device, device_timestamp, host_timestamp);
+}
+
+cl_int TRACE_FN(clGetDeviceIDs)(
+    cl_platform_id platform,
+    cl_device_type device_type,
+    cl_uint num_entries,
+    cl_device_id *devices,
+    cl_uint *num_devices) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(platform);
+  TRACE_FN_ARG_ULONG(device_type);
+  TRACE_FN_ARG_UINT(num_entries);
+  TRACE_FN_ARG_PTR(devices);
+  TRACE_FN_ARG_PTR(num_devices);
+  TRACE_FN_ARG_END();
+  return clGetDeviceIDs(platform, device_type, num_entries, devices,
+                        num_devices);
+}
+
+cl_int TRACE_FN(clGetDeviceInfo)(
+    cl_device_id device,
+    cl_device_info param_name,
+    size_t param_value_size,
+    void *param_value,
+    size_t *param_value_size_ret) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(device);
+  TRACE_FN_ARG_UINT(param_name);
+  TRACE_FN_ARG_SIZE(param_value_size);
+  TRACE_FN_ARG_PTR(param_value);
+  TRACE_FN_ARG_PTR(param_value_size_ret);
+  TRACE_FN_ARG_END();
+  return clGetDeviceInfo(device, param_name, param_value_size, param_value,
+                         param_value_size_ret);
+}
+
+cl_int TRACE_FN(clGetEventInfo)(
+    cl_event event,
+    cl_event_info param_name,
+    size_t param_value_size,
+    void *param_value,
+    size_t *param_value_size_ret) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(event);
+  TRACE_FN_ARG_UINT(param_name);
+  TRACE_FN_ARG_SIZE(param_value_size);
+  TRACE_FN_ARG_PTR(param_value);
+  TRACE_FN_ARG_PTR(param_value_size_ret);
+  TRACE_FN_ARG_END();
+  return clGetEventInfo(event, param_name, param_value_size, param_value,
+                        param_value_size_ret);
+}
+
+cl_int TRACE_FN(clGetEventProfilingInfo)(
+    cl_event event,
+    cl_profiling_info param_name,
+    size_t param_value_size,
+    void *param_value,
+    size_t *param_value_size_ret) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(event);
+  TRACE_FN_ARG_UINT(param_name);
+  TRACE_FN_ARG_SIZE(param_value_size);
+  TRACE_FN_ARG_PTR(param_value);
+  TRACE_FN_ARG_PTR(param_value_size_ret);
+  TRACE_FN_ARG_END();
+  return clGetEventProfilingInfo(event, param_name, param_value_size,
+                                 param_value, param_value_size_ret);
+}
+
+void *TRACE_FN(clGetExtensionFunctionAddressForPlatform)(
+    cl_platform_id platform,
+    const char *funcname) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(platform);
+  TRACE_FN_ARG_PTR(funcname);
+  TRACE_FN_ARG_END();
+  return clGetExtensionFunctionAddressForPlatform(platform, funcname);
+}
+
+cl_int TRACE_FN(clGetKernelArgInfo)(
+    cl_kernel kernel,
+    cl_uint arg_index,
+    cl_kernel_arg_info param_name,
+    size_t param_value_size,
+    void *param_value,
+    size_t *param_value_size_ret) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(kernel);
+  TRACE_FN_ARG_UINT(arg_index);
+  TRACE_FN_ARG_UINT(param_name);
+  TRACE_FN_ARG_SIZE(param_value_size);
+  TRACE_FN_ARG_PTR(param_value);
+  TRACE_FN_ARG_PTR(param_value_size_ret);
+  TRACE_FN_ARG_END();
+  return clGetKernelArgInfo(kernel, arg_index, param_name, param_value_size,
+                            param_value, param_value_size_ret);
+}
+
+cl_int TRACE_FN(clGetKernelInfo)(
+    cl_kernel kernel,
+    cl_kernel_info param_name,
+    size_t param_value_size,
+    void *param_value,
+    size_t *param_value_size_ret) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(kernel);
+  TRACE_FN_ARG_UINT(param_name);
+  TRACE_FN_ARG_SIZE(param_value_size);
+  TRACE_FN_ARG_PTR(param_value);
+  TRACE_FN_ARG_PTR(param_value_size_ret);
+  TRACE_FN_ARG_END();
+  return clGetKernelInfo(kernel, param_name, param_value_size, param_value,
+                         param_value_size_ret);
+}
+
+cl_int TRACE_FN(clGetKernelSubGroupInfo)(
+    cl_kernel kernel,
+    cl_device_id device,
+    cl_kernel_sub_group_info param_name,
+    size_t input_value_size,
+    const void *input_value,
+    size_t param_value_size,
+    void *param_value,
+    size_t *param_value_size_ret) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(kernel);
+  TRACE_FN_ARG_PTR(device);
+  TRACE_FN_ARG_UINT(param_name);
+  TRACE_FN_ARG_SIZE(input_value_size);
+  TRACE_FN_ARG_PTR(input_value);
+  TRACE_FN_ARG_SIZE(param_value_size);
+  TRACE_FN_ARG_PTR(param_value);
+  TRACE_FN_ARG_PTR(param_value_size_ret);
+  TRACE_FN_ARG_END();
+  return clGetKernelSubGroupInfo(kernel, device, param_name, input_value_size,
+                                 input_value, param_value_size, param_value,
+                                 param_value_size_ret);
+}
+
+cl_int TRACE_FN(clGetKernelWorkGroupInfo)(
+    cl_kernel kernel,
+    cl_device_id device,
+    cl_kernel_work_group_info param_name,
+    size_t param_value_size,
+    void *param_value,
+    size_t *param_value_size_ret) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(kernel);
+  TRACE_FN_ARG_PTR(device);
+  TRACE_FN_ARG_UINT(param_name);
+  TRACE_FN_ARG_SIZE(param_value_size);
+  TRACE_FN_ARG_PTR(param_value);
+  TRACE_FN_ARG_PTR(param_value_size_ret);
+  TRACE_FN_ARG_END();
+  return clGetKernelWorkGroupInfo(kernel, device, param_name, param_value_size,
+                                  param_value, param_value_size_ret);
+}
+
+cl_int TRACE_FN(clGetMemAllocInfoINTEL)(
+    clGetMemAllocInfoINTEL_fn funcptr,
+    cl_context context,
+    const void *ptr,
+    cl_mem_info_intel param_name,
+    size_t param_value_size,
+    void *param_value,
+    size_t *param_value_size_ret) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(context);
+  TRACE_FN_ARG_PTR(ptr);
+  TRACE_FN_ARG_UINT(param_name);
+  TRACE_FN_ARG_SIZE(param_value_size);
+  TRACE_FN_ARG_PTR(param_value);
+  TRACE_FN_ARG_PTR(param_value_size_ret);
+  TRACE_FN_ARG_END();
+  return funcptr(context, ptr, param_name, param_value_size, param_value,
+                 param_value_size_ret);
+}
+
+cl_int TRACE_FN(clGetPlatformIDs)(
+    cl_uint num_entries,
+    cl_platform_id *platforms,
+    cl_uint *num_platforms) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_UINT(num_entries);
+  TRACE_FN_ARG_PTR(platforms);
+  TRACE_FN_ARG_PTR(num_platforms);
+  TRACE_FN_ARG_END();
+  return clGetPlatformIDs(num_entries, platforms, num_platforms);
+}
+
+cl_int TRACE_FN(clGetPlatformInfo)(
+    cl_platform_id platform,
+    cl_platform_info param_name,
+    size_t param_value_size,
+    void *param_value,
+    size_t *param_value_size_ret) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(platform);
+  TRACE_FN_ARG_UINT(param_name);
+  TRACE_FN_ARG_SIZE(param_value_size);
+  TRACE_FN_ARG_PTR(param_value);
+  TRACE_FN_ARG_PTR(param_value_size_ret);
+  TRACE_FN_ARG_END();
+  return clGetPlatformInfo(platform, param_name, param_value_size, param_value,
+                           param_value_size_ret);
+}
+
+cl_int TRACE_FN(clGetProgramBuildInfo)(
+    cl_program program,
+    cl_device_id device,
+    cl_program_build_info param_name,
+    size_t param_value_size,
+    void *param_value,
+    size_t *param_value_size_ret) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(program);
+  TRACE_FN_ARG_PTR(device);
+  TRACE_FN_ARG_UINT(param_name);
+  TRACE_FN_ARG_SIZE(param_value_size);
+  TRACE_FN_ARG_PTR(param_value);
+  TRACE_FN_ARG_PTR(param_value_size_ret);
+  TRACE_FN_ARG_END();
+  return clGetProgramBuildInfo(program, device, param_name, param_value_size,
+                               param_value, param_value_size_ret);
+}
+
+void *TRACE_FN(clHostMemAllocINTEL)(
+    clHostMemAllocINTEL_fn funcptr,
+    cl_context context,
+    const cl_mem_properties_intel *properties,
+    size_t size,
+    cl_uint alignment,
+    cl_int *errcode_ret) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(context);
+  TRACE_FN_ARG_PTR(properties);
+  TRACE_FN_ARG_SIZE(size);
+  TRACE_FN_ARG_UINT(alignment);
+  TRACE_FN_ARG_PTR(errcode_ret);
+  TRACE_FN_ARG_END();
+  return funcptr(context, properties, size, alignment, errcode_ret);
+}
+
+void *TRACE_FN(clDeviceMemAllocINTEL)(
+    clDeviceMemAllocINTEL_fn funcptr,
+    cl_context context,
+    cl_device_id device,
+    const cl_mem_properties_intel *properties,
+    size_t size,
+    cl_uint alignment,
+    cl_int *errcode_ret) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(context);
+  TRACE_FN_ARG_PTR(device);
+  TRACE_FN_ARG_PTR(properties);
+  TRACE_FN_ARG_SIZE(size);
+  TRACE_FN_ARG_UINT(alignment);
+  TRACE_FN_ARG_PTR(errcode_ret);
+  TRACE_FN_ARG_END();
+  return funcptr(context, device, properties, size, alignment, errcode_ret);
+}
+
+cl_program TRACE_FN(clLinkProgram)(
+    cl_context context,
+    cl_uint num_devices,
+    const cl_device_id *device_list,
+    const char *options,
+    cl_uint num_input_programs,
+    const cl_program *input_programs,
+    void (CL_CALLBACK *pfn_notify)(cl_program, void *),
+    void *user_data,
+    cl_int *errcode_ret) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(context);
+  TRACE_FN_ARG_UINT(num_devices);
+  TRACE_FN_ARG_PTR(device_list);
+  TRACE_FN_ARG_PTR(options);
+  TRACE_FN_ARG_UINT(num_input_programs);
+  TRACE_FN_ARG_PTR(input_programs);
+  TRACE_FN_ARG_PTR(pfn_notify);
+  TRACE_FN_ARG_PTR(user_data);
+  TRACE_FN_ARG_END();
+  return clLinkProgram(context, num_devices, device_list, options,
+                       num_input_programs, input_programs, pfn_notify,
+                       user_data, errcode_ret);
+}
+
+cl_program TRACE_FN(clCreateProgramWithBinary)(
+    cl_context context,
+    cl_uint num_devices,
+    const cl_device_id *device_list,
+    const size_t *lengths,
+    const unsigned char **binaries,
+    cl_int *binary_status,
+    cl_int *errcode_ret) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(context);
+  TRACE_FN_ARG_UINT(num_devices);
+  TRACE_FN_ARG_PTR(device_list);
+  TRACE_FN_ARG_PTR(lengths);
+  TRACE_FN_ARG_PTR(binaries);
+  TRACE_FN_ARG_PTR(binary_status);
+  TRACE_FN_ARG_END();
+  return clCreateProgramWithBinary(context, num_devices, device_list, lengths,
+                                   binaries, binary_status, errcode_ret);
+}
+
+cl_int TRACE_FN(clMemFreeINTEL)(
+    clMemFreeINTEL_fn funcptr,
+    cl_context context,
+    void *ptr) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(context);
+  TRACE_FN_ARG_PTR(ptr);
+  TRACE_FN_ARG_END();
+  return funcptr(context, ptr);
+}
+
+cl_int TRACE_FN(clReleaseCommandQueue)(
+    cl_command_queue command_queue) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(command_queue);
+  TRACE_FN_ARG_END();
+  return clReleaseCommandQueue(command_queue);
+}
+
+cl_int TRACE_FN(clReleaseContext)(
+    cl_context context) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(context);
+  TRACE_FN_ARG_END();
+  return clReleaseContext(context);
+}
+
+cl_int TRACE_FN(clReleaseKernel)(
+    cl_kernel kernel) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(kernel);
+  TRACE_FN_ARG_END();
+  return clReleaseKernel(kernel);
+}
+
+cl_int TRACE_FN(clReleaseMemObject)(
+    cl_mem memobj) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(memobj);
+  TRACE_FN_ARG_END();
+  return clReleaseMemObject(memobj);
+}
+
+cl_int TRACE_FN(clReleaseProgram)(
+    cl_program program) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(program);
+  TRACE_FN_ARG_END();
+  return clReleaseProgram(program);
+}
+
+cl_int TRACE_FN(clSetEventCallback)(
+    cl_event event,
+    cl_int command_exec_callback_type,
+    void (CL_CALLBACK *pfn_notify)(cl_event, cl_int, void *),
+    void *user_data) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(event);
+  TRACE_FN_ARG_INT(command_exec_callback_type);
+  TRACE_FN_ARG_PTR(pfn_notify);
+  TRACE_FN_ARG_PTR(user_data);
+  TRACE_FN_ARG_END();
+  return clSetEventCallback(event, command_exec_callback_type, pfn_notify,
+                            user_data);
+}
+
+cl_int TRACE_FN(clSetKernelArg)(
+    cl_kernel kernel,
+    cl_uint arg_index,
+    size_t arg_size,
+    const void *arg_value) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(kernel);
+  TRACE_FN_ARG_UINT(arg_index);
+  TRACE_FN_ARG_SIZE(arg_size);
+  TRACE_FN_ARG_PTR(arg_value);
+  TRACE_FN_ARG_END();
+  return clSetKernelArg(kernel, arg_index, arg_size, arg_value);
+}
+
+cl_int TRACE_FN(clSetKernelArgSVMPointer)(
+    cl_kernel kernel,
+    cl_uint arg_index,
+    const void *arg_value) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(kernel);
+  TRACE_FN_ARG_UINT(arg_index);
+  TRACE_FN_ARG_PTR(arg_value);
+  TRACE_FN_ARG_END();
+  return clSetKernelArgSVMPointer(kernel, arg_index, arg_value);
+}
+
+cl_int TRACE_FN(clSetKernelArgMemPointerINTEL)(
+    clSetKernelArgMemPointerINTEL_fn funcptr,
+    cl_kernel kernel,
+    cl_uint arg_index,
+    const void *arg_value) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(kernel);
+  TRACE_FN_ARG_UINT(arg_index);
+  TRACE_FN_ARG_PTR(arg_value);
+  TRACE_FN_ARG_END();
+  return funcptr(kernel, arg_index, arg_value);
+}
+
+cl_int TRACE_FN(clSetKernelExecInfo)(
+    cl_kernel kernel,
+    cl_kernel_exec_info param_name,
+    size_t param_value_size,
+    const void *param_value) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(kernel);
+  TRACE_FN_ARG_UINT(param_name);
+  TRACE_FN_ARG_SIZE(param_value_size);
+  TRACE_FN_ARG_PTR(param_value);
+  TRACE_FN_ARG_END();
+  return clSetKernelExecInfo(kernel, param_name, param_value_size, param_value);
+}
+
+void *TRACE_FN(clSharedMemAllocINTEL)(
+    clSharedMemAllocINTEL_fn funcptr,
+    cl_context context,
+    cl_device_id device,
+    const cl_mem_properties_intel *properties,
+    size_t size,
+    cl_uint alignment,
+    cl_int *errcode_ret) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(context);
+  TRACE_FN_ARG_PTR(device);
+  TRACE_FN_ARG_PTR(properties);
+  TRACE_FN_ARG_SIZE(size);
+  TRACE_FN_ARG_UINT(alignment);
+  TRACE_FN_ARG_PTR(errcode_ret);
+  TRACE_FN_ARG_END();
+  return funcptr(context, device, properties, size, alignment, errcode_ret);
+}
+
+void *TRACE_FN(clSVMAlloc)(
+    cl_context context,
+    cl_svm_mem_flags flags,
+    size_t size,
+    cl_uint alignment) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(context);
+  TRACE_FN_ARG_ULONG(flags);
+  TRACE_FN_ARG_SIZE(size);
+  TRACE_FN_ARG_UINT(alignment);
+  TRACE_FN_ARG_END();
+  return clSVMAlloc(context, flags, size, alignment);
+}
+
+void TRACE_FN(clSVMFree)(
+    cl_context context,
+    void *svm_pointer) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(context);
+  TRACE_FN_ARG_PTR(svm_pointer);
+  TRACE_FN_ARG_END();
+  clSVMFree(context, svm_pointer);
+}
+
+cl_int TRACE_FN(clWaitForEvents)(
+    cl_uint num_events,
+    const cl_event *event_list) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_UINT(num_events);
+  TRACE_FN_ARG_PTR(event_list);
+  TRACE_FN_ARG_END();
+  return clWaitForEvents(num_events, event_list);
+}
+
+cl_int TRACE_FN(clGetKernelSuggestedLocalWorkSizeINTEL)(
+    clGetKernelSuggestedLocalWorkSizeINTEL_fn funcptr,
+    cl_command_queue command_queue,
+    cl_kernel kernel,
+    cl_uint work_dim,
+    const size_t *global_work_offset,
+    const size_t *global_work_size,
+    size_t *suggested_local_work_size) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(command_queue);
+  TRACE_FN_ARG_PTR(kernel);
+  TRACE_FN_ARG_UINT(work_dim);
+  TRACE_FN_ARG_PTR(global_work_offset);
+  TRACE_FN_ARG_PTR(global_work_size);
+  TRACE_FN_ARG_PTR(suggested_local_work_size);
+  TRACE_FN_ARG_END();
+  return funcptr(command_queue, kernel, work_dim, global_work_offset,
+                 global_work_size, suggested_local_work_size);
+}
+
+cl_int TRACE_FN(clSetProgramSpecializationConstant)(
+    clSetProgramSpecializationConstant_fn funcptr,
+    cl_program program,
+    cl_uint spec_id,
+    size_t spec_size,
+    const void* spec_value) {
+  TRACE_FN_ARG_BEGIN();
+  TRACE_FN_ARG_PTR(program);
+  TRACE_FN_ARG_UINT(spec_id);
+  TRACE_FN_ARG_SIZE(spec_size);
+  TRACE_FN_ARG_PTR(spec_value);
+  TRACE_FN_ARG_END();
+  return funcptr(program, spec_id, spec_size, spec_value);
+}
+
+/// Calls without error check
+#define CALL_CL_SILENT(Rc, Fn, ...)                                            \
+  do {                                                                         \
+    if (DebugLevel > 1) {                                                      \
+      DPCALL("CL_CALLER: %s %s\n", TO_STRING(Fn), TO_STRING(( __VA_ARGS__ ))); \
+      Rc = TRACE_FN(Fn)(__VA_ARGS__);                                          \
+    } else {                                                                   \
+      Rc = Fn(__VA_ARGS__);                                                    \
+    }                                                                          \
+  } while (0)
+
+/// Calls that only have return code
+#define CALL_CL(Rc, Fn, ...)                                                   \
+  do {                                                                         \
+    CALL_CL_SILENT(Rc, Fn, __VA_ARGS__);                                       \
+    if (Rc != CL_SUCCESS) {                                                    \
+      DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, Rc,    \
+         getCLErrorName(Rc));                                                  \
+    }                                                                          \
+  } while (0)
+
+/// Emit warning for unsuccessful CL call
+#define CALL_CLW(Rc, Fn, ...)                                                  \
+  do {                                                                         \
+    CALL_CL_SILENT(Rc, Fn, __VA_ARGS__);                                       \
+    if (Rc != CL_SUCCESS) {                                                    \
+      DP("Warning: %s:%s returned %d, %s\n", __func__, #Fn, Rc,                \
+         getCLErrorName(Rc));                                                  \
+    }                                                                          \
+  } while (0)
+
+#define CALL_CL_RET(Ret, Fn, ...)                                              \
+  do {                                                                         \
+    cl_int rc;                                                                 \
+    CALL_CL(rc, Fn, __VA_ARGS__);                                              \
+    if (rc != CL_SUCCESS)                                                      \
+      return Ret;                                                              \
+  } while (0)
+
+#define CALL_CLW_RET(Ret, Fn, ...)                                             \
+  do {                                                                         \
+    cl_int rc;                                                                 \
+    CALL_CLW(rc, Fn, __VA_ARGS__);                                             \
+    if (rc != CL_SUCCESS)                                                      \
+      return Ret;                                                              \
+  } while (0)
+
+#define CALL_CL_EXIT_FAIL(Fn, ...)                                             \
+  do {                                                                         \
+    cl_int rc;                                                                 \
+    CALL_CL(rc, Fn, __VA_ARGS__);                                              \
+    if (rc != CL_SUCCESS)                                                      \
+      exit(EXIT_FAILURE);                                                      \
+  } while (0)
+
+#define CALL_CL_RET_FAIL(Fn, ...) CALL_CL_RET(OFFLOAD_FAIL, Fn, __VA_ARGS__)
+#define CALL_CL_RET_NULL(Fn, ...) CALL_CL_RET(nullptr, Fn, __VA_ARGS__)
+#define CALL_CL_RET_ZERO(Fn, ...) CALL_CL_RET(0, Fn, __VA_ARGS__)
+#define CALL_CL_RET_VOID(Fn, ...) CALL_CL_RET(, Fn, __VA_ARGS__)
+#define CALL_CLW_RET_VOID(Fn, ...) CALL_CLW_RET(, Fn, __VA_ARGS__)
+
+/// Calls that have return value and return code
+#define CALL_CL_RVRC(Rv, Fn, Rc, ...)                                          \
+  do {                                                                         \
+    if (DebugLevel > 1) {                                                      \
+      DPCALL("CL_CALLER: %s %s\n", TO_STRING(Fn), TO_STRING(( __VA_ARGS__ ))); \
+      Rv = TRACE_FN(Fn)(__VA_ARGS__, &Rc);                                     \
+    } else {                                                                   \
+      Rv = Fn(__VA_ARGS__, &Rc);                                               \
+    }                                                                          \
+    if (Rc != CL_SUCCESS) {                                                    \
+      DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, Rc,    \
+         getCLErrorName(Rc));                                                  \
+    }                                                                          \
+  } while (0)
+
+/// Calls that only have return value
+#define CALL_CL_RV(Rv, Fn, ...)                                                \
+  do {                                                                         \
+    if (DebugLevel > 1) {                                                      \
+      DPCALL("CL_CALLER: %s %s\n", TO_STRING(Fn), TO_STRING(( __VA_ARGS__ ))); \
+      Rv = TRACE_FN(Fn)(__VA_ARGS__);                                          \
+    } else {                                                                   \
+      Rv = Fn(__VA_ARGS__);                                                    \
+    }                                                                          \
+  } while (0)
+
+/// Calls that don't return anything
+#define CALL_CL_VOID(Fn, ...)                                                  \
+  do {                                                                         \
+    if (DebugLevel > 1) {                                                      \
+      DPCALL("CL_CALLER: %s %s\n", TO_STRING(Fn), TO_STRING(( __VA_ARGS__ ))); \
+      TRACE_FN(Fn)(__VA_ARGS__);                                               \
+    } else {                                                                   \
+      Fn(__VA_ARGS__);                                                         \
+    }                                                                          \
+  } while (0)
+
+/// Call extension function, return nothing
+#define CALL_CL_EXT_VOID(DeviceId, Name, ...)                                  \
+  do {                                                                         \
+    Name##_fn Fn = reinterpret_cast<Name##_fn>(                                \
+        DeviceInfo->getExtensionFunctionPtr(DeviceId, Name##Id));              \
+    if (DebugLevel > 1) {                                                      \
+      DPCALL("CL_CALLER: %s %s\n",                                             \
+           TO_STRING(Name), TO_STRING(( __VA_ARGS__ )));                       \
+      TRACE_FN(Name)(Fn, __VA_ARGS__);                                         \
+    } else {                                                                   \
+      (*Fn)(__VA_ARGS__);                                                      \
+    }                                                                          \
+  } while (0)
+
+/// Extension calls without error reporting
+#define CALL_CL_EXT_SILENT(DeviceId, Rc, Name, ...)                            \
+  do {                                                                         \
+    Name##_fn Fn = reinterpret_cast<Name##_fn>(                                \
+        DeviceInfo->getExtensionFunctionPtr(DeviceId, Name##Id));              \
+    if (DebugLevel > 1) {                                                      \
+      DPCALL("CL_CALLER: %s %s\n",                                             \
+           TO_STRING(Name), TO_STRING(( __VA_ARGS__ )));                       \
+      Rc = TRACE_FN(Name)(Fn, __VA_ARGS__);                                    \
+    } else {                                                                   \
+      Rc = (*Fn)(__VA_ARGS__);                                                 \
+    }                                                                          \
+  } while (0)
+
+/// Extension calls that only have return code
+#define CALL_CL_EXT(DeviceId, Rc, Name, ...)                                   \
+  do {                                                                         \
+    CALL_CL_EXT_SILENT(DeviceId, Rc, Name, __VA_ARGS__);                       \
+    if (Rc != CL_SUCCESS) {                                                    \
+      DP("Error: %s:%s failed with error code %d, %s\n",                       \
+         __func__, TO_STRING(Name), Rc, getCLErrorName(Rc));                   \
+    }                                                                          \
+  } while (0)
+
+/// Extension calls that have return value and return code
+#define CALL_CL_EXT_RVRC(DeviceId, Rv, Name, Rc, ...)                          \
+  do {                                                                         \
+    Name##_fn Fn = reinterpret_cast<Name##_fn>(                                \
+        DeviceInfo->getExtensionFunctionPtr(DeviceId, Name##Id));              \
+    if (DebugLevel > 1) {                                                      \
+      DPCALL("CL_CALLER: %s %s\n",                                             \
+           TO_STRING(Name), TO_STRING(( __VA_ARGS__ )));                       \
+      Rv = TRACE_FN(Name)(Fn, __VA_ARGS__, &Rc);                               \
+    } else {                                                                   \
+      Rv = (*Fn)(__VA_ARGS__, &Rc);                                            \
+    }                                                                          \
+    if (Rc != CL_SUCCESS) {                                                    \
+      DP("Error: %s:%s failed with error code %d, %s\n",                       \
+         __func__, TO_STRING(Name), Rc, getCLErrorName(Rc));                   \
+    }                                                                          \
+  } while (0)
+
+#define CALL_CL_EXT_RET(DeviceId, Ret, Name, ...)                              \
+  do {                                                                         \
+    cl_int rc;                                                                 \
+    CALL_CL_EXT(DeviceId, rc, Name, __VA_ARGS__);                              \
+    if (rc != CL_SUCCESS)                                                      \
+      return Ret;                                                              \
+  } while (0)
+
+#define CALL_CL_EXT_RET_FAIL(DeviceId, Name, ...)                              \
+  CALL_CL_EXT_RET(DeviceId, OFFLOAD_FAIL, Name, __VA_ARGS__)
+#define CALL_CL_EXT_RET_NULL(DeviceId, Name, ...)                              \
+  CALL_CL_EXT_RET(DeviceId, nullptr, Name, __VA_ARGS__)
+
+#endif // !defined(RTL_TRACE_H)
diff --git a/openmp/libomptarget/plugins/opencl/src/rtl.cpp b/openmp/libomptarget/plugins/opencl/src/rtl.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/plugins/opencl/src/rtl.cpp
@@ -0,0 +1,3969 @@
+//===--- Target RTLs Implementation ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is modified from https://github.com/daniel-schuermann/openmp.git.
+// Thanks to Daniel Scheuermann, the author of rtl.cpp.
+//
+// RTL for SPIR-V/OpenCL machine
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+#include <cassert>
+#include <cctype>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <list>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <set>
+#include <stdlib.h>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#ifdef _WIN32
+#include <fcntl.h>
+#include <io.h>
+#include <Windows.h>
+#else
+#include <dlfcn.h>
+#include <unistd.h>
+#endif
+
+#include "elf_light.h"
+#include "omptargetplugin.h"
+#include "rtl-trace.h"
+
+#include "llvm/Support/Endian.h"
+
+/// Additional TARGET_ALLOC* definition for the plugin
+constexpr int32_t TARGET_ALLOC_SVM = INT32_MAX;
+
+/// Device type enumeration common to compiler and runtime
+enum DeviceArch : uint64_t {
+  DeviceArch_None   = 0,
+  DeviceArch_Gen9   = 0x0001,
+  DeviceArch_XeLP   = 0x0002,
+  DeviceArch_XeHP   = 0x0004,
+  DeviceArch_x86_64 = 0x0100
+};
+
+/// Mapping from device arch to GPU runtime's device identifiers
+#ifdef _WIN32
+/// For now, we need to depend on known published product names
+std::map<uint64_t, std::vector<const char *>> DeviceArchStrMap {
+  {
+    DeviceArch_Gen9, {
+      "HD Graphics",
+      "UHD Graphics",
+      "Pro Graphics",
+      "Plus Graphics",
+      "Iris(TM) Graphics",
+    }
+  },
+  {
+    DeviceArch_XeLP, {
+      "Xe Graphics",
+      "Xe MAX Graphics"
+    }
+  }
+  // TODO: how to detect XeHP?
+  // Using XeHP on Windows seems to be a rare case.
+};
+#endif // _WIN32
+std::map<uint64_t, std::vector<uint32_t>> DeviceArchMap {
+  {
+    DeviceArch_Gen9, {
+      0x1900, // SKL
+      0x5900, // KBL
+      0x3E00, 0x9B00, // CFL
+      0x8A00, // ICX
+    }
+  },
+  {
+    DeviceArch_XeLP, {
+      0xFF20, 0x9A00, // TGL
+      0x4900, // DG1
+      0x4C00, // RKL
+      0x4600, // ADLS
+    }
+  },
+};
+
+int DebugLevel = getDebugLevel();
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// FIXME: we should actually include omp.h instead of declaring
+//        these ourselves.
+#if _WIN32
+int __cdecl omp_get_max_teams(void);
+int __cdecl omp_get_thread_limit(void);
+double __cdecl omp_get_wtime(void);
+int __cdecl __kmpc_global_thread_num(void *);
+#else   // !_WIN32
+int omp_get_max_teams(void) __attribute__((weak));
+int omp_get_thread_limit(void) __attribute__((weak));
+double omp_get_wtime(void) __attribute__((weak));
+int __kmpc_global_thread_num(void *) __attribute__((weak));
+#endif  // !_WIN32
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+class KernelInfoTy {
+  uint32_t Version = 0;
+  uint64_t Attributes1 = 0;
+  uint64_t WGNum = 0;
+  uint64_t WINum = 0;
+
+  struct KernelArgInfoTy {
+    bool IsLiteral = false;
+    uint32_t Size = 0;
+    KernelArgInfoTy(bool IsLiteral, uint32_t Size)
+      : IsLiteral(IsLiteral), Size(Size) {}
+  };
+  std::vector<KernelArgInfoTy> ArgsInfo;
+
+  void checkVersion(uint32_t MinVer) const {
+    assert(Version >= MinVer &&
+           "API is not supported for this version of KernelInfoTy.");
+    (void)Version;
+  }
+
+public:
+  KernelInfoTy(uint32_t Version) : Version(Version) {}
+  void addArgInfo(bool IsLiteral, uint32_t Size) {
+    checkVersion(1);
+    ArgsInfo.emplace_back(IsLiteral, Size);
+  }
+  size_t getArgsNum() const {
+    checkVersion(1);
+    return ArgsInfo.size();
+  }
+  bool isArgLiteral(uint32_t Idx) const {
+    checkVersion(1);
+    return ArgsInfo[Idx].IsLiteral;
+  }
+  uint32_t getArgSize(uint32_t Idx) const {
+    checkVersion(1);
+    return ArgsInfo[Idx].Size;
+  }
+  void setAttributes1(uint64_t Val) {
+    Attributes1 = Val;
+  }
+  bool getHasTeamsReduction() const {
+    return (Attributes1 & 1);
+  }
+  void setWGNum(uint64_t Val) {
+    WGNum = Val;
+  }
+  uint64_t getWGNum() const {
+    return WGNum;
+  }
+  void setWINum(uint64_t Val) {
+    WINum = Val;
+  }
+  uint64_t getWINum() const {
+    return WINum;
+  }
+  bool isAtomicFreeReduction() const {
+    return getWGNum();
+  }
+};
+
+/// Loop descriptor
+typedef struct {
+  int64_t Lb;     // The lower bound of the i-th loop
+  int64_t Ub;     // The upper bound of the i-th loop
+  int64_t Stride; // The stride of the i-th loop
+} TgtLoopDescTy;
+
+typedef struct {
+  int32_t NumLoops;        // Number of loops/dimensions
+  int32_t DistributeDim;   // Dimensions lower than this one
+                           // must end up in one WG
+  TgtLoopDescTy Levels[3]; // Up to 3 loops
+} TgtNDRangeDescTy;
+
+/// Profile data
+struct ProfileDataTy {
+  struct TimingsTy {
+    double host = 0.0;
+    double device = 0.0;
+  };
+
+  std::map<std::string, TimingsTy> data;
+
+  std::string alignLeft(size_t Width, std::string Str) {
+    if (Str.size() < Width)
+      return Str + std::string(Width - Str.size(), ' ');
+    return Str;
+  }
+
+  void printData(int32_t deviceId, int32_t threadId, const char *deviceName,
+                 int64_t resolution) {
+    std::string profileSep(80, '=');
+    std::string lineSep(80, '-');
+
+    fprintf(stderr, "%s\n", profileSep.c_str());
+
+    fprintf(stderr, "LIBOMPTARGET_PLUGIN_PROFILE(%s) for OMP DEVICE(%" PRId32
+            ") %s, Thread %" PRId32 "\n", GETNAME(TARGET_NAME), deviceId,
+            deviceName, threadId);
+
+    fprintf(stderr, "%s\n", lineSep.c_str());
+
+    const char *unit = resolution == 1000 ? "msec" : "usec";
+
+    std::string kernelPrefix("Kernel ");
+    size_t maxKeyLength = kernelPrefix.size() + 3;
+    for (const auto &d : data)
+      if (d.first.substr(0, kernelPrefix.size()) != kernelPrefix &&
+          maxKeyLength < d.first.size())
+        maxKeyLength = d.first.size();
+
+    // Print kernel key and name
+    int kernelId = 0;
+    for (const auto &d: data) {
+      if (d.first.substr(0, kernelPrefix.size()) == kernelPrefix)
+        fprintf(stderr, "-- %s: %s\n",
+                alignLeft(maxKeyLength, kernelPrefix +
+                          std::to_string(kernelId++)).c_str(),
+                d.first.substr(kernelPrefix.size()).c_str());
+    }
+
+    fprintf(stderr, "%s\n", lineSep.c_str());
+
+    fprintf(stderr, "-- %s:     Host Time (%s)   Device Time (%s)\n",
+            alignLeft(maxKeyLength, "Name").c_str(), unit, unit);
+
+    double hostTotal = 0.0;
+    double deviceTotal = 0.0;
+    kernelId = 0;
+    for (const auto &d : data) {
+      double hostTime = 1e-9 * d.second.host * resolution;
+      double deviceTime = 0.0;
+      std::string key(d.first);
+
+      if (d.first.substr(0, kernelPrefix.size()) == kernelPrefix) {
+        key = kernelPrefix + std::to_string(kernelId++);
+        deviceTime = 1e-9 * d.second.device * resolution;
+      } else if (d.first.substr(0, 8) == "DataRead" ||
+                 d.first.substr(0, 9) == "DataWrite") {
+        deviceTime = 1e-9 * d.second.device * resolution;
+      }
+
+      fprintf(stderr, "-- %s: %20.3f %20.3f\n",
+              alignLeft(maxKeyLength, key).c_str(), hostTime, deviceTime);
+      hostTotal += hostTime;
+      deviceTotal += deviceTime;
+    }
+    fprintf(stderr, "-- %s: %20.3f %20.3f\n",
+            alignLeft(maxKeyLength, "Total").c_str(), hostTotal, deviceTotal);
+    fprintf(stderr, "%s\n", profileSep.c_str());
+  }
+
+  // for non-event profile
+  void update(
+      const char *name, cl_ulong host_elapsed, cl_ulong device_elapsed) {
+    std::string key(name);
+    TimingsTy &timings = data[key];
+    timings.host += host_elapsed;
+    timings.device += device_elapsed;
+  }
+
+  void update(
+      const char *name, double host_elapsed, double device_elapsed) {
+    std::string key(name);
+    TimingsTy &timings = data[key];
+    timings.host += host_elapsed;
+    timings.device += device_elapsed;
+  }
+
+  // for event profile
+  void update(const char *name, cl_event event) {
+    cl_ulong host_begin = 0, host_end = 0;
+    CALL_CLW_RET_VOID(clGetEventProfilingInfo, event,
+        CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &host_begin, nullptr);
+    CALL_CLW_RET_VOID(clGetEventProfilingInfo, event,
+        CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &host_end, nullptr);
+    cl_ulong device_begin = 0, device_end = 0;
+    CALL_CLW_RET_VOID(clGetEventProfilingInfo, event,
+        CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &device_begin, nullptr);
+    CALL_CLW_RET_VOID(clGetEventProfilingInfo, event,
+        CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &device_end, nullptr);
+    update(name, host_end - host_begin, device_end - device_begin);
+  }
+}; // ProfileDataTy
+
+// Platform-dependent information -- context and INTEL extension API
+struct PlatformInfoTy {
+  cl_platform_id Platform = nullptr;
+  cl_context Context = nullptr;
+  std::vector<const char *> ExtensionFunctionNames {
+#define EXTENSION_FN_NAME(Fn) TO_STRING(Fn),
+      FOR_EACH_EXTENSION_FN(EXTENSION_FN_NAME)
+  };
+  std::vector<void *> ExtensionFunctionPointers;
+
+  PlatformInfoTy() = default;
+
+  PlatformInfoTy(cl_platform_id platform, cl_context context) {
+    Platform = platform;
+    Context = context;
+    ExtensionFunctionPointers.resize(ExtensionFunctionNames.size(), nullptr);
+    for (int i = 0; i < ExtensionIdLast; i++) {
+      CALL_CL_RV(ExtensionFunctionPointers[i],
+                 clGetExtensionFunctionAddressForPlatform, platform,
+                 ExtensionFunctionNames[i]);
+      if (ExtensionFunctionPointers[i]) {
+        DP("Extension %s is found.\n", ExtensionFunctionNames[i]);
+      } else {
+        DP("Warning: Extension %s is not found.\n", ExtensionFunctionNames[i]);
+      }
+    }
+  }
+};
+
+// OpenCL extensions status.
+enum ExtensionStatusTy : uint8_t {
+  // Default value.  It is unknown if the extension is supported.
+  ExtensionStatusUnknown = 0,
+
+  // Extension is disabled (either because it is unsupported or
+  // due to user environment control).
+  ExtensionStatusDisabled,
+
+  // Extenstion is enabled.  An extension can only be used,
+  // if it has this status after __tgt_rtl_load_binary.
+  ExtensionStatusEnabled,
+};
+
+// A descriptor of OpenCL extensions with their statuses.
+struct ExtensionsTy {
+  ExtensionStatusTy UnifiedSharedMemory = ExtensionStatusUnknown;
+  ExtensionStatusTy DeviceAttributeQuery = ExtensionStatusUnknown;
+  ExtensionStatusTy GetDeviceGlobalVariablePointer = ExtensionStatusUnknown;
+  ExtensionStatusTy SuggestedGroupSize = ExtensionStatusUnknown;
+
+  // Libdevice extensions that may be supported by device runtime.
+  struct LibdeviceExtDescTy {
+    const char *Name;
+    const char *FallbackLibName;
+    ExtensionStatusTy Status;
+  };
+
+  std::vector<LibdeviceExtDescTy> LibdeviceExtensions = {
+    {
+      "cl_intel_devicelib_cassert",
+      "libomp-fallback-cassert.spv",
+      ExtensionStatusUnknown
+    },
+    {
+      "cl_intel_devicelib_math",
+      "libomp-fallback-cmath.spv",
+      ExtensionStatusUnknown
+    },
+    {
+      "cl_intel_devicelib_math_fp64",
+      "libomp-fallback-cmath-fp64.spv",
+      ExtensionStatusUnknown
+    },
+    {
+      "cl_intel_devicelib_complex",
+      "libomp-fallback-complex.spv",
+      ExtensionStatusUnknown
+    },
+    {
+      "cl_intel_devicelib_complex_fp64",
+      "libomp-fallback-complex-fp64.spv",
+      ExtensionStatusUnknown
+    },
+    {
+      "cl_intel_devicelib_cstring",
+      "libomp-fallback-cstring.spv",
+      ExtensionStatusUnknown
+    },
+  };
+
+  // Initialize extensions' statuses for the given device.
+  int32_t getExtensionsInfoForDevice(int32_t DeviceId);
+};
+
+/// Data transfer method
+enum DataTransferMethodTy {
+  DATA_TRANSFER_METHOD_INVALID = -1,   // Invalid
+  DATA_TRANSFER_METHOD_CLMEM = 0,      // Use Buffer on SVM
+  DATA_TRANSFER_METHOD_SVMMAP,         // Use SVMMap/Unmap
+  DATA_TRANSFER_METHOD_SVMMEMCPY,      // Use SVMMemcpy
+  DATA_TRANSFER_METHOD_LAST,
+};
+
+/// OpenCL program that can contain multiple OCL programs
+class OpenCLProgramTy {
+  struct DeviceOffloadEntryTy {
+    /// Common part with the host offload table.
+    __tgt_offload_entry Base;
+    /// Length of the Base.name string in bytes including
+    /// the null terminator.
+    size_t NameSize;
+  };
+
+  /// Cached device image
+  __tgt_device_image *Image = nullptr;
+
+  /// Cached OpenCL context
+  cl_context Context = nullptr;
+
+  /// Cached OpenDL device
+  cl_device_id Device = nullptr;
+
+  /// Cached OpenMP device ID
+  int32_t DeviceId = 0;
+
+  /// Program is created from binary?
+  bool IsBinary = false;
+
+  /// Target table
+  __tgt_target_table Table;
+
+  /// Target entries
+  std::vector<__tgt_offload_entry> Entries;
+
+  /// Internal offload entries
+  std::vector<DeviceOffloadEntryTy> OffloadEntries;
+
+  /// Handle multiple modules within a single target image
+  std::vector<cl_program> Programs;
+
+  /// Kernels created from the target image
+  std::vector<cl_kernel> Kernels;
+
+  /// Kernel info added by compiler
+  std::unordered_map<cl_kernel, KernelInfoTy> KernelInfo;
+
+  /// Final OpenCL program
+  cl_program FinalProgram = nullptr;
+
+  /// Requires program link
+  bool RequiresProgramLink = false;
+
+  /// Loads the device version of the offload table for device \p DeviceId.
+  /// The table is expected to have \p NumEntries entries.
+  /// Returns true, if the load was successful, false - otherwise.
+  bool loadOffloadTable(size_t NumEntries);
+
+  /// Add a single OpenCL program created from the given SPIR-V image
+  int32_t addProgramIL(const size_t Size, const unsigned char *Image);
+
+  /// Add a single OpenCL program created from the given native image
+  int32_t addProgramBIN(const size_t Size, const unsigned char *Image);
+
+  /// Looks up an OpenMP declare target global variable with the given
+  /// \p Name and \p Size in the device environment for the current device.
+  /// The lookup is first done via the device offload table. If it fails,
+  /// then the lookup falls back to non-OpenMP specific lookup on the device.
+  void *getOffloadVarDeviceAddr(const char *Name, size_t Size);
+
+  /// Read KernelInfo auxiliary information for the specified kernel.
+  /// The information is stored in \p KernelInfo.
+  /// The function is called during the binary loading.
+  bool readKernelInfo(const __tgt_offload_entry &KernelEntry);
+
+public:
+  OpenCLProgramTy() = default;
+
+  OpenCLProgramTy(__tgt_device_image *Image_, cl_context Context_,
+                  cl_device_id Device_, int32_t DeviceId_) :
+      Image(Image_), Context(Context_), Device(Device_), DeviceId(DeviceId_) {}
+
+  ~OpenCLProgramTy();
+
+  int32_t buildPrograms(std::string &CompilationOptions,
+                        std::string &LinkingOptions);
+
+  int32_t compilePrograms(std::string &CompilationOptions,
+                          std::string &LinkingOptions);
+
+  int32_t linkPrograms(std::string &LinkingOptions);
+
+  /// Looks up an external global variable with the given \p Name
+  /// in the device environment for device \p DeviceId.
+  /// \p Size must not be null. If (*SizePtr) is not zero, then
+  /// the lookup verifies that the found variable's size matches
+  /// (*SizePtr), otherwise, the found variable's size is returned
+  /// via \p Size.
+  void *getVarDeviceAddr(const char *Name, size_t *SizePtr);
+
+  /// Looks up an external global variable with the given \p Name
+  /// and \p Size in the device environment for device \p DeviceId.
+  void *getVarDeviceAddr(const char *Name, size_t Size);
+
+  /// Build kernels from all modules.
+  int32_t buildKernels();
+
+  /// Return the pointer to the offload table.
+  __tgt_target_table *getTablePtr() { return &Table; }
+
+  /// Returns the auxiliary kernel information for the specified kernel.
+  const KernelInfoTy *getKernelInfo(cl_kernel Kernel) const;
+};
+
+/// RTL flags
+struct RTLFlagsTy {
+  uint64_t CollectDataTransferLatency : 1;
+  uint64_t EnableProfile : 1;
+  uint64_t UseInteropQueueInorderAsync : 1;
+  uint64_t UseInteropQueueInorderSharedSync : 1;
+  uint64_t UseHostMemForUSM : 1;
+  uint64_t UseDriverGroupSizes : 1;
+  uint64_t EnableSimd : 1;
+  uint64_t UseSVM : 1;
+  uint64_t UseBuffer : 1;
+  uint64_t UseSingleContext : 1;
+  uint64_t UseImageOptions : 1;
+  uint64_t ShowBuildLog : 1;
+  uint64_t LinkLibDevice : 1;
+  // Add new flags here
+  uint64_t Reserved : 51;
+  RTLFlagsTy() :
+      CollectDataTransferLatency(0),
+      EnableProfile(0),
+      UseInteropQueueInorderAsync(0),
+      UseInteropQueueInorderSharedSync(0),
+      UseHostMemForUSM(0),
+      UseDriverGroupSizes(0),
+      EnableSimd(0),
+      UseSVM(0),
+      UseBuffer(0),
+      UseSingleContext(0),
+      UseImageOptions(1),
+      ShowBuildLog(0),
+      LinkLibDevice(0),
+      Reserved(0) {}
+};
+
+/// Kernel properties.
+struct KernelPropertiesTy {
+  size_t Width = 0;
+  size_t SIMDWidth = 0;
+  size_t MaxThreadGroupSize = 0;
+  /// Kernel-specific implicit arguments
+  std::set<void *> ImplicitArgs;
+};
+
+/// Specialization constants used for an OpenCL program compilation.
+class SpecConstantsTy {
+  std::vector<uint32_t> ConstantIds;
+  std::vector<size_t> ConstantValueSizes;
+  std::vector<const void *> ConstantValues;
+
+public:
+  SpecConstantsTy() = default;
+  SpecConstantsTy(const SpecConstantsTy &) = delete;
+  SpecConstantsTy(const SpecConstantsTy &&Other)
+    : ConstantIds(std::move(Other.ConstantIds)),
+      ConstantValueSizes(std::move(Other.ConstantValueSizes)),
+      ConstantValues(std::move(Other.ConstantValues)) {}
+
+  ~SpecConstantsTy() {
+    for (auto I : ConstantValues) {
+      const char *ValuePtr = reinterpret_cast<const char *>(I);
+      delete[] ValuePtr;
+    }
+  }
+
+  template <typename T>
+  void addConstant(uint32_t Id, T Val) {
+    const size_t ValSize = sizeof(Val);
+    char *ValuePtr = new char[ValSize];
+    *reinterpret_cast<T *>(ValuePtr) = Val;
+
+    ConstantIds.push_back(Id);
+    ConstantValueSizes.push_back(ValSize);
+    ConstantValues.push_back(reinterpret_cast<void *>(ValuePtr));
+  }
+
+  void setProgramConstants(int32_t DeviceId, cl_program Program) const;
+};
+
+struct MemAllocInfoTy {
+  /// Base address allocated from compute runtime
+  void *Base = nullptr;
+  /// Allocation size known to users/libomptarget
+  size_t Size = 0;
+  /// TARGET_ALLOC kind
+  int32_t Kind = TARGET_ALLOC_DEFAULT;
+  /// Allocation from pool?
+  bool InPool = false;
+  /// Is implicit argument
+  bool ImplicitArg = false;
+
+  MemAllocInfoTy() = default;
+
+  MemAllocInfoTy(void *_Base, size_t _Size, int32_t _Kind, bool _InPool,
+                 bool _ImplicitArg) :
+      Base(_Base), Size(_Size), Kind(_Kind), InPool(_InPool),
+      ImplicitArg(_ImplicitArg) {}
+};
+
+/// Allocation information maintained in RTL
+class MemAllocInfoMapTy {
+  /// Map from allocated pointer to allocation information
+  std::map<void *, MemAllocInfoTy> Map;
+  /// Map from target alloc kind to number of implicit arguments
+  std::map<int32_t, uint32_t> NumImplicitArgs;
+  /// Mutex for guarding the internal data
+  std::mutex Mtx;
+
+public:
+  void add(void *Ptr, void *_Base, size_t _Size, int32_t _Kind,
+           bool _InPool = false, bool _ImplicitArg = false) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    auto Inserted = Map.emplace(
+        Ptr, MemAllocInfoTy{_Base, _Size, _Kind, _InPool, _ImplicitArg});
+    (void)Inserted;
+    if (_ImplicitArg)
+      NumImplicitArgs[_Kind]++;
+  }
+
+  bool remove(void *Ptr, MemAllocInfoTy *Removed = nullptr) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    auto AllocInfo = Map.find(Ptr);
+    if (AllocInfo == Map.end())
+      return false;
+    if (AllocInfo->second.ImplicitArg)
+      NumImplicitArgs[AllocInfo->second.Kind]--;
+    if (Removed)
+      *Removed = AllocInfo->second;
+    Map.erase(AllocInfo);
+    return true;
+  }
+
+  const MemAllocInfoTy *find(void *Ptr) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    auto AllocInfo = Map.find(Ptr);
+    if (AllocInfo == Map.end())
+      return nullptr;
+    else
+      return &AllocInfo->second;
+  }
+
+  /// Return allocation information if Ptr belongs to any allocation range.
+  const MemAllocInfoTy *search(void *Ptr) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    if (Map.size() == 0)
+      return nullptr;
+    auto I = Map.upper_bound(const_cast<void *>(Ptr));
+    // Key pointer (I->first) may be greater than Ptr, so both I and --I need to
+    // be checked.
+    int J = 0;
+    do {
+      std::advance(I, J);
+      if (I == Map.end())
+        continue;
+      uintptr_t AllocBase = (uintptr_t)I->second.Base;
+      size_t AllocSize = I->second.Size + (uintptr_t)I->first - AllocBase;
+      if (AllocBase <= (uintptr_t)Ptr && (uintptr_t)Ptr < AllocBase + AllocSize)
+        return &I->second;
+    } while (J-- > -1 && I != Map.begin());
+
+    return nullptr;
+  }
+
+  bool contains(const void *Ptr, size_t Size) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    if (Map.size() == 0)
+      return false;
+    auto I = Map.upper_bound(const_cast<void *>(Ptr));
+    if (I == Map.begin())
+      return false;
+    --I;
+    bool Ret = (uintptr_t)I->first <= (uintptr_t)Ptr &&
+        (uintptr_t)Ptr + (uintptr_t)Size <=
+        (uintptr_t)I->first + (uintptr_t)I->second.Size;
+    return Ret;
+  }
+
+  /// Add a list of implicit arguments to the output vector.
+  void getImplicitArgs(
+      std::vector<void *> &SVMArgs, std::vector<void *> &USMArgs) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    for (auto &AllocInfo : Map) {
+      if (AllocInfo.second.ImplicitArg) {
+        if (AllocInfo.second.Kind == TARGET_ALLOC_SVM)
+          SVMArgs.push_back(AllocInfo.first);
+        else
+          USMArgs.push_back(AllocInfo.first);
+      }
+    }
+  }
+
+  bool hasImplicitUSMArg(int32_t Kind = TARGET_ALLOC_DEFAULT) {
+    std::lock_guard<std::mutex> Lock(Mtx);
+    if (Kind == TARGET_ALLOC_DEFAULT) {
+      uint32_t Num = NumImplicitArgs[TARGET_ALLOC_HOST] +
+          NumImplicitArgs[TARGET_ALLOC_DEVICE] +
+          NumImplicitArgs[TARGET_ALLOC_SHARED];
+      return Num > 0;
+    } else {
+      return NumImplicitArgs[Kind] > 0;
+    }
+  }
+};
+
+/// RTL options and flags users can override
+struct RTLOptionTy {
+  /// Binary flags
+  RTLFlagsTy Flags;
+
+  /// Emulated data transfer latency in microsecond
+  int32_t DataTransferLatency = 0;
+
+  /// Data transfer method when SVM is used
+  int32_t DataTransferMethod = DATA_TRANSFER_METHOD_SVMMAP;
+
+  /// Plugin profiling resolution (msec by default)
+  int64_t ProfileResolution = 1000;
+
+  /// Used device type
+  cl_device_type DeviceType = CL_DEVICE_TYPE_GPU;
+
+  // OpenCL 2.0 builtins (like atomic_load_explicit and etc.) are used by
+  // runtime, so we have to explicitly specify the "-cl-std=CL2.0" compilation
+  // option. With it, the SPIR-V will be converted to LLVM IR with OpenCL 2.0
+  // builtins. Otherwise, SPIR-V will be converted to LLVM IR with OpenCL 1.2
+  // builtins.
+  std::string CompilationOptions = "-cl-std=CL2.0 ";
+  std::string UserCompilationOptions = "";
+  std::string UserLinkingOptions = "";
+
+  /// Limit for the number of WIs in a WG.
+  uint32_t ThreadLimit = 0;
+
+  /// Limit for the number of WGs.
+  uint32_t NumTeams = 0;
+
+  // This is a factor applied to the number of WGs computed
+  // for the execution, based on the HW characteristics.
+  size_t SubscriptionRate = 1;
+
+  // For kernels that compute cross-WG reductions the number of computed WGs
+  // is reduced by this factor.
+  size_t ReductionSubscriptionRate = 1;
+  bool ReductionSubscriptionRateIsDefault = true;
+
+  /// Loop kernels with known ND-range may be known to have
+  /// few iterations and they may not exploit the offload device
+  /// to the fullest extent.
+  /// Let's assume a device has N total HW threads available,
+  /// and the kernel requires M hardware threads with LWS set to L.
+  /// If (M < N * ThinThreadsThreshold), then we will try
+  /// to iteratively divide L by 2 to increase the number of HW
+  /// threads used for executing the kernel. Effectively, we will
+  /// end up with L less than the kernel's SIMD width, so the HW
+  /// threads will not use all their SIMD lanes. This (presumably) should
+  /// allow more parallelism, because the stalls in the SIMD lanes
+  /// will be distributed across more HW threads, and the probability
+  /// of having a stall (or a sequence of stalls) on a critical path
+  /// in the kernel should decrease.
+  /// Anyway, this is just a heuristics that seems to work well for some
+  /// kernels (which poorly expose parallelism in the first place).
+  double ThinThreadsThreshold = 0.1;
+
+  // Spec constants used for all OpenCL programs.
+  SpecConstantsTy CommonSpecConstants;
+
+  RTLOptionTy() {
+    const char *Env;
+
+    // Get global OMP_THREAD_LIMIT for SPMD parallelization.
+    int ThrLimit = omp_get_thread_limit();
+    DP("omp_get_thread_limit() returned %" PRId32 "\n", ThrLimit);
+    // omp_get_thread_limit() would return INT_MAX by default.
+    // NOTE: Windows.h defines max() macro, so we have to guard
+    //       the call with parentheses.
+    ThreadLimit = (ThrLimit > 0 &&
+        ThrLimit != (std::numeric_limits<int32_t>::max)()) ? ThrLimit : 0;
+
+    // Global max number of teams.
+    int NTeams = omp_get_max_teams();
+    DP("omp_get_max_teams() returned %" PRId32 "\n", NTeams);
+    // omp_get_max_teams() would return INT_MAX by default.
+    // NOTE: Windows.h defines max() macro, so we have to guard
+    //       the call with parentheses.
+    NumTeams = (NTeams > 0 &&
+        NTeams != (std::numeric_limits<int32_t>::max)()) ? NTeams : 0;
+
+    // Read LIBOMPTARGET_DATA_TRANSFER_LATENCY (experimental input)
+    if ((Env = readEnvVar("LIBOMPTARGET_DATA_TRANSFER_LATENCY"))) {
+      std::string Value(Env);
+      if (Value.substr(0, 2) == "T,") {
+        Flags.CollectDataTransferLatency = 1;
+        int32_t Usec = std::stoi(Value.substr(2).c_str());
+        DataTransferLatency = (Usec > 0) ? Usec : 0;
+      }
+    }
+
+    // Read LIBOMPTARGET_OPENCL_DATA_TRANSFER_METHOD
+    if ((Env = readEnvVar("LIBOMPTARGET_OPENCL_DATA_TRANSFER_METHOD"))) {
+      std::string Value(Env);
+      DataTransferMethod = DATA_TRANSFER_METHOD_INVALID;
+      if (Value.size() == 1 && std::isdigit(Value.c_str()[0])) {
+        int Method = std::stoi(Env);
+        if (Method < DATA_TRANSFER_METHOD_LAST)
+          DataTransferMethod = Method;
+      }
+      if (DataTransferMethod == DATA_TRANSFER_METHOD_INVALID) {
+        WARNING("Invalid data transfer method (%s) selected"
+                " -- using default method.\n", Env);
+        DataTransferMethod = DATA_TRANSFER_METHOD_SVMMAP;
+      }
+    }
+
+    // Read LIBOMPTARGET_DEVICETYPE
+    if ((Env = readEnvVar("LIBOMPTARGET_DEVICETYPE"))) {
+      std::string Value(Env);
+      if (Value == "GPU" || Value == "gpu" || Value == "")
+        DeviceType = CL_DEVICE_TYPE_GPU;
+      else if (Value == "CPU" || Value == "cpu")
+        DeviceType = CL_DEVICE_TYPE_CPU;
+      else
+        WARNING("Invalid or unsupported LIBOMPTARGET_DEVICETYPE=%s\n", Env);
+    }
+    DP("Target device type is set to %s\n",
+       (DeviceType == CL_DEVICE_TYPE_CPU) ? "CPU" : "GPU");
+
+    /// Oversubscription rate for normal kernels
+    if ((Env = readEnvVar("LIBOMPTARGET_OPENCL_SUBSCRIPTION_RATE"))) {
+      int32_t Value = std::stoi(Env);
+      // Set some reasonable limits.
+      if (Value > 0 && Value <= 0xFFFF)
+        SubscriptionRate = Value;
+    }
+
+    /// Oversubscription rate for reduction kernels
+    if ((Env = readEnvVar("LIBOMPTARGET_ONEAPI_REDUCTION_SUBSCRIPTION_RATE"))) {
+      int32_t Value = std::stoi(Env);
+      // Set some reasonable limits.
+      // '0' is a special value meaning to use regular default ND-range
+      // for kernels with reductions.
+      if (Value >= 0 && Value <= 0xFFFF) {
+        ReductionSubscriptionRate = Value;
+        ReductionSubscriptionRateIsDefault = false;
+      }
+    }
+
+    /// Read LIBOMPTARGET_PLUGIN_PROFILE
+    if ((Env = readEnvVar("LIBOMPTARGET_PLUGIN_PROFILE"))) {
+      std::istringstream Value(Env);
+      std::string Token;
+      while (std::getline(Value, Token, ',')) {
+        if (Token == "T" || Token == "1")
+          Flags.EnableProfile = 1;
+        else if (Token == "unit_usec" || Token == "usec")
+          ProfileResolution = 1000000;
+      }
+    }
+
+    if ((Env = readEnvVar("LIBOMPTARGET_ENABLE_SIMD"))) {
+      if (parseBool(Env) == 1)
+        Flags.EnableSimd = 1;
+      else
+        WARNING("Invalid or unsupported LIBOMPTARGET_ENABLE_SIMD=%s\n", Env);
+    }
+
+    // TODO: deprecate this variable since the default behavior is equivalent
+    //       to "inorder_async" and "inorder_shared_sync".
+    // Read LIBOMPTARGET_OPENCL_INTEROP_QUEUE
+    // Two independent options can be specified as follows.
+    // -- inorder_async: use a new in-order queue for asynchronous case
+    //    (default: shared out-of-order queue)
+    // -- inorder_shared_sync: use the existing shared in-order queue for
+    //    synchronous case (default: new in-order queue).
+    if ((Env = readEnvVar("LIBOMPTARGET_OPENCL_INTEROP_QUEUE",
+                          "LIBOMPTARGET_INTEROP_PIPE"))) {
+      std::istringstream Value(Env);
+      std::string Token;
+      while (std::getline(Value, Token, ',')) {
+        if (Token == "inorder_async") {
+          Flags.UseInteropQueueInorderAsync = 1;
+          DP("    enabled in-order asynchronous separate queue\n");
+        } else if (Token == "inorder_shared_sync") {
+          Flags.UseInteropQueueInorderSharedSync = 1;
+          DP("    enabled in-order synchronous shared queue\n");
+        }
+      }
+    }
+
+    if ((Env = readEnvVar("LIBOMPTARGET_OPENCL_COMPILATION_OPTIONS"))) {
+      UserCompilationOptions += Env;
+    }
+
+    if ((Env = readEnvVar("LIBOMPTARGET_OPENCL_LINKING_OPTIONS"))) {
+      UserLinkingOptions += Env;
+    }
+
+    // Read LIBOMPTARGET_USM_HOST_MEM
+    if ((Env = readEnvVar("LIBOMPTARGET_USM_HOST_MEM"))) {
+      if (parseBool(Env) == 1)
+        Flags.UseHostMemForUSM = 1;
+    }
+
+    // Read LIBOMPTARGET_OPENCL_USE_SVM
+    if ((Env = readEnvVar("LIBOMPTARGET_OPENCL_USE_SVM"))) {
+      int32_t Value = parseBool(Env);
+      if (Value == 1)
+        Flags.UseSVM = 1;
+      else if (Value == 0)
+        Flags.UseSVM = 0;
+    }
+
+    // Read LIBOMPTARGET_OPENCL_USE_BUFFER
+    if ((Env = readEnvVar("LIBOMPTARGET_OPENCL_USE_BUFFER"))) {
+      if (parseBool(Env) == 1)
+        Flags.UseBuffer = 1;
+    }
+
+    // Read LIBOMPTARGET_USE_SINGLE_CONTEXT
+    if ((Env = readEnvVar("LIBOMPTARGET_USE_SINGLE_CONTEXT"))) {
+      if (parseBool(Env) == 1)
+        Flags.UseSingleContext = 1;
+    }
+
+    if (readEnvVar("INTEL_ENABLE_OFFLOAD_ANNOTATIONS")) {
+      // To match SYCL RT behavior, we just need to check whether
+      // INTEL_ENABLE_OFFLOAD_ANNOTATIONS is set. The actual value
+      // does not matter.
+      CommonSpecConstants.addConstant<char>(0xFF747469, 1);
+    }
+
+    if ((Env = readEnvVar("LIBOMPTARGET_ONEAPI_USE_IMAGE_OPTIONS"))) {
+      int32_t Value = parseBool(Env);
+      if (Value == 1)
+        Flags.UseImageOptions = 1;
+      else if (Value == 0)
+        Flags.UseImageOptions = 0;
+    }
+
+    if ((Env = readEnvVar("LIBOMPTARGET_ONEAPI_SHOW_BUILD_LOG"))) {
+      int32_t Value = parseBool(Env);
+      if (Value == 1)
+        Flags.ShowBuildLog = 1;
+      else if (Value == 0)
+        Flags.ShowBuildLog = 0;
+    }
+
+    // LIBOMPTARGET_ONEAPI_LINK_LIBDEVICE
+    if ((Env = readEnvVar("LIBOMPTARGET_ONEAPI_LINK_LIBDEVICE"))) {
+      int32_t Value = parseBool(Env);
+      if (Value == 1)
+        Flags.LinkLibDevice = 1;
+      else if (Value == 0)
+        Flags.LinkLibDevice = 0;
+    }
+
+    if ((Env = readEnvVar("LIBOMPTARGET_ONEAPI_THIN_THREADS_THRESHOLD"))) {
+      char *StrEnd;
+      double Value = std::strtod(Env, &StrEnd);
+      if (errno == 0 && StrEnd != Env &&
+          Value >= 0.0 && Value <= 1.0) {
+        ThinThreadsThreshold = Value;
+      } else {
+        if (errno != 0)
+          DP("Error parsing value of "
+             "LIBOMPTARGET_ONEAPI_THIN_THREADS_THRESHOLD: %s\n",
+             strerror(errno));
+        DP("Value of LIBOMPTARGET_ONEAPI_THIN_THREADS_THRESHOLD must "
+           "be a non-negative floating-point number not greater than 1.0.\n");
+        DP("Using default value: %f\n", ThinThreadsThreshold);
+      }
+    }
+  }
+
+  /// Read environment variable value with optional deprecated name
+  const char *readEnvVar(const char *Name, const char *OldName = nullptr) {
+    if (!Name)
+      return nullptr;
+    const char *Value = std::getenv(Name);
+    if (Value || !OldName) {
+      if (Value)
+        DP("ENV: %s=%s\n", Name, Value);
+      return Value;
+    }
+    Value = std::getenv(OldName);
+    if (Value) {
+      DP("ENV: %s=%s\n", OldName, Value);
+      WARNING("%s is being deprecated. Use %s instead.\n", OldName, Name);
+    }
+    return Value;
+  }
+
+  /// Parse boolean value
+  /// Return 1 for: TRUE, T, 1, ON, YES, ENABLED (case insensitive)
+  /// Return 0 for: FALSE, F, 0, OFF, NO, DISABLED (case insensitive)
+  /// Return -1 for failed match
+  /// NOTE: we can later simplify the document to just TRUE or FALSE like what
+  /// OpenMP host runtime does.
+  int32_t parseBool(const char *Value) {
+    std::string Str(Value);
+    std::transform(Str.begin(), Str.end(), Str.begin(),
+                   [](unsigned char C) {return std::tolower(C);});
+    if (Str == "true" || Str == "t" || Str == "1" || Str == "on" ||
+        Str == "yes" || Str == "enabled")
+      return 1;
+    if (Str == "false" || Str == "f" || Str == "0" || Str == "off" ||
+        Str == "no" || Str == "disabled")
+      return 0;
+    return -1;
+  }
+}; // RTLOptionTy
+
+/// Device property
+struct DevicePropertiesTy {
+  cl_uint DeviceId = 0;
+  cl_uint NumSlices = 0;
+  cl_uint NumSubslicesPerSlice = 0;
+  cl_uint NumEUsPerSubslice = 0;
+  cl_uint NumThreadsPerEU = 0;
+  cl_uint NumHWThreads = 0;
+
+  int32_t getDeviceProperties(cl_device_id ID);
+};
+
+/// Class containing all the device information.
+class RTLDeviceInfoTy {
+public:
+  /// Number of OpenMP devices
+  cl_uint NumDevices = 0;
+
+  /// List of OpenCLProgramTy objects
+  std::vector<std::list<OpenCLProgramTy>> Programs;
+
+  /// Contains context and extension API
+  std::map<cl_platform_id, PlatformInfoTy> PlatformInfos;
+
+  /// Platform that each device belongs to
+  std::vector<cl_platform_id> Platforms;
+
+  /// Contexts used by each device
+  std::vector<cl_context> Contexts;
+
+  /// OpenCL device
+  std::vector<cl_device_id> Devices;
+
+  // Internal device type ID
+  std::vector<uint64_t> DeviceArchs;
+
+  /// Device properties
+  std::vector<int32_t> maxExecutionUnits;
+  std::vector<size_t> maxWorkGroupSize;
+  std::vector<cl_ulong> MaxMemAllocSize;
+  std::vector<DevicePropertiesTy> DeviceProperties;
+
+  /// A vector of descriptors of OpenCL extensions for each device.
+  std::vector<ExtensionsTy> Extensions;
+
+  /// Default command queues for each devices
+  std::vector<cl_command_queue> Queues;
+
+  /// Inorder command queues for each devices
+  std::vector<cl_command_queue> QueuesInOrder;
+
+  /// Kernel properties for each devices
+  std::vector<std::map<cl_kernel, KernelPropertiesTy>> KernelProperties;
+
+  /// Kernel-specific implicit arguments
+  std::vector<std::map<cl_kernel, std::set<void *>>> ImplicitArgs;
+
+  /// Thread-private profile information for each devices
+  std::vector<std::map<int32_t, ProfileDataTy>> Profiles;
+
+  std::vector<std::vector<char>> Names;
+
+  /// Whether each devices are initialized
+  std::vector<bool> Initialized;
+
+  std::vector<cl_ulong> SLMSize;
+
+  std::mutex *Mutexes;
+
+  std::mutex *ProfileLocks;
+
+  std::vector<std::set<void *>> ClMemBuffers;
+
+  /// Memory owned by the plugin
+  std::vector<std::vector<void *>> OwnedMemory;
+
+  /// Internal allocation information
+  std::vector<std::unique_ptr<MemAllocInfoMapTy>> MemAllocInfo;
+
+  /// Requires flags
+  int64_t RequiresFlags = OMP_REQ_UNDEFINED;
+
+  /// Number of active kernel launches for each device
+  std::vector<uint32_t> NumActiveKernels;
+
+  /// RTL option
+  RTLOptionTy Option;
+
+  RTLDeviceInfoTy() = default;
+
+  /// Return per-thread profile data
+  ProfileDataTy &getProfiles(int32_t DeviceId) {
+    int32_t gtid = __kmpc_global_thread_num(nullptr);
+    ProfileLocks[DeviceId].lock();
+    auto &profiles = Profiles[DeviceId];
+    if (profiles.count(gtid) == 0)
+      profiles.emplace(gtid, ProfileDataTy());
+    auto &profileData = profiles[gtid];
+    ProfileLocks[DeviceId].unlock();
+    return profileData;
+  }
+
+  /// Return context for the given device ID
+  cl_context getContext(int32_t DeviceId) {
+    if (Option.Flags.UseSingleContext)
+      return PlatformInfos[Platforms[DeviceId]].Context;
+    else
+      return Contexts[DeviceId];
+  }
+
+  /// Return the extension function pointer for the given ID
+  void *getExtensionFunctionPtr(int32_t DeviceId, int32_t ExtensionId) {
+    auto platformId = Platforms[DeviceId];
+    return PlatformInfos[platformId].ExtensionFunctionPointers[ExtensionId];
+  }
+
+  /// Return the extension function name for the given ID
+  const char *getExtensionFunctionName(int32_t DeviceId, int32_t ExtensionId) {
+    auto platformId = Platforms[DeviceId];
+    return PlatformInfos[platformId].ExtensionFunctionNames[ExtensionId];
+  }
+
+  /// Check if extension function is available and enabled.
+  bool isExtensionFunctionEnabled(int32_t DeviceId, int32_t ExtensionId) {
+    if (!getExtensionFunctionPtr(DeviceId, ExtensionId))
+      return false;
+
+    switch (ExtensionId) {
+    case clGetMemAllocInfoINTELId:
+    case clHostMemAllocINTELId:
+    case clDeviceMemAllocINTELId:
+    case clSharedMemAllocINTELId:
+    case clMemFreeINTELId:
+    case clSetKernelArgMemPointerINTELId:
+    case clEnqueueMemcpyINTELId:
+      return Extensions[DeviceId].UnifiedSharedMemory == ExtensionStatusEnabled;
+    case clGetDeviceGlobalVariablePointerINTELId:
+      return Extensions[DeviceId].GetDeviceGlobalVariablePointer ==
+          ExtensionStatusEnabled;
+    case clGetKernelSuggestedLocalWorkSizeINTELId:
+      return Extensions[DeviceId].SuggestedGroupSize == ExtensionStatusEnabled;
+    default:
+      return true;
+    }
+  }
+
+  /// Reset program data
+  int32_t resetProgramData(int32_t DeviceId);
+
+  /// Allocate cl_mem data
+  void *allocDataClMem(int32_t DeviceId, size_t Size);
+
+  /// Get PCI device ID
+  uint32_t getPCIDeviceId(int32_t DeviceId);
+
+  /// Get device arch
+  uint64_t getDeviceArch(int32_t DeviceId);
+
+  /// Get allocated memory type
+  cl_unified_shared_memory_type_intel getMemAllocType(
+      int32_t DeviceId, const void *Ptr);
+
+  /// For the given kernel return its KernelInfo auxiliary information
+  /// that was previously read by readKernelInfo().
+  const KernelInfoTy *
+      getKernelInfo(int32_t DeviceId, const cl_kernel &Kernel) const;
+
+  /// Get memory allocation properties to be used in memory allocation
+  std::unique_ptr<std::vector<cl_mem_properties_intel>>
+  getAllocMemProperties(int32_t DeviceId, size_t Size);
+};
+
+#ifdef _WIN32
+#define __ATTRIBUTE__(X)
+#else
+#define __ATTRIBUTE__(X)  __attribute__((X))
+#endif // _WIN32
+
+static RTLDeviceInfoTy *DeviceInfo = nullptr;
+
+__ATTRIBUTE__(constructor(101)) void init() {
+  DP("Init OpenCL plugin!\n");
+  DeviceInfo = new RTLDeviceInfoTy();
+}
+
+__ATTRIBUTE__(destructor(101)) void deinit() {
+  DP("Deinit OpenCL plugin!\n");
+  delete DeviceInfo;
+  DeviceInfo = nullptr;
+}
+
+#if _WIN32
+extern "C" BOOL WINAPI
+DllMain(HINSTANCE const instance, // handle to DLL module
+        DWORD const reason,       // reason for calling function
+        LPVOID const reserved)    // reserved
+{
+  // Perform actions based on the reason for calling.
+  switch (reason) {
+  case DLL_PROCESS_ATTACH:
+    // Initialize once for each new process.
+    // Return FALSE to fail DLL load.
+    init();
+    break;
+
+  case DLL_THREAD_ATTACH:
+    // Do thread-specific initialization.
+    break;
+
+  case DLL_THREAD_DETACH:
+    // Do thread-specific cleanup.
+    break;
+
+  case DLL_PROCESS_DETACH:
+    break;
+  }
+  return TRUE; // Successful DLL_PROCESS_ATTACH.
+}
+#endif // _WIN32
+
+// Helper class to collect time intervals for host and device.
+// The interval is managed by start()/stop() methods.
+// The automatic flush of the time interval happens at the object's
+// destruction.
+class ProfileIntervalTy {
+  // A timer interval may be either disabled, paused or running.
+  // Interval may switch from paused to running and from running
+  // to paused. Interval may switch to disabled start from any state,
+  // and it cannot switch to disabled to anything else.
+  enum TimerStatusTy {
+    Disabled,
+    Paused,
+    Running
+  };
+
+  // Cumulative times collected by this interval so far.
+  double DeviceElapsed = 0.0;
+  double HostElapsed = 0.0;
+
+  // Temporary timer values initialized at each interval
+  // (re)start.
+  cl_ulong DeviceTimeTemp = 0;
+  cl_ulong HostTimeTemp = 0;
+
+  // The interval name as seen in the profile data output.
+  std::string Name;
+
+  // OpenMP device id.
+  int32_t DeviceId;
+
+  // OpenCL device id.
+  cl_device_id ClDeviceId;
+
+  // Current status of the interval.
+  TimerStatusTy Status;
+
+public:
+  // Create new timer interval for the given OpenMP device
+  // and with the given name (which will be used for the profile
+  // data output).
+  ProfileIntervalTy(const char *Name, int32_t DeviceId)
+    : Name(Name), DeviceId(DeviceId),
+      ClDeviceId(DeviceInfo->Devices[DeviceId]) {
+    if (DeviceInfo->Option.Flags.EnableProfile)
+      // Start the interval paused.
+      Status = TimerStatusTy::Paused;
+    else
+      // Disable the interval for good.
+      Status = TimerStatusTy::Disabled;
+  }
+
+  // The destructor automatically updates the profile data.
+  ~ProfileIntervalTy() {
+    if (Status == TimerStatusTy::Disabled)
+      return;
+    if (Status == TimerStatusTy::Running) {
+      Status = TimerStatusTy::Disabled;
+      WARNING("profiling timer '%s' for OpenMP device (%" PRId32 ") %s "
+              "is disabled due to start/stop mismatch.\n",
+              Name.c_str(), DeviceId, DeviceInfo->Names[DeviceId].data());
+      return;
+    }
+
+    DeviceInfo->getProfiles(DeviceId).update(
+        Name.c_str(), HostElapsed, DeviceElapsed);
+  }
+
+  // Trigger interval start.
+  void start() {
+    if (Status == TimerStatusTy::Disabled)
+      return;
+    if (Status == TimerStatusTy::Running) {
+      Status = TimerStatusTy::Disabled;
+      WARNING("profiling timer '%s' for OpenMP device (%" PRId32 ") %s "
+              "is disabled due to start/stop mismatch.\n",
+              Name.c_str(), DeviceId, DeviceInfo->Names[DeviceId].data());
+      return;
+    }
+    cl_int rc;
+    CALL_CL(rc, clGetDeviceAndHostTimer, ClDeviceId, &DeviceTimeTemp,
+            &HostTimeTemp);
+    if (rc != CL_SUCCESS) {
+      Status = TimerStatusTy::Disabled;
+      WARNING("profiling timer '%s' for OpenMP device (%" PRId32 ") %s "
+              "is disabled due to invalid OpenCL timer.\n",
+              Name.c_str(), DeviceId, DeviceInfo->Names[DeviceId].data());
+      return;
+    }
+    Status = TimerStatusTy::Running;
+  }
+
+  // Trigger interval stop (actually, a pause).
+  void stop() {
+    if (Status == TimerStatusTy::Disabled)
+      return;
+    if (Status == TimerStatusTy::Paused) {
+      Status = TimerStatusTy::Disabled;
+      WARNING("profiling timer '%s' for OpenMP device (%" PRId32 ") %s "
+              "is disabled due to start/stop mismatch.\n",
+              Name.c_str(), DeviceId, DeviceInfo->Names[DeviceId].data());
+      return;
+    }
+
+    cl_ulong DeviceTime, HostTime;
+    cl_int rc;
+    CALL_CL(rc, clGetDeviceAndHostTimer, ClDeviceId, &DeviceTime, &HostTime);
+    if (rc != CL_SUCCESS) {
+      Status = TimerStatusTy::Disabled;
+      WARNING("profiling timer '%s' for OpenMP device (%" PRId32 ") %s "
+              "is disabled due to invalid OpenCL timer.\n",
+              Name.c_str(), DeviceId, DeviceInfo->Names[DeviceId].data());
+      return;
+    }
+
+    if (DeviceTime < DeviceTimeTemp || HostTime < HostTimeTemp) {
+      Status = TimerStatusTy::Disabled;
+      WARNING("profiling timer '%s' for OpenMP device (%" PRId32 ") %s "
+              "is disabled due to timer overflow.\n",
+              Name.c_str(), DeviceId, DeviceInfo->Names[DeviceId].data());
+      return;
+    }
+
+    DeviceElapsed +=
+        static_cast<double>(DeviceTime) - static_cast<double>(DeviceTimeTemp);
+    HostElapsed +=
+        static_cast<double>(HostTime) - static_cast<double>(HostTimeTemp);
+    Status = TimerStatusTy::Paused;
+  }
+}; // ProfileIntervalTy
+
+/// Clean-up routine to be registered by std::atexit().
+static void closeRTL() {
+  for (uint32_t i = 0; i < DeviceInfo->NumDevices; i++) {
+    if (!DeviceInfo->Initialized[i])
+      continue;
+    if (DeviceInfo->Option.Flags.EnableProfile) {
+      for (auto &profile : DeviceInfo->Profiles[i])
+        profile.second.printData(i, profile.first, DeviceInfo->Names[i].data(),
+                                 DeviceInfo->Option.ProfileResolution);
+    }
+
+    CALL_CL_EXIT_FAIL(clReleaseCommandQueue, DeviceInfo->Queues[i]);
+
+    if (DeviceInfo->QueuesInOrder[i])
+      CALL_CL_EXIT_FAIL(clReleaseCommandQueue, DeviceInfo->QueuesInOrder[i]);
+
+    for (auto mem : DeviceInfo->OwnedMemory[i])
+      CALL_CL_EXT_VOID(i, clMemFreeINTEL, DeviceInfo->getContext(i), mem);
+
+    if (!DeviceInfo->Option.Flags.UseSingleContext)
+      CALL_CL_EXIT_FAIL(clReleaseContext, DeviceInfo->Contexts[i]);
+
+    DeviceInfo->Programs[i].clear();
+  }
+
+  if (DeviceInfo->Option.Flags.UseSingleContext)
+    for (auto platformInfo : DeviceInfo->PlatformInfos)
+      CALL_CL_EXIT_FAIL(clReleaseContext, platformInfo.second.Context);
+
+  delete[] DeviceInfo->Mutexes;
+  delete[] DeviceInfo->ProfileLocks;
+  DP("Closed RTL successfully\n");
+}
+
+static std::string getDeviceRTLPath(const char *BaseName) {
+  std::string RTLPath;
+#ifdef _WIN32
+  char Path[_MAX_PATH];
+  HMODULE Module = nullptr;
+  if (!GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |
+                          GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
+                          (LPCSTR) &DeviceInfo, &Module))
+    return RTLPath;
+  if (!GetModuleFileNameA(Module, Path, sizeof(Path)))
+    return RTLPath;
+  RTLPath = Path;
+#else
+  Dl_info RTLInfo;
+  if (!dladdr(&DeviceInfo, &RTLInfo))
+    return RTLPath;
+  RTLPath = RTLInfo.dli_fname;
+#endif
+  size_t Split = RTLPath.find_last_of("/\\");
+  RTLPath.replace(Split + 1, std::string::npos, BaseName);
+  return RTLPath;
+}
+
+static inline void addDataTransferLatency() {
+  if (!DeviceInfo->Option.Flags.CollectDataTransferLatency)
+    return;
+  double goal = omp_get_wtime() + 1e-6 * DeviceInfo->Option.DataTransferLatency;
+  // Naive spinning should be enough
+  while (omp_get_wtime() < goal)
+    ;
+}
+
+// FIXME: move this to llvm/BinaryFormat/ELF.h and elf.h:
+#define NT_INTEL_ONEOMP_OFFLOAD_VERSION 1
+#define NT_INTEL_ONEOMP_OFFLOAD_IMAGE_COUNT 2
+#define NT_INTEL_ONEOMP_OFFLOAD_IMAGE_AUX 3
+
+static bool isValidOneOmpImage(__tgt_device_image *Image,
+                               uint64_t &MajorVer,
+                               uint64_t &MinorVer) {
+  char *ImgBegin = reinterpret_cast<char *>(Image->ImageStart);
+  char *ImgEnd = reinterpret_cast<char *>(Image->ImageEnd);
+  size_t ImgSize = ImgEnd - ImgBegin;
+  ElfL E(ImgBegin, ImgSize);
+  if (!E.isValidElf()) {
+    DP("Warning: unable to get ELF handle: %s!\n", E.getErrmsg(-1));
+    return false;
+  }
+
+  for (auto I = E.section_notes_begin(), IE = E.section_notes_end(); I != IE;
+       ++I) {
+    ElfLNote Note = *I;
+    if (Note.getNameSize() == 0)
+      continue;
+    std::string NameStr(Note.getName(), Note.getNameSize());
+    if (NameStr != "INTELONEOMPOFFLOAD")
+      continue;
+    uint64_t Type = Note.getType();
+    if (Type != NT_INTEL_ONEOMP_OFFLOAD_VERSION)
+      continue;
+    std::string DescStr(reinterpret_cast<const char *>(Note.getDesc()),
+                        Note.getDescSize());
+    auto DelimPos = DescStr.find('.');
+    if (DelimPos == std::string::npos) {
+      // The version has to look like "Major#.Minor#".
+      DP("Invalid NT_INTEL_ONEOMP_OFFLOAD_VERSION: '%s'\n", DescStr.c_str());
+      return false;
+    }
+    std::string MajorVerStr = DescStr.substr(0, DelimPos);
+    DescStr.erase(0, DelimPos + 1);
+    MajorVer = std::stoull(MajorVerStr);
+    MinorVer = std::stoull(DescStr);
+    bool isSupported = (MajorVer == 1 && MinorVer == 0);
+    return isSupported;
+  }
+
+  return false;
+}
+
+static void dumpImageToFile(
+    const void *Image, size_t ImageSize, const char *Type) {
+}
+
+static void debugPrintBuildLog(cl_program program, cl_device_id did) {
+  if (DebugLevel <= 0 && !DeviceInfo->Option.Flags.ShowBuildLog)
+    return;
+
+  size_t len = 0;
+  CALL_CL_RET_VOID(clGetProgramBuildInfo, program, did, CL_PROGRAM_BUILD_LOG, 0,
+                   nullptr, &len);
+  // The len must actually be bigger than 0 always, because the log string
+  // is null-terminated.
+  if (len == 0)
+    return;
+
+  std::vector<char> buffer(len);
+  CALL_CL_RET_VOID(clGetProgramBuildInfo, program, did, CL_PROGRAM_BUILD_LOG,
+                   len, buffer.data(), nullptr);
+  const char *buildLog = (len > 1) ? buffer.data() : "<empty>";
+  MESSAGE0("Target build log:");
+  std::stringstream Str(buildLog);
+  std::string Line;
+  while(std::getline(Str, Line, '\n'))
+    MESSAGE("  %s", Line.c_str());
+}
+
+static cl_program createProgramFromFile(const char *BaseName,
+                                        int32_t DeviceId) {
+  std::string RTLPath = getDeviceRTLPath(BaseName);
+  std::ifstream RTLFile(RTLPath, std::ios::binary);
+
+  if (RTLFile.is_open()) {
+    DP("Found device RTL: %s\n", RTLPath.c_str());
+    RTLFile.seekg(0, RTLFile.end);
+    int RTLSize = RTLFile.tellg();
+    std::string RTL(RTLSize, '\0');
+    RTLFile.seekg(0);
+    if (!RTLFile.read(&RTL[0], RTLSize)) {
+      DP("I/O Error: Failed to read device RTL.\n");
+      return nullptr;
+    }
+
+    dumpImageToFile(RTL.c_str(), RTLSize, BaseName);
+
+    cl_int RC;
+    cl_program PGM;
+    CALL_CL_RVRC(PGM, clCreateProgramWithIL, RC,
+                 DeviceInfo->getContext(DeviceId), RTL.c_str(), RTLSize);
+    if (RC != CL_SUCCESS) {
+      DP("Error: Failed to create device RTL from IL: %d\n", RC);
+      return nullptr;
+    }
+
+    DeviceInfo->Option.CommonSpecConstants.setProgramConstants(DeviceId, PGM);
+
+    return PGM;
+  }
+
+  DP("Cannot find device RTL: %s\n", RTLPath.c_str());
+  return nullptr;
+}
+
+static inline void *dataAlloc(int32_t DeviceId, int64_t Size, void *HstPtr,
+    void *HstBase, bool ImplicitArg, cl_uint Align = 0) {
+  intptr_t Offset = (intptr_t)HstPtr - (intptr_t)HstBase;
+  // If the offset is negative, then for our practical purposes it can be
+  // considered 0 because the base address of an array will be contained
+  // within or after the allocated memory.
+  intptr_t MeaningfulOffset = Offset >= 0 ? Offset : 0;
+  // If the offset is negative and the size we map is not large enough to reach
+  // the base, then we must allocate extra memory up to the base (+1 to include
+  // at least the first byte the base is pointing to).
+  int64_t MeaningfulSize =
+      Offset < 0 && abs(Offset) >= Size ? abs(Offset) + 1 : Size;
+
+  void *Base = nullptr;
+  auto Context = DeviceInfo->getContext(DeviceId);
+  size_t AllocSize = MeaningfulSize + MeaningfulOffset;
+  int32_t AllocKind = TARGET_ALLOC_DEVICE;
+
+  ProfileIntervalTy DataAllocTimer("DataAlloc", DeviceId);
+  DataAllocTimer.start();
+
+  if (DeviceInfo->Option.Flags.UseSVM) {
+    AllocKind = TARGET_ALLOC_SVM;
+    CALL_CL_RV(Base, clSVMAlloc, Context, CL_MEM_READ_WRITE, AllocSize, Align);
+  } else {
+    if (!DeviceInfo->isExtensionFunctionEnabled(DeviceId,
+                                                clDeviceMemAllocINTELId)) {
+      DP("Error: Extension %s is not supported\n",
+         DeviceInfo->getExtensionFunctionName(DeviceId,
+                                              clDeviceMemAllocINTELId));
+      return nullptr;
+    }
+    cl_int RC;
+    auto AllocProp = DeviceInfo->getAllocMemProperties(DeviceId, AllocSize);
+    CALL_CL_EXT_RVRC(DeviceId, Base, clDeviceMemAllocINTEL, RC, Context,
+                     DeviceInfo->Devices[DeviceId], AllocProp->data(),
+                     AllocSize, Align);
+    if (RC != CL_SUCCESS)
+      return nullptr;
+  }
+  if (!Base) {
+    DP("Error: Failed to allocate base buffer\n");
+    return nullptr;
+  }
+  DP("Created base buffer " DPxMOD " during data alloc\n", DPxPTR(Base));
+
+  void *Ret = (void *)((intptr_t)Base + MeaningfulOffset);
+
+  if (ImplicitArg) {
+    DP("Stashing an implicit argument " DPxMOD " for next kernel\n",
+       DPxPTR(Ret));
+  }
+  DeviceInfo->MemAllocInfo[DeviceId]->add(
+      Ret, Base, Size, AllocKind, false, ImplicitArg);
+
+  DataAllocTimer.stop();
+
+  return Ret;
+}
+
+static void *dataAllocExplicit(
+    int32_t DeviceId, int64_t Size, int32_t Kind, cl_uint Align = 0) {
+  auto Device = DeviceInfo->Devices[DeviceId];
+  auto Context = DeviceInfo->getContext(DeviceId);
+  cl_int RC;
+  void *Mem = nullptr;
+  ProfileIntervalTy DataAllocTimer("DataAlloc", DeviceId);
+  DataAllocTimer.start();
+  auto ID = DeviceId;
+  auto AllocProp = DeviceInfo->getAllocMemProperties(DeviceId, Size);
+
+  switch (Kind) {
+  case TARGET_ALLOC_DEVICE:
+    Mem = dataAlloc(DeviceId, Size, nullptr, nullptr, true /* ImplicitArg */,
+                    Align);
+    break;
+  case TARGET_ALLOC_HOST:
+    if (DeviceInfo->Option.Flags.UseSingleContext)
+      ID = DeviceInfo->NumDevices;
+    if (!DeviceInfo->isExtensionFunctionEnabled(DeviceId,
+                                                clHostMemAllocINTELId)) {
+      DP("Host memory allocator is not available\n");
+      return nullptr;
+    }
+    CALL_CL_EXT_RVRC(DeviceId, Mem, clHostMemAllocINTEL, RC, Context,
+                     AllocProp->data(), Size, Align);
+    if (Mem) {
+      DeviceInfo->MemAllocInfo[ID]->add(
+          Mem, Mem, Size, Kind, false /* InPool */, true /* IsImplicitArg */);
+      DP("Allocated a host memory object " DPxMOD "\n", DPxPTR(Mem));
+    }
+    break;
+  case TARGET_ALLOC_SHARED:
+    if (!DeviceInfo->isExtensionFunctionEnabled(
+          DeviceId, clSharedMemAllocINTELId)) {
+      DP("Shared memory allocator is not available\n");
+      return nullptr;
+    }
+    CALL_CL_EXT_RVRC(DeviceId, Mem, clSharedMemAllocINTEL, RC, Context, Device,
+                     AllocProp->data(), Size, Align);
+    if (Mem) {
+      DeviceInfo->MemAllocInfo[ID]->add(
+          Mem, Mem, Size, Kind, false /* InPool */, true /* IsImplicitArg */);
+      DP("Allocated a shared memory object " DPxMOD "\n", DPxPTR(Mem));
+    }
+    break;
+  default:
+    FATAL_ERROR("Invalid target data allocation kind");
+  }
+
+  DataAllocTimer.stop();
+
+  return Mem;
+}
+
+static int32_t submitData(int32_t device_id, void *tgt_ptr, void *hst_ptr,
+                          int64_t size) {
+  if (size == 0)
+    // All other plugins seem to be handling 0 size gracefully,
+    // so we should do as well.
+    return OFFLOAD_SUCCESS;
+
+  cl_command_queue queue = DeviceInfo->Queues[device_id];
+
+  // Add synthetic delay for experiments
+  addDataTransferLatency();
+
+  const char *ProfileKey = "DataWrite (Host to Device)";
+
+  if (DeviceInfo->Option.Flags.UseBuffer) {
+    std::unique_lock<std::mutex> lock(DeviceInfo->Mutexes[device_id]);
+    if (DeviceInfo->ClMemBuffers[device_id].count(tgt_ptr) > 0) {
+      cl_event event;
+      CALL_CL_RET_FAIL(clEnqueueWriteBuffer, queue, (cl_mem)tgt_ptr, CL_FALSE,
+                       0, size, hst_ptr, 0, nullptr, &event);
+      CALL_CL_RET_FAIL(clWaitForEvents, 1, &event);
+      if (DeviceInfo->Option.Flags.EnableProfile)
+        DeviceInfo->getProfiles(device_id).update(ProfileKey, event);
+
+      return OFFLOAD_SUCCESS;
+    }
+  }
+
+  if (!DeviceInfo->Option.Flags.UseSVM) {
+    if (!DeviceInfo->isExtensionFunctionEnabled(device_id,
+                                                clEnqueueMemcpyINTELId)) {
+      DP("Error: Extension %s is not supported\n",
+         DeviceInfo->getExtensionFunctionName(device_id,
+                                              clEnqueueMemcpyINTELId));
+      return OFFLOAD_FAIL;
+    }
+    cl_event event;
+    CALL_CL_EXT_RET_FAIL(device_id, clEnqueueMemcpyINTEL, queue, CL_FALSE,
+                         tgt_ptr, hst_ptr, size, 0, nullptr, &event);
+    CALL_CL_RET_FAIL(clWaitForEvents, 1, &event);
+    if (DeviceInfo->Option.Flags.EnableProfile)
+      DeviceInfo->getProfiles(device_id).update(ProfileKey, event);
+
+    return OFFLOAD_SUCCESS;
+  }
+
+  switch (DeviceInfo->Option.DataTransferMethod) {
+  case DATA_TRANSFER_METHOD_SVMMAP: {
+    cl_event event;
+    ProfileIntervalTy SubmitTime(ProfileKey, device_id);
+    SubmitTime.start();
+
+    CALL_CL_RET_FAIL(clEnqueueSVMMap, queue, CL_TRUE, CL_MAP_WRITE, tgt_ptr,
+                     size, 0, nullptr, nullptr);
+    memcpy(tgt_ptr, hst_ptr, size);
+    CALL_CL_RET_FAIL(clEnqueueSVMUnmap, queue, tgt_ptr, 0, nullptr, &event);
+    CALL_CL_RET_FAIL(clWaitForEvents, 1, &event);
+
+    SubmitTime.stop();
+  } break;
+  case DATA_TRANSFER_METHOD_SVMMEMCPY: {
+    cl_event event;
+    CALL_CL_RET_FAIL(clEnqueueSVMMemcpy, queue, CL_TRUE, tgt_ptr, hst_ptr,
+                     size, 0, nullptr, &event);
+    if (DeviceInfo->Option.Flags.EnableProfile)
+      DeviceInfo->getProfiles(device_id).update(ProfileKey, event);
+  } break;
+  case DATA_TRANSFER_METHOD_CLMEM:
+  default: {
+    cl_event event;
+    cl_int rc;
+    cl_mem mem = nullptr;
+    CALL_CL_RVRC(mem, clCreateBuffer, rc, DeviceInfo->getContext(device_id),
+                 CL_MEM_USE_HOST_PTR, size, tgt_ptr);
+    if (rc != CL_SUCCESS) {
+      DP("Error: Failed to create a buffer from a SVM pointer " DPxMOD "\n",
+         DPxPTR(tgt_ptr));
+      return OFFLOAD_FAIL;
+    }
+    CALL_CL_RET_FAIL(clEnqueueWriteBuffer, queue, mem, CL_FALSE, 0, size,
+                     hst_ptr, 0, nullptr, &event);
+    CALL_CL_RET_FAIL(clWaitForEvents, 1, &event);
+    CALL_CL_RET_FAIL(clReleaseMemObject, mem);
+    if (DeviceInfo->Option.Flags.EnableProfile)
+      DeviceInfo->getProfiles(device_id).update(ProfileKey, event);
+  }
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+static int32_t retrieveData(int32_t device_id, void *hst_ptr, void *tgt_ptr,
+                            int64_t size) {
+  if (size == 0)
+    // All other plugins seem to be handling 0 size gracefully,
+    // so we should do as well.
+    return OFFLOAD_SUCCESS;
+
+  cl_command_queue queue = DeviceInfo->Queues[device_id];
+
+  // Add synthetic delay for experiments
+  addDataTransferLatency();
+
+  const char *ProfileKey = "DataRead (Device to Host)";
+
+  if (DeviceInfo->Option.Flags.UseBuffer) {
+    std::unique_lock<std::mutex> lock(DeviceInfo->Mutexes[device_id]);
+    if (DeviceInfo->ClMemBuffers[device_id].count(tgt_ptr) > 0) {
+      cl_event event;
+      CALL_CL_RET_FAIL(clEnqueueReadBuffer, queue, (cl_mem)tgt_ptr, CL_FALSE,
+                       0, size, hst_ptr, 0, nullptr, &event);
+      CALL_CL_RET_FAIL(clWaitForEvents, 1, &event);
+      if (DeviceInfo->Option.Flags.EnableProfile)
+        DeviceInfo->getProfiles(device_id).update(ProfileKey, event);
+
+      return OFFLOAD_SUCCESS;
+    }
+  }
+
+  if (!DeviceInfo->Option.Flags.UseSVM) {
+    if (!DeviceInfo->isExtensionFunctionEnabled(device_id,
+                                                clEnqueueMemcpyINTELId)) {
+      DP("Error: Extension %s is not supported\n",
+         DeviceInfo->getExtensionFunctionName(device_id,
+                                              clEnqueueMemcpyINTELId));
+      return OFFLOAD_FAIL;
+    }
+    cl_event event;
+    CALL_CL_EXT_RET_FAIL(device_id, clEnqueueMemcpyINTEL, queue, CL_FALSE,
+                         hst_ptr, tgt_ptr, size, 0, nullptr, &event);
+    CALL_CL_RET_FAIL(clWaitForEvents, 1, &event);
+    if (DeviceInfo->Option.Flags.EnableProfile)
+      DeviceInfo->getProfiles(device_id).update(ProfileKey, event);
+
+    return OFFLOAD_SUCCESS;
+  }
+
+  switch (DeviceInfo->Option.DataTransferMethod) {
+  case DATA_TRANSFER_METHOD_SVMMAP: {
+    cl_event event;
+    ProfileIntervalTy RetrieveTime(ProfileKey, device_id);
+    RetrieveTime.start();
+
+    CALL_CL_RET_FAIL(clEnqueueSVMMap, queue, CL_TRUE, CL_MAP_READ, tgt_ptr,
+                     size, 0, nullptr, nullptr);
+    memcpy(hst_ptr, tgt_ptr, size);
+    CALL_CL_RET_FAIL(clEnqueueSVMUnmap, queue, tgt_ptr, 0, nullptr, &event);
+    CALL_CL_RET_FAIL(clWaitForEvents, 1, &event);
+
+    RetrieveTime.stop();
+  } break;
+  case DATA_TRANSFER_METHOD_SVMMEMCPY: {
+    cl_event event;
+    CALL_CL_RET_FAIL(clEnqueueSVMMemcpy, queue, CL_TRUE, hst_ptr, tgt_ptr,
+                     size, 0, nullptr, &event);
+    if (DeviceInfo->Option.Flags.EnableProfile)
+      DeviceInfo->getProfiles(device_id).update(ProfileKey, event);
+  } break;
+  case DATA_TRANSFER_METHOD_CLMEM:
+  default: {
+    cl_int rc;
+    cl_event event;
+    cl_mem mem = nullptr;
+    CALL_CL_RVRC(mem, clCreateBuffer, rc, DeviceInfo->getContext(device_id),
+                 CL_MEM_USE_HOST_PTR, size, tgt_ptr);
+    if (rc != CL_SUCCESS) {
+      DP("Error: Failed to create a buffer from a SVM pointer " DPxMOD "\n",
+         DPxPTR(tgt_ptr));
+      return OFFLOAD_FAIL;
+    }
+    CALL_CL_RET_FAIL(clEnqueueReadBuffer, queue, mem, CL_FALSE, 0, size,
+                     hst_ptr, 0, nullptr, &event);
+    CALL_CL_RET_FAIL(clWaitForEvents, 1, &event);
+    CALL_CL_RET_FAIL(clReleaseMemObject, mem);
+    if (DeviceInfo->Option.Flags.EnableProfile)
+      DeviceInfo->getProfiles(device_id).update(ProfileKey, event);
+  }
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+// Return the number of total HW threads required to execute
+// a loop kernel compiled with the given simdWidth, and the given
+// loop(s) trip counts and group sizes.
+// Returns UINT64_MAX, if computations overflow.
+static uint64_t computeThreadsNeeded(
+    const size_t (&tripCounts)[3], const size_t (&groupSizes)[3],
+    uint32_t simdWidth) {
+  uint64_t groupCount[3];
+  for (int i = 0; i < 3; ++i) {
+    if (tripCounts[i] == 0 || groupSizes[i] == 0)
+      return (std::numeric_limits<uint64_t>::max)();
+    groupCount[i] =
+        (uint64_t(tripCounts[i]) + groupSizes[i] - 1) / groupSizes[i];
+    if (groupCount[i] > (std::numeric_limits<uint32_t>::max)())
+      return (std::numeric_limits<uint64_t>::max)();
+  }
+  for (int i = 1; i < 3; ++i) {
+    if ((std::numeric_limits<uint64_t>::max)() / groupCount[0] < groupCount[i])
+      return (std::numeric_limits<uint64_t>::max)();
+    groupCount[0] *= groupCount[i];
+  }
+  // Multiplication of the group sizes must never overflow uint64_t
+  // for any existing device.
+  uint64_t localWorkSize =
+      uint64_t(groupSizes[0]) * groupSizes[1] * groupSizes[2];
+  uint64_t threadsPerWG = ((localWorkSize + simdWidth - 1) / simdWidth);
+
+  // Check that the total number of threads fits uint64_t.
+  if ((std::numeric_limits<uint64_t>::max)() / groupCount[0] <
+      threadsPerWG)
+    return (std::numeric_limits<uint64_t>::max)();
+
+  return groupCount[0] * threadsPerWG;
+}
+
+static void decideLoopKernelGroupArguments(
+    int32_t DeviceId, int32_t ThreadLimit, TgtNDRangeDescTy *LoopLevels,
+    cl_kernel Kernel, size_t *GroupSizes, size_t *GroupCounts) {
+
+  size_t maxGroupSize = DeviceInfo->maxWorkGroupSize[DeviceId];
+  auto &KernelProperty = DeviceInfo->KernelProperties[DeviceId][Kernel];
+  size_t kernelWidth = KernelProperty.Width;
+  DP("Assumed kernel SIMD width is %zu\n", KernelProperty.SIMDWidth);
+  DP("Preferred group size is multiple of %zu\n", kernelWidth);
+
+  size_t kernelMaxThreadGroupSize = KernelProperty.MaxThreadGroupSize;
+  if (kernelMaxThreadGroupSize < maxGroupSize) {
+    maxGroupSize = kernelMaxThreadGroupSize;
+    DP("Capping maximum thread group size to %zu due to kernel constraints.\n",
+       maxGroupSize);
+  }
+
+  bool maxGroupSizeForced = false;
+
+  if (ThreadLimit > 0) {
+    maxGroupSizeForced = true;
+
+    if ((uint32_t)ThreadLimit <= maxGroupSize) {
+      maxGroupSize = ThreadLimit;
+      DP("Max group size is set to %zu (thread_limit clause)\n", maxGroupSize);
+    } else {
+      DP("thread_limit(%" PRIu32 ") exceeds current maximum %zu\n",
+         ThreadLimit, maxGroupSize);
+    }
+  }
+
+  if (DeviceInfo->Option.ThreadLimit > 0) {
+    maxGroupSizeForced = true;
+
+    if (DeviceInfo->Option.ThreadLimit <= maxGroupSize) {
+      maxGroupSize = DeviceInfo->Option.ThreadLimit;
+      DP("Max group size is set to %zu (OMP_THREAD_LIMIT)\n", maxGroupSize);
+    } else {
+      DP("OMP_THREAD_LIMIT(%" PRIu32 ") exceeds current maximum %zu\n",
+         DeviceInfo->Option.ThreadLimit, maxGroupSize);
+    }
+  }
+
+  if (DeviceInfo->Option.NumTeams > 0)
+    DP("OMP_NUM_TEAMS(%" PRIu32 ") is ignored\n", DeviceInfo->Option.NumTeams);
+
+  GroupCounts[0] = GroupCounts[1] = GroupCounts[2] = 1;
+  size_t groupSizes[3] = {maxGroupSize, 1, 1};
+  TgtLoopDescTy *level = LoopLevels->Levels;
+  int32_t distributeDim = LoopLevels->DistributeDim;
+  assert(distributeDim >= 0 && distributeDim <= 2 &&
+         "Invalid distribute dimension.");
+  int32_t numLoopLevels = LoopLevels->NumLoops;
+  assert((numLoopLevels > 0 && numLoopLevels <= 3) &&
+         "Invalid loop nest description for ND partitioning");
+
+  // Compute global widths for X/Y/Z dimensions.
+  size_t tripCounts[3] = {1, 1, 1};
+
+  for (int32_t i = 0; i < numLoopLevels; i++) {
+    assert(level[i].Stride > 0 && "Invalid loop stride for ND partitioning");
+    DP("Loop %" PRIu32 ": lower bound = %" PRId64 ", upper bound = %" PRId64
+       ", Stride = %" PRId64 "\n",
+       i, level[i].Lb, level[i].Ub, level[i].Stride);
+    if (level[i].Ub < level[i].Lb)
+      tripCounts[i] = 0;
+    else
+      tripCounts[i] =
+          (level[i].Ub - level[i].Lb + level[i].Stride) / level[i].Stride;
+  }
+
+  // Check if any of the loop has zero iterations.
+  if (tripCounts[0] == 0 || tripCounts[1] == 0 || tripCounts[2] == 0) {
+    std::fill(GroupSizes, GroupSizes + 3, 1);
+    std::fill(GroupCounts, GroupCounts + 3, 1);
+    if (distributeDim > 0 && tripCounts[distributeDim] != 0) {
+      // There is a distribute dimension, and the distribute loop
+      // has non-zero iterations, but some inner parallel loop
+      // has zero iterations. We still want to split the distribute
+      // loop's iterations between many WGs (of size 1), but the inner/lower
+      // dimensions should be 1x1.
+      // Note that this code is currently dead, because we are not
+      // hoisting the inner loops' bounds outside of the target regions.
+      // The code is here just for completeness.
+      size_t distributeTripCount = tripCounts[distributeDim];
+      GroupCounts[distributeDim] = distributeTripCount;
+    }
+    return;
+  }
+
+  if (!maxGroupSizeForced) {
+    // Use clGetKernelSuggestedLocalWorkSizeINTEL to compute group sizes,
+    // or fallback to setting dimension 0 width to SIMDWidth.
+    // Note that in case of user-specified LWS groupSizes[0]
+    // is already set according to the specified value.
+    size_t globalSizes[3] = { tripCounts[0], tripCounts[1], tripCounts[2] };
+    if (distributeDim > 0) {
+      // There is a distribute dimension.
+      globalSizes[distributeDim - 1] *= globalSizes[distributeDim];
+      globalSizes[distributeDim] = 1;
+    }
+
+    cl_int rc = CL_DEVICE_NOT_FOUND;
+    size_t suggestedGroupSizes[3] = {1, 1, 1};
+    if (DeviceInfo->Option.Flags.UseDriverGroupSizes &&
+        DeviceInfo->isExtensionFunctionEnabled(
+            DeviceId, clGetKernelSuggestedLocalWorkSizeINTELId)) {
+      CALL_CL_EXT(DeviceId, rc, clGetKernelSuggestedLocalWorkSizeINTEL,
+                  DeviceInfo->Queues[DeviceId], Kernel, 3, nullptr, globalSizes,
+                  suggestedGroupSizes);
+    }
+    if (rc == CL_SUCCESS) {
+      groupSizes[0] = suggestedGroupSizes[0];
+      groupSizes[1] = suggestedGroupSizes[1];
+      groupSizes[2] = suggestedGroupSizes[2];
+    } else {
+      if (maxGroupSize > kernelWidth) {
+        groupSizes[0] = kernelWidth;
+      }
+      if (distributeDim == 0 &&
+          // We need to know exact number of HW threads available
+          // on the device, so we need cl_intel_device_attribute_query
+          // extension to be supported.
+          DeviceInfo->Extensions[DeviceId].DeviceAttributeQuery ==
+          ExtensionStatusEnabled) {
+        // If there is a distribute dimension, then we do not use
+        // thin HW threads, since we do not know anything about
+        // the iteration space of the inner parallel loop regions.
+        //
+        // If there is no distribute dimension, then try to use thiner
+        // HW threads to get more independent HW threads executing
+        // the kernel - this may allow more parallelism due to
+        // the stalls being distributed across multiple HW threads rather
+        // than across SIMD lanes within one HW thread.
+        assert(groupSizes[1] == 1 && groupSizes[2] == 1 &&
+               "Unexpected group sizes for dimensions 1 or/and 2.");
+        uint32_t simdWidth = KernelProperty.SIMDWidth;
+        auto &deviceProperties = DeviceInfo->DeviceProperties[DeviceId];
+        uint32_t numEUsPerSubslice = deviceProperties.NumEUsPerSubslice;
+        uint32_t numSubslices = deviceProperties.NumSlices *
+            deviceProperties.NumSubslicesPerSlice;
+        uint32_t numThreadsPerEU = deviceProperties.NumThreadsPerEU;
+        uint64_t totalThreads = uint64_t(numThreadsPerEU) * numEUsPerSubslice *
+            numSubslices;
+              totalThreads *= DeviceInfo->Option.ThinThreadsThreshold;
+
+        uint64_t groupSizePrev = groupSizes[0];
+        uint64_t threadsNeeded =
+            computeThreadsNeeded(tripCounts, groupSizes, simdWidth);
+        while (threadsNeeded < totalThreads) {
+          groupSizePrev = groupSizes[0];
+          // Try to half the local work size (if possible) and see
+          // how many HW threads the kernel will require with this
+          // new local work size.
+          // In most implementations the initial groupSizes[0]
+          // will be a power-of-two.
+          if (groupSizes[0] <= 1)
+            break;
+          groupSizes[0] >>= 1;
+          threadsNeeded =
+              computeThreadsNeeded(tripCounts, groupSizes, simdWidth);
+        }
+        groupSizes[0] = groupSizePrev;
+      }
+    }
+  }
+
+  for (int32_t i = 0; i < numLoopLevels; i++) {
+    if (i < distributeDim) {
+      GroupCounts[i] = 1;
+      continue;
+    }
+    size_t trip = tripCounts[i];
+    if (groupSizes[i] >= trip)
+      groupSizes[i] = trip;
+    GroupCounts[i] = (trip + groupSizes[i] - 1) / groupSizes[i];
+  }
+  std::copy(groupSizes, groupSizes + 3, GroupSizes);
+}
+
+static void decideKernelGroupArguments(
+    int32_t DeviceId, int32_t NumTeams, int32_t ThreadLimit,
+    cl_kernel Kernel, size_t *GroupSizes, size_t *GroupCounts) {
+  const KernelInfoTy *KInfo = DeviceInfo->getKernelInfo(DeviceId, Kernel);
+  if (!KInfo) {
+    DP("Warning: Cannot find kernel information for kernel " DPxMOD ".\n",
+       DPxPTR(Kernel));
+  }
+  size_t maxGroupSize = DeviceInfo->maxWorkGroupSize[DeviceId];
+  bool maxGroupSizeForced = false;
+  bool maxGroupCountForced = false;
+
+  auto &KernelProperty = DeviceInfo->KernelProperties[DeviceId][Kernel];
+  size_t kernelWidth = KernelProperty.Width;
+  DP("Preferred group size is multiple of %zu\n", kernelWidth);
+
+  size_t kernelMaxThreadGroupSize = KernelProperty.MaxThreadGroupSize;
+  if (kernelMaxThreadGroupSize < maxGroupSize) {
+    maxGroupSize = kernelMaxThreadGroupSize;
+    DP("Capping maximum thread group size to %zu due to kernel constraints.\n",
+       maxGroupSize);
+  }
+
+  if (ThreadLimit > 0) {
+    maxGroupSizeForced = true;
+
+    if ((uint32_t)ThreadLimit <= maxGroupSize) {
+      maxGroupSize = ThreadLimit;
+      DP("Max group size is set to %zu (thread_limit clause)\n",
+         maxGroupSize);
+    } else {
+      DP("thread_limit(%" PRIu32 ") exceeds current maximum %zu\n",
+         ThreadLimit, maxGroupSize);
+    }
+  }
+
+  if (DeviceInfo->Option.ThreadLimit > 0) {
+    maxGroupSizeForced = true;
+
+    if (DeviceInfo->Option.ThreadLimit <= maxGroupSize) {
+      maxGroupSize = DeviceInfo->Option.ThreadLimit;
+      DP("Max group size is set to %zu (OMP_THREAD_LIMIT)\n", maxGroupSize);
+    } else {
+      DP("OMP_THREAD_LIMIT(%" PRIu32 ") exceeds current maximum %zu\n",
+         DeviceInfo->Option.ThreadLimit, maxGroupSize);
+    }
+  }
+
+  size_t maxGroupCount = 0;
+
+  if (NumTeams > 0) {
+    maxGroupCount = NumTeams;
+    maxGroupCountForced = true;
+    DP("Max group count is set to %zu "
+       "(num_teams clause or no teams construct)\n", maxGroupCount);
+  } else if (DeviceInfo->Option.NumTeams > 0) {
+    // OMP_NUM_TEAMS only matters, if num_teams() clause is absent.
+    maxGroupCount = DeviceInfo->Option.NumTeams;
+    maxGroupCountForced = true;
+    DP("Max group count is set to %zu (OMP_NUM_TEAMS)\n", maxGroupCount);
+  }
+
+  if (maxGroupCountForced) {
+    // If number of teams is specified by the user, then use kernelWidth
+    // WIs per WG by default, so that it matches
+    // decideLoopKernelGroupArguments() behavior.
+    if (!maxGroupSizeForced) {
+      maxGroupSize = kernelWidth;
+    }
+  } else {
+    maxGroupCount = DeviceInfo->maxExecutionUnits[DeviceId];
+  }
+
+  GroupSizes[0] = maxGroupSize;
+  GroupSizes[1] = GroupSizes[2] = 1;
+
+  if (KInfo && KInfo->getWINum()) {
+    GroupSizes[0] =
+        (std::min)(KInfo->getWINum(), static_cast<uint64_t>(GroupSizes[0]));
+    DP("Capping maximum thread group size to %" PRIu64
+       " due to kernel constraints (reduction).\n", KInfo->getWINum());
+  }
+
+  GroupCounts[0] = maxGroupCount;
+  GroupCounts[1] = GroupCounts[2] = 1;
+  if (!maxGroupCountForced) {
+    if (KInfo && KInfo->getHasTeamsReduction() &&
+        DeviceInfo->Option.ReductionSubscriptionRate) {
+      if (!KInfo->isAtomicFreeReduction() ||
+          !DeviceInfo->Option.ReductionSubscriptionRateIsDefault) {
+        // Use reduction subscription rate 1 for kernels using
+        // atomic-free reductions, unless user forced reduction subscription
+        // rate via environment.
+        GroupCounts[0] /= DeviceInfo->Option.ReductionSubscriptionRate;
+        GroupCounts[0] = (std::max)(GroupCounts[0], size_t(1));
+      }
+    } else {
+      GroupCounts[0] *= DeviceInfo->Option.SubscriptionRate;
+    }
+  }
+
+  if (KInfo && KInfo->getWGNum()) {
+    GroupCounts[0] =
+        (std::min)(KInfo->getWGNum(), static_cast<uint64_t>(GroupCounts[0]));
+    DP("Capping maximum thread groups count to %" PRIu64
+       " due to kernel constraints (reduction).\n", KInfo->getWGNum());
+  }
+}
+
+static inline int32_t runTargetTeamNDRegion(
+    int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
+    ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams,
+    int32_t ThreadLimit, void *LoopDesc) {
+
+  cl_kernel Kernel = *static_cast<cl_kernel *>(TgtEntryPtr);
+  if (!Kernel) {
+    REPORT("Failed to invoke deleted kernel.\n");
+    return OFFLOAD_FAIL;
+  }
+
+  // Decide group sizes and counts
+  size_t LocalWorkSize[3] = {1, 1, 1};
+  size_t NumWorkGroups[3] = {1, 1, 1};
+  if (LoopDesc) {
+    decideLoopKernelGroupArguments(DeviceId, ThreadLimit,
+                                   (TgtNDRangeDescTy *)LoopDesc, Kernel,
+                                   LocalWorkSize, NumWorkGroups);
+  } else {
+    decideKernelGroupArguments(DeviceId, NumTeams, ThreadLimit, Kernel,
+                               LocalWorkSize, NumWorkGroups);
+  }
+
+  size_t GlobalWorkSize[3];
+  for (int32_t I = 0; I < 3; ++I)
+    GlobalWorkSize[I] = LocalWorkSize[I] * NumWorkGroups[I];
+
+  DP("Team sizes = {%zu, %zu, %zu}\n", LocalWorkSize[0], LocalWorkSize[1],
+     LocalWorkSize[2]);
+  DP("Number of teams = {%zu, %zu, %zu}\n",
+     GlobalWorkSize[0] / LocalWorkSize[0], GlobalWorkSize[1] / LocalWorkSize[1],
+     GlobalWorkSize[2] / LocalWorkSize[2]);
+
+  // Protect thread-unsafe OpenCL API calls
+  DeviceInfo->Mutexes[DeviceId].lock();
+
+  // Set kernel args
+  for (int32_t I = 0; I < NumArgs; ++I) {
+    ptrdiff_t Offset = TgtOffsets[I];
+    const char *ArgType = "Unknown";
+    auto *KernelInfo = DeviceInfo->getKernelInfo(DeviceId, Kernel);
+    if (KernelInfo && KernelInfo->isArgLiteral(I)) {
+      uint32_t Size = KernelInfo->getArgSize(I);
+      CALL_CL_RET_FAIL(clSetKernelArg, Kernel, I, Size, TgtArgs[I]);
+      ArgType = "ByVal";
+    } else if (Offset == (std::numeric_limits<ptrdiff_t>::max)()) {
+      // Offset equal to MAX(ptrdiff_t) means that the argument
+      // must be passed as literal, and the offset should be ignored.
+      intptr_t Arg = (intptr_t)TgtArgs[I];
+      CALL_CL_RET_FAIL(clSetKernelArg, Kernel, I, sizeof(Arg), &Arg);
+      ArgType = "Scalar";
+    } else {
+      ArgType = "Pointer";
+      void *Ptr = (void *)((intptr_t)TgtArgs[I] + Offset);
+      if (DeviceInfo->Option.Flags.UseBuffer &&
+          DeviceInfo->ClMemBuffers[DeviceId].count(Ptr) > 0) {
+        CALL_CL_RET_FAIL(clSetKernelArg, Kernel, I, sizeof(cl_mem), &Ptr);
+        ArgType = "ClMem";
+      } else if (DeviceInfo->Option.Flags.UseSVM) {
+        CALL_CL_RET_FAIL(clSetKernelArgSVMPointer, Kernel, I, Ptr);
+      } else {
+        if (!DeviceInfo->isExtensionFunctionEnabled(
+                DeviceId, clSetKernelArgMemPointerINTELId)) {
+          DP("Error: Extension %s is not supported\n",
+             DeviceInfo->getExtensionFunctionName(
+                 DeviceId, clSetKernelArgMemPointerINTELId));
+          return OFFLOAD_FAIL;
+        }
+        CALL_CL_EXT_RET_FAIL(
+            DeviceId, clSetKernelArgMemPointerINTEL, Kernel, I, Ptr);
+      }
+    }
+    DP("Kernel %s Arg %d set successfully\n", ArgType, I);
+    (void)ArgType;
+  }
+
+  auto &KernelProperty = DeviceInfo->KernelProperties[DeviceId][Kernel];
+  std::vector<void *> ImplicitSVMArgs;
+  std::vector<void *> ImplicitUSMArgs;
+  std::map<int32_t, bool> HasUSMArgs{
+      {TARGET_ALLOC_DEVICE, false},
+      {TARGET_ALLOC_HOST, false},
+      {TARGET_ALLOC_SHARED, false}
+  };
+  auto &AllocInfos = DeviceInfo->MemAllocInfo;
+
+  /// Kernel-dependent implicit arguments
+  for (auto Ptr : KernelProperty.ImplicitArgs) {
+    if (!Ptr)
+      continue;
+    // "Ptr" is not always the allocation information known to libomptarget, so
+    // use "search" instead of "find".
+    auto *Info = AllocInfos[DeviceId]->search(Ptr);
+    if (Info) {
+      if ((int32_t)Info->Kind == TARGET_ALLOC_SVM) {
+        ImplicitSVMArgs.push_back(Ptr);
+      } else {
+        ImplicitUSMArgs.push_back(Ptr);
+        HasUSMArgs[Info->Kind] = true;
+      }
+    }
+    if (DeviceInfo->Option.Flags.UseSingleContext) {
+      Info = AllocInfos[DeviceInfo->NumDevices]->search(Ptr);
+      if (Info) {
+        ImplicitUSMArgs.push_back(Ptr);
+        HasUSMArgs[TARGET_ALLOC_HOST] = true;
+      }
+    }
+  }
+
+  /// Kernel-independent implicit arguments
+  AllocInfos[DeviceId]->getImplicitArgs(ImplicitSVMArgs, ImplicitUSMArgs);
+  for (auto &ArgKind : HasUSMArgs)
+    if (AllocInfos[DeviceId]->hasImplicitUSMArg(ArgKind.first))
+      ArgKind.second = true;
+  if (DeviceInfo->Option.Flags.UseSingleContext) {
+    auto ID = DeviceInfo->NumDevices;
+    AllocInfos[ID]->getImplicitArgs(ImplicitSVMArgs, ImplicitUSMArgs);
+    if (AllocInfos[ID]->hasImplicitUSMArg(TARGET_ALLOC_HOST))
+      HasUSMArgs[TARGET_ALLOC_HOST] = true;
+  }
+
+  if (ImplicitSVMArgs.size() > 0) {
+    DP("Calling clSetKernelExecInfo to pass %zu implicit SVM arguments "
+       "to kernel " DPxMOD "\n", ImplicitSVMArgs.size(), DPxPTR(Kernel));
+    CALL_CL_RET_FAIL(clSetKernelExecInfo, Kernel, CL_KERNEL_EXEC_INFO_SVM_PTRS,
+                     sizeof(void *) * ImplicitSVMArgs.size(),
+                     ImplicitSVMArgs.data());
+  }
+
+  if (ImplicitUSMArgs.size() > 0) {
+    // Report non-argument USM pointers to the runtime.
+    DP("Calling clSetKernelExecInfo to pass %zu implicit USM arguments "
+       "to kernel " DPxMOD "\n", ImplicitUSMArgs.size(), DPxPTR(Kernel));
+    CALL_CL_RET_FAIL(clSetKernelExecInfo, Kernel,
+                     CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL,
+                     sizeof(void *) * ImplicitUSMArgs.size(),
+                     ImplicitUSMArgs.data());
+    // Mark the kernel as supporting indirect USM accesses, otherwise,
+    // clEnqueueNDRangeKernel call below will fail.
+    cl_bool KernelSupportsUSM = CL_TRUE;
+    if (HasUSMArgs[TARGET_ALLOC_HOST])
+      CALL_CL_RET_FAIL(clSetKernelExecInfo, Kernel,
+                       CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL,
+                       sizeof(cl_bool), &KernelSupportsUSM);
+    if (HasUSMArgs[TARGET_ALLOC_DEVICE])
+      CALL_CL_RET_FAIL(clSetKernelExecInfo, Kernel,
+                       CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL,
+                       sizeof(cl_bool), &KernelSupportsUSM);
+    if (HasUSMArgs[TARGET_ALLOC_SHARED])
+      CALL_CL_RET_FAIL(clSetKernelExecInfo, Kernel,
+                       CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL,
+                       sizeof(cl_bool), &KernelSupportsUSM);
+  }
+
+  cl_event Event;
+  CALL_CL_RET_FAIL(clEnqueueNDRangeKernel, DeviceInfo->Queues[DeviceId],
+                   Kernel, 3, nullptr, GlobalWorkSize,
+                   LocalWorkSize, 0, nullptr, &Event);
+
+  DeviceInfo->Mutexes[DeviceId].unlock();
+
+  DP("Started executing kernel.\n");
+
+  CALL_CL_RET_FAIL(clWaitForEvents, 1, &Event);
+  if (DeviceInfo->Option.Flags.EnableProfile) {
+    std::vector<char> Buf;
+    size_t BufSize;
+    CALL_CL_RET_FAIL(clGetKernelInfo, Kernel, CL_KERNEL_FUNCTION_NAME, 0,
+                     nullptr, &BufSize);
+    std::string KernelName("Kernel ");
+    if (BufSize > 0) {
+      Buf.resize(BufSize);
+      CALL_CL_RET_FAIL(clGetKernelInfo, Kernel, CL_KERNEL_FUNCTION_NAME,
+                       Buf.size(), Buf.data(), nullptr);
+      KernelName += Buf.data();
+    }
+    DeviceInfo->getProfiles(DeviceId).update(KernelName.c_str(), Event);
+  }
+  DP("Successfully finished kernel execution.\n");
+
+  return OFFLOAD_SUCCESS;
+}
+
+void *RTLDeviceInfoTy::allocDataClMem(int32_t DeviceId, size_t Size) {
+  cl_mem ret = nullptr;
+  cl_int rc;
+
+  CALL_CL_RVRC(ret, clCreateBuffer, rc, getContext(DeviceId),
+               CL_MEM_READ_WRITE, Size, nullptr);
+  if (rc != CL_SUCCESS)
+    return nullptr;
+
+  std::unique_lock<std::mutex> lock(Mutexes[DeviceId]);
+  ClMemBuffers[DeviceId].insert((void *)ret);
+
+  DP("Allocated cl_mem data " DPxMOD "\n", DPxPTR(ret));
+  return (void *)ret;
+}
+
+uint32_t RTLDeviceInfoTy::getPCIDeviceId(int32_t DeviceId) {
+  if (Extensions[DeviceId].DeviceAttributeQuery == ExtensionStatusEnabled)
+    return DeviceProperties[DeviceId].DeviceId;
+
+  uint32_t Id = 0;
+#ifndef _WIN32
+  // Linux: Device name contains "[0xABCD]" device identifier.
+  if (Option.DeviceType == CL_DEVICE_TYPE_GPU) {
+    std::string DeviceName(Names[DeviceId].data());
+    auto P = DeviceName.rfind("[");
+    if (P != std::string::npos && DeviceName.size() - P >= 8)
+      Id = std::strtol(DeviceName.substr(P + 1, 6).c_str(), nullptr, 16);
+  }
+#endif
+  return Id;
+}
+
+uint64_t RTLDeviceInfoTy::getDeviceArch(int32_t DeviceId) {
+  if (Option.DeviceType == CL_DEVICE_TYPE_CPU)
+    return DeviceArch_x86_64;
+
+  uint32_t PCIDeviceId = getPCIDeviceId(DeviceId);
+  if (PCIDeviceId != 0) {
+    for (auto &Arch : DeviceArchMap)
+      for (auto Id : Arch.second)
+        if (PCIDeviceId == Id || (PCIDeviceId & 0xFF00) == Id)
+          return Arch.first;  // Exact match or prefix match
+  }
+
+  std::string DeviceName(Names[DeviceId].data());
+#ifdef _WIN32
+  // Windows: Device name contains published product name.
+  for (auto &Arch : DeviceArchStrMap)
+    for (auto Str : Arch.second)
+      if (DeviceName.find(Str) != std::string::npos)
+        return Arch.first;
+#endif
+
+  DP("Warning: Cannot decide device arch for %s.\n", DeviceName.c_str());
+  return DeviceArch_None;
+}
+
+cl_unified_shared_memory_type_intel RTLDeviceInfoTy::getMemAllocType(
+    int32_t DeviceId, const void *Ptr) {
+  cl_unified_shared_memory_type_intel MemType = CL_MEM_TYPE_UNKNOWN_INTEL;
+  CALL_CL_EXT_RET(DeviceId, MemType, clGetMemAllocInfoINTEL,
+                  getContext(DeviceId), Ptr, CL_MEM_ALLOC_TYPE_INTEL,
+                  sizeof(MemType), &MemType, nullptr);
+  return MemType;
+}
+
+const KernelInfoTy *RTLDeviceInfoTy::getKernelInfo(
+    int32_t DeviceId, const cl_kernel &Kernel) const {
+  for (auto &Program : Programs[DeviceId]) {
+    auto *KernelInfo = Program.getKernelInfo(Kernel);
+    if (KernelInfo)
+      return KernelInfo;
+  }
+
+  return nullptr;
+}
+
+// Get memory attributes for the given allocation size.
+std::unique_ptr<std::vector<cl_mem_properties_intel>>
+RTLDeviceInfoTy::getAllocMemProperties(int32_t DeviceId, size_t Size) {
+  std::vector<cl_mem_properties_intel> Properties;
+  Properties.push_back(0);
+
+  return std::make_unique<std::vector<cl_mem_properties_intel>>(
+      std::move(Properties));
+}
+
+void SpecConstantsTy::setProgramConstants(
+    int32_t DeviceId, cl_program Program) const {
+  cl_int Rc;
+
+  if (!DeviceInfo->isExtensionFunctionEnabled(
+          DeviceId, clSetProgramSpecializationConstantId)) {
+    DP("Error: Extension %s is not supported.\n",
+       DeviceInfo->getExtensionFunctionName(
+           DeviceId, clSetProgramSpecializationConstantId));
+    return;
+  }
+
+  for (int I = ConstantValues.size(); I > 0; --I) {
+    cl_uint Id = static_cast<cl_uint>(ConstantIds[I - 1]);
+    size_t Size = ConstantValueSizes[I - 1];
+    const void *Val = ConstantValues[I - 1];
+    CALL_CL_EXT_SILENT(DeviceId, Rc, clSetProgramSpecializationConstant,
+                       Program, Id, Size, Val);
+    if (Rc == CL_SUCCESS)
+      DP("Set specialization constant '0x%X'\n", static_cast<int32_t>(Id));
+  }
+}
+
+int32_t ExtensionsTy::getExtensionsInfoForDevice(int32_t DeviceNum) {
+  // Identify the size of OpenCL extensions string.
+  size_t RetSize = 0;
+  // If the below call fails, some extensions's status may be
+  // left ExtensionStatusUnknown, so only ExtensionStatusEnabled
+  // actually means that the extension is enabled.
+  DP("Getting extensions for device %d\n", DeviceNum);
+
+  cl_device_id DeviceId = DeviceInfo->Devices[DeviceNum];
+  CALL_CL_RET_FAIL(clGetDeviceInfo, DeviceId, CL_DEVICE_EXTENSIONS, 0, nullptr,
+                   &RetSize);
+
+  std::unique_ptr<char []> Data(new char[RetSize]);
+  CALL_CL_RET_FAIL(clGetDeviceInfo, DeviceId, CL_DEVICE_EXTENSIONS, RetSize,
+                   Data.get(), &RetSize);
+
+  std::string Extensions(Data.get());
+  DP("Device extensions: %s\n", Extensions.c_str());
+
+  if (UnifiedSharedMemory == ExtensionStatusUnknown &&
+      Extensions.find("cl_intel_unified_shared_memory") != std::string::npos) {
+    UnifiedSharedMemory = ExtensionStatusEnabled;
+    DP("Extension UnifiedSharedMemory enabled.\n");
+  }
+
+  if (DeviceAttributeQuery == ExtensionStatusUnknown &&
+      Extensions.find("cl_intel_device_attribute_query") != std::string::npos) {
+    DeviceAttributeQuery = ExtensionStatusEnabled;
+    DP("Extension DeviceAttributeQuery enabled.\n");
+  }
+
+  // Check if the extension was not explicitly disabled, i.e.
+  // that its current status is unknown.
+  if (GetDeviceGlobalVariablePointer == ExtensionStatusUnknown)
+    // FIXME: use the right extension name.
+    if (Extensions.find("") != std::string::npos) {
+      GetDeviceGlobalVariablePointer = ExtensionStatusEnabled;
+      DP("Extension clGetDeviceGlobalVariablePointerINTEL enabled.\n");
+    }
+
+  if (SuggestedGroupSize == ExtensionStatusUnknown)
+    // FIXME: use the right extension name.
+    if (Extensions.find("") != std::string::npos) {
+      SuggestedGroupSize = ExtensionStatusEnabled;
+      DP("Extension clGetKernelSuggestedLocalWorkSizeINTEL enabled.\n");
+    }
+
+  std::for_each(LibdeviceExtensions.begin(), LibdeviceExtensions.end(),
+                [&Extensions](LibdeviceExtDescTy &Desc) {
+                  if (Desc.Status == ExtensionStatusUnknown)
+                    if (Extensions.find(Desc.Name) != std::string::npos) {
+                      Desc.Status = ExtensionStatusEnabled;
+                      DP("Extension %s enabled.\n", Desc.Name);
+                    }
+                });
+
+  return CL_SUCCESS;
+}
+
+int32_t DevicePropertiesTy::getDeviceProperties(cl_device_id ID) {
+  CALL_CL_RET_FAIL(clGetDeviceInfo, ID, CL_DEVICE_ID_INTEL, sizeof(cl_uint),
+                   &DeviceId, nullptr);
+  CALL_CL_RET_FAIL(clGetDeviceInfo, ID, CL_DEVICE_NUM_SLICES_INTEL,
+                   sizeof(cl_uint), &NumSlices, nullptr);
+  CALL_CL_RET_FAIL(clGetDeviceInfo, ID,
+                   CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL, sizeof(cl_uint),
+                   &NumSubslicesPerSlice, nullptr);
+  CALL_CL_RET_FAIL(clGetDeviceInfo, ID, CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL,
+                   sizeof(cl_uint), &NumEUsPerSubslice, nullptr);
+  CALL_CL_RET_FAIL(clGetDeviceInfo, ID, CL_DEVICE_NUM_THREADS_PER_EU_INTEL,
+                   sizeof(cl_uint), &NumThreadsPerEU, nullptr);
+
+  NumHWThreads =
+      NumSlices * NumSubslicesPerSlice * NumEUsPerSubslice * NumThreadsPerEU;
+
+  return OFFLOAD_SUCCESS;
+}
+
+OpenCLProgramTy::~OpenCLProgramTy() {
+  for (auto Kernel : Kernels) {
+    if (Kernel)
+      CALL_CL_RET_VOID(clReleaseKernel, Kernel);
+  }
+  for (auto PGM : Programs) {
+    CALL_CL_RET_VOID(clReleaseProgram, PGM);
+  }
+  if (RequiresProgramLink) {
+    CALL_CL_RET_VOID(clReleaseProgram, FinalProgram);
+  }
+  // Unload offload entries
+  for (auto &Entry : OffloadEntries)
+    delete[] Entry.Base.name;
+}
+
+/// Add program read from a single section
+int32_t OpenCLProgramTy::addProgramIL(const size_t Size,
+                                      const unsigned char *Image) {
+  cl_program PGM;
+  cl_int RC;
+  CALL_CL_RVRC(PGM, clCreateProgramWithIL, RC, Context, Image, Size);
+
+  auto Flags = DeviceInfo->Option.Flags;
+
+  if (RC != CL_SUCCESS || Flags.ShowBuildLog)
+    debugPrintBuildLog(PGM, Device);
+
+  if (RC != CL_SUCCESS) {
+    DP("Error: Failed to create program from SPIR-V: %d\n", RC);
+    return OFFLOAD_FAIL;
+  }
+
+  DeviceInfo->Option.CommonSpecConstants.setProgramConstants(DeviceId, PGM);
+  Programs.push_back(PGM);
+  IsBinary = false;
+
+  // First SPIR-V image is expected to be the only image or the first image
+  // that contains global information. We also add fallback libdevice image
+  // here if required.
+  if (Programs.size() == 1 && Flags.LinkLibDevice) {
+    auto &Extensions = DeviceInfo->Extensions[DeviceId].LibdeviceExtensions;
+    for (auto &ExtDesc : Extensions) {
+      if (ExtDesc.Status == ExtensionStatusEnabled) {
+        DP("Fallback libdevice RTL %s is not required.\n",
+           ExtDesc.FallbackLibName);
+        continue;
+      }
+      // Device runtime does not support this libdevice extension,
+      // so we have to link in the fallback implementation.
+      //
+      // TODO: the device image must specify which libdevice extensions
+      //       are actually required. We should link only the required
+      //       fallback implementations.
+      PGM = createProgramFromFile(ExtDesc.FallbackLibName, DeviceId);
+      if (PGM) {
+        DP("Added fallback libdevice RTL %s.\n", ExtDesc.FallbackLibName);
+        Programs.push_back(PGM);
+      } else {
+        DP("Cannot add fallback libdeice RTL %s.\n", ExtDesc.FallbackLibName);
+      }
+    }
+  }
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t OpenCLProgramTy::addProgramBIN(const size_t Size,
+                                       const unsigned char *Image) {
+  cl_program PGM;
+  cl_int RC;
+  CALL_CL_RVRC(PGM, clCreateProgramWithBinary, RC, Context, 1, &Device,
+               &Size, &Image, nullptr);
+
+  if (RC != CL_SUCCESS || DeviceInfo->Option.Flags.ShowBuildLog)
+    debugPrintBuildLog(PGM, Device);
+
+  if (RC != CL_SUCCESS) {
+    DP("Error: Failed to create program from binary: %d\n", RC);
+    return OFFLOAD_FAIL;
+  }
+
+  DeviceInfo->Option.CommonSpecConstants.setProgramConstants(DeviceId, PGM);
+  Programs.push_back(PGM);
+  IsBinary = true;
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t OpenCLProgramTy::buildPrograms(std::string &CompilationOptions,
+                                       std::string &LinkingOptions) {
+  int32_t RC;
+
+  uint64_t MajorVer, MinorVer;
+  if (!isValidOneOmpImage(Image, MajorVer, MinorVer)) {
+    // Handle legacy plain SPIR-V image.
+    char *ImgBegin = reinterpret_cast<char *>(Image->ImageStart);
+    char *ImgEnd = reinterpret_cast<char *>(Image->ImageEnd);
+    size_t ImgSize = ImgEnd - ImgBegin;
+    dumpImageToFile(ImgBegin, ImgSize, "OpenMP");
+    if (addProgramIL(ImgSize, (unsigned char *)ImgBegin) != OFFLOAD_SUCCESS)
+      return OFFLOAD_FAIL;
+
+    return OFFLOAD_SUCCESS;
+  }
+
+  // Iterate over the images and pick the first one that fits.
+  char *ImgBegin = reinterpret_cast<char *>(Image->ImageStart);
+  char *ImgEnd = reinterpret_cast<char *>(Image->ImageEnd);
+  size_t ImgSize = ImgEnd - ImgBegin;
+  ElfL E(ImgBegin, ImgSize);
+  assert(E.isValidElf() &&
+         "isValidOneOmpImage() returns true for invalid ELF image.");
+  assert(MajorVer == 1 && MinorVer == 0 &&
+         "FIXME: update image processing for new oneAPI OpenMP version.");
+  // Collect auxiliary information.
+  uint64_t ImageCount = 0;
+  uint64_t MaxImageIdx = 0;
+  struct V1ImageInfo {
+    // 0 - native, 1 - SPIR-V
+    uint64_t Format =  (std::numeric_limits<uint64_t>::max)();
+    std::string CompileOpts;
+    std::string LinkOpts;
+    const uint8_t *Begin;
+    uint64_t Size;
+
+    V1ImageInfo(uint64_t Format, std::string CompileOpts,
+                std::string LinkOpts, const uint8_t *Begin, uint64_t Size)
+      : Format(Format), CompileOpts(CompileOpts),
+        LinkOpts(LinkOpts), Begin(Begin), Size(Size) {}
+  };
+
+  std::unordered_map<uint64_t, V1ImageInfo> AuxInfo;
+
+  for (auto I = E.section_notes_begin(), IE = E.section_notes_end(); I != IE;
+       ++I) {
+    ElfLNote Note = *I;
+    if (Note.getNameSize() == 0)
+      continue;
+    std::string NameStr(Note.getName(), Note.getNameSize());
+    if (NameStr != "INTELONEOMPOFFLOAD")
+      continue;
+    uint64_t Type = Note.getType();
+    std::string DescStr(reinterpret_cast<const char *>(Note.getDesc()),
+                        Note.getDescSize());
+    switch (Type) {
+    default:
+      DP("Warning: unrecognized INTELONEOMPOFFLOAD note.\n");
+      break;
+    case NT_INTEL_ONEOMP_OFFLOAD_VERSION:
+      break;
+    case NT_INTEL_ONEOMP_OFFLOAD_IMAGE_COUNT:
+      ImageCount = std::stoull(DescStr);
+      break;
+    case NT_INTEL_ONEOMP_OFFLOAD_IMAGE_AUX: {
+      std::vector<std::string> Parts;
+      do {
+        auto DelimPos = DescStr.find('\0');
+        if (DelimPos == std::string::npos) {
+          Parts.push_back(DescStr);
+          break;
+        }
+        Parts.push_back(DescStr.substr(0, DelimPos));
+        DescStr.erase(0, DelimPos + 1);
+      } while (Parts.size() < 4);
+
+      // Ignore records with less than 4 strings.
+      if (Parts.size() != 4) {
+        DP("Warning: short NT_INTEL_ONEOMP_OFFLOAD_IMAGE_AUX "
+           "record is ignored.\n");
+        continue;
+      }
+
+      uint64_t Idx = std::stoull(Parts[0]);
+      MaxImageIdx = (std::max)(MaxImageIdx, Idx);
+      if (AuxInfo.find(Idx) != AuxInfo.end()) {
+        DP("Warning: duplicate auxiliary information for image %" PRIu64
+           " is ignored.\n", Idx);
+        continue;
+      }
+      AuxInfo.emplace(std::piecewise_construct,
+                      std::forward_as_tuple(Idx),
+                      std::forward_as_tuple(std::stoull(Parts[1]),
+                                            Parts[2], Parts[3],
+                                            // Image pointer and size
+                                            // will be initialized later.
+                                            nullptr, 0));
+    }
+    }
+  }
+
+  if (MaxImageIdx >= ImageCount)
+    DP("Warning: invalid image index found in auxiliary information.\n");
+
+  for (auto I = E.sections_begin(), IE = E.sections_end(); I != IE; ++I) {
+    const char *Prefix = "__openmp_offload_spirv_";
+    std::string SectionName((*I).getName() ? (*I).getName() : "");
+    if (SectionName.find(Prefix) != 0)
+      continue;
+    SectionName.erase(0, std::strlen(Prefix));
+    uint64_t Idx = std::stoull(SectionName);
+    if (Idx >= ImageCount) {
+      DP("Warning: ignoring image section (index %" PRIu64
+         " is out of range).\n", Idx);
+      continue;
+    }
+
+    auto AuxInfoIt = AuxInfo.find(Idx);
+    if (AuxInfoIt == AuxInfo.end()) {
+      DP("Warning: ignoring image section (no aux info).\n");
+      continue;
+    }
+
+    AuxInfoIt->second.Begin = (*I).getContents();
+    AuxInfoIt->second.Size = (*I).getSize();
+  }
+
+  for (uint64_t Idx = 0; Idx < ImageCount; ++Idx) {
+    auto It = AuxInfo.find(Idx);
+    if (It == AuxInfo.end()) {
+      DP("Warning: image %" PRIu64
+         " without auxiliary information is ingored.\n", Idx);
+      continue;
+    }
+
+    const unsigned char *ImgBegin =
+        reinterpret_cast<const unsigned char *>(It->second.Begin);
+    size_t ImgSize = It->second.Size;
+    dumpImageToFile(ImgBegin, ImgSize, "OpenMP");
+
+    if (It->second.Format == 0) {
+      // Native format.
+      RC = addProgramBIN(ImgSize, ImgBegin);
+    } else if (It->second.Format == 1) {
+      // SPIR-V format.
+      RC = addProgramIL(ImgSize, ImgBegin);
+    } else {
+      DP("Warning: image %" PRIu64 "is ignored due to unknown format.\n", Idx);
+      continue;
+    }
+
+    if (RC != OFFLOAD_SUCCESS)
+      continue;
+
+    DP("Created offload program from image #%" PRIu64 ".\n", Idx);
+    if (DeviceInfo->Option.Flags.UseImageOptions) {
+      CompilationOptions += " " + It->second.CompileOpts;
+      LinkingOptions += " " + It->second.LinkOpts;
+    }
+
+    return OFFLOAD_SUCCESS;
+  }
+
+  return OFFLOAD_FAIL;
+}
+
+int32_t OpenCLProgramTy::compilePrograms(std::string &CompOptions,
+                                         std::string &LinkOptions) {
+  if (IsBinary && Programs.size() > 1) {
+    // Nothing to be done for split-kernel binaries as later linking step
+    // falls back to SPIR-V recompilation.
+    DP("Skipping compilation for multiple binary images.\n");
+    RequiresProgramLink = true;
+    return OFFLOAD_SUCCESS;
+  }
+
+  cl_int RC;
+  auto &Flags = DeviceInfo->Option.Flags;
+
+  if (Programs.size() == 1 &&
+      (IsBinary || Flags.EnableSimd ||
+      // Work around GPU API issue: clCompileProgram/clLinkProgram
+      // does not work with -vc-codegen, so we have to use clBuildProgram.
+      CompOptions.find(" -vc-codegen ") != std::string::npos)) {
+    auto BuildOptions = CompOptions + " " + LinkOptions;
+    CALL_CL(RC, clBuildProgram, Programs[0], 0, nullptr, BuildOptions.c_str(),
+            nullptr, nullptr);
+    if (RC != CL_SUCCESS || Flags.ShowBuildLog) {
+      debugPrintBuildLog(Programs[0], Device);
+      if (RC != CL_SUCCESS) {
+        DP("Error: Failed to build program: %d\n", RC);
+        return OFFLOAD_FAIL;
+      }
+    }
+    RequiresProgramLink = false;
+    return OFFLOAD_SUCCESS;
+  }
+
+  // Single or multiple SPIR-V programs are compiled
+  for (auto &PGM : Programs) {
+    CALL_CL(RC, clCompileProgram, PGM, 0, nullptr, CompOptions.c_str(), 0,
+            nullptr, nullptr, nullptr, nullptr);
+    if (RC != CL_SUCCESS || Flags.ShowBuildLog) {
+      debugPrintBuildLog(PGM, Device);
+      if (RC != CL_SUCCESS) {
+        DP("Error: Failed to compile program: %d\n", RC);
+        return OFFLOAD_FAIL;
+      }
+    }
+  }
+
+  RequiresProgramLink = true;
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t OpenCLProgramTy::linkPrograms(std::string &LinkOptions) {
+  if (!RequiresProgramLink) {
+    FinalProgram = Programs[0];
+    DP("Program linking is not required.\n");
+    return OFFLOAD_SUCCESS;
+  }
+
+  cl_int RC;
+  CALL_CL_RVRC(FinalProgram, clLinkProgram, RC, Context, 1, &Device,
+               LinkOptions.c_str(), Programs.size(), Programs.data(), nullptr,
+               nullptr);
+  if (RC != CL_SUCCESS || DeviceInfo->Option.Flags.ShowBuildLog) {
+    debugPrintBuildLog(FinalProgram, Device);
+    if (RC != CL_SUCCESS) {
+      DP("Error: Failed to link program: %d\n", RC);
+      return OFFLOAD_FAIL;
+    }
+  }
+
+  DP("Successfully linked %zu programs.\n", Programs.size());
+
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t OpenCLProgramTy::buildKernels() {
+  size_t NumEntries = (size_t)(Image->EntriesEnd - Image->EntriesBegin);
+
+  Entries.resize(NumEntries);
+  Kernels.resize(NumEntries);
+
+  ProfileIntervalTy EntriesTimer("OffloadEntriesInit", DeviceId);
+  EntriesTimer.start();
+  if (!loadOffloadTable(NumEntries))
+    DP("Warning: could not load offload table.\n");
+  EntriesTimer.stop();
+
+  // We are supposed to have a single final program at this point
+  for (size_t I = 0; I < NumEntries; I++) {
+    // Size is 0 means that it is kernel function.
+    auto Size = Image->EntriesBegin[I].size;
+    char *Name = Image->EntriesBegin[I].name;
+
+    if (Size != 0) {
+      EntriesTimer.start();
+      void *HostAddr = Image->EntriesBegin[I].addr;
+      void *TgtAddr = getOffloadVarDeviceAddr(Name, Size);
+
+      if (!TgtAddr) {
+        TgtAddr = __tgt_rtl_data_alloc(DeviceId, Size, HostAddr,
+                                       TARGET_ALLOC_DEFAULT);
+        __tgt_rtl_data_submit(DeviceId, TgtAddr, HostAddr, Size);
+        DP("Warning: global variable '%s' allocated. "
+           "Direct references will not work properly.\n", Name);
+      }
+
+      DP("Global variable mapped: Name = %s, Size = %zu, "
+         "HostPtr = " DPxMOD ", TgtPtr = " DPxMOD "\n",
+         Name, Size, DPxPTR(HostAddr), DPxPTR(TgtAddr));
+      Entries[I].addr = TgtAddr;
+      Entries[I].name = Name;
+      Entries[I].size = Size;
+      Kernels[I] = nullptr;
+      EntriesTimer.stop();
+      continue;
+    }
+
+#if _WIN32
+    // FIXME: temporary allow zero padding bytes in the entries table
+    //        added by MSVC linker (e.g. for incremental linking).
+    if (!Name) {
+      // Initialize the members to be on the safe side.
+      DP("Warning: Entry with a nullptr name!!!\n");
+      Entries[I].addr = nullptr;
+      Entries[I].name = nullptr;
+      continue;
+    }
+#endif  // _WIN32
+    cl_int RC;
+    CALL_CL_RVRC(Kernels[I], clCreateKernel, RC, FinalProgram, Name);
+    if (RC != CL_SUCCESS) {
+      // If a kernel was deleted by optimizations (e.g. DCE), then
+      // clCreateKernel will fail. We expect that such a kernel
+      // will never be actually invoked.
+      DP("Warning: Failed to create kernel %s, %d\n", Name, RC);
+      Kernels[I] = nullptr;
+    }
+    Entries[I].addr = &Kernels[I];
+    Entries[I].name = Name;
+
+    // Do not try to query information for deleted kernels.
+    if (!Kernels[I])
+      continue;
+
+    if (!readKernelInfo(Entries[I])) {
+      DP("Error: failed to read kernel info for kernel %s\n", Name);
+      return OFFLOAD_FAIL;
+    }
+
+    // Retrieve kernel group size info.
+    auto Kernel = Kernels[I];
+    auto &KernelProperty = DeviceInfo->KernelProperties[DeviceId][Kernel];
+    CALL_CL_RET_FAIL(clGetKernelWorkGroupInfo, Kernel, Device,
+                     CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
+                     sizeof(size_t), &KernelProperty.Width, nullptr);
+    CALL_CL_RET_FAIL(clGetKernelSubGroupInfo, Kernel, Device,
+                     CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE, sizeof(size_t),
+                     &KernelProperty.SIMDWidth, sizeof(size_t),
+                     &KernelProperty.SIMDWidth, nullptr);
+    if (KernelProperty.SIMDWidth == 0) {
+      // clGetKernelSubGroupInfo is not supported on Windows with CPU device, so
+      // assign default value to avoid any issues when using this variable.
+      KernelProperty.SIMDWidth = KernelProperty.Width / 2;
+    }
+    assert(KernelProperty.SIMDWidth <= KernelProperty.Width &&
+           "Invalid preferred group size multiple.");
+    CALL_CL_RET_FAIL(clGetKernelWorkGroupInfo, Kernel, Device,
+                     CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t),
+                     &KernelProperty.MaxThreadGroupSize, nullptr);
+
+    if (DebugLevel > 0) {
+      // Show kernel information
+      std::vector<char> Buf;
+      size_t BufSize;
+      cl_uint NumArgs = 0;
+      CALL_CL_RET_FAIL(clGetKernelInfo, Kernel, CL_KERNEL_NUM_ARGS,
+                       sizeof(cl_uint), &NumArgs, nullptr);
+      DP("Kernel %zu: Name = %s, NumArgs = %" PRIu32 "\n", I, Name, NumArgs);
+      for (cl_uint J = 0; J < NumArgs; J++) {
+        // clGetKernelArgInfo is not supposed to work unless the program is
+        // built with clCreateProgramWithSource according to the specification.
+        // We still allow this if the backend RT is capable of returning the
+        // argument information without using clCreateProgramWithSource.
+        CALL_CL_SILENT(RC, clGetKernelArgInfo, Kernel, J,
+                       CL_KERNEL_ARG_TYPE_NAME, 0, nullptr, &BufSize);
+        if (RC != CL_SUCCESS)
+          break; // Kernel argument info won't be available
+        Buf.resize(BufSize);
+        CALL_CL_RET_FAIL(clGetKernelArgInfo, Kernel, J,
+                         CL_KERNEL_ARG_TYPE_NAME, BufSize, Buf.data(), nullptr);
+        std::string TypeName = Buf.data();
+        CALL_CL_RET_FAIL(clGetKernelArgInfo, Kernel, J, CL_KERNEL_ARG_NAME, 0,
+                         nullptr, &BufSize);
+        Buf.resize(BufSize);
+        CALL_CL_RET_FAIL(clGetKernelArgInfo, Kernel, J, CL_KERNEL_ARG_NAME,
+                         BufSize, Buf.data(), nullptr);
+        DP("  Arg %2" PRIu32 ": %s %s\n", J, TypeName.c_str(),
+           Buf.data() ? Buf.data() : "undefined");
+      }
+    }
+  }
+
+  Table.EntriesBegin = &(Entries.data()[0]);
+  Table.EntriesEnd = &(Entries.data()[Entries.size()]);
+
+  return OFFLOAD_SUCCESS;
+}
+
+bool OpenCLProgramTy::readKernelInfo(const __tgt_offload_entry &KernelEntry) {
+  const cl_kernel *KernelPtr =
+      reinterpret_cast<const cl_kernel *>(KernelEntry.addr);
+  const char *Name = KernelEntry.name;
+  std::string InfoVarName(Name);
+  InfoVarName += "_kernel_info";
+  size_t InfoVarSize = 0;
+  void *InfoVarAddr = getVarDeviceAddr(InfoVarName.c_str(), &InfoVarSize);
+  // If there is no kernel info variable, then the kernel might have been
+  // produced by older toolchain - this is acceptable, so return success.
+  if (!InfoVarAddr)
+    return true;
+  if (InfoVarSize == 0) {
+    DP("Error: kernel info variable cannot have 0 size.\n");
+    return false;
+  }
+  std::vector<char> InfoBuffer;
+  InfoBuffer.resize(InfoVarSize);
+  CALL_CL_EXT_RET(DeviceId, false, clEnqueueMemcpyINTEL,
+                  DeviceInfo->Queues[DeviceId],
+                  /*blocking=*/CL_TRUE, InfoBuffer.data(),
+                  InfoVarAddr, InfoVarSize,
+                  /*num_events_in_wait_list=*/0,
+                  /*event_wait_list=*/nullptr,
+                  /*event=*/nullptr);
+  // TODO: add support for big-endian devices, if needed.
+  //       Currently supported devices are little-endian.
+  char *ReadPtr = InfoBuffer.data();
+  uint32_t Version = llvm::support::endian::read32le(ReadPtr);
+  if (Version == 0) {
+    DP("Error: version 0 of kernel info structure is illegal.\n");
+    return false;
+  }
+  if (Version > 4) {
+    DP("Error: unsupported version (%" PRIu32 ") of kernel info structure.\n",
+       Version);
+    DP("Error: please use newer OpenMP offload runtime.\n");
+    return false;
+  }
+  ReadPtr += 4;
+  uint32_t KernelArgsNum = llvm::support::endian::read32le(ReadPtr);
+  size_t ExpectedInfoVarSize = static_cast<size_t>(KernelArgsNum) * 8 + 8;
+  // Support Attributes1 since version 2.
+  if (Version > 1)
+    ExpectedInfoVarSize += 8;
+  // Support WGNum since version 3.
+  if (Version > 2)
+    ExpectedInfoVarSize += 8;
+  // Support WINum since version 4.
+  if (Version > 3)
+    ExpectedInfoVarSize += 8;
+  if (InfoVarSize != ExpectedInfoVarSize) {
+    DP("Error: expected kernel info variable size %zu - got %zu\n",
+       ExpectedInfoVarSize, InfoVarSize);
+    return false;
+  }
+  KernelInfoTy Info(Version);
+  ReadPtr += 4;
+  for (uint64_t I = 0; I < KernelArgsNum; ++I) {
+    bool ArgIsLiteral = (llvm::support::endian::read32le(ReadPtr) != 0);
+    ReadPtr += 4;
+    uint32_t ArgSize = llvm::support::endian::read32le(ReadPtr);
+    ReadPtr += 4;
+    Info.addArgInfo(ArgIsLiteral, ArgSize);
+  }
+
+  if (Version > 1) {
+    // Read 8-byte Attributes1 since version 2.
+    uint64_t Attributes1 = llvm::support::endian::read64le(ReadPtr);
+    Info.setAttributes1(Attributes1);
+    ReadPtr += 8;
+  }
+
+  if (Version > 2) {
+    // Read 8-byte WGNum since version 3.
+    uint32_t WGNum = llvm::support::endian::read64le(ReadPtr);
+    Info.setWGNum(WGNum);
+    ReadPtr += 8;
+  }
+
+  if (Version > 3) {
+    // Read 8-byte WGNum since version 3.
+    uint32_t WINum = llvm::support::endian::read64le(ReadPtr);
+    Info.setWINum(WINum);
+    ReadPtr += 8;
+  }
+
+  KernelInfo.emplace(std::make_pair(*KernelPtr, std::move(Info)));
+  return true;
+}
+
+bool OpenCLProgramTy::loadOffloadTable(size_t NumEntries) {
+  const char *OffloadTableSizeVarName = "__omp_offloading_entries_table_size";
+  void *OffloadTableSizeVarAddr =
+      getVarDeviceAddr(OffloadTableSizeVarName, sizeof(int64_t));
+
+  if (!OffloadTableSizeVarAddr) {
+    DP("Warning: cannot get device value for global variable '%s'.\n",
+       OffloadTableSizeVarName);
+    return false;
+  }
+
+  int64_t TableSizeVal = 0;
+  CALL_CL_EXT_RET(DeviceId, false, clEnqueueMemcpyINTEL,
+                  DeviceInfo->Queues[DeviceId],
+                  CL_TRUE, &TableSizeVal, OffloadTableSizeVarAddr,
+                  sizeof(int64_t), 0, nullptr, nullptr);
+  size_t TableSize = (size_t)TableSizeVal;
+
+  if ((TableSize % sizeof(DeviceOffloadEntryTy)) != 0) {
+    DP("Warning: offload table size (%zu) is not a multiple of %zu.\n",
+       TableSize, sizeof(DeviceOffloadEntryTy));
+    return false;
+  }
+
+  size_t DeviceNumEntries = TableSize / sizeof(DeviceOffloadEntryTy);
+
+  if (NumEntries != DeviceNumEntries) {
+    DP("Warning: number of entries in host and device "
+       "offload tables mismatch (%zu != %zu).\n", NumEntries, DeviceNumEntries);
+  }
+
+  const char *OffloadTableVarName = "__omp_offloading_entries_table";
+  void *OffloadTableVarAddr = getVarDeviceAddr(OffloadTableVarName, TableSize);
+  if (!OffloadTableVarAddr) {
+    DP("Warning: cannot get device value for global variable '%s'.\n",
+       OffloadTableVarName);
+    return false;
+  }
+
+  OffloadEntries.resize(DeviceNumEntries);
+  CALL_CL_EXT_RET(DeviceId, false, clEnqueueMemcpyINTEL,
+                  DeviceInfo->Queues[DeviceId],
+                  CL_TRUE, OffloadEntries.data(), OffloadTableVarAddr,
+                  TableSize, 0, nullptr, nullptr);
+
+  size_t I = 0;
+  const char *PreviousName = "";
+  bool PreviousIsVar = false;
+
+  for (; I < DeviceNumEntries; ++I) {
+    DeviceOffloadEntryTy &Entry = OffloadEntries[I];
+    size_t NameSize = Entry.NameSize;
+    void *NameTgtAddr = Entry.Base.name;
+    Entry.Base.name = nullptr;
+
+    if (NameSize == 0) {
+      DP("Warning: offload entry (%zu) with 0 name size.\n", I);
+      break;
+    }
+    if (NameTgtAddr == nullptr) {
+      DP("Warning: offload entry (%zu) with invalid name.\n", I);
+      break;
+    }
+
+    Entry.Base.name = new char[NameSize];
+    CALL_CL_EXT_RET(DeviceId, false, clEnqueueMemcpyINTEL,
+                    DeviceInfo->Queues[DeviceId],
+                    CL_TRUE, Entry.Base.name, NameTgtAddr, NameSize, 0, nullptr,
+                    nullptr);
+    if (strnlen(Entry.Base.name, NameSize) != NameSize - 1) {
+      DP("Warning: offload entry's name has wrong size.\n");
+      break;
+    }
+
+    int Cmp = strncmp(PreviousName, Entry.Base.name, NameSize);
+    if (Cmp > 0) {
+      DP("Warning: offload table is not sorted.\n"
+         "Warning: previous name is '%s'.\n"
+         "Warning:  current name is '%s'.\n",
+         PreviousName, Entry.Base.name);
+      break;
+    } else if (Cmp == 0 && (PreviousIsVar || Entry.Base.size)) {
+      // The names are equal. This should never happen for
+      // offload variables, but we allow this for offload functions.
+      DP("Warning: duplicate names (%s) in offload table.\n", PreviousName);
+      break;
+    }
+    PreviousName = Entry.Base.name;
+    PreviousIsVar = (Entry.Base.size != 0);
+  }
+
+  if (I != DeviceNumEntries) {
+    // Errors during the table processing.
+    // Deallocate all memory allocated in the loop.
+    for (size_t J = 0; J <= I; ++J) {
+      DeviceOffloadEntryTy &Entry = OffloadEntries[J];
+      if (Entry.Base.name)
+        delete[] Entry.Base.name;
+    }
+
+    OffloadEntries.clear();
+    return false;
+  }
+
+  if (DebugLevel > 0) {
+    DP("Device offload table loaded:\n");
+    for (size_t I = 0; I < DeviceNumEntries; ++I)
+      DP("\t%zu:\t%s\n", I, OffloadEntries[I].Base.name);
+  }
+
+  return true;
+}
+
+void *OpenCLProgramTy::getOffloadVarDeviceAddr(const char *Name, size_t Size) {
+  DP("Looking up OpenMP global variable '%s' of size %zu bytes on device %d.\n",
+     Name, Size, DeviceId);
+
+  if (!OffloadEntries.empty()) {
+    size_t NameSize = strlen(Name) + 1;
+    auto I = std::lower_bound(
+        OffloadEntries.begin(), OffloadEntries.end(), Name,
+        [NameSize](const DeviceOffloadEntryTy &E, const char *Name) {
+          return strncmp(E.Base.name, Name, NameSize) < 0;
+        });
+
+    if (I != OffloadEntries.end() &&
+        strncmp(I->Base.name, Name, NameSize) == 0) {
+      DP("Global variable '%s' found in the offload table at position %zu.\n",
+         Name, std::distance(OffloadEntries.begin(), I));
+      return I->Base.addr;
+    }
+
+    DP("Warning: global variable '%s' was not found in the offload table.\n",
+       Name);
+  } else
+    DP("Warning: offload table is not loaded for device %d.\n", DeviceId);
+
+  // Fallback to the lookup by name.
+  return getVarDeviceAddr(Name, Size);
+}
+
+void *OpenCLProgramTy::getVarDeviceAddr(const char *Name, size_t *SizePtr) {
+  size_t DeviceSize = 0;
+  void *TgtAddr = nullptr;
+  size_t Size = *SizePtr;
+  bool SizeIsKnown = (Size != 0);
+  if (SizeIsKnown) {
+    DP("Looking up device global variable '%s' of size %zu bytes "
+       "on device %d.\n", Name, Size, DeviceId);
+  } else {
+    DP("Looking up device global variable '%s' of unknown size "
+       "on device %d.\n", Name, DeviceId);
+  }
+
+  if (!DeviceInfo->isExtensionFunctionEnabled(
+          DeviceId, clGetDeviceGlobalVariablePointerINTELId))
+    return nullptr;
+
+  cl_int RC;
+  auto clGetDeviceGlobalVariablePointerINTELFn =
+      reinterpret_cast<clGetDeviceGlobalVariablePointerINTEL_fn>(
+          DeviceInfo->getExtensionFunctionPtr(
+              DeviceId, clGetDeviceGlobalVariablePointerINTELId));
+  RC = clGetDeviceGlobalVariablePointerINTELFn(
+      Device, FinalProgram, Name, &DeviceSize, &TgtAddr);
+
+  if (RC != CL_SUCCESS) {
+    DP("Warning: clGetDeviceGlobalVariablePointerINTEL API returned "
+       "nullptr for global variable '%s'.\n", Name);
+    DeviceSize = 0;
+  } else if (Size != DeviceSize && SizeIsKnown) {
+    DP("Warning: size mismatch for host (%zu) and device (%zu) versions "
+       "of global variable: %s\n.  Direct references "
+       "to this variable will not work properly.\n",
+       Size, DeviceSize, Name);
+    DeviceSize = 0;
+  }
+
+  if (DeviceSize == 0) {
+    DP("Warning: global variable lookup failed.\n");
+    return nullptr;
+  }
+
+  DP("Global variable lookup succeeded (size: %zu bytes).\n", DeviceSize);
+  *SizePtr = DeviceSize;
+  return TgtAddr;
+}
+
+void *OpenCLProgramTy::getVarDeviceAddr(const char *Name, size_t Size) {
+  return getVarDeviceAddr(Name, &Size);
+}
+
+const KernelInfoTy *OpenCLProgramTy::getKernelInfo(
+    const cl_kernel Kernel) const {
+  auto I = KernelInfo.find(Kernel);
+  if (I != KernelInfo.end())
+    return &(I->second);
+  else
+    return nullptr;
+}
+
+///
+/// Common plugin interface
+///
+
+int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
+  uint64_t MajorVer, MinorVer;
+  if (isValidOneOmpImage(Image, MajorVer, MinorVer)) {
+    DP("Target binary is a valid oneAPI OpenMP image.\n");
+    return 1;
+  }
+
+  DP("Target binary is *not* a valid oneAPI OpenMP image.\n");
+
+  // Fallback to legacy behavior, when the image is a plain
+  // SPIR-V file.
+  uint32_t MagicWord = *(uint32_t *)Image->ImageStart;
+  // compare magic word in little endian and big endian:
+  int32_t Ret = (MagicWord == 0x07230203 || MagicWord == 0x03022307);
+  DP("Target binary is %s\n", Ret ? "VALID" : "INVALID");
+
+  return Ret;
+}
+
+int32_t __tgt_rtl_number_of_devices() {
+  // Assume it is thread safe, since it is called once.
+
+  DP("Start initializing OpenCL\n");
+  // get available platforms
+  cl_uint Count = 0;
+  CALL_CL_RET_ZERO(clGetPlatformIDs, 0, nullptr, &Count);
+  std::vector<cl_platform_id> PlatformIDs(Count);
+  CALL_CL_RET_ZERO(clGetPlatformIDs, Count, PlatformIDs.data(), nullptr);
+
+  // All eligible OpenCL device IDs from the platforms are stored in a list
+  // in the order they are probed by clGetPlatformIDs/clGetDeviceIDs.
+  for (cl_platform_id ID : PlatformIDs) {
+    std::vector<char> Buf;
+    size_t BufSize;
+    cl_int RC;
+    CALL_CL(RC, clGetPlatformInfo, ID, CL_PLATFORM_VERSION, 0, nullptr,
+            &BufSize);
+    if (RC != CL_SUCCESS || BufSize == 0)
+      continue;
+    Buf.resize(BufSize);
+    CALL_CL(RC, clGetPlatformInfo, ID, CL_PLATFORM_VERSION, BufSize,
+            Buf.data(), nullptr);
+    // clCreateProgramWithIL() requires OpenCL 2.1.
+    if (RC != CL_SUCCESS || std::stof(std::string(Buf.data() + 6)) <= 2.0) {
+      continue;
+    }
+    cl_uint NumDevices = 0;
+    CALL_CL_SILENT(RC, clGetDeviceIDs, ID, DeviceInfo->Option.DeviceType, 0,
+                   nullptr, &NumDevices);
+    if (RC != CL_SUCCESS || NumDevices == 0)
+      continue;
+
+    DP("Platform %s has %" PRIu32 " Devices\n",
+       Buf.data() ? Buf.data() : "undefined", NumDevices);
+    std::vector<cl_device_id> Devices(NumDevices);
+    CALL_CL_RET_ZERO(clGetDeviceIDs, ID, DeviceInfo->Option.DeviceType,
+                     NumDevices, Devices.data(), nullptr);
+
+    cl_context Context = nullptr;
+    if (DeviceInfo->Option.Flags.UseSingleContext) {
+      cl_context_properties ContextProperties[] = {
+          CL_CONTEXT_PLATFORM, (cl_context_properties)ID, 0
+      };
+      CALL_CL_RVRC(Context, clCreateContext, RC, ContextProperties,
+                   Devices.size(), Devices.data(), nullptr, nullptr);
+      if (RC != CL_SUCCESS)
+        continue;
+    }
+
+    DeviceInfo->PlatformInfos.emplace(ID, PlatformInfoTy(ID, Context));
+    for (auto Device : Devices) {
+      DeviceInfo->Devices.push_back(Device);
+      DeviceInfo->Platforms.push_back(ID);
+    }
+    DeviceInfo->NumDevices += NumDevices;
+  }
+
+  if (!DeviceInfo->Option.Flags.UseSingleContext)
+    DeviceInfo->Contexts.resize(DeviceInfo->NumDevices);
+  DeviceInfo->Programs.resize(DeviceInfo->NumDevices);
+  DeviceInfo->maxExecutionUnits.resize(DeviceInfo->NumDevices);
+  DeviceInfo->maxWorkGroupSize.resize(DeviceInfo->NumDevices);
+  DeviceInfo->MaxMemAllocSize.resize(DeviceInfo->NumDevices);
+  DeviceInfo->DeviceProperties.resize(DeviceInfo->NumDevices);
+  DeviceInfo->Extensions.resize(DeviceInfo->NumDevices);
+  DeviceInfo->Queues.resize(DeviceInfo->NumDevices);
+  DeviceInfo->QueuesInOrder.resize(DeviceInfo->NumDevices, nullptr);
+  DeviceInfo->KernelProperties.resize(DeviceInfo->NumDevices);
+  DeviceInfo->ClMemBuffers.resize(DeviceInfo->NumDevices);
+  DeviceInfo->ImplicitArgs.resize(DeviceInfo->NumDevices);
+  DeviceInfo->Profiles.resize(DeviceInfo->NumDevices);
+  DeviceInfo->Names.resize(DeviceInfo->NumDevices);
+  DeviceInfo->DeviceArchs.resize(DeviceInfo->NumDevices);
+  DeviceInfo->Initialized.resize(DeviceInfo->NumDevices);
+  DeviceInfo->SLMSize.resize(DeviceInfo->NumDevices);
+  DeviceInfo->Mutexes = new std::mutex[DeviceInfo->NumDevices];
+  DeviceInfo->ProfileLocks = new std::mutex[DeviceInfo->NumDevices];
+  DeviceInfo->OwnedMemory.resize(DeviceInfo->NumDevices);
+  DeviceInfo->NumActiveKernels.resize(DeviceInfo->NumDevices, 0);
+
+  // Host allocation information needs one additional slot
+  for (uint32_t I = 0; I < DeviceInfo->NumDevices + 1; I++)
+    DeviceInfo->MemAllocInfo.emplace_back(new MemAllocInfoMapTy());
+
+  // get device specific information
+  for (unsigned I = 0; I < DeviceInfo->NumDevices; I++) {
+    size_t BufSize;
+    cl_int RC;
+    cl_device_id DeviceId = DeviceInfo->Devices[I];
+    CALL_CL(RC, clGetDeviceInfo, DeviceId, CL_DEVICE_NAME, 0, nullptr,
+            &BufSize);
+    if (RC != CL_SUCCESS || BufSize == 0)
+      continue;
+    DeviceInfo->Names[I].resize(BufSize);
+    CALL_CL(RC, clGetDeviceInfo, DeviceId, CL_DEVICE_NAME, BufSize,
+            DeviceInfo->Names[I].data(), nullptr);
+    if (RC != CL_SUCCESS)
+      continue;
+    DP("Device %d: %s\n", I, DeviceInfo->Names[I].data());
+    CALL_CL_RET_ZERO(clGetDeviceInfo, DeviceId, CL_DEVICE_MAX_COMPUTE_UNITS, 4,
+                     &DeviceInfo->maxExecutionUnits[I], nullptr);
+    DP("Number of execution units on the device is %d\n",
+       DeviceInfo->maxExecutionUnits[I]);
+    CALL_CL_RET_ZERO(clGetDeviceInfo, DeviceId, CL_DEVICE_MAX_WORK_GROUP_SIZE,
+                     sizeof(size_t), &DeviceInfo->maxWorkGroupSize[I], nullptr);
+    DP("Maximum work group size for the device is %d\n",
+       static_cast<int32_t>(DeviceInfo->maxWorkGroupSize[I]));
+    CALL_CL_RET_ZERO(clGetDeviceInfo, DeviceId, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                     sizeof(cl_ulong), &DeviceInfo->MaxMemAllocSize[I],
+                     nullptr);
+    DP("Maximum memory allocation size is %" PRIu64 "\n",
+       DeviceInfo->MaxMemAllocSize[I]);
+    CALL_CL_RET_ZERO(clGetDeviceInfo, DeviceId, CL_DEVICE_LOCAL_MEM_SIZE,
+                     sizeof(cl_ulong), &DeviceInfo->SLMSize[I], nullptr);
+    DP("Device local mem size: %zu\n", (size_t)DeviceInfo->SLMSize[I]);
+    DeviceInfo->Initialized[I] = false;
+  }
+  if (DeviceInfo->NumDevices == 0) {
+    DP("WARNING: No OpenCL devices found.\n");
+  }
+
+#ifndef _WIN32
+  // Make sure it is registered after OCL handlers are registered.
+  // Registerization is done in DLLmain for Windows
+  if (std::atexit(closeRTL)) {
+    FATAL_ERROR("Registration of clean-up function");
+  }
+#endif //WIN32
+
+  return DeviceInfo->NumDevices;
+}
+
+int32_t __tgt_rtl_init_device(int32_t DeviceId) {
+  cl_int RC;
+  DP("Initialize OpenCL device\n");
+  assert(DeviceId >= 0 && (cl_uint)DeviceId < DeviceInfo->NumDevices &&
+         "bad device id");
+
+  // Use out-of-order queue by default.
+  std::vector<cl_queue_properties> QProperties {
+      CL_QUEUE_PROPERTIES,
+      CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
+  };
+  if (DeviceInfo->Option.Flags.EnableProfile)
+    QProperties.back() |= CL_QUEUE_PROFILING_ENABLE;
+  QProperties.push_back(0);
+
+  if (!DeviceInfo->Option.Flags.UseSingleContext) {
+    auto Platform = DeviceInfo->Platforms[DeviceId];
+    auto Device = DeviceInfo->Devices[DeviceId];
+    cl_context_properties ContextProperties[] = {
+        CL_CONTEXT_PLATFORM, (cl_context_properties)Platform, 0
+    };
+    CALL_CL_RVRC(DeviceInfo->Contexts[DeviceId], clCreateContext, RC,
+                 ContextProperties, 1, &Device, nullptr, nullptr);
+    if (RC != CL_SUCCESS)
+      return OFFLOAD_FAIL;
+  }
+
+  auto CLDeviceId = DeviceInfo->Devices[DeviceId];
+  auto Context = DeviceInfo->getContext(DeviceId);
+  CALL_CL_RVRC(DeviceInfo->Queues[DeviceId], clCreateCommandQueueWithProperties,
+               RC, Context, CLDeviceId, QProperties.data());
+  if (RC != CL_SUCCESS) {
+    DP("Error: Failed to create CommandQueue: %d\n", RC);
+    return OFFLOAD_FAIL;
+  }
+
+  auto &Extension = DeviceInfo->Extensions[DeviceId];
+  Extension.getExtensionsInfoForDevice(DeviceId);
+
+  if (Extension.DeviceAttributeQuery == ExtensionStatusEnabled) {
+    if (OFFLOAD_SUCCESS !=
+        DeviceInfo->DeviceProperties[DeviceId].getDeviceProperties(CLDeviceId))
+      return OFFLOAD_FAIL;
+  }
+
+  DeviceInfo->DeviceArchs[DeviceId] = DeviceInfo->getDeviceArch(DeviceId);
+
+  DeviceInfo->Initialized[DeviceId] = true;
+
+  return OFFLOAD_SUCCESS;
+}
+
+int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
+  DP("Initialize requires flags to %" PRId64 "\n", RequiresFlags);
+  DeviceInfo->RequiresFlags = RequiresFlags;
+  return RequiresFlags;
+}
+
+__tgt_target_table *__tgt_rtl_load_binary(
+    int32_t DeviceId, __tgt_device_image *Image) {
+  DP("Device %" PRId32 ": Loading binary from " DPxMOD "\n", DeviceId,
+     DPxPTR(Image->ImageStart));
+
+  size_t ImageSize = (size_t)Image->ImageEnd - (size_t)Image->ImageStart;
+  size_t NumEntries = (size_t)(Image->EntriesEnd - Image->EntriesBegin);
+  (void)NumEntries;
+
+  DP("Expecting to have %zu entries defined\n", NumEntries);
+
+  auto &Option = DeviceInfo->Option;
+  std::string CompilationOptions(Option.CompilationOptions + " " +
+                                 Option.UserCompilationOptions);
+  std::string LinkingOptions(Option.UserLinkingOptions);
+
+  DP("Base OpenCL compilation options: %s\n", CompilationOptions.c_str());
+  DP("Base OpenCL linking options: %s\n", LinkingOptions.c_str());
+
+  dumpImageToFile(Image->ImageStart, ImageSize, "OpenMP");
+
+  auto Context = DeviceInfo->getContext(DeviceId);
+  auto Device = DeviceInfo->Devices[DeviceId];
+  DeviceInfo->Programs[DeviceId].emplace_back(Image, Context, Device, DeviceId);
+  auto &Program = DeviceInfo->Programs[DeviceId].back();
+
+  ProfileIntervalTy CompilationTimer("Compiling", DeviceId);
+  ProfileIntervalTy LinkingTimer("Linking", DeviceId);
+
+  CompilationTimer.start();
+  int32_t RC = Program.buildPrograms(CompilationOptions, LinkingOptions);
+  if (RC != OFFLOAD_SUCCESS)
+    return nullptr;
+
+  // Decide final compilation/linking options
+  DP("Final OpenCL compilation options: %s\n", CompilationOptions.c_str());
+  DP("Final OpenCL linking options: %s\n", LinkingOptions.c_str());
+  // clLinkProgram drops the last symbol. Work this around temporarily.
+  LinkingOptions += " ";
+
+  RC = Program.compilePrograms(CompilationOptions, LinkingOptions);
+  CompilationTimer.stop();
+  if (RC != OFFLOAD_SUCCESS)
+    return nullptr;
+
+  LinkingTimer.start();
+  RC = Program.linkPrograms(LinkingOptions);
+  LinkingTimer.stop();
+  if (RC != OFFLOAD_SUCCESS)
+    return nullptr;
+
+  RC = Program.buildKernels();
+  if (RC != OFFLOAD_SUCCESS)
+    return nullptr;
+
+  auto *Table = Program.getTablePtr();
+
+  return Table;
+}
+
+void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *HstPtr,
+                           int32_t Kind) {
+  bool ImplicitArg = false;
+
+  if (!HstPtr) {
+    ImplicitArg = true;
+    // User allocation
+    if (Kind != TARGET_ALLOC_DEFAULT) {
+      // Explicit allocation
+      return dataAllocExplicit(DeviceId, Size, Kind);
+    }
+    if (DeviceInfo->Option.Flags.UseBuffer) {
+      // Experimental CL buffer allocation
+      return DeviceInfo->allocDataClMem(DeviceId, Size);
+    }
+  }
+  return dataAlloc(DeviceId, Size, HstPtr, HstPtr, ImplicitArg);
+}
+
+int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
+                              int64_t Size) {
+  return submitData(DeviceId, TgtPtr, HstPtr, Size);
+}
+
+int32_t __tgt_rtl_data_submit_async(
+    int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
+    __tgt_async_info *AsyncInfo /*not used*/) {
+  return submitData(DeviceId, TgtPtr, HstPtr, Size);
+}
+
+int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
+                                int64_t Size) {
+  return retrieveData(DeviceId, HstPtr, TgtPtr, Size);
+}
+
+int32_t __tgt_rtl_data_retrieve_async(
+    int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
+    __tgt_async_info *AsyncInfo /*not used*/) {
+  return retrieveData(DeviceId, HstPtr, TgtPtr, Size);
+}
+
+int32_t __tgt_rtl_is_data_exchangable(int32_t SrcId, int32_t DstId) {
+  // Only support this case. We don't have any documented OpenCL behavior for
+  // cross-device data transfer.
+  if (SrcId == DstId)
+    return 1;
+
+  return 0;
+}
+
+int32_t __tgt_rtl_data_exchange(int32_t SrcId, void *SrcPtr, int32_t DstId,
+                                void *DstPtr, int64_t Size) {
+  if (SrcId != DstId)
+    return OFFLOAD_FAIL;
+
+  // This is OK for same-device copy with SVM or USM extension.
+  return __tgt_rtl_data_submit(DstId, DstPtr, SrcPtr, Size);
+}
+
+int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr) {
+  DeviceInfo->Mutexes[DeviceId].lock();
+
+  // Deallocate cl_mem data
+  if (DeviceInfo->Option.Flags.UseBuffer) {
+    auto &ClMemBuffers = DeviceInfo->ClMemBuffers[DeviceId];
+    if (ClMemBuffers.count(TgtPtr) > 0) {
+      ClMemBuffers.erase(TgtPtr);
+      CALL_CL_RET_FAIL(clReleaseMemObject, (cl_mem)TgtPtr);
+      return OFFLOAD_SUCCESS;
+    }
+  }
+
+  DeviceInfo->Mutexes[DeviceId].unlock();
+
+  MemAllocInfoTy Info;
+  auto &AllocInfos = DeviceInfo->MemAllocInfo;
+  auto Removed = AllocInfos[DeviceId]->remove(TgtPtr, &Info);
+  // Try again with device-independent allocation information (host USM)
+  if (!Removed && DeviceInfo->Option.Flags.UseSingleContext)
+    Removed = AllocInfos[DeviceInfo->NumDevices]->remove(TgtPtr, &Info);
+  if (!Removed) {
+    DP("Error: Cannot find memory allocation information for " DPxMOD "\n",
+       DPxPTR(TgtPtr));
+    return OFFLOAD_FAIL;
+  }
+
+  auto Context = DeviceInfo->getContext(DeviceId);
+  if (DeviceInfo->Option.Flags.UseSVM) {
+    CALL_CL_VOID(clSVMFree, Context, Info.Base);
+  } else {
+    CALL_CL_EXT_VOID(DeviceId, clMemFreeINTEL, Context, Info.Base);
+  }
+
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_run_target_team_region(
+    int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets,
+    int32_t NumArgs, int32_t NumTeams, int32_t ThreadLimit,
+    uint64_t LoopTripCount /*not used*/) {
+  return runTargetTeamNDRegion(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets,
+                               NumArgs, NumTeams, ThreadLimit, nullptr);
+}
+
+int32_t __tgt_rtl_run_target_team_region_async(
+    int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets,
+    int32_t NumArgs, int32_t NumTeams, int32_t ThreadLimit,
+    uint64_t LoopTripCount /*not used*/,
+    __tgt_async_info *AsyncInfo /*not used*/) {
+  return runTargetTeamNDRegion(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets,
+                               NumArgs, NumTeams, ThreadLimit, nullptr);
+}
+
+int32_t __tgt_rtl_run_target_region(
+    int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets,
+    int32_t NumArgs) {
+  // use one team!
+  return __tgt_rtl_run_target_team_region(DeviceId, TgtEntryPtr, TgtArgs,
+                                          TgtOffsets, NumArgs, 1, 0, 0);
+}
+
+int32_t __tgt_rtl_run_target_region_async(
+    int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets,
+    int32_t NumArgs, __tgt_async_info *AsyncInfo /*not used*/) {
+  // use one team!
+  return __tgt_rtl_run_target_team_region(DeviceId, TgtEntryPtr, TgtArgs,
+                                          TgtOffsets, NumArgs, 1, 0, 0);
+}
+
+int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *AsyncInfo) {
+  return OFFLOAD_SUCCESS;
+}
+
+///
+/// Extended plugin interface
+///
+
+// Notify the kernel about target pointers that are not explicitly
+// passed as arguments, but which are pointing to mapped objects
+// that may potentially be accessed in the kernel code (e.g. PTR_AND_OBJ
+// objects).
+int32_t __tgt_rtl_manifest_data_for_region(
+    int32_t DeviceId, void *TgtEntryPtr, void **TgtPtrs, size_t NumPtrs) {
+  cl_kernel Kernel = *static_cast<cl_kernel *>(TgtEntryPtr);
+  DP("Stashing %zu implicit arguments for kernel " DPxMOD "\n", NumPtrs,
+     DPxPTR(Kernel));
+  auto &KernelProperty = DeviceInfo->KernelProperties[DeviceId][Kernel];
+  std::lock_guard<std::mutex> Lock(DeviceInfo->Mutexes[DeviceId]);
+  KernelProperty.ImplicitArgs.clear();
+  KernelProperty.ImplicitArgs.insert(TgtPtrs, TgtPtrs + NumPtrs);
+
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_requires_mapping(int32_t DeviceId, void *Ptr, int64_t Size) {
+  // Force mapping for host memory with positive size
+  int32_t Ret;
+  cl_unified_shared_memory_type_intel MemType = 0;
+  CALL_CL_EXT_RET(DeviceId, false, clGetMemAllocInfoINTEL,
+                  DeviceInfo->getContext(DeviceId), Ptr,
+                  CL_MEM_ALLOC_TYPE_INTEL, sizeof(MemType), &MemType,
+                  nullptr);
+  if (MemType == CL_MEM_TYPE_UNKNOWN_INTEL ||
+      (MemType == CL_MEM_TYPE_HOST_INTEL && Size > 0))
+    Ret = 1;
+  else
+    Ret = 0;
+
+  DP("Ptr " DPxMOD " %s mapping.\n", DPxPTR(Ptr),
+     Ret ? "requires" : "does not require");
+  return Ret;
+}
+
+// Allocate a base buffer with the given information.
+void *__tgt_rtl_data_alloc_base(int32_t DeviceId, int64_t Size, void *HstPtr,
+                                void *HstBase) {
+  return dataAlloc(DeviceId, Size, HstPtr, HstBase, false);
+}
+
+// Allocate a managed memory object.
+void *__tgt_rtl_data_alloc_managed(int32_t DeviceId, int64_t Size) {
+  int32_t Kind = DeviceInfo->Option.Flags.UseHostMemForUSM
+                    ? TARGET_ALLOC_HOST : TARGET_ALLOC_SHARED;
+  return dataAllocExplicit(DeviceId, Size, Kind);
+}
+
+void *__tgt_rtl_data_realloc(
+    int32_t DeviceId, void *Ptr, size_t Size, int32_t Kind) {
+  const MemAllocInfoTy *Info = nullptr;
+
+  if (Ptr) {
+    Info = DeviceInfo->MemAllocInfo[DeviceId]->find(Ptr);
+    if (!Info && DeviceInfo->Option.Flags.UseSingleContext)
+      Info = DeviceInfo->MemAllocInfo[DeviceInfo->NumDevices]->find(Ptr);
+    if (!Info) {
+      DP("Error: Cannot find allocation information for pointer " DPxMOD "\n",
+         DPxPTR(Ptr));
+      return nullptr;
+    }
+    if (Size <= Info->Size && Kind == Info->Kind) {
+      DP("Returning the same pointer " DPxMOD " as reallocation is unneeded\n",
+         DPxPTR(Ptr));
+      return Ptr;
+    }
+  }
+
+  int32_t AllocKind =
+      (Kind == TARGET_ALLOC_DEFAULT) ? TARGET_ALLOC_DEVICE : Kind;
+
+  void *Mem = dataAllocExplicit(DeviceId, Size, AllocKind);
+
+  if (Mem && Info) {
+    if (AllocKind == TARGET_ALLOC_DEVICE || Info->Kind == TARGET_ALLOC_DEVICE ||
+        Info->Kind == TARGET_ALLOC_SVM) {
+      // TARGET_ALLOC_SVM is for "Device" memory type when SVM is enabled
+      auto Queue = DeviceInfo->Queues[DeviceId];
+      if (DeviceInfo->Option.Flags.UseSVM) {
+        CALL_CL_RET_NULL(clEnqueueSVMMemcpy, Queue, CL_TRUE, Mem, Ptr,
+                         Info->Size, 0, nullptr, nullptr);
+      } else {
+        CALL_CL_EXT_RET_NULL(DeviceId, clEnqueueMemcpyINTEL, Queue, CL_TRUE,
+                             Mem, Ptr, Info->Size, 0, nullptr, nullptr);
+      }
+    } else {
+      std::copy_n((char *)Ptr, Info->Size, (char *)Mem);
+    }
+    auto Rc = __tgt_rtl_data_delete(DeviceId, Ptr);
+    if (Rc != OFFLOAD_SUCCESS)
+      return nullptr;
+  }
+
+  return Mem;
+}
+
+void *__tgt_rtl_get_context_handle(int32_t DeviceId) {
+  return (void *)DeviceInfo->getContext(DeviceId);
+}
+
+void *__tgt_rtl_data_aligned_alloc(int32_t DeviceId, size_t Align, size_t Size,
+                                   int32_t Kind) {
+  if (Align != 0 && (Align & (Align - 1)) != 0) {
+    DP("Error: Alignment %zu is not power of two.\n", Align);
+    return nullptr;
+  }
+
+  int32_t AllocKind =
+      (Kind == TARGET_ALLOC_DEFAULT) ? TARGET_ALLOC_DEVICE : Kind;
+
+  return dataAllocExplicit(DeviceId, Size, AllocKind, Align);
+}
+
+int32_t __tgt_rtl_run_target_team_nd_region(
+    int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets,
+    int32_t NumArgs, int32_t NumTeams, int32_t ThreadLimit, void *LoopDesc) {
+  return runTargetTeamNDRegion(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets,
+                               NumArgs, NumTeams, ThreadLimit, LoopDesc);
+}
+
+char *__tgt_rtl_get_device_name(int32_t DeviceId, char *Buf, size_t BufMax) {
+  assert(Buf && "Buf cannot be nullptr.");
+  assert(BufMax > 0 && "BufMax cannot be zero.");
+  CALL_CL_RET_NULL(clGetDeviceInfo, DeviceInfo->Devices[DeviceId],
+                   CL_DEVICE_NAME, BufMax, Buf, nullptr);
+  return Buf;
+}
+
+void __tgt_rtl_add_build_options(
+    const char *CompileOptions, const char *LinkOptions) {
+  if (CompileOptions) {
+    auto &compileOptions = DeviceInfo->Option.UserCompilationOptions;
+    if (compileOptions.empty()) {
+      compileOptions = std::string(CompileOptions) + " ";
+    } else {
+      DP("Respecting LIBOMPTARGET_OPENCL_COMPILATION_OPTIONS=%s\n",
+         compileOptions.c_str());
+    }
+  }
+  if (LinkOptions) {
+    auto &linkOptions = DeviceInfo->Option.UserLinkingOptions;
+    if (linkOptions.empty()) {
+      linkOptions = std::string(LinkOptions) + " ";
+    } else {
+      DP("Respecting LIBOMPTARGET_OPENCL_LINKING_OPTIONS=%s\n",
+         linkOptions.c_str());
+    }
+  }
+}
+
+int32_t __tgt_rtl_is_supported_device(int32_t DeviceId, void *DeviceType) {
+  if (!DeviceType)
+    return true;
+
+  uint64_t DeviceArch = DeviceInfo->DeviceArchs[DeviceId];
+  int32_t Ret = (uint64_t)(DeviceArch & (uint64_t)DeviceType) == DeviceArch;
+  DP("Device %" PRIu32 " does%s match the requested device types " DPxMOD "\n",
+     DeviceId, Ret ? "" : " not", DPxPTR(DeviceType));
+  return Ret;
+}
+
+void __tgt_rtl_deinit(void) {
+  // No-op on Linux
+#ifdef _WIN32
+  if (DeviceInfo) {
+    closeRTL();
+    deinit();
+  }
+#endif // _WIN32
+}
+
+int32_t __tgt_rtl_is_accessible_addr_range(
+    int32_t DeviceId, const void *Ptr, size_t Size) {
+  if (!Ptr || Size == 0)
+    return 0;
+
+  auto MemType = DeviceInfo->getMemAllocType(DeviceId, Ptr);
+  if (MemType != CL_MEM_TYPE_HOST_INTEL && MemType != CL_MEM_TYPE_SHARED_INTEL)
+    return 0;
+
+  if (MemType == CL_MEM_TYPE_HOST_INTEL &&
+      DeviceInfo->Option.Flags.UseSingleContext)
+    DeviceId = DeviceInfo->NumDevices;
+
+  if (DeviceInfo->MemAllocInfo[DeviceId]->contains(Ptr, Size))
+    return 1;
+  else
+    return 0;
+}
+
+int32_t __tgt_rtl_is_private_arg_on_host(
+    int32_t DeviceId, const void *TgtEntryPtr, uint32_t Idx) {
+  const cl_kernel *Kernel = static_cast<const cl_kernel *>(TgtEntryPtr);
+  if (!*Kernel) {
+    REPORT("Querying information about a deleted kernel.\n");
+    return 0;
+  }
+  auto *KernelInfo = DeviceInfo->getKernelInfo(DeviceId, *Kernel);
+  if (!KernelInfo)
+    return 0;
+
+  if (KernelInfo->isArgLiteral(Idx))
+    return 1;
+
+  return 0;
+}
+
+void *__tgt_rtl_alloc_per_hw_thread_scratch(
+    int32_t DeviceId, size_t ObjSize, int32_t AllocKind) {
+  void *Mem = nullptr;
+  cl_uint NumHWThreads = DeviceInfo->DeviceProperties[DeviceId].NumHWThreads;
+
+  if (NumHWThreads == 0)
+    return Mem;
+
+  // Only support USM
+  cl_int RC;
+  auto Context = DeviceInfo->getContext(DeviceId);
+  auto Device = DeviceInfo->Devices[DeviceId];
+  size_t AllocSize = ObjSize * NumHWThreads;
+  auto AllocProp = DeviceInfo->getAllocMemProperties(DeviceId, AllocSize);
+
+  switch (AllocKind) {
+  case TARGET_ALLOC_HOST:
+    CALL_CL_EXT_RVRC(DeviceId, Mem, clHostMemAllocINTEL, RC, Context,
+                     AllocProp->data(), AllocSize, 0 /* Align */);
+    break;
+  case TARGET_ALLOC_SHARED:
+    CALL_CL_EXT_RVRC(DeviceId, Mem, clSharedMemAllocINTEL, RC, Context, Device,
+                     AllocProp->data(), AllocSize, 0 /* Align */);
+    break;
+  case TARGET_ALLOC_DEVICE:
+  default:
+    CALL_CL_EXT_RVRC(DeviceId, Mem, clDeviceMemAllocINTEL, RC, Context, Device,
+                     AllocProp->data(), AllocSize, 0 /* Align */);
+  }
+
+  if (RC != CL_SUCCESS) {
+    DP("Failed to allocate per-hw-thread scratch space.\n");
+    return nullptr;
+  }
+
+  DP("Allocated %zu byte per-hw-thread scratch space at " DPxMOD "\n",
+     AllocSize, DPxPTR(Mem));
+
+  return Mem;
+}
+
+void __tgt_rtl_free_per_hw_thread_scratch(int32_t DeviceId, void *Ptr) {
+  auto Context = DeviceInfo->getContext(DeviceId);
+  CALL_CL_EXT_VOID(DeviceId, clMemFreeINTEL, Context, Ptr);
+}