diff --git a/openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake b/openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake --- a/openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake +++ b/openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake @@ -281,3 +281,61 @@ endif() set(OPENMP_PTHREAD_LIB ${LLVM_PTHREAD_LIB}) + +################################################################################ +# Looking for OpenCL... +################################################################################ + +# Cmake 3.4.3 cannot find OpenCL library unless an OpenCL SDK is +# installed as root. This workaround is a fallback to let us find the library as +# well as includes for now. +message(STATUS "Looking for OpenCL includes.") + +find_path(LIBOMPTARGET_DEP_OPENCL_INCLUDE_DIRS + NAMES + CL/cl.h OpenCL/cl.h + PATHS + ENV LIBOMPTARGET_OCL_ROOT + ENV CPATH + PATH_SUFFIXES + include include/sycl) +message(STATUS "OpenCL include DIR: ${LIBOMPTARGET_DEP_OPENCL_INCLUDE_DIRS}") + +if (NOT LIBOMPTARGET_DEP_OPENCL_INCLUDE_DIRS) + set(LIBOMPTARGET_DEP_OPENCL_FOUND FALSE) + message(STATUS "Could NOT find OpenCL. Missing includes.") +else() + + message(STATUS "Looking for OpenCL library.") + + find_library(LIBOMPTARGET_DEP_OPENCL_LIBRARIES + NAMES OpenCL + PATHS + ENV LIBOMPTARGET_OCL_ROOT + ENV LIBRARY_PATH + ENV LD_LIBRARY_PATH + PATH_SUFFIXES + lib lib64 lib/intel64_lin) + message(STATUS "OpenCL lib: ${LIBOMPTARGET_DEP_OPENCL_LIBRARIES}") + + if (NOT LIBOMPTARGET_DEP_OPENCL_LIBRARIES) + set(LIBOMPTARGET_DEP_OPENCL_FOUND FALSE) + message(STATUS "Could NOT find OpenCL. Missing libs.") + else() + set(LIBOMPTARGET_DEP_OPENCL_FOUND TRUE) + endif() + +endif() + +if (NOT LIBOMPTARGET_DEP_OPENCL_FOUND) + message(STATUS "Looking for OpenCL again.") + find_package(OpenCL) + set(LIBOMPTARGET_DEP_OPENCL_FOUND ${OPENCL_FOUND}) + set(LIBOMPTARGET_DEP_OPENCL_LIBRARIES ${OPENCL_LIBRARIES}) + set(LIBOMPTARGET_DEP_OPENCL_INCLUDE_DIRS ${OPENCL_INCLUDE_DIRS}) +endif() + +mark_as_advanced( + LIBOMPTARGET_DEP_OPENCL_FOUND + LIBOMPTARGET_DEP_OPENCL_INCLUDE_DIRS + LIBOMPTARGET_DEP_OPENCL_LIBRARIES) diff --git a/openmp/libomptarget/plugins/CMakeLists.txt b/openmp/libomptarget/plugins/CMakeLists.txt --- a/openmp/libomptarget/plugins/CMakeLists.txt +++ b/openmp/libomptarget/plugins/CMakeLists.txt @@ -81,6 +81,7 @@ add_subdirectory(ve) add_subdirectory(x86_64) add_subdirectory(remote) +add_subdirectory(opencl) # Make sure the parent scope can see the plugins that will be created. set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE) diff --git a/openmp/libomptarget/plugins/common/elf_common/CMakeLists.txt b/openmp/libomptarget/plugins/common/elf_common/CMakeLists.txt --- a/openmp/libomptarget/plugins/common/elf_common/CMakeLists.txt +++ b/openmp/libomptarget/plugins/common/elf_common/CMakeLists.txt @@ -10,7 +10,7 @@ # ##===----------------------------------------------------------------------===## -add_library(elf_common OBJECT elf_common.cpp) +add_library(elf_common OBJECT elf_common.cpp elf_light.cpp) # Build elf_common with PIC to be able to link it with plugin shared libraries. set_property(TARGET elf_common PROPERTY POSITION_INDEPENDENT_CODE ON) diff --git a/openmp/libomptarget/plugins/common/elf_common/elf_light.h b/openmp/libomptarget/plugins/common/elf_common/elf_light.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/common/elf_common/elf_light.h @@ -0,0 +1,181 @@ +//===-- elf_light.h - Basic ELF functionality -------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Light ELF implementation provides basic ELF reading functionality. +// It may be used in systems without libelf support, if the corresponding +// LLVM ELF implementation is available. +// The interface declared here must be independent of libelf.h/elf.h. +// +// NOTE: we can try to rely on https://github.com/WolfgangSt/libelf +// on Windows, if this implementation gets more complex. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_ELF_COMMON_ELF_LIGHT_H +#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_ELF_COMMON_ELF_LIGHT_H + +#include +#include +#include + +class ElfL; +class ElfLSegmentNoteIterator; +class ElfLSectionNoteIterator; +class ElfNote; + +// Class representing NOTEs from PT_NOTE segments and SHT_NOTE sections. +class ElfLNote { + const void *Impl = nullptr; + + friend class ElfLSegmentNoteIterator; + friend class ElfLSectionNoteIterator; + + // Only ElfLSectionNoteIterator is allowed to create notes via its + // operator*(). + explicit ElfLNote(const void *I); + ElfLNote &operator=(const ElfLNote &) = delete; + +public: + // FIXME: add move copy constructor and assignment operator. + ElfLNote(const ElfLNote &); + ~ElfLNote(); + // Returns the note's name size not including the null terminator. + // Note that it may be illegal to access the getName() pointer + // beyond the returned size, i.e. the implementation may + // not guarantee that there is '\0' after getNameSize() + // characters of the name. + uint64_t getNameSize() const; + // Returns a pointer to the beginning of the note's name. + const char *getName() const; + // Returns the number of bytes in the descriptor. + uint64_t getDescSize() const; + // Returns a pointer to the beginning of the note's descriptor. + // It is illegal to access more that getDescSize() bytes + // via this pointer. + const uint8_t *getDesc() const; + uint64_t getType() const; +}; + +// Iterator over NOTEs in PT_NOTE segments. +class ElfLSegmentNoteIterator + : std::iterator { + + void *Impl = nullptr; + + friend class ElfL; + + // Only ElfL is allowed to create iterators to itself. + ElfLSegmentNoteIterator(const void *I, bool IsEnd = false); + ElfLSectionNoteIterator &operator=(const ElfLSegmentNoteIterator &) = delete; + +public: + // FIXME: add move copy constructor and assignment operator. + ElfLSegmentNoteIterator(const ElfLSegmentNoteIterator &Other); + ~ElfLSegmentNoteIterator(); + ElfLSegmentNoteIterator &operator++(); + bool operator==(const ElfLSegmentNoteIterator Other) const; + bool operator!=(const ElfLSegmentNoteIterator Other) const; + ElfLNote operator*() const; +}; + +// Iterator over NOTEs in SHT_NOTE sections. +class ElfLSectionNoteIterator + : std::iterator { + + void *Impl = nullptr; + + friend class ElfL; + + // Only ElfL is allowed to create iterators to itself. + ElfLSectionNoteIterator(const void *I, bool IsEnd = false); + ElfLSectionNoteIterator &operator=(const ElfLSectionNoteIterator &) = delete; + +public: + // FIXME: add move copy constructor and assignment operator. + ElfLSectionNoteIterator(const ElfLSectionNoteIterator &Other); + ~ElfLSectionNoteIterator(); + ElfLSectionNoteIterator &operator++(); + bool operator==(const ElfLSectionNoteIterator Other) const; + bool operator!=(const ElfLSectionNoteIterator Other) const; + ElfLNote operator*() const; +}; + +// Class representing ELF section. +class ElfLSection { + const void *Impl = nullptr; + + friend class ElfLSectionIterator; + + // Only ElfLSectionIterator is allowed to create sections via its + // operator*(). + explicit ElfLSection(const void *I); + ElfLSection &operator=(const ElfLSection &) = delete; + +public: + // FIXME: add move copy constructor and assignment operator. + ElfLSection(const ElfLSection &); + ~ElfLSection(); + + // Returns the section name, which is is a null-terminated string. + const char *getName() const; + // Returns the section size. + uint64_t getSize() const; + // Returns a pointer to the beginning of the section. + const uint8_t *getContents() const; +}; + +// Iterator over sections. +class ElfLSectionIterator + : std::iterator { + + void *Impl = nullptr; + + friend class ElfL; + + // Only ElfL is allowed to create iterators to itself. + ElfLSectionIterator(const void *I, bool IsEnd = false); + ElfLSectionIterator &operator=(const ElfLSectionIterator &) = delete; + +public: + // FIXME: add move copy constructor and assignment operator. + ElfLSectionIterator(const ElfLSectionIterator &Other); + ~ElfLSectionIterator(); + ElfLSectionIterator &operator++(); + bool operator==(const ElfLSectionIterator Other) const; + bool operator!=(const ElfLSectionIterator Other) const; + ElfLSection operator*() const; +}; + +// Wrapper around the given ELF image. +class ElfL { + // Opaque pointer to the actual implementation. + void *Impl = nullptr; + + // FIXME: implement if needed. + ElfL(const ElfL &) = delete; + ElfL &operator=(const ElfL &) = delete; + +public: + ElfL(char *Begin, size_t Size); + ~ElfL(); + bool isValidElf() const; + const char *getErrmsg(int N) const; + uint16_t getEMachine() const; + uint16_t getEType() const; + + static bool isDynType(uint16_t Ty); + + ElfLSectionNoteIterator section_notes_begin() const; + ElfLSectionNoteIterator section_notes_end() const; + ElfLSegmentNoteIterator segment_notes_begin() const; + ElfLSegmentNoteIterator segment_notes_end() const; + ElfLSectionIterator sections_begin() const; + ElfLSectionIterator sections_end() const; +}; + +#endif // LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_ELF_COMMON_ELF_LIGHT_H diff --git a/openmp/libomptarget/plugins/common/elf_common/elf_light.cpp b/openmp/libomptarget/plugins/common/elf_common/elf_light.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/common/elf_common/elf_light.cpp @@ -0,0 +1,1682 @@ +//===-- elf_light.cpp - Basic ELF functionality -----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "elf_light.h" +#include "Debug.h" +#include + +#ifndef TARGET_NAME +#define TARGET_NAME ELF light +#endif +#define DEBUG_PREFIX "TARGET " GETNAME(TARGET_NAME) + +#if MAY_USE_LIBELF + +// Implementation based on libelf. +#include +#include + +// Helper methods to align addresses. +template inline T alignDown(T value, size_t alignment) { + return (T)(value & ~(alignment - 1)); +} + +template inline T *alignDown(T *value, size_t alignment) { + return reinterpret_cast(alignDown((intptr_t)value, alignment)); +} + +template inline T alignUp(T value, size_t alignment) { + return alignDown((T)(value + alignment - 1), alignment); +} + +template inline T *alignUp(T *value, size_t alignment) { + return reinterpret_cast( + alignDown((intptr_t)(value + alignment - 1), alignment)); +} + +// FIXME: this is taken from openmp/libomptarget/plugins/amdgpu/impl/system.cpp, +// but it may be incorrect for 64-bit ELF. Elf64_Nhdr and Elf32_Nhdr +// have different representation. The alignment used for the name and +// the descriptor is still 4 bytes. At the same time, it seems to work +// for 64-bit ELFs produced by LLVM. +struct Elf_Note { + uint32_t n_namesz; // Length of note's name. + uint32_t n_descsz; // Length of note's value. + uint32_t n_type; // Type of note. + // then name + // then padding, optional + // then desc, at 4 byte alignment (not 8, despite being elf64) +}; + +static const uint32_t NoteAlignment = 4; + +// Implementation of the iterator for SHT_NOTE sections. +// The iterator allows processing all NOTEs in all SHT_NOTE sections +// in the ELF image provided during the iterator object construction. +class ElfLSectionNoteIteratorImpl { + // A pointer to Elf object created by elf_memory() for + // the ELF image we are going to iterate. + Elf *EF; + + // A pointer to the current SHT_NOTE section. + // In the initial state of the iterator object this will actually + // point to the very first section in the ELF image, but it will be + // adjusted right away either to point to the first SHT_NOTE section + // or set to nullptr (if there are no SHT_NOTE sections). + Elf_Scn *CurrentSection = nullptr; + + // A pointer to the current NOTE inside a SHT_NOTE section + // pointed by CurrentSection. If it is nullptr, then this means + // that the iterator object is an end() iterator. + Elf_Note *NPtr = nullptr; + + uint64_t getNotesBeginAddr(const GElf_Shdr &Shdr) const { + return reinterpret_cast(elf_rawfile(EF, nullptr)) + + Shdr.sh_offset; + } + + uint64_t getNotesEndAddr(const GElf_Shdr &Shdr) const { + return getNotesBeginAddr(Shdr) + Shdr.sh_size; + } + + uint64_t getNoteSize(const Elf_Note &Note) const { + return sizeof(Note) + alignUp(Note.n_namesz, NoteAlignment) + + alignUp(Note.n_descsz, NoteAlignment); + } + + // Given the current state of the iterator object, advances + // the iterator forward to point to the next NOTE in the next + // SHT_NOTE section. + // If there is no such a NOTE, then it sets the iterator + // object to the end() state. + // + // Note that this method does not change the iterator, if + // NPtr is pointing to a valid note within CurrentSection. + // The iterator advancement in this case is done via operator++. + void autoAdvance(bool IsFirst = false) { + // Cannot advance, if CurrentSection is NULL. + if (!CurrentSection) + return; + + // NPtr points to a valid NOTE in CurrentSection, thus, + // no auto advancement. + if (NPtr) + return; + + GElf_Shdr Shdr; + gelf_getshdr(CurrentSection, &Shdr); + + // CurrentSection is a valid section, and NPtr is an end() iterator. + // + // If IsFirst is true, then we just in the initial state, and + // we need to set CurrentSection to the first SHT_NOTE section (if any), + // and, then, NPtr to the first note in this section. + // + // If IsFirst is false, then we've reached the end of the current + // SHT_NOTE section, and should find the next section with notes. + if (!IsFirst || gelf_getshdr(CurrentSection, &Shdr)->sh_type != SHT_NOTE) + CurrentSection = elf_nextscn(EF, CurrentSection); + + while (CurrentSection && + gelf_getshdr(CurrentSection, &Shdr)->sh_type != SHT_NOTE) + CurrentSection = elf_nextscn(EF, CurrentSection); + + if (!CurrentSection) { + // No more sections. + // Note that NPtr is already nullptr indicating the end() iterator. + return; + } + + gelf_getshdr(CurrentSection, &Shdr); + uint64_t NotesBegin = getNotesBeginAddr(Shdr); + uint64_t NotesEnd = getNotesEndAddr(Shdr); + if (NotesBegin >= NotesEnd) { + // Something went wrong. Assume that we've reached + // the end of all notes. + CurrentSection = nullptr; + NPtr = nullptr; + return; + } + + NPtr = reinterpret_cast(NotesBegin); + assert(NPtr && "Invalid SHT_NOTE section."); + } + + bool operator!=(const ElfLSectionNoteIteratorImpl Other) const { + return !(*this == Other); + } + +public: + ElfLSectionNoteIteratorImpl(Elf *RawElf, bool IsEnd = false) : EF(RawElf) { + assert(EF && "Trying to iterate invalid ELF."); + + if (IsEnd) { + // NPtr equal to nullptr means end() iterator. + return; + } + + // Set CurrentSection to the very first section, + // and let autoAdvance() find the first valid note (if any). + CurrentSection = elf_getscn(EF, 0); + autoAdvance(true); + } + + bool operator==(const ElfLSectionNoteIteratorImpl Other) const { + // They should be pointing to the same NOTE to be equal. + return NPtr == Other.NPtr; + } + + const Elf_Note *operator*() const { + assert(*this != ElfLSectionNoteIteratorImpl(EF, true) && + "Dereferencing the end iterator."); + return NPtr; + } + + // Advance to the next NOTE in the CurrentSection. + // If there is no next NOTE, then autoAdvance() to the next + // SHT_NOTE section and its first NOTE. + ElfLSectionNoteIteratorImpl &operator++() { + assert(*this != ElfLSectionNoteIteratorImpl(EF, true) && + "Incrementing the end iterator."); + + GElf_Shdr Shdr; + gelf_getshdr(CurrentSection, &Shdr); + uint64_t NotesBegin = getNotesBeginAddr(Shdr); + uint64_t NotesEnd = getNotesEndAddr(Shdr); + assert(reinterpret_cast(NPtr) >= NotesBegin && + reinterpret_cast(NPtr) < NotesEnd && + "Invalid pointer to a note computed somewhere else."); + (void)NotesBegin; + + uint64_t NoteSize = getNoteSize(*NPtr); + NPtr = + reinterpret_cast(reinterpret_cast(NPtr) + NoteSize); + if (reinterpret_cast(NPtr) >= NotesEnd || + reinterpret_cast(NPtr) + sizeof(*NPtr) >= NotesEnd) { + // We've reached the end of the current NOTE section. + NPtr = nullptr; + } + + // Auto advance to the next section, if needed. + autoAdvance(); + return *this; + } +}; + +// Implementation of the iterator for PT_NOTE segments. +// The iterator allows processing all NOTEs in all PT_NOTE segments +// in the ELF image provided during the iterator object construction. +class ElfLSegmentNoteIteratorImpl { + // A pointer to Elf object created by elf_memory() for + // the ELF image we are going to iterate. + Elf *EF; + + // A pointer to the current PT_NOTE segment. + // In the initial state of the iterator object this will actually + // point to the very first segment in the ELF image, but it will be + // adjusted right away either to point to the first PT_NOTE segment + // or set to nullptr (if there are no PT_NOTE segments). + size_t NumberOfPhdrs = (std::numeric_limits::max)(); + size_t CurrentSegment = (std::numeric_limits::max)(); + + // A pointer to the current NOTE inside a PT_NOTE segment + // pointed by CurrentSegment. If it is nullptr, then this means + // that the iterator object is an end() iterator. + Elf_Note *NPtr = nullptr; + + uint64_t getNotesBeginAddr(const GElf_Phdr &Phdr) const { + return reinterpret_cast(elf_rawfile(EF, nullptr)) + Phdr.p_offset; + } + + uint64_t getNotesEndAddr(const GElf_Phdr &Phdr) const { + return getNotesBeginAddr(Phdr) + Phdr.p_filesz; + } + + uint64_t getNoteSize(const Elf_Note &Note) const { + return sizeof(Note) + alignUp(Note.n_namesz, NoteAlignment) + + alignUp(Note.n_descsz, NoteAlignment); + } + + // Given the current state of the iterator object, advances + // the iterator forward to point to the next NOTE in the next + // PT_NOTE segment. + // If there is no such a NOTE, then it sets the iterator + // object to the end() state. + // + // Note that this method does not change the iterator, if + // NPtr is pointing to a valid note within CurrentSegment. + // The iterator advancement in this case is done via operator++. + void autoAdvance(bool IsFirst = false) { + // Cannot advance, if CurrentSegment is invalid. + if (CurrentSegment >= NumberOfPhdrs) + return; + + // NPtr points to a valid NOTE in CurrentSegment, thus, + // no auto advancement. + if (NPtr) + return; + + GElf_Phdr Phdr; + gelf_getphdr(EF, CurrentSegment, &Phdr); + + // CurrentSegment is a valid segment, and NPtr is an end() iterator. + // + // If IsFirst is true, then we just in the initial state, and + // we need to set CurrentSegment to the first PT_NOTE segment (if any), + // and, then, NPtr to the first note in this segment. + // + // If IsFirst is false, then we've reached the end of the current + // PT_NOTE segment, and should find the next segment with notes. + if (!IsFirst || Phdr.p_type != PT_NOTE) + ++CurrentSegment; + + while (CurrentSegment < NumberOfPhdrs) { + if (gelf_getphdr(EF, CurrentSegment, &Phdr) != &Phdr) + continue; + + if (Phdr.p_type == PT_NOTE) + break; + + ++CurrentSegment; + } + + if (CurrentSegment >= NumberOfPhdrs) { + // No more segments. + // Note that NPtr is already nullptr indicating the end() iterator. + return; + } + + if (gelf_getphdr(EF, CurrentSegment, &Phdr) != &Phdr) + assert(false && "Invalid program header selected above."); + + uint64_t NotesBegin = getNotesBeginAddr(Phdr); + uint64_t NotesEnd = getNotesEndAddr(Phdr); + if (NotesBegin >= NotesEnd) { + // Something went wrong. Assume that we've reached + // the end of all notes. + CurrentSegment = NumberOfPhdrs; + NPtr = nullptr; + return; + } + + NPtr = reinterpret_cast(NotesBegin); + assert(NPtr && "Invalid PT_NOTE segment."); + } + + bool operator!=(const ElfLSegmentNoteIteratorImpl Other) const { + return !(*this == Other); + } + +public: + ElfLSegmentNoteIteratorImpl(Elf *RawElf, bool IsEnd = false) : EF(RawElf) { + assert(EF && "Trying to iterate invalid ELF."); + + if (IsEnd) { + // NPtr equal to nullptr means end() iterator. + return; + } + + // Set CurrentSegment to the very first segment, + // and let autoAdvance() find the first valid note (if any). + CurrentSegment = 0; + + // Set NumberOfPhdrs to 0, if we cannot query it. + if (elf_getphdrnum(EF, &NumberOfPhdrs) != 0) + NumberOfPhdrs = 0; + autoAdvance(true); + } + + bool operator==(const ElfLSegmentNoteIteratorImpl Other) const { + // They should be pointing to the same NOTE to be equal. + return NPtr == Other.NPtr; + } + + const Elf_Note *operator*() const { + assert(*this != ElfLSegmentNoteIteratorImpl(EF, true) && + "Dereferencing the end iterator."); + return NPtr; + } + + // Advance to the next NOTE in the CurrentSegment. + // If there is no next NOTE, then autoAdvance() to the next + // PT_NOTE segment and its first NOTE. + ElfLSegmentNoteIteratorImpl &operator++() { + assert(*this != ElfLSegmentNoteIteratorImpl(EF, true) && + "Incrementing the end iterator."); + + GElf_Phdr Phdr; + gelf_getphdr(EF, CurrentSegment, &Phdr); + uint64_t NotesBegin = getNotesBeginAddr(Phdr); + uint64_t NotesEnd = getNotesEndAddr(Phdr); + assert(reinterpret_cast(NPtr) >= NotesBegin && + reinterpret_cast(NPtr) < NotesEnd && + "Invalid pointer to a note computed somewhere else."); + (void)NotesBegin; + + uint64_t NoteSize = getNoteSize(*NPtr); + NPtr = + reinterpret_cast(reinterpret_cast(NPtr) + NoteSize); + if (reinterpret_cast(NPtr) >= NotesEnd || + reinterpret_cast(NPtr) + sizeof(*NPtr) >= NotesEnd) { + // We've reached the end of the current NOTE section. + NPtr = nullptr; + } + + // Auto advance to the next section, if needed. + autoAdvance(); + return *this; + } +}; + +class ElfLSectionImpl { + Elf *EF = nullptr; + Elf_Scn *Section = nullptr; + +public: + ElfLSectionImpl(Elf *EF, Elf_Scn *Section) : EF(EF), Section(Section) {} + + const char *getName() const { + size_t SHStrNdx; + if (elf_getshdrstrndx(EF, &SHStrNdx) != 0) + return ""; + + GElf_Shdr Shdr; + gelf_getshdr(Section, &Shdr); + char *Name = elf_strptr(EF, SHStrNdx, static_cast(Shdr.sh_name)); + return Name ? Name : ""; + } + + uint64_t getSize() const { + Elf_Data *Desc = elf_rawdata(Section, nullptr); + if (!Desc) + return 0; + + return Desc->d_size; + } + + const uint8_t *getContents() const { + Elf_Data *Desc = elf_rawdata(Section, nullptr); + if (!Desc) + return 0; + + return reinterpret_cast(Desc->d_buf); + } +}; + +class ElfLSectionIteratorImpl { + // A pointer to Elf object created by elf_memory() for + // the ELF image we are going to iterate. + Elf *EF = nullptr; + + // A pointer to the current section. + Elf_Scn *CurrentSection = nullptr; + + bool operator!=(const ElfLSectionIteratorImpl Other) const { + return !(*this == Other); + } + +public: + ElfLSectionIteratorImpl(Elf *RawElf, bool IsEnd = false) : EF(RawElf) { + assert(EF && "Trying to iterate invalid ELF."); + if (IsEnd) + return; + + CurrentSection = elf_getscn(EF, 0); + } + + bool operator==(const ElfLSectionIteratorImpl Other) const { + return CurrentSection == Other.CurrentSection; + } + + ElfLSectionImpl *operator*() const { + assert(*this != ElfLSectionIteratorImpl(EF, true) && + "Dereferencing the end iterator."); + return new ElfLSectionImpl(EF, CurrentSection); + } + + ElfLSectionIteratorImpl &operator++() { + assert(*this != ElfLSectionIteratorImpl(EF, true) && + "Dereferencing the end iterator."); + + CurrentSection = elf_nextscn(EF, CurrentSection); + return *this; + } +}; + +// Actual implementation of ElfL via libelf. +// It is constructed from an ELF image defined by its +// starting pointer in memory and a length in bytes. +class ElfLImpl { + // A pointer to Elf object created by elf_memory() for + // the ELF image. + Elf *EF = nullptr; + + // Class of the ELF image. + unsigned ElfClass = ELFCLASSNONE; + + // A pointer to the ELF image's header. + // Depending on the class it may be either 'Elf32_Ehdr *' + // or 'Elf64_Ehdr '. + const void *Header = nullptr; + + // Let the owning object access this. + friend class ElfL; + +public: + ElfLImpl(Elf *RawElf, unsigned ElfClass, const void *Header) + : EF(RawElf), ElfClass(ElfClass), Header(Header) {} + + // Allocates and constructs a new iterator for NOTEs in + // SHT_NOTE sections of the ELF image. + ElfLSectionNoteIteratorImpl * + createSectionNoteIteratorImpl(bool IsEnd) const { + return new ElfLSectionNoteIteratorImpl(EF, IsEnd); + } + + // Allocates and constructs a new iterator for NOTEs in + // PT_NOTE segments of the ELF image. + ElfLSegmentNoteIteratorImpl * + createSegmentNoteIteratorImpl(bool IsEnd) const { + return new ElfLSegmentNoteIteratorImpl(EF, IsEnd); + } + + ElfLSectionIteratorImpl * + createSectionIteratorImpl(bool IsEnd) const { + return new ElfLSectionIteratorImpl(EF, IsEnd); + } +}; + +ElfL::ElfL(char *Begin, size_t Size) { + Elf *ElfHandle = elf_memory(Begin, Size); + if (!ElfHandle) { + elf_end(ElfHandle); + return; + } + + const Elf32_Ehdr *Header32 = elf32_getehdr(ElfHandle); + const Elf64_Ehdr *Header64 = elf64_getehdr(ElfHandle); + + if (!Header32 == !Header64) { + // Ambiguous ELF header or unrecognized ELF image. + elf_end(ElfHandle); + return; + } + + const void *Header = nullptr; + unsigned ElfClass = ELFCLASSNONE; + + if (Header32) { + ElfClass = ELFCLASS32; + Header = reinterpret_cast(Header32); + } else { + ElfClass = ELFCLASS64; + Header = reinterpret_cast(Header64); + } + + Impl = reinterpret_cast(new ElfLImpl(ElfHandle, ElfClass, Header)); +} + +ElfL::~ElfL() { + if (Impl) { + ElfLImpl *EImpl = reinterpret_cast(Impl); + elf_end(EImpl->EF); + delete EImpl; + } +} + +bool ElfL::isValidElf() const { + ElfLImpl *EImpl = reinterpret_cast(Impl); + return Impl && EImpl->Header && EImpl->ElfClass != ELFCLASSNONE; +} + +const char *ElfL::getErrmsg(int N) const { return elf_errmsg(-1); } + +uint16_t ElfL::getEMachine() const { + assert(isValidElf() && "Invalid ELF."); + ElfLImpl *EImpl = reinterpret_cast(Impl); + if (EImpl->ElfClass == ELFCLASS32) + return reinterpret_cast(EImpl->Header)->e_machine; + else if (EImpl->ElfClass == ELFCLASS64) + return reinterpret_cast(EImpl->Header)->e_machine; + else + assert(false && "Unsupported ELF class."); + + return EM_NONE; +} + +uint16_t ElfL::getEType() const { + assert(isValidElf() && "Invalid ELF."); + ElfLImpl *EImpl = reinterpret_cast(Impl); + if (EImpl->ElfClass == ELFCLASS32) + return reinterpret_cast(EImpl->Header)->e_type; + else if (EImpl->ElfClass == ELFCLASS64) + return reinterpret_cast(EImpl->Header)->e_type; + else + assert(false && "Unsupported ELF class."); + + return ET_NONE; +} + +bool ElfL::isDynType(uint16_t Ty) { return Ty == ET_DYN; } + +ElfLSectionNoteIterator ElfL::section_notes_begin() const { + return ElfLSectionNoteIterator(reinterpret_cast(Impl)); +} + +ElfLSectionNoteIterator ElfL::section_notes_end() const { + return ElfLSectionNoteIterator(reinterpret_cast(Impl), + true); +} + +ElfLSectionNoteIterator::ElfLSectionNoteIterator(const void *I, bool IsEnd) { + const ElfLImpl *EImpl = reinterpret_cast(I); + Impl = EImpl->createSectionNoteIteratorImpl(IsEnd); +} + +ElfLSectionNoteIterator::ElfLSectionNoteIterator( + const ElfLSectionNoteIterator &Other) { + ElfLSectionNoteIteratorImpl *IImpl = + reinterpret_cast(Other.Impl); + Impl = new ElfLSectionNoteIteratorImpl(*IImpl); +} + +ElfLSectionNoteIterator::~ElfLSectionNoteIterator() { + assert(Impl && "Invalid ElfLSectionNoteIterator object."); + ElfLSectionNoteIteratorImpl *IImpl = + reinterpret_cast(Impl); + delete IImpl; +} + +bool ElfLSectionNoteIterator::operator==( + const ElfLSectionNoteIterator Other) const { + const ElfLSectionNoteIteratorImpl *Lhs = + reinterpret_cast(Impl); + const ElfLSectionNoteIteratorImpl *Rhs = + reinterpret_cast(Other.Impl); + return (*Lhs == *Rhs); +} + +bool ElfLSectionNoteIterator::operator!=( + const ElfLSectionNoteIterator Other) const { + return !(*this == Other); +} + +ElfLSectionNoteIterator &ElfLSectionNoteIterator::operator++() { + ElfLSectionNoteIteratorImpl *IImpl = + reinterpret_cast(Impl); + ++(*IImpl); + return *this; +} + +ElfLNote ElfLSectionNoteIterator::operator*() const { + ElfLSectionNoteIteratorImpl *IImpl = + reinterpret_cast(Impl); + return ElfLNote(**IImpl); +} + +ElfLSegmentNoteIterator ElfL::segment_notes_begin() const { + return ElfLSegmentNoteIterator(reinterpret_cast(Impl)); +} + +ElfLSegmentNoteIterator ElfL::segment_notes_end() const { + return ElfLSegmentNoteIterator(reinterpret_cast(Impl), + true); +} + +ElfLSegmentNoteIterator::ElfLSegmentNoteIterator(const void *I, bool IsEnd) { + const ElfLImpl *EImpl = reinterpret_cast(I); + Impl = EImpl->createSegmentNoteIteratorImpl(IsEnd); +} + +ElfLSegmentNoteIterator::ElfLSegmentNoteIterator( + const ElfLSegmentNoteIterator &Other) { + ElfLSegmentNoteIteratorImpl *IImpl = + reinterpret_cast(Other.Impl); + Impl = new ElfLSegmentNoteIteratorImpl(*IImpl); +} + +ElfLSegmentNoteIterator::~ElfLSegmentNoteIterator() { + assert(Impl && "Invalid ElfLSegmentNoteIterator object."); + ElfLSegmentNoteIteratorImpl *IImpl = + reinterpret_cast(Impl); + delete IImpl; +} + +bool ElfLSegmentNoteIterator::operator==( + const ElfLSegmentNoteIterator Other) const { + const ElfLSegmentNoteIteratorImpl *Lhs = + reinterpret_cast(Impl); + const ElfLSegmentNoteIteratorImpl *Rhs = + reinterpret_cast(Other.Impl); + return (*Lhs == *Rhs); +} + +bool ElfLSegmentNoteIterator::operator!=( + const ElfLSegmentNoteIterator Other) const { + return !(*this == Other); +} + +ElfLSegmentNoteIterator &ElfLSegmentNoteIterator::operator++() { + ElfLSegmentNoteIteratorImpl *IImpl = + reinterpret_cast(Impl); + ++(*IImpl); + return *this; +} + +ElfLNote ElfLSegmentNoteIterator::operator*() const { + ElfLSegmentNoteIteratorImpl *IImpl = + reinterpret_cast(Impl); + return ElfLNote(**IImpl); +} + +ElfLNote::ElfLNote(const void *I) { + // ElfLNote::Impl is a pointer to Elf_Note in this implementation. + // A pointer to Elf_Note is returned by + // ElfLSectionNoteIteratorImpl::operator*(). + Impl = I; +} + +ElfLNote::ElfLNote(const ElfLNote &Other) { Impl = Other.Impl; } + +ElfLNote::~ElfLNote() {} + +uint64_t ElfLNote::getNameSize() const { + const Elf_Note *Note = reinterpret_cast(Impl); + if (Note->n_namesz == 0) + return 0; + // libelf returns name size that accounts for the null terminator. + // ELF light interface returns the size ignoring it. + return Note->n_namesz - 1; +} + +const char *ElfLNote::getName() const { + const Elf_Note *Note = reinterpret_cast(Impl); + return reinterpret_cast(Note) + sizeof(*Note); +} + +uint64_t ElfLNote::getDescSize() const { + const Elf_Note *Note = reinterpret_cast(Impl); + return Note->n_descsz; +} + +const uint8_t *ElfLNote::getDesc() const { + const Elf_Note *Note = reinterpret_cast(Impl); + return reinterpret_cast(Note) + sizeof(*Note) + + alignUp(getNameSize(), NoteAlignment); +} + +uint64_t ElfLNote::getType() const { + const Elf_Note *Note = reinterpret_cast(Impl); + return Note->n_type; +} + +ElfLSection::ElfLSection(const void *I) { + Impl = I; +} + +ElfLSection::ElfLSection(const ElfLSection &Other) { + const ElfLSectionImpl *SImpl = + reinterpret_cast(Other.Impl); + Impl = new ElfLSectionImpl(*SImpl); +} + +ElfLSection::~ElfLSection() { + const ElfLSectionImpl *SImpl = + reinterpret_cast(Impl); + delete SImpl; +} + +const char *ElfLSection::getName() const { + const ElfLSectionImpl *SImpl = + reinterpret_cast(Impl); + return SImpl->getName(); +} + +uint64_t ElfLSection::getSize() const { + const ElfLSectionImpl *SImpl = + reinterpret_cast(Impl); + return SImpl->getSize(); +} + +const uint8_t *ElfLSection::getContents() const { + const ElfLSectionImpl *SImpl = + reinterpret_cast(Impl); + return SImpl->getContents(); +} + +ElfLSectionIterator ElfL::sections_begin() const { + return ElfLSectionIterator(Impl); +} + +ElfLSectionIterator ElfL::sections_end() const { + return ElfLSectionIterator(Impl, true); +} + +ElfLSectionIterator::ElfLSectionIterator(const void *I, bool IsEnd) { + const ElfLImpl *EImpl = reinterpret_cast(I); + Impl = EImpl->createSectionIteratorImpl(IsEnd); +} + +ElfLSectionIterator::ElfLSectionIterator( + const ElfLSectionIterator &Other) { + ElfLSectionIteratorImpl *IImpl = + reinterpret_cast(Other.Impl); + Impl = new ElfLSectionIteratorImpl(*IImpl); +} + +ElfLSectionIterator::~ElfLSectionIterator() { + assert(Impl && "Invalid ElfLSectionIterator object."); + ElfLSectionIteratorImpl *IImpl = + reinterpret_cast(Impl); + delete IImpl; +} + +bool ElfLSectionIterator::operator==( + const ElfLSectionIterator Other) const { + const ElfLSectionIteratorImpl *Lhs = + reinterpret_cast(Impl); + const ElfLSectionIteratorImpl *Rhs = + reinterpret_cast(Other.Impl); + return (*Lhs == *Rhs); +} + +bool ElfLSectionIterator::operator!=( + const ElfLSectionIterator Other) const { + return !(*this == Other); +} + +ElfLSectionIterator &ElfLSectionIterator::operator++() { + ElfLSectionIteratorImpl *IImpl = + reinterpret_cast(Impl); + ++(*IImpl); + return *this; +} + +ElfLSection ElfLSectionIterator::operator*() const { + ElfLSectionIteratorImpl *IImpl = + reinterpret_cast(Impl); + return ElfLSection(**IImpl); +} +#else // !MAY_USE_LIBELF + +// Implementation based on LLVM ELF binary format. +#include "llvm/Object/Binary.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/Object/ELFTypes.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/MemoryBuffer.h" + +using namespace llvm; +using namespace llvm::ELF; +using namespace llvm::object; + +class ElfLNoteImplBase { +public: + virtual ~ElfLNoteImplBase() = default; + virtual ElfLNoteImplBase *clone() const = 0; + virtual size_t getNameSize() const = 0; + virtual const char *getName() const = 0; + virtual size_t getDescSize() const = 0; + virtual const uint8_t *getDesc() const = 0; + virtual uint32_t getType() const = 0; +}; + +template class ElfLNoteImpl : public ElfLNoteImplBase { + using Elf_Note = typename ELFT::Note; + const Elf_Note Note; + +public: + ElfLNoteImpl(const Elf_Note Note) : Note(Note) {} + ElfLNoteImpl(const ElfLNoteImpl &) = default; + ElfLNoteImplBase *clone() const override { return new ElfLNoteImpl(*this); } + ~ElfLNoteImpl() = default; + size_t getNameSize() const override { return Note.getName().size(); } + const char *getName() const override { return Note.getName().data(); } + size_t getDescSize() const override { return Note.getDesc().size(); } + const uint8_t *getDesc() const override { return Note.getDesc().data(); } + uint32_t getType() const override { return Note.getType(); } +}; + +class ElfLNoteIteratorImplBase { +protected: + const endianness TargetEndianness; + const bool Is64Bits; + const bool IsSectionIterator; + + ElfLNoteIteratorImplBase(endianness TargetEndianness, bool Is64Bits, + bool IsSectionIterator) + : TargetEndianness(TargetEndianness), Is64Bits(Is64Bits), + IsSectionIterator(IsSectionIterator) {} + +public: + ElfLNoteIteratorImplBase(const ElfLNoteIteratorImplBase &) = default; + virtual ~ElfLNoteIteratorImplBase() = default; + virtual ElfLNoteIteratorImplBase *clone() const = 0; + virtual ElfLNoteIteratorImplBase &operator++() = 0; + virtual bool operator==(const ElfLNoteIteratorImplBase &) const = 0; + virtual ElfLNoteImplBase *operator*() const = 0; + + endianness getEndianness() const { return TargetEndianness; } + + bool is64Bits() const { return Is64Bits; } + + bool isSectionIterator() const { return IsSectionIterator; } +}; + +template +class ElfLNoteIteratorImpl : public ElfLNoteIteratorImplBase { +protected: + using NoteIterator = typename ELFT::NoteIterator; + + const ELFFile &EF; + NoteIterator NotesIt; + Error &Err; + + explicit ElfLNoteIteratorImpl(const ELFFile &EF, Error &Err, + bool IsSectionIterator) + : ElfLNoteIteratorImplBase(ELFT::TargetEndianness, ELFT::Is64Bits, + IsSectionIterator), + EF(EF), NotesIt(EF.notes_end()), Err(Err) {} + +public: + ElfLNoteIteratorImpl(const ElfLNoteIteratorImpl &) = default; + virtual ~ElfLNoteIteratorImpl() = default; + + static bool classof(const ElfLNoteIteratorImplBase *B) { + return (B->getEndianness() == ELFT::TargetEndianness && + B->is64Bits() == ELFT::Is64Bits); + } +}; + +template +class ElfLSectionNoteIteratorImpl : public ElfLNoteIteratorImpl { + using Elf_Shdr = typename ELFT::Shdr; + using Elf_Shdr_Range = typename ELFT::ShdrRange; + using NoteIterator = typename ElfLNoteIteratorImpl::NoteIterator; + using SectionsIteratorTy = typename Elf_Shdr_Range::iterator; + + SectionsIteratorTy SectionsIt; + + const ELFFile &getEF() const { return this->EF; } + const NoteIterator &getNotesIt() const { return this->NotesIt; } + Error &getErr() const { return this->Err; } + NoteIterator &getNotesIt() { return this->NotesIt; } + SectionsIteratorTy section_begin() const { + Expected Sections = getEF().sections(); + if (!Sections) + return SectionsIteratorTy(); + + return Sections->begin(); + } + + SectionsIteratorTy section_end() const { + Expected Sections = getEF().sections(); + if (!Sections) + return SectionsIteratorTy(); + + return Sections->end(); + } + + bool isEqual(const ElfLSectionNoteIteratorImpl &Lhs, + const ElfLSectionNoteIteratorImpl &Rhs) const { + // Check for end() iterators, first. + if (Lhs.SectionsIt == section_end() && Rhs.SectionsIt == section_end()) + return true; + + if (Lhs.SectionsIt != Rhs.SectionsIt) + return false; + + return Lhs.getNotesIt() == Rhs.getNotesIt(); + } + + void autoAdvance(bool IsFirst = false) { + if (SectionsIt == section_end()) + return; + + if (getNotesIt() != getEF().notes_end()) + return; + + // SectionsIt is not an end iterator, and NotesIt is an end() + // iterator. + // + // If IsFirst is true, then we just in the initial state, and + // we need to set SectionsIt to the first SHT_NOTE section (if any), + // and, then, NotesIt to the first note in this section. + // + // If IsFirst is false, then we've reached the end of the current + // SHT_NOTE section, and should find the next section with notes. + if (!IsFirst || SectionsIt->sh_type != ELF::SHT_NOTE) + ++SectionsIt; + + while (SectionsIt != section_end() && + SectionsIt->sh_type != ELF::SHT_NOTE) { + ++SectionsIt; + } + + if (SectionsIt == section_end()) { + // No more sections. + return; + } + + const Elf_Shdr &Section = *SectionsIt; + getNotesIt() = getEF().notes_begin(Section, getErr()); + + // Auto advance the iterator, if the NOTE section + // does not contain any notes (e.g. some error happened + // during the note parsing). + autoAdvance(); + } + + bool operator!=(const ElfLSectionNoteIteratorImpl &Other) const { + return !(*this == Other); + } + +public: + ElfLSectionNoteIteratorImpl(const ELFFile &EF, Error &Err, + bool IsEnd = false) + : ElfLNoteIteratorImpl(EF, Err, true) { + if (IsEnd) { + SectionsIt = section_end(); + // It is an end() iterator, if SectionsIt is an end() iterator. + return; + } + + SectionsIt = section_begin(); + autoAdvance(true); + } + + ElfLSectionNoteIteratorImpl(const ElfLSectionNoteIteratorImpl &Copy) = + default; + + ElfLNoteIteratorImplBase *clone() const override { + return new ElfLSectionNoteIteratorImpl(*this); + } + + bool operator==(const ElfLNoteIteratorImplBase &Other) const override { + if (const ElfLSectionNoteIteratorImpl *OPtr = + dyn_cast(&Other)) { + return isEqual(*this, *OPtr); + } + return false; + } + + ElfLSectionNoteIteratorImpl &operator++() override { + assert(*this != ElfLSectionNoteIteratorImpl(getEF(), getErr(), true) && + "Incrementing the end iterator."); + // Move the notes iterator within the current section. + ++getNotesIt(); + autoAdvance(); + + return *this; + } + + ElfLNoteImplBase *operator*() const override { + assert(*this != ElfLSectionNoteIteratorImpl(getEF(), getErr(), true) && + "Dereferencing the end iterator."); + return new ElfLNoteImpl(*getNotesIt()); + } + + static bool classof(const ElfLNoteIteratorImplBase *B) { + return (ElfLNoteIteratorImpl::classof(B) && + B->isSectionIterator() == true); + } +}; + +template +class ElfLSegmentNoteIteratorImpl : public ElfLNoteIteratorImpl { + using Elf_Phdr = typename ELFT::Phdr; + using Elf_Phdr_Range = typename ELFT::PhdrRange; + using NoteIterator = typename ElfLNoteIteratorImpl::NoteIterator; + using SegmentIteratorTy = typename Elf_Phdr_Range::iterator; + + SegmentIteratorTy SegmentsIt; + + const ELFFile &getEF() const { return this->EF; } + const NoteIterator &getNotesIt() const { return this->NotesIt; } + Error &getErr() const { return this->Err; } + NoteIterator &getNotesIt() { return this->NotesIt; } + SegmentIteratorTy segment_begin() const { + Expected Segments = getEF().program_headers(); + if (!Segments) + return SegmentIteratorTy(); + + return Segments->begin(); + } + + SegmentIteratorTy segment_end() const { + Expected Segments = getEF().program_headers(); + if (!Segments) + return SegmentIteratorTy(); + + return Segments->end(); + } + + bool isEqual(const ElfLSegmentNoteIteratorImpl &Lhs, + const ElfLSegmentNoteIteratorImpl &Rhs) const { + // Check for end() iterators, first. + if (Lhs.SegmentsIt == segment_end() && Rhs.SegmentsIt == segment_end()) + return true; + + if (Lhs.SegmentsIt != Rhs.SegmentsIt) + return false; + + return Lhs.getNotesIt() == Rhs.getNotesIt(); + } + + void autoAdvance(bool IsFirst = false) { + if (SegmentsIt == segment_end()) + return; + + if (getNotesIt() != getEF().notes_end()) + return; + + // SegmentsIt is not an end iterator, and NotesIt is an end() + // iterator. + // + // If IsFirst is true, then we just in the initial state, and + // we need to set SegmentsIt to the first PT_NOTE segment (if any), + // and, then, NotesIt to the first note in this segment. + // + // If IsFirst is false, then we've reached the end of the current + // PT_NOTE segment, and should find the next segment with notes. + if (!IsFirst || SegmentsIt->p_type != ELF::SHT_NOTE) + ++SegmentsIt; + + while (SegmentsIt != segment_end() && SegmentsIt->p_type != ELF::PT_NOTE) { + ++SegmentsIt; + } + + if (SegmentsIt == segment_end()) { + // No more segments. + return; + } + + const Elf_Phdr &Segment = *SegmentsIt; + getNotesIt() = getEF().notes_begin(Segment, getErr()); + + // Auto advance the iterator, if the NOTE segment + // does not contain any notes (e.g. some error happened + // during the note parsing). + autoAdvance(); + } + + bool operator!=(const ElfLSegmentNoteIteratorImpl &Other) const { + return !(*this == Other); + } + +public: + ElfLSegmentNoteIteratorImpl(const ELFFile &EF, Error &Err, + bool IsEnd = false) + : ElfLNoteIteratorImpl(EF, Err, false) { + if (IsEnd) { + SegmentsIt = segment_end(); + // It is an end() iterator, if SegmentsIt is an end() iterator. + return; + } + + SegmentsIt = segment_begin(); + autoAdvance(true); + } + + ElfLSegmentNoteIteratorImpl(const ElfLSegmentNoteIteratorImpl &Copy) = + default; + + ElfLNoteIteratorImplBase *clone() const override { + return new ElfLSegmentNoteIteratorImpl(*this); + } + + bool operator==(const ElfLNoteIteratorImplBase &Other) const override { + if (const ElfLSegmentNoteIteratorImpl *OPtr = + dyn_cast(&Other)) { + return isEqual(*this, *OPtr); + } + return false; + } + + ElfLSegmentNoteIteratorImpl &operator++() override { + assert(*this != ElfLSegmentNoteIteratorImpl(getEF(), getErr(), true) && + "Incrementing the end iterator."); + // Move the notes iterator within the current segment. + ++getNotesIt(); + autoAdvance(); + + return *this; + } + + ElfLNoteImplBase *operator*() const override { + assert(*this != ElfLSegmentNoteIteratorImpl(getEF(), getErr(), true) && + "Dereferencing the end iterator."); + return new ElfLNoteImpl(*getNotesIt()); + } + + static bool classof(const ElfLNoteIteratorImplBase *B) { + return (ElfLNoteIteratorImpl::classof(B) && + B->isSectionIterator() == false); + } +}; + +class ElfLSectionImplBase { +public: + virtual ~ElfLSectionImplBase() = default; + virtual ElfLSectionImplBase *clone() const = 0; + virtual const char *getName() const = 0; + virtual uint64_t getSize() const = 0; + virtual const uint8_t *getContents() const = 0; +}; + +template class ElfLSectionImpl : public ElfLSectionImplBase { + using Elf_Shdr = typename ELFT::Shdr; + + const ELFFile &EF; + const Elf_Shdr &Section; + +public: + ElfLSectionImpl(const ELFFile &EF, const Elf_Shdr &Section) + : EF(EF), Section(Section) {} + ElfLSectionImpl(const ElfLSectionImpl &) = default; + ElfLSectionImpl *clone() const override { return new ElfLSectionImpl(*this); } + ~ElfLSectionImpl() = default; + + const char *getName() const override { + Expected NameOrErr = EF.getSectionName(Section); + if (!NameOrErr) { + consumeError(NameOrErr.takeError()); + return ""; + } + return NameOrErr->data(); + } + + uint64_t getSize() const override { + Expected> ContentsOrErr = EF.getSectionContents(Section); + if (!ContentsOrErr) { + consumeError(ContentsOrErr.takeError()); + return 0; + } + return ContentsOrErr->size(); + } + + const uint8_t *getContents() const override { + Expected> ContentsOrErr = EF.getSectionContents(Section); + if (!ContentsOrErr) { + consumeError(ContentsOrErr.takeError()); + return 0; + } + return ContentsOrErr->data(); + } +}; + +class ElfLSectionIteratorImplBase { +protected: + const endianness TargetEndianness; + const bool Is64Bits; + + ElfLSectionIteratorImplBase(endianness TargetEndianness, bool Is64Bits) + : TargetEndianness(TargetEndianness), Is64Bits(Is64Bits) {} + +public: + ElfLSectionIteratorImplBase(const ElfLSectionIteratorImplBase &) = default; + virtual ~ElfLSectionIteratorImplBase() = default; + virtual ElfLSectionIteratorImplBase *clone() const = 0; + virtual ElfLSectionIteratorImplBase &operator++() = 0; + virtual bool operator==(const ElfLSectionIteratorImplBase &) const = 0; + virtual ElfLSectionImplBase *operator*() const = 0; + + endianness getEndianness() const { return TargetEndianness; } + + bool is64Bits() const { return Is64Bits; } +}; + +template +class ElfLSectionIteratorImpl : public ElfLSectionIteratorImplBase { + using Elf_Shdr = typename ELFT::Shdr; + using Elf_Shdr_Range = typename ELFT::ShdrRange; + using SectionsIteratorTy = typename Elf_Shdr_Range::iterator; + + const ELFFile &EF; + SectionsIteratorTy SectionsIt; + + const ELFFile &getEF() const { return EF; } + + SectionsIteratorTy section_begin() const { + Expected Sections = getEF().sections(); + if (!Sections) + return SectionsIteratorTy(); + + return Sections->begin(); + } + + SectionsIteratorTy section_end() const { + Expected Sections = getEF().sections(); + if (!Sections) + return SectionsIteratorTy(); + + return Sections->end(); + } + + bool isEqual(const ElfLSectionIteratorImpl &Lhs, + const ElfLSectionIteratorImpl &Rhs) const { + return Lhs.SectionsIt == Rhs.SectionsIt; + } + + bool operator!=(const ElfLSectionIteratorImpl Other) const { + return !(*this == Other); + } + +public: + ElfLSectionIteratorImpl(const ELFFile &EF, bool IsEnd = false) + : ElfLSectionIteratorImplBase(ELFT::TargetEndianness, ELFT::Is64Bits), + EF(EF) { + if (IsEnd) { + SectionsIt = section_end(); + return; + } + + SectionsIt = section_begin(); + } + + ElfLSectionIteratorImpl *clone() const override { + return new ElfLSectionIteratorImpl(*this); + } + + bool operator==(const ElfLSectionIteratorImplBase &Other) const override { + if (const ElfLSectionIteratorImpl *OPtr = + dyn_cast(&Other)) { + return isEqual(*this, *OPtr); + } + return false; + } + + ElfLSectionImplBase *operator*() const override { + assert(*this != ElfLSectionIteratorImpl(EF, true) && + "Dereferencing the end iterator."); + return new ElfLSectionImpl(EF, *SectionsIt); + } + + ElfLSectionIteratorImpl &operator++() override { + assert(*this != ElfLSectionIteratorImpl(EF, true) && + "Dereferencing the end iterator."); + + ++SectionsIt; + return *this; + } + + static bool classof(const ElfLSectionIteratorImplBase *B) { + return (B->getEndianness() == ELFT::TargetEndianness && + B->is64Bits() == ELFT::Is64Bits); + } +}; + +class ElfLImplBase { +public: + ElfLImplBase() = default; + ElfLImplBase(const ElfLImplBase &) = delete; + ElfLImplBase &operator=(const ElfLImplBase &) = delete; + virtual ~ElfLImplBase() = default; + virtual uint16_t getEMachine() const = 0; + virtual uint16_t getEType() const = 0; + + virtual ElfLNoteIteratorImplBase * + createSectionNoteIteratorImpl(bool IsEnd) const = 0; + virtual ElfLNoteIteratorImplBase * + createSegmentNoteIteratorImpl(bool IsEnd) const = 0; + virtual ElfLSectionIteratorImplBase * + createSectionIteratorImpl(bool IsEnd) const = 0; +}; + +template class ElfLImpl : public ElfLImplBase { + std::unique_ptr> File; + Error *Err = nullptr; + + friend class ElfL; + +public: + ElfLImpl(std::unique_ptr F) { + ObjectFile *FPtr = F.release(); + if (auto *Obj = dyn_cast>(FPtr)) + File = std::unique_ptr>(Obj); + else + assert(false && "Not an ELF object file, or ELF class is wrong."); + + Err = new Error(std::move(Error::success())); + } + ElfLImpl(const ElfLImpl &) = delete; + ElfLImpl &operator=(const ElfLImpl &) = delete; + virtual ~ElfLImpl() { + if (!Err) + return; + + if (*Err) { + auto ErrorString = toString(std::move(*Err)); + DP("Destroying ELF object parsed with errors: %s\n", ErrorString.c_str()); + } else { + delete Err; + } + Err = nullptr; + } + uint16_t getEMachine() const override { + return cast(File.get())->getEMachine(); + } + uint16_t getEType() const override { + return cast(File.get())->getEType(); + } + + ElfLNoteIteratorImplBase * + createSectionNoteIteratorImpl(bool IsEnd) const override { + return new ElfLSectionNoteIteratorImpl(File->getELFFile(), *Err, + IsEnd); + } + + ElfLNoteIteratorImplBase * + createSegmentNoteIteratorImpl(bool IsEnd) const override { + return new ElfLSegmentNoteIteratorImpl(File->getELFFile(), *Err, + IsEnd); + } + + ElfLSectionIteratorImplBase * + createSectionIteratorImpl(bool IsEnd) const override { + return new ElfLSectionIteratorImpl(File->getELFFile(), IsEnd); + } +}; + +ElfL::ElfL(char *Begin, size_t Size) { + StringRef StrBuf(Begin, Size); + std::unique_ptr MemBuf = + MemoryBuffer::getMemBuffer(StrBuf, "", false); + Expected> BinOrErr = + ObjectFile::createELFObjectFile(MemBuf->getMemBufferRef(), + /*InitContent=*/false); + if (!BinOrErr) { + consumeError(BinOrErr.takeError()); + return; + } + + if (isa(BinOrErr->get())) { + Impl = + reinterpret_cast(new ElfLImpl(std::move(*BinOrErr))); + } else if (isa(BinOrErr->get())) + Impl = + reinterpret_cast(new ElfLImpl(std::move(*BinOrErr))); + else if (isa(BinOrErr->get())) + Impl = + reinterpret_cast(new ElfLImpl(std::move(*BinOrErr))); + else if (isa(BinOrErr->get())) + Impl = + reinterpret_cast(new ElfLImpl(std::move(*BinOrErr))); +} + +ElfL::~ElfL() { + ElfLImplBase *EImpl = reinterpret_cast(Impl); + delete EImpl; +} + +bool ElfL::isValidElf() const { return Impl; } + +const char *ElfL::getErrmsg(int N) const { + // TODO: return text representation for the latest Error. + return "LLVM ELF error"; +} + +uint16_t ElfL::getEMachine() const { + assert(isValidElf() && "Invalid ELF."); + ElfLImplBase *EImpl = reinterpret_cast(Impl); + return EImpl->getEMachine(); +} + +uint16_t ElfL::getEType() const { + assert(isValidElf() && "Invalid ELF."); + ElfLImplBase *EImpl = reinterpret_cast(Impl); + return EImpl->getEType(); +} + +bool ElfL::isDynType(uint16_t Ty) { return Ty == ET_DYN; } + +ElfLSectionNoteIterator::ElfLSectionNoteIterator(const void *I, bool IsEnd) { + const ElfLImplBase *EImpl = reinterpret_cast(I); + // Create new ElfLSectionNoteIteratorImpl object. + Impl = EImpl->createSectionNoteIteratorImpl(IsEnd); +} + +ElfLSectionNoteIterator::~ElfLSectionNoteIterator() { + const ElfLNoteIteratorImplBase *IImpl = + reinterpret_cast(Impl); + delete IImpl; +} + +ElfLSectionNoteIterator::ElfLSectionNoteIterator( + const ElfLSectionNoteIterator &Other) { + const ElfLNoteIteratorImplBase *IImpl = + reinterpret_cast(Other.Impl); + Impl = IImpl->clone(); +} + +bool ElfLSectionNoteIterator::operator==( + const ElfLSectionNoteIterator Other) const { + const ElfLNoteIteratorImplBase *Lhs = + reinterpret_cast(Impl); + const ElfLNoteIteratorImplBase *Rhs = + reinterpret_cast(Other.Impl); + return (*Lhs == *Rhs); +} + +bool ElfLSectionNoteIterator::operator!=( + const ElfLSectionNoteIterator Other) const { + return !(*this == Other); +} + +ElfLSectionNoteIterator &ElfLSectionNoteIterator::operator++() { + ElfLNoteIteratorImplBase *EImpl = + reinterpret_cast(Impl); + ++(*EImpl); + return *this; +} + +ElfLNote ElfLSectionNoteIterator::operator*() const { return ElfLNote(Impl); } + +ElfLSectionNoteIterator ElfL::section_notes_begin() const { + assert(isValidElf() && "Invalid ELF."); + return ElfLSectionNoteIterator(reinterpret_cast(Impl)); +} + +ElfLSectionNoteIterator ElfL::section_notes_end() const { + assert(isValidElf() && "Invalid ELF."); + return ElfLSectionNoteIterator(reinterpret_cast(Impl), + true); +} + +ElfLSegmentNoteIterator::ElfLSegmentNoteIterator(const void *I, bool IsEnd) { + const ElfLImplBase *EImpl = reinterpret_cast(I); + // Create new ElfLSegmentNoteIteratorImpl object. + Impl = EImpl->createSegmentNoteIteratorImpl(IsEnd); +} + +ElfLSegmentNoteIterator::~ElfLSegmentNoteIterator() { + const ElfLNoteIteratorImplBase *IImpl = + reinterpret_cast(Impl); + delete IImpl; +} + +ElfLSegmentNoteIterator::ElfLSegmentNoteIterator( + const ElfLSegmentNoteIterator &Other) { + const ElfLNoteIteratorImplBase *IImpl = + reinterpret_cast(Other.Impl); + Impl = IImpl->clone(); +} + +bool ElfLSegmentNoteIterator::operator==( + const ElfLSegmentNoteIterator Other) const { + const ElfLNoteIteratorImplBase *Lhs = + reinterpret_cast(Impl); + const ElfLNoteIteratorImplBase *Rhs = + reinterpret_cast(Other.Impl); + return (*Lhs == *Rhs); +} + +bool ElfLSegmentNoteIterator::operator!=( + const ElfLSegmentNoteIterator Other) const { + return !(*this == Other); +} + +ElfLSegmentNoteIterator &ElfLSegmentNoteIterator::operator++() { + ElfLNoteIteratorImplBase *EImpl = + reinterpret_cast(Impl); + ++(*EImpl); + return *this; +} + +ElfLNote ElfLSegmentNoteIterator::operator*() const { return ElfLNote(Impl); } + +ElfLSegmentNoteIterator ElfL::segment_notes_begin() const { + assert(isValidElf() && "Invalid ELF."); + return ElfLSegmentNoteIterator(reinterpret_cast(Impl)); +} + +ElfLSegmentNoteIterator ElfL::segment_notes_end() const { + assert(isValidElf() && "Invalid ELF."); + return ElfLSegmentNoteIterator(reinterpret_cast(Impl), + true); +} + +ElfLNote::ElfLNote(const void *IteratorImpl) { + const ElfLNoteIteratorImplBase *IImpl = + reinterpret_cast(IteratorImpl); + Impl = **IImpl; +} + +ElfLNote::ElfLNote(const ElfLNote &Other) { + const ElfLNoteImplBase *NImpl = + reinterpret_cast(Impl); + if (NImpl) + Impl = NImpl->clone(); +} + +ElfLNote::~ElfLNote() { + const ElfLNoteImplBase *NImpl = + reinterpret_cast(Impl); + delete NImpl; +} + +uint64_t ElfLNote::getNameSize() const { + const ElfLNoteImplBase *NImpl = + reinterpret_cast(Impl); + return NImpl->getNameSize(); +} + +const char *ElfLNote::getName() const { + const ElfLNoteImplBase *NImpl = + reinterpret_cast(Impl); + return NImpl->getName(); +} + +uint64_t ElfLNote::getDescSize() const { + const ElfLNoteImplBase *NImpl = + reinterpret_cast(Impl); + return NImpl->getDescSize(); +} + +const uint8_t *ElfLNote::getDesc() const { + const ElfLNoteImplBase *NImpl = + reinterpret_cast(Impl); + return NImpl->getDesc(); +} + +uint64_t ElfLNote::getType() const { + const ElfLNoteImplBase *NImpl = + reinterpret_cast(Impl); + return NImpl->getType(); +} + +ElfLSection::ElfLSection(const void *I) { + Impl = I; +} + +ElfLSection::ElfLSection(const ElfLSection &Other) { + const ElfLSectionImplBase *SImpl = + reinterpret_cast(Other.Impl); + Impl = SImpl->clone(); +} + +ElfLSection::~ElfLSection() { + const ElfLSectionImplBase *SImpl = + reinterpret_cast(Impl); + delete SImpl; +} + +const char *ElfLSection::getName() const { + const ElfLSectionImplBase *SImpl = + reinterpret_cast(Impl); + return SImpl->getName(); +} + +uint64_t ElfLSection::getSize() const { + const ElfLSectionImplBase *SImpl = + reinterpret_cast(Impl); + return SImpl->getSize(); +} + +const uint8_t *ElfLSection::getContents() const { + const ElfLSectionImplBase *SImpl = + reinterpret_cast(Impl); + return SImpl->getContents(); +} + +ElfLSectionIterator ElfL::sections_begin() const { + assert(isValidElf() && "Invalid ELF."); + return ElfLSectionIterator(reinterpret_cast(Impl)); +} + +ElfLSectionIterator ElfL::sections_end() const { + assert(isValidElf() && "Invalid ELF."); + return ElfLSectionIterator(reinterpret_cast(Impl), + true); +} + +ElfLSectionIterator::ElfLSectionIterator(const void *I, bool IsEnd) { + const ElfLImplBase *EImpl = reinterpret_cast(I); + Impl = EImpl->createSectionIteratorImpl(IsEnd); +} + +ElfLSectionIterator::ElfLSectionIterator( + const ElfLSectionIterator &Other) { + ElfLSectionIteratorImplBase *IImpl = + reinterpret_cast(Other.Impl); + Impl = IImpl->clone(); +} + +ElfLSectionIterator::~ElfLSectionIterator() { + ElfLSectionIteratorImplBase *IImpl = + reinterpret_cast(Impl); + delete IImpl; +} + +bool ElfLSectionIterator::operator==( + const ElfLSectionIterator Other) const { + const ElfLSectionIteratorImplBase *Lhs = + reinterpret_cast(Impl); + const ElfLSectionIteratorImplBase *Rhs = + reinterpret_cast(Other.Impl); + return (*Lhs == *Rhs); +} + +bool ElfLSectionIterator::operator!=( + const ElfLSectionIterator Other) const { + return !(*this == Other); +} + +ElfLSectionIterator &ElfLSectionIterator::operator++() { + ElfLSectionIteratorImplBase *IImpl = + reinterpret_cast(Impl); + ++(*IImpl); + return *this; +} + +ElfLSection ElfLSectionIterator::operator*() const { + ElfLSectionIteratorImplBase *IImpl = + reinterpret_cast(Impl); + return ElfLSection(**IImpl); +} +#endif // !MAY_USE_LIBELF diff --git a/openmp/libomptarget/plugins/opencl/CMakeLists.txt b/openmp/libomptarget/plugins/opencl/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/opencl/CMakeLists.txt @@ -0,0 +1,69 @@ +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Plugin for OpenCL device +# +##===----------------------------------------------------------------------===## + +if(LIBOMPTARGET_DEP_OPENCL_FOUND) + if((CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)$" OR + CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64") AND + (CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Windows")) + + libomptarget_say("Building OpenCL offloading plugin.") + + # Define the suffix for the runtime messaging dumps. + add_definitions(-DTARGET_NAME=OPENCL) + add_definitions(-DCL_TARGET_OPENCL_VERSION=210) + + if(LIBOMPTARGET_CMAKE_BUILD_TYPE MATCHES debug) + add_definitions(-DSPIR_ERROR_REPORT) + add_definitions(-DOMPTARGET_OPENCL_DEBUG) + endif() + + add_library(omptarget.rtl.opencl SHARED src/rtl.cpp) + + target_include_directories(omptarget.rtl.opencl PRIVATE + ${LIBOMPTARGET_INCLUDE_DIR} + ${LIBOMPTARGET_DEP_OPENCL_INCLUDE_DIRS} + ${LIBOMPTARGET_LLVM_INCLUDE_DIRS} + ) + + if (NOT OPENMP_STANDALONE_BUILD) + # We have to disable EH for Windows compilation. + # For standalone OpenMP build, we need to come up + # with our own EH flags management. + llvm_update_compile_flags(omptarget.rtl.opencl) + endif() + + install(TARGETS omptarget.rtl.opencl + LIBRARY DESTINATION lib${LIBOMPTARGET_LIBDIR_SUFFIX}) + + # OpenMP runtime library must be linked in. + target_link_libraries(omptarget.rtl.opencl PRIVATE omp) + + target_link_libraries(omptarget.rtl.opencl PRIVATE elf_common LLVMSupport + ${LIBOMPTARGET_DEP_OPENCL_LIBRARIES}) + + if (CMAKE_SYSTEM_NAME MATCHES "Linux") + target_link_libraries(omptarget.rtl.opencl PRIVATE dl + ${OPENMP_PTHREAD_LIB} + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports") + elseif (NOT CMAKE_SYSTEM_NAME MATCHES "Windows") + message(FATAL_ERROR "Missing platfrom support.") + endif() + + # Report to the parent scope that we are building a plugin for OpenCL. + set(LIBOMPTARGET_SYSTEM_TARGETS + "${LIBOMPTARGET_SYSTEM_TARGETS} spir64-unknown-unknown" PARENT_SCOPE) + else() + libomptarget_say("Not building OpenCL offloading plugin: only support OpenCL in x86_64 Linux/Windows hosts.") + endif() +else() + libomptarget_say("Not building OpenCL offloading plugin: OpenCL not found in system.") +endif() diff --git a/openmp/libomptarget/plugins/opencl/src/rtl-trace.h b/openmp/libomptarget/plugins/opencl/src/rtl-trace.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/opencl/src/rtl-trace.h @@ -0,0 +1,1176 @@ +//===--- Target RTLs Implementation ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Code for tracing RTL +// +//===----------------------------------------------------------------------===// +#ifndef RTL_TRACE_H +#define RTL_TRACE_H + +#include +#include +#include +#include +#include "omptarget.h" +#include "Debug.h" + +#define STR(x) #x +#define TO_STRING(x) STR(x) + +#define TARGET_NAME OPENCL +#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL" + +extern int DebugLevel; + +#define DPCALL(...) \ + do { \ + if (DebugLevel > 1) \ + DP(__VA_ARGS__); \ + } while (0) + +#define FATAL_ERROR(Msg) \ + do { \ + fprintf(stderr, "%s --> ", DEBUG_PREFIX); \ + fprintf(stderr, "Error: %s failed (%s) -- exiting...\n", __func__, Msg); \ + exit(EXIT_FAILURE); \ + } while (0) + +#define WARNING(...) \ + do { \ + fprintf(stderr, "%s --> ", DEBUG_PREFIX); \ + fprintf(stderr, "Warning: " __VA_ARGS__); \ + } while (0) + +typedef cl_int (CL_API_CALL *clGetDeviceGlobalVariablePointerINTEL_fn)( + cl_device_id, + cl_program, + const char *, + size_t *, + void **); +typedef cl_int (CL_API_CALL *clGetKernelSuggestedLocalWorkSizeINTEL_fn)( + cl_command_queue, + cl_kernel, + cl_uint, + const size_t *, + const size_t *, + size_t *); +typedef cl_int (CL_API_CALL *clSetProgramSpecializationConstant_fn)( + cl_program, cl_uint, size_t, const void *); + +#define FOR_EACH_COMMON_EXTENSION_FN(M) \ + M(clGetMemAllocInfoINTEL) \ + M(clHostMemAllocINTEL) \ + M(clDeviceMemAllocINTEL) \ + M(clSharedMemAllocINTEL) \ + M(clMemFreeINTEL) \ + M(clSetKernelArgMemPointerINTEL) \ + M(clEnqueueMemcpyINTEL) \ + M(clSetProgramSpecializationConstant) \ + M(clGetDeviceGlobalVariablePointerINTEL) \ + M(clGetKernelSuggestedLocalWorkSizeINTEL) + +#define FOR_EACH_EXTENSION_FN(M) FOR_EACH_COMMON_EXTENSION_FN(M) + +enum ExtensionIdTy { +#define EXTENSION_FN_ID(Fn) Fn##Id, + FOR_EACH_EXTENSION_FN(EXTENSION_FN_ID) + ExtensionIdLast +}; + +#define FOREACH_CL_ERROR_CODE(FN) \ + FN(CL_SUCCESS) \ + FN(CL_DEVICE_NOT_FOUND) \ + FN(CL_DEVICE_NOT_AVAILABLE) \ + FN(CL_COMPILER_NOT_AVAILABLE) \ + FN(CL_MEM_OBJECT_ALLOCATION_FAILURE) \ + FN(CL_OUT_OF_RESOURCES) \ + FN(CL_OUT_OF_HOST_MEMORY) \ + FN(CL_PROFILING_INFO_NOT_AVAILABLE) \ + FN(CL_MEM_COPY_OVERLAP) \ + FN(CL_IMAGE_FORMAT_MISMATCH) \ + FN(CL_IMAGE_FORMAT_NOT_SUPPORTED) \ + FN(CL_BUILD_PROGRAM_FAILURE) \ + FN(CL_MAP_FAILURE) \ + FN(CL_MISALIGNED_SUB_BUFFER_OFFSET) \ + FN(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) \ + FN(CL_COMPILE_PROGRAM_FAILURE) \ + FN(CL_LINKER_NOT_AVAILABLE) \ + FN(CL_LINK_PROGRAM_FAILURE) \ + FN(CL_DEVICE_PARTITION_FAILED) \ + FN(CL_KERNEL_ARG_INFO_NOT_AVAILABLE) \ + FN(CL_INVALID_VALUE) \ + FN(CL_INVALID_DEVICE_TYPE) \ + FN(CL_INVALID_PLATFORM) \ + FN(CL_INVALID_DEVICE) \ + FN(CL_INVALID_CONTEXT) \ + FN(CL_INVALID_QUEUE_PROPERTIES) \ + FN(CL_INVALID_COMMAND_QUEUE) \ + FN(CL_INVALID_HOST_PTR) \ + FN(CL_INVALID_MEM_OBJECT) \ + FN(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR) \ + FN(CL_INVALID_IMAGE_SIZE) \ + FN(CL_INVALID_SAMPLER) \ + FN(CL_INVALID_BINARY) \ + FN(CL_INVALID_BUILD_OPTIONS) \ + FN(CL_INVALID_PROGRAM) \ + FN(CL_INVALID_PROGRAM_EXECUTABLE) \ + FN(CL_INVALID_KERNEL_NAME) \ + FN(CL_INVALID_KERNEL_DEFINITION) \ + FN(CL_INVALID_KERNEL) \ + FN(CL_INVALID_ARG_INDEX) \ + FN(CL_INVALID_ARG_VALUE) \ + FN(CL_INVALID_ARG_SIZE) \ + FN(CL_INVALID_KERNEL_ARGS) \ + FN(CL_INVALID_WORK_DIMENSION) \ + FN(CL_INVALID_WORK_GROUP_SIZE) \ + FN(CL_INVALID_WORK_ITEM_SIZE) \ + FN(CL_INVALID_GLOBAL_OFFSET) \ + FN(CL_INVALID_EVENT_WAIT_LIST) \ + FN(CL_INVALID_EVENT) \ + FN(CL_INVALID_OPERATION) \ + FN(CL_INVALID_GL_OBJECT) \ + FN(CL_INVALID_BUFFER_SIZE) \ + FN(CL_INVALID_MIP_LEVEL) \ + FN(CL_INVALID_GLOBAL_WORK_SIZE) \ + FN(CL_INVALID_PROPERTY) \ + FN(CL_INVALID_IMAGE_DESCRIPTOR) \ + FN(CL_INVALID_COMPILER_OPTIONS) \ + FN(CL_INVALID_LINKER_OPTIONS) \ + FN(CL_INVALID_DEVICE_PARTITION_COUNT) \ + FN(CL_INVALID_PIPE_SIZE) \ + FN(CL_INVALID_DEVICE_QUEUE) + +#define CASE_TO_STRING(s) case s: return #s; + +const char *getCLErrorName(int error) { + switch (error) { + FOREACH_CL_ERROR_CODE(CASE_TO_STRING) + default: + return "Unknown Error"; + } +} + +#define TRACE_FN(Name) CLTR##Name +#define TRACE_FN_ARG_BEGIN() \ + do { \ + std::string fn(__func__); \ + DPCALL("CL_CALLEE: %s (\n", fn.substr(4).c_str()); \ + } while (0) +#define TRACE_FN_ARG_END() DPCALL(")\n") +#define TRACE_FN_ARG(Arg, Fmt) DPCALL(" %s = " Fmt "\n", TO_STRING(Arg), Arg) +#define TRACE_FN_ARG_PTR(Arg) \ + DPCALL(" %s = " DPxMOD "\n", TO_STRING(Arg), DPxPTR(Arg)) +#define TRACE_FN_ARG_INT(Arg) TRACE_FN_ARG(Arg, "%" PRId32) +#define TRACE_FN_ARG_SIZE(Arg) TRACE_FN_ARG(Arg, "%zu") +#define TRACE_FN_ARG_UINT(Arg) TRACE_FN_ARG(Arg, "%" PRIu32) +#define TRACE_FN_ARG_ULONG(Arg) TRACE_FN_ARG(Arg, "%" PRIu64) + +cl_int TRACE_FN(clCompileProgram)( + cl_program program, + cl_uint num_devices, + const cl_device_id *device_list, + const char *options, + cl_uint num_input_headers, + const cl_program *input_headers, + const char **header_include_names, + void (CL_CALLBACK *pfn_notify)(cl_program, void *), + void *user_data) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(program); + TRACE_FN_ARG_UINT(num_devices); + TRACE_FN_ARG_PTR(device_list); + TRACE_FN_ARG_PTR(options); + TRACE_FN_ARG_UINT(num_input_headers); + TRACE_FN_ARG_PTR(input_headers); + TRACE_FN_ARG_PTR(header_include_names); + TRACE_FN_ARG_PTR(pfn_notify); + TRACE_FN_ARG_PTR(user_data); + TRACE_FN_ARG_END(); + return clCompileProgram(program, num_devices, device_list, options, + num_input_headers, input_headers, + header_include_names, pfn_notify, user_data); +} + +cl_int TRACE_FN(clBuildProgram)( + cl_program program, + cl_uint num_devices, + const cl_device_id *device_list, + const char *options, + void (CL_CALLBACK *pfn_notify)(cl_program, void *), + void *user_data) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(program); + TRACE_FN_ARG_UINT(num_devices); + TRACE_FN_ARG_PTR(device_list); + TRACE_FN_ARG_PTR(options); + TRACE_FN_ARG_PTR(pfn_notify); + TRACE_FN_ARG_PTR(user_data); + TRACE_FN_ARG_END(); + return clBuildProgram(program, num_devices, device_list, options, pfn_notify, + user_data); +} + +cl_mem TRACE_FN(clCreateBuffer)( + cl_context context, + cl_mem_flags flags, + size_t size, + void *host_ptr, + cl_int *errcode_ret) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(context); + TRACE_FN_ARG_ULONG(flags); + TRACE_FN_ARG_SIZE(size); + TRACE_FN_ARG_PTR(host_ptr); + TRACE_FN_ARG_PTR(errcode_ret); + TRACE_FN_ARG_END(); + return clCreateBuffer(context, flags, size, host_ptr, errcode_ret); +} + +cl_command_queue TRACE_FN(clCreateCommandQueueWithProperties)( + cl_context context, + cl_device_id device, + const cl_queue_properties *properties, + cl_int *errcode_ret) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(context); + TRACE_FN_ARG_PTR(device); + TRACE_FN_ARG_PTR(properties); + TRACE_FN_ARG_PTR(errcode_ret); + TRACE_FN_ARG_END(); + return clCreateCommandQueueWithProperties(context, device, properties, + errcode_ret); +} + +cl_context TRACE_FN(clCreateContext)( + const cl_context_properties *properties, + cl_uint num_devices, + const cl_device_id *devices, + void (CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *), + void *user_data, + cl_int *errcode_ret) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(properties); + TRACE_FN_ARG_UINT(num_devices); + TRACE_FN_ARG_PTR(devices); + TRACE_FN_ARG_PTR(pfn_notify); + TRACE_FN_ARG_PTR(user_data); + TRACE_FN_ARG_PTR(errcode_ret); + TRACE_FN_ARG_END(); + return clCreateContext(properties, num_devices, devices, pfn_notify, + user_data, errcode_ret); +} + +cl_kernel TRACE_FN(clCreateKernel)( + cl_program program, + const char *kernel_name, + cl_int *errcode_ret) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(program); + TRACE_FN_ARG_PTR(kernel_name); + TRACE_FN_ARG_PTR(errcode_ret); + TRACE_FN_ARG_END(); + return clCreateKernel(program, kernel_name, errcode_ret); +} + +cl_program TRACE_FN(clCreateProgramWithIL)( + cl_context context, + const void *il, + size_t length, + cl_int *errcode_ret) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(context); + TRACE_FN_ARG_PTR(il); + TRACE_FN_ARG_SIZE(length); + TRACE_FN_ARG_PTR(errcode_ret); + TRACE_FN_ARG_END(); + return clCreateProgramWithIL(context, il, length, errcode_ret); +} + +cl_int TRACE_FN(clEnqueueBarrierWithWaitList)( + cl_command_queue command_queue, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(command_queue); + TRACE_FN_ARG_UINT(num_events_in_wait_list); + TRACE_FN_ARG_PTR(event_wait_list); + TRACE_FN_ARG_PTR(event); + TRACE_FN_ARG_END(); + return clEnqueueBarrierWithWaitList(command_queue, num_events_in_wait_list, + event_wait_list, event); +} + +cl_int TRACE_FN(clEnqueueNDRangeKernel)( + cl_command_queue command_queue, + cl_kernel kernel, + cl_uint work_dim, + const size_t *global_work_offset, + const size_t *global_work_size, + const size_t *local_work_size, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(command_queue); + TRACE_FN_ARG_PTR(kernel); + TRACE_FN_ARG_UINT(work_dim); + TRACE_FN_ARG_PTR(global_work_offset); + TRACE_FN_ARG_PTR(global_work_size); + TRACE_FN_ARG_PTR(local_work_size); + TRACE_FN_ARG_UINT(num_events_in_wait_list); + TRACE_FN_ARG_PTR(event_wait_list); + TRACE_FN_ARG_PTR(event); + TRACE_FN_ARG_END(); + return clEnqueueNDRangeKernel(command_queue, kernel, work_dim, + global_work_offset, global_work_size, + local_work_size, num_events_in_wait_list, + event_wait_list, event); +} + +cl_int TRACE_FN(clEnqueueReadBuffer)( + cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_read, + size_t offset, + size_t size, + void *ptr, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(command_queue); + TRACE_FN_ARG_PTR(buffer); + TRACE_FN_ARG_UINT(blocking_read); + TRACE_FN_ARG_SIZE(offset); + TRACE_FN_ARG_SIZE(size); + TRACE_FN_ARG_PTR(ptr); + TRACE_FN_ARG_UINT(num_events_in_wait_list); + TRACE_FN_ARG_PTR(event_wait_list); + TRACE_FN_ARG_PTR(event); + TRACE_FN_ARG_END(); + return clEnqueueReadBuffer(command_queue, buffer, blocking_read, offset, size, + ptr, num_events_in_wait_list, event_wait_list, + event); +} + +cl_int TRACE_FN(clEnqueueSVMMap)( + cl_command_queue command_queue, + cl_bool blocking_map, + cl_map_flags flags, + void *svm_ptr, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(command_queue); + TRACE_FN_ARG_UINT(blocking_map); + TRACE_FN_ARG_ULONG(flags); + TRACE_FN_ARG_PTR(svm_ptr); + TRACE_FN_ARG_SIZE(size); + TRACE_FN_ARG_UINT(num_events_in_wait_list); + TRACE_FN_ARG_PTR(event_wait_list); + TRACE_FN_ARG_PTR(event); + TRACE_FN_ARG_END(); + return clEnqueueSVMMap(command_queue, blocking_map, flags, svm_ptr, size, + num_events_in_wait_list, event_wait_list, event); +} + +cl_int TRACE_FN(clEnqueueSVMMemcpy)( + cl_command_queue command_queue, + cl_bool blocking_copy, + void *dst_ptr, + const void *src_ptr, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(command_queue); + TRACE_FN_ARG_UINT(blocking_copy); + TRACE_FN_ARG_PTR(dst_ptr); + TRACE_FN_ARG_PTR(src_ptr); + TRACE_FN_ARG_SIZE(size); + TRACE_FN_ARG_UINT(num_events_in_wait_list); + TRACE_FN_ARG_PTR(event_wait_list); + TRACE_FN_ARG_PTR(event); + TRACE_FN_ARG_END(); + return clEnqueueSVMMemcpy(command_queue, blocking_copy, dst_ptr, src_ptr, + size, num_events_in_wait_list, event_wait_list, + event); +} + +cl_int TRACE_FN(clEnqueueMemcpyINTEL)( + clEnqueueMemcpyINTEL_fn funcptr, + cl_command_queue command_queue, + cl_bool blocking, + void *dst_ptr, + const void *src_ptr, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(command_queue); + TRACE_FN_ARG_UINT(blocking); + TRACE_FN_ARG_PTR(dst_ptr); + TRACE_FN_ARG_PTR(src_ptr); + TRACE_FN_ARG_SIZE(size); + TRACE_FN_ARG_UINT(num_events_in_wait_list); + TRACE_FN_ARG_PTR(event_wait_list); + TRACE_FN_ARG_PTR(event); + TRACE_FN_ARG_END(); + return funcptr(command_queue, blocking, dst_ptr, src_ptr, size, + num_events_in_wait_list, event_wait_list, event); +} + +cl_int TRACE_FN(clEnqueueSVMUnmap)( + cl_command_queue command_queue, + void *svm_ptr, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(command_queue); + TRACE_FN_ARG_PTR(svm_ptr); + TRACE_FN_ARG_UINT(num_events_in_wait_list); + TRACE_FN_ARG_PTR(event_wait_list); + TRACE_FN_ARG_PTR(event); + TRACE_FN_ARG_END(); + return clEnqueueSVMUnmap(command_queue, svm_ptr, num_events_in_wait_list, + event_wait_list, event); +} + +cl_int TRACE_FN(clEnqueueWriteBuffer)( + cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_write, + size_t offset, + size_t size, + const void *ptr, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(command_queue); + TRACE_FN_ARG_PTR(buffer); + TRACE_FN_ARG_UINT(blocking_write); + TRACE_FN_ARG_SIZE(offset); + TRACE_FN_ARG_SIZE(size); + TRACE_FN_ARG_PTR(ptr); + TRACE_FN_ARG_UINT(num_events_in_wait_list); + TRACE_FN_ARG_PTR(event_wait_list); + TRACE_FN_ARG_PTR(event); + TRACE_FN_ARG_END(); + return clEnqueueWriteBuffer(command_queue, buffer, blocking_write, offset, + size, ptr, num_events_in_wait_list, + event_wait_list, event); +} + +cl_int TRACE_FN(clFinish)( + cl_command_queue command_queue) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(command_queue); + TRACE_FN_ARG_END(); + return clFinish(command_queue); +} + +cl_int TRACE_FN(clGetDeviceAndHostTimer)( + cl_device_id device, + cl_ulong *device_timestamp, + cl_ulong *host_timestamp) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(device); + TRACE_FN_ARG_PTR(device_timestamp); + TRACE_FN_ARG_PTR(host_timestamp); + TRACE_FN_ARG_END(); + return clGetDeviceAndHostTimer(device, device_timestamp, host_timestamp); +} + +cl_int TRACE_FN(clGetDeviceIDs)( + cl_platform_id platform, + cl_device_type device_type, + cl_uint num_entries, + cl_device_id *devices, + cl_uint *num_devices) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(platform); + TRACE_FN_ARG_ULONG(device_type); + TRACE_FN_ARG_UINT(num_entries); + TRACE_FN_ARG_PTR(devices); + TRACE_FN_ARG_PTR(num_devices); + TRACE_FN_ARG_END(); + return clGetDeviceIDs(platform, device_type, num_entries, devices, + num_devices); +} + +cl_int TRACE_FN(clGetDeviceInfo)( + cl_device_id device, + cl_device_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(device); + TRACE_FN_ARG_UINT(param_name); + TRACE_FN_ARG_SIZE(param_value_size); + TRACE_FN_ARG_PTR(param_value); + TRACE_FN_ARG_PTR(param_value_size_ret); + TRACE_FN_ARG_END(); + return clGetDeviceInfo(device, param_name, param_value_size, param_value, + param_value_size_ret); +} + +cl_int TRACE_FN(clGetEventInfo)( + cl_event event, + cl_event_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(event); + TRACE_FN_ARG_UINT(param_name); + TRACE_FN_ARG_SIZE(param_value_size); + TRACE_FN_ARG_PTR(param_value); + TRACE_FN_ARG_PTR(param_value_size_ret); + TRACE_FN_ARG_END(); + return clGetEventInfo(event, param_name, param_value_size, param_value, + param_value_size_ret); +} + +cl_int TRACE_FN(clGetEventProfilingInfo)( + cl_event event, + cl_profiling_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(event); + TRACE_FN_ARG_UINT(param_name); + TRACE_FN_ARG_SIZE(param_value_size); + TRACE_FN_ARG_PTR(param_value); + TRACE_FN_ARG_PTR(param_value_size_ret); + TRACE_FN_ARG_END(); + return clGetEventProfilingInfo(event, param_name, param_value_size, + param_value, param_value_size_ret); +} + +void *TRACE_FN(clGetExtensionFunctionAddressForPlatform)( + cl_platform_id platform, + const char *funcname) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(platform); + TRACE_FN_ARG_PTR(funcname); + TRACE_FN_ARG_END(); + return clGetExtensionFunctionAddressForPlatform(platform, funcname); +} + +cl_int TRACE_FN(clGetKernelArgInfo)( + cl_kernel kernel, + cl_uint arg_index, + cl_kernel_arg_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(kernel); + TRACE_FN_ARG_UINT(arg_index); + TRACE_FN_ARG_UINT(param_name); + TRACE_FN_ARG_SIZE(param_value_size); + TRACE_FN_ARG_PTR(param_value); + TRACE_FN_ARG_PTR(param_value_size_ret); + TRACE_FN_ARG_END(); + return clGetKernelArgInfo(kernel, arg_index, param_name, param_value_size, + param_value, param_value_size_ret); +} + +cl_int TRACE_FN(clGetKernelInfo)( + cl_kernel kernel, + cl_kernel_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(kernel); + TRACE_FN_ARG_UINT(param_name); + TRACE_FN_ARG_SIZE(param_value_size); + TRACE_FN_ARG_PTR(param_value); + TRACE_FN_ARG_PTR(param_value_size_ret); + TRACE_FN_ARG_END(); + return clGetKernelInfo(kernel, param_name, param_value_size, param_value, + param_value_size_ret); +} + +cl_int TRACE_FN(clGetKernelSubGroupInfo)( + cl_kernel kernel, + cl_device_id device, + cl_kernel_sub_group_info param_name, + size_t input_value_size, + const void *input_value, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(kernel); + TRACE_FN_ARG_PTR(device); + TRACE_FN_ARG_UINT(param_name); + TRACE_FN_ARG_SIZE(input_value_size); + TRACE_FN_ARG_PTR(input_value); + TRACE_FN_ARG_SIZE(param_value_size); + TRACE_FN_ARG_PTR(param_value); + TRACE_FN_ARG_PTR(param_value_size_ret); + TRACE_FN_ARG_END(); + return clGetKernelSubGroupInfo(kernel, device, param_name, input_value_size, + input_value, param_value_size, param_value, + param_value_size_ret); +} + +cl_int TRACE_FN(clGetKernelWorkGroupInfo)( + cl_kernel kernel, + cl_device_id device, + cl_kernel_work_group_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(kernel); + TRACE_FN_ARG_PTR(device); + TRACE_FN_ARG_UINT(param_name); + TRACE_FN_ARG_SIZE(param_value_size); + TRACE_FN_ARG_PTR(param_value); + TRACE_FN_ARG_PTR(param_value_size_ret); + TRACE_FN_ARG_END(); + return clGetKernelWorkGroupInfo(kernel, device, param_name, param_value_size, + param_value, param_value_size_ret); +} + +cl_int TRACE_FN(clGetMemAllocInfoINTEL)( + clGetMemAllocInfoINTEL_fn funcptr, + cl_context context, + const void *ptr, + cl_mem_info_intel param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(context); + TRACE_FN_ARG_PTR(ptr); + TRACE_FN_ARG_UINT(param_name); + TRACE_FN_ARG_SIZE(param_value_size); + TRACE_FN_ARG_PTR(param_value); + TRACE_FN_ARG_PTR(param_value_size_ret); + TRACE_FN_ARG_END(); + return funcptr(context, ptr, param_name, param_value_size, param_value, + param_value_size_ret); +} + +cl_int TRACE_FN(clGetPlatformIDs)( + cl_uint num_entries, + cl_platform_id *platforms, + cl_uint *num_platforms) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_UINT(num_entries); + TRACE_FN_ARG_PTR(platforms); + TRACE_FN_ARG_PTR(num_platforms); + TRACE_FN_ARG_END(); + return clGetPlatformIDs(num_entries, platforms, num_platforms); +} + +cl_int TRACE_FN(clGetPlatformInfo)( + cl_platform_id platform, + cl_platform_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(platform); + TRACE_FN_ARG_UINT(param_name); + TRACE_FN_ARG_SIZE(param_value_size); + TRACE_FN_ARG_PTR(param_value); + TRACE_FN_ARG_PTR(param_value_size_ret); + TRACE_FN_ARG_END(); + return clGetPlatformInfo(platform, param_name, param_value_size, param_value, + param_value_size_ret); +} + +cl_int TRACE_FN(clGetProgramBuildInfo)( + cl_program program, + cl_device_id device, + cl_program_build_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(program); + TRACE_FN_ARG_PTR(device); + TRACE_FN_ARG_UINT(param_name); + TRACE_FN_ARG_SIZE(param_value_size); + TRACE_FN_ARG_PTR(param_value); + TRACE_FN_ARG_PTR(param_value_size_ret); + TRACE_FN_ARG_END(); + return clGetProgramBuildInfo(program, device, param_name, param_value_size, + param_value, param_value_size_ret); +} + +void *TRACE_FN(clHostMemAllocINTEL)( + clHostMemAllocINTEL_fn funcptr, + cl_context context, + const cl_mem_properties_intel *properties, + size_t size, + cl_uint alignment, + cl_int *errcode_ret) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(context); + TRACE_FN_ARG_PTR(properties); + TRACE_FN_ARG_SIZE(size); + TRACE_FN_ARG_UINT(alignment); + TRACE_FN_ARG_PTR(errcode_ret); + TRACE_FN_ARG_END(); + return funcptr(context, properties, size, alignment, errcode_ret); +} + +void *TRACE_FN(clDeviceMemAllocINTEL)( + clDeviceMemAllocINTEL_fn funcptr, + cl_context context, + cl_device_id device, + const cl_mem_properties_intel *properties, + size_t size, + cl_uint alignment, + cl_int *errcode_ret) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(context); + TRACE_FN_ARG_PTR(device); + TRACE_FN_ARG_PTR(properties); + TRACE_FN_ARG_SIZE(size); + TRACE_FN_ARG_UINT(alignment); + TRACE_FN_ARG_PTR(errcode_ret); + TRACE_FN_ARG_END(); + return funcptr(context, device, properties, size, alignment, errcode_ret); +} + +cl_program TRACE_FN(clLinkProgram)( + cl_context context, + cl_uint num_devices, + const cl_device_id *device_list, + const char *options, + cl_uint num_input_programs, + const cl_program *input_programs, + void (CL_CALLBACK *pfn_notify)(cl_program, void *), + void *user_data, + cl_int *errcode_ret) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(context); + TRACE_FN_ARG_UINT(num_devices); + TRACE_FN_ARG_PTR(device_list); + TRACE_FN_ARG_PTR(options); + TRACE_FN_ARG_UINT(num_input_programs); + TRACE_FN_ARG_PTR(input_programs); + TRACE_FN_ARG_PTR(pfn_notify); + TRACE_FN_ARG_PTR(user_data); + TRACE_FN_ARG_END(); + return clLinkProgram(context, num_devices, device_list, options, + num_input_programs, input_programs, pfn_notify, + user_data, errcode_ret); +} + +cl_program TRACE_FN(clCreateProgramWithBinary)( + cl_context context, + cl_uint num_devices, + const cl_device_id *device_list, + const size_t *lengths, + const unsigned char **binaries, + cl_int *binary_status, + cl_int *errcode_ret) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(context); + TRACE_FN_ARG_UINT(num_devices); + TRACE_FN_ARG_PTR(device_list); + TRACE_FN_ARG_PTR(lengths); + TRACE_FN_ARG_PTR(binaries); + TRACE_FN_ARG_PTR(binary_status); + TRACE_FN_ARG_END(); + return clCreateProgramWithBinary(context, num_devices, device_list, lengths, + binaries, binary_status, errcode_ret); +} + +cl_int TRACE_FN(clMemFreeINTEL)( + clMemFreeINTEL_fn funcptr, + cl_context context, + void *ptr) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(context); + TRACE_FN_ARG_PTR(ptr); + TRACE_FN_ARG_END(); + return funcptr(context, ptr); +} + +cl_int TRACE_FN(clReleaseCommandQueue)( + cl_command_queue command_queue) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(command_queue); + TRACE_FN_ARG_END(); + return clReleaseCommandQueue(command_queue); +} + +cl_int TRACE_FN(clReleaseContext)( + cl_context context) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(context); + TRACE_FN_ARG_END(); + return clReleaseContext(context); +} + +cl_int TRACE_FN(clReleaseKernel)( + cl_kernel kernel) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(kernel); + TRACE_FN_ARG_END(); + return clReleaseKernel(kernel); +} + +cl_int TRACE_FN(clReleaseMemObject)( + cl_mem memobj) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(memobj); + TRACE_FN_ARG_END(); + return clReleaseMemObject(memobj); +} + +cl_int TRACE_FN(clReleaseProgram)( + cl_program program) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(program); + TRACE_FN_ARG_END(); + return clReleaseProgram(program); +} + +cl_int TRACE_FN(clSetEventCallback)( + cl_event event, + cl_int command_exec_callback_type, + void (CL_CALLBACK *pfn_notify)(cl_event, cl_int, void *), + void *user_data) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(event); + TRACE_FN_ARG_INT(command_exec_callback_type); + TRACE_FN_ARG_PTR(pfn_notify); + TRACE_FN_ARG_PTR(user_data); + TRACE_FN_ARG_END(); + return clSetEventCallback(event, command_exec_callback_type, pfn_notify, + user_data); +} + +cl_int TRACE_FN(clSetKernelArg)( + cl_kernel kernel, + cl_uint arg_index, + size_t arg_size, + const void *arg_value) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(kernel); + TRACE_FN_ARG_UINT(arg_index); + TRACE_FN_ARG_SIZE(arg_size); + TRACE_FN_ARG_PTR(arg_value); + TRACE_FN_ARG_END(); + return clSetKernelArg(kernel, arg_index, arg_size, arg_value); +} + +cl_int TRACE_FN(clSetKernelArgSVMPointer)( + cl_kernel kernel, + cl_uint arg_index, + const void *arg_value) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(kernel); + TRACE_FN_ARG_UINT(arg_index); + TRACE_FN_ARG_PTR(arg_value); + TRACE_FN_ARG_END(); + return clSetKernelArgSVMPointer(kernel, arg_index, arg_value); +} + +cl_int TRACE_FN(clSetKernelArgMemPointerINTEL)( + clSetKernelArgMemPointerINTEL_fn funcptr, + cl_kernel kernel, + cl_uint arg_index, + const void *arg_value) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(kernel); + TRACE_FN_ARG_UINT(arg_index); + TRACE_FN_ARG_PTR(arg_value); + TRACE_FN_ARG_END(); + return funcptr(kernel, arg_index, arg_value); +} + +cl_int TRACE_FN(clSetKernelExecInfo)( + cl_kernel kernel, + cl_kernel_exec_info param_name, + size_t param_value_size, + const void *param_value) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(kernel); + TRACE_FN_ARG_UINT(param_name); + TRACE_FN_ARG_SIZE(param_value_size); + TRACE_FN_ARG_PTR(param_value); + TRACE_FN_ARG_END(); + return clSetKernelExecInfo(kernel, param_name, param_value_size, param_value); +} + +void *TRACE_FN(clSharedMemAllocINTEL)( + clSharedMemAllocINTEL_fn funcptr, + cl_context context, + cl_device_id device, + const cl_mem_properties_intel *properties, + size_t size, + cl_uint alignment, + cl_int *errcode_ret) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(context); + TRACE_FN_ARG_PTR(device); + TRACE_FN_ARG_PTR(properties); + TRACE_FN_ARG_SIZE(size); + TRACE_FN_ARG_UINT(alignment); + TRACE_FN_ARG_PTR(errcode_ret); + TRACE_FN_ARG_END(); + return funcptr(context, device, properties, size, alignment, errcode_ret); +} + +void *TRACE_FN(clSVMAlloc)( + cl_context context, + cl_svm_mem_flags flags, + size_t size, + cl_uint alignment) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(context); + TRACE_FN_ARG_ULONG(flags); + TRACE_FN_ARG_SIZE(size); + TRACE_FN_ARG_UINT(alignment); + TRACE_FN_ARG_END(); + return clSVMAlloc(context, flags, size, alignment); +} + +void TRACE_FN(clSVMFree)( + cl_context context, + void *svm_pointer) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(context); + TRACE_FN_ARG_PTR(svm_pointer); + TRACE_FN_ARG_END(); + clSVMFree(context, svm_pointer); +} + +cl_int TRACE_FN(clWaitForEvents)( + cl_uint num_events, + const cl_event *event_list) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_UINT(num_events); + TRACE_FN_ARG_PTR(event_list); + TRACE_FN_ARG_END(); + return clWaitForEvents(num_events, event_list); +} + +cl_int TRACE_FN(clGetKernelSuggestedLocalWorkSizeINTEL)( + clGetKernelSuggestedLocalWorkSizeINTEL_fn funcptr, + cl_command_queue command_queue, + cl_kernel kernel, + cl_uint work_dim, + const size_t *global_work_offset, + const size_t *global_work_size, + size_t *suggested_local_work_size) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(command_queue); + TRACE_FN_ARG_PTR(kernel); + TRACE_FN_ARG_UINT(work_dim); + TRACE_FN_ARG_PTR(global_work_offset); + TRACE_FN_ARG_PTR(global_work_size); + TRACE_FN_ARG_PTR(suggested_local_work_size); + TRACE_FN_ARG_END(); + return funcptr(command_queue, kernel, work_dim, global_work_offset, + global_work_size, suggested_local_work_size); +} + +cl_int TRACE_FN(clSetProgramSpecializationConstant)( + clSetProgramSpecializationConstant_fn funcptr, + cl_program program, + cl_uint spec_id, + size_t spec_size, + const void* spec_value) { + TRACE_FN_ARG_BEGIN(); + TRACE_FN_ARG_PTR(program); + TRACE_FN_ARG_UINT(spec_id); + TRACE_FN_ARG_SIZE(spec_size); + TRACE_FN_ARG_PTR(spec_value); + TRACE_FN_ARG_END(); + return funcptr(program, spec_id, spec_size, spec_value); +} + +/// Calls without error check +#define CALL_CL_SILENT(Rc, Fn, ...) \ + do { \ + if (DebugLevel > 1) { \ + DPCALL("CL_CALLER: %s %s\n", TO_STRING(Fn), TO_STRING(( __VA_ARGS__ ))); \ + Rc = TRACE_FN(Fn)(__VA_ARGS__); \ + } else { \ + Rc = Fn(__VA_ARGS__); \ + } \ + } while (0) + +/// Calls that only have return code +#define CALL_CL(Rc, Fn, ...) \ + do { \ + CALL_CL_SILENT(Rc, Fn, __VA_ARGS__); \ + if (Rc != CL_SUCCESS) { \ + DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, Rc, \ + getCLErrorName(Rc)); \ + } \ + } while (0) + +/// Emit warning for unsuccessful CL call +#define CALL_CLW(Rc, Fn, ...) \ + do { \ + CALL_CL_SILENT(Rc, Fn, __VA_ARGS__); \ + if (Rc != CL_SUCCESS) { \ + DP("Warning: %s:%s returned %d, %s\n", __func__, #Fn, Rc, \ + getCLErrorName(Rc)); \ + } \ + } while (0) + +#define CALL_CL_RET(Ret, Fn, ...) \ + do { \ + cl_int rc; \ + CALL_CL(rc, Fn, __VA_ARGS__); \ + if (rc != CL_SUCCESS) \ + return Ret; \ + } while (0) + +#define CALL_CLW_RET(Ret, Fn, ...) \ + do { \ + cl_int rc; \ + CALL_CLW(rc, Fn, __VA_ARGS__); \ + if (rc != CL_SUCCESS) \ + return Ret; \ + } while (0) + +#define CALL_CL_EXIT_FAIL(Fn, ...) \ + do { \ + cl_int rc; \ + CALL_CL(rc, Fn, __VA_ARGS__); \ + if (rc != CL_SUCCESS) \ + exit(EXIT_FAILURE); \ + } while (0) + +#define CALL_CL_RET_FAIL(Fn, ...) CALL_CL_RET(OFFLOAD_FAIL, Fn, __VA_ARGS__) +#define CALL_CL_RET_NULL(Fn, ...) CALL_CL_RET(nullptr, Fn, __VA_ARGS__) +#define CALL_CL_RET_ZERO(Fn, ...) CALL_CL_RET(0, Fn, __VA_ARGS__) +#define CALL_CL_RET_VOID(Fn, ...) CALL_CL_RET(, Fn, __VA_ARGS__) +#define CALL_CLW_RET_VOID(Fn, ...) CALL_CLW_RET(, Fn, __VA_ARGS__) + +/// Calls that have return value and return code +#define CALL_CL_RVRC(Rv, Fn, Rc, ...) \ + do { \ + if (DebugLevel > 1) { \ + DPCALL("CL_CALLER: %s %s\n", TO_STRING(Fn), TO_STRING(( __VA_ARGS__ ))); \ + Rv = TRACE_FN(Fn)(__VA_ARGS__, &Rc); \ + } else { \ + Rv = Fn(__VA_ARGS__, &Rc); \ + } \ + if (Rc != CL_SUCCESS) { \ + DP("Error: %s:%s failed with error code %d, %s\n", __func__, #Fn, Rc, \ + getCLErrorName(Rc)); \ + } \ + } while (0) + +/// Calls that only have return value +#define CALL_CL_RV(Rv, Fn, ...) \ + do { \ + if (DebugLevel > 1) { \ + DPCALL("CL_CALLER: %s %s\n", TO_STRING(Fn), TO_STRING(( __VA_ARGS__ ))); \ + Rv = TRACE_FN(Fn)(__VA_ARGS__); \ + } else { \ + Rv = Fn(__VA_ARGS__); \ + } \ + } while (0) + +/// Calls that don't return anything +#define CALL_CL_VOID(Fn, ...) \ + do { \ + if (DebugLevel > 1) { \ + DPCALL("CL_CALLER: %s %s\n", TO_STRING(Fn), TO_STRING(( __VA_ARGS__ ))); \ + TRACE_FN(Fn)(__VA_ARGS__); \ + } else { \ + Fn(__VA_ARGS__); \ + } \ + } while (0) + +/// Call extension function, return nothing +#define CALL_CL_EXT_VOID(DeviceId, Name, ...) \ + do { \ + Name##_fn Fn = reinterpret_cast( \ + DeviceInfo->getExtensionFunctionPtr(DeviceId, Name##Id)); \ + if (DebugLevel > 1) { \ + DPCALL("CL_CALLER: %s %s\n", \ + TO_STRING(Name), TO_STRING(( __VA_ARGS__ ))); \ + TRACE_FN(Name)(Fn, __VA_ARGS__); \ + } else { \ + (*Fn)(__VA_ARGS__); \ + } \ + } while (0) + +/// Extension calls without error reporting +#define CALL_CL_EXT_SILENT(DeviceId, Rc, Name, ...) \ + do { \ + Name##_fn Fn = reinterpret_cast( \ + DeviceInfo->getExtensionFunctionPtr(DeviceId, Name##Id)); \ + if (DebugLevel > 1) { \ + DPCALL("CL_CALLER: %s %s\n", \ + TO_STRING(Name), TO_STRING(( __VA_ARGS__ ))); \ + Rc = TRACE_FN(Name)(Fn, __VA_ARGS__); \ + } else { \ + Rc = (*Fn)(__VA_ARGS__); \ + } \ + } while (0) + +/// Extension calls that only have return code +#define CALL_CL_EXT(DeviceId, Rc, Name, ...) \ + do { \ + CALL_CL_EXT_SILENT(DeviceId, Rc, Name, __VA_ARGS__); \ + if (Rc != CL_SUCCESS) { \ + DP("Error: %s:%s failed with error code %d, %s\n", \ + __func__, TO_STRING(Name), Rc, getCLErrorName(Rc)); \ + } \ + } while (0) + +/// Extension calls that have return value and return code +#define CALL_CL_EXT_RVRC(DeviceId, Rv, Name, Rc, ...) \ + do { \ + Name##_fn Fn = reinterpret_cast( \ + DeviceInfo->getExtensionFunctionPtr(DeviceId, Name##Id)); \ + if (DebugLevel > 1) { \ + DPCALL("CL_CALLER: %s %s\n", \ + TO_STRING(Name), TO_STRING(( __VA_ARGS__ ))); \ + Rv = TRACE_FN(Name)(Fn, __VA_ARGS__, &Rc); \ + } else { \ + Rv = (*Fn)(__VA_ARGS__, &Rc); \ + } \ + if (Rc != CL_SUCCESS) { \ + DP("Error: %s:%s failed with error code %d, %s\n", \ + __func__, TO_STRING(Name), Rc, getCLErrorName(Rc)); \ + } \ + } while (0) + +#define CALL_CL_EXT_RET(DeviceId, Ret, Name, ...) \ + do { \ + cl_int rc; \ + CALL_CL_EXT(DeviceId, rc, Name, __VA_ARGS__); \ + if (rc != CL_SUCCESS) \ + return Ret; \ + } while (0) + +#define CALL_CL_EXT_RET_FAIL(DeviceId, Name, ...) \ + CALL_CL_EXT_RET(DeviceId, OFFLOAD_FAIL, Name, __VA_ARGS__) +#define CALL_CL_EXT_RET_NULL(DeviceId, Name, ...) \ + CALL_CL_EXT_RET(DeviceId, nullptr, Name, __VA_ARGS__) + +#endif // !defined(RTL_TRACE_H) diff --git a/openmp/libomptarget/plugins/opencl/src/rtl.cpp b/openmp/libomptarget/plugins/opencl/src/rtl.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins/opencl/src/rtl.cpp @@ -0,0 +1,3969 @@ +//===--- Target RTLs Implementation ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file is modified from https://github.com/daniel-schuermann/openmp.git. +// Thanks to Daniel Scheuermann, the author of rtl.cpp. +// +// RTL for SPIR-V/OpenCL machine +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _WIN32 +#include +#include +#include +#else +#include +#include +#endif + +#include "elf_light.h" +#include "omptargetplugin.h" +#include "rtl-trace.h" + +#include "llvm/Support/Endian.h" + +/// Additional TARGET_ALLOC* definition for the plugin +constexpr int32_t TARGET_ALLOC_SVM = INT32_MAX; + +/// Device type enumeration common to compiler and runtime +enum DeviceArch : uint64_t { + DeviceArch_None = 0, + DeviceArch_Gen9 = 0x0001, + DeviceArch_XeLP = 0x0002, + DeviceArch_XeHP = 0x0004, + DeviceArch_x86_64 = 0x0100 +}; + +/// Mapping from device arch to GPU runtime's device identifiers +#ifdef _WIN32 +/// For now, we need to depend on known published product names +std::map> DeviceArchStrMap { + { + DeviceArch_Gen9, { + "HD Graphics", + "UHD Graphics", + "Pro Graphics", + "Plus Graphics", + "Iris(TM) Graphics", + } + }, + { + DeviceArch_XeLP, { + "Xe Graphics", + "Xe MAX Graphics" + } + } + // TODO: how to detect XeHP? + // Using XeHP on Windows seems to be a rare case. +}; +#endif // _WIN32 +std::map> DeviceArchMap { + { + DeviceArch_Gen9, { + 0x1900, // SKL + 0x5900, // KBL + 0x3E00, 0x9B00, // CFL + 0x8A00, // ICX + } + }, + { + DeviceArch_XeLP, { + 0xFF20, 0x9A00, // TGL + 0x4900, // DG1 + 0x4C00, // RKL + 0x4600, // ADLS + } + }, +}; + +int DebugLevel = getDebugLevel(); + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// FIXME: we should actually include omp.h instead of declaring +// these ourselves. +#if _WIN32 +int __cdecl omp_get_max_teams(void); +int __cdecl omp_get_thread_limit(void); +double __cdecl omp_get_wtime(void); +int __cdecl __kmpc_global_thread_num(void *); +#else // !_WIN32 +int omp_get_max_teams(void) __attribute__((weak)); +int omp_get_thread_limit(void) __attribute__((weak)); +double omp_get_wtime(void) __attribute__((weak)); +int __kmpc_global_thread_num(void *) __attribute__((weak)); +#endif // !_WIN32 + +#ifdef __cplusplus +} +#endif // __cplusplus + +class KernelInfoTy { + uint32_t Version = 0; + uint64_t Attributes1 = 0; + uint64_t WGNum = 0; + uint64_t WINum = 0; + + struct KernelArgInfoTy { + bool IsLiteral = false; + uint32_t Size = 0; + KernelArgInfoTy(bool IsLiteral, uint32_t Size) + : IsLiteral(IsLiteral), Size(Size) {} + }; + std::vector ArgsInfo; + + void checkVersion(uint32_t MinVer) const { + assert(Version >= MinVer && + "API is not supported for this version of KernelInfoTy."); + (void)Version; + } + +public: + KernelInfoTy(uint32_t Version) : Version(Version) {} + void addArgInfo(bool IsLiteral, uint32_t Size) { + checkVersion(1); + ArgsInfo.emplace_back(IsLiteral, Size); + } + size_t getArgsNum() const { + checkVersion(1); + return ArgsInfo.size(); + } + bool isArgLiteral(uint32_t Idx) const { + checkVersion(1); + return ArgsInfo[Idx].IsLiteral; + } + uint32_t getArgSize(uint32_t Idx) const { + checkVersion(1); + return ArgsInfo[Idx].Size; + } + void setAttributes1(uint64_t Val) { + Attributes1 = Val; + } + bool getHasTeamsReduction() const { + return (Attributes1 & 1); + } + void setWGNum(uint64_t Val) { + WGNum = Val; + } + uint64_t getWGNum() const { + return WGNum; + } + void setWINum(uint64_t Val) { + WINum = Val; + } + uint64_t getWINum() const { + return WINum; + } + bool isAtomicFreeReduction() const { + return getWGNum(); + } +}; + +/// Loop descriptor +typedef struct { + int64_t Lb; // The lower bound of the i-th loop + int64_t Ub; // The upper bound of the i-th loop + int64_t Stride; // The stride of the i-th loop +} TgtLoopDescTy; + +typedef struct { + int32_t NumLoops; // Number of loops/dimensions + int32_t DistributeDim; // Dimensions lower than this one + // must end up in one WG + TgtLoopDescTy Levels[3]; // Up to 3 loops +} TgtNDRangeDescTy; + +/// Profile data +struct ProfileDataTy { + struct TimingsTy { + double host = 0.0; + double device = 0.0; + }; + + std::map data; + + std::string alignLeft(size_t Width, std::string Str) { + if (Str.size() < Width) + return Str + std::string(Width - Str.size(), ' '); + return Str; + } + + void printData(int32_t deviceId, int32_t threadId, const char *deviceName, + int64_t resolution) { + std::string profileSep(80, '='); + std::string lineSep(80, '-'); + + fprintf(stderr, "%s\n", profileSep.c_str()); + + fprintf(stderr, "LIBOMPTARGET_PLUGIN_PROFILE(%s) for OMP DEVICE(%" PRId32 + ") %s, Thread %" PRId32 "\n", GETNAME(TARGET_NAME), deviceId, + deviceName, threadId); + + fprintf(stderr, "%s\n", lineSep.c_str()); + + const char *unit = resolution == 1000 ? "msec" : "usec"; + + std::string kernelPrefix("Kernel "); + size_t maxKeyLength = kernelPrefix.size() + 3; + for (const auto &d : data) + if (d.first.substr(0, kernelPrefix.size()) != kernelPrefix && + maxKeyLength < d.first.size()) + maxKeyLength = d.first.size(); + + // Print kernel key and name + int kernelId = 0; + for (const auto &d: data) { + if (d.first.substr(0, kernelPrefix.size()) == kernelPrefix) + fprintf(stderr, "-- %s: %s\n", + alignLeft(maxKeyLength, kernelPrefix + + std::to_string(kernelId++)).c_str(), + d.first.substr(kernelPrefix.size()).c_str()); + } + + fprintf(stderr, "%s\n", lineSep.c_str()); + + fprintf(stderr, "-- %s: Host Time (%s) Device Time (%s)\n", + alignLeft(maxKeyLength, "Name").c_str(), unit, unit); + + double hostTotal = 0.0; + double deviceTotal = 0.0; + kernelId = 0; + for (const auto &d : data) { + double hostTime = 1e-9 * d.second.host * resolution; + double deviceTime = 0.0; + std::string key(d.first); + + if (d.first.substr(0, kernelPrefix.size()) == kernelPrefix) { + key = kernelPrefix + std::to_string(kernelId++); + deviceTime = 1e-9 * d.second.device * resolution; + } else if (d.first.substr(0, 8) == "DataRead" || + d.first.substr(0, 9) == "DataWrite") { + deviceTime = 1e-9 * d.second.device * resolution; + } + + fprintf(stderr, "-- %s: %20.3f %20.3f\n", + alignLeft(maxKeyLength, key).c_str(), hostTime, deviceTime); + hostTotal += hostTime; + deviceTotal += deviceTime; + } + fprintf(stderr, "-- %s: %20.3f %20.3f\n", + alignLeft(maxKeyLength, "Total").c_str(), hostTotal, deviceTotal); + fprintf(stderr, "%s\n", profileSep.c_str()); + } + + // for non-event profile + void update( + const char *name, cl_ulong host_elapsed, cl_ulong device_elapsed) { + std::string key(name); + TimingsTy &timings = data[key]; + timings.host += host_elapsed; + timings.device += device_elapsed; + } + + void update( + const char *name, double host_elapsed, double device_elapsed) { + std::string key(name); + TimingsTy &timings = data[key]; + timings.host += host_elapsed; + timings.device += device_elapsed; + } + + // for event profile + void update(const char *name, cl_event event) { + cl_ulong host_begin = 0, host_end = 0; + CALL_CLW_RET_VOID(clGetEventProfilingInfo, event, + CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &host_begin, nullptr); + CALL_CLW_RET_VOID(clGetEventProfilingInfo, event, + CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &host_end, nullptr); + cl_ulong device_begin = 0, device_end = 0; + CALL_CLW_RET_VOID(clGetEventProfilingInfo, event, + CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &device_begin, nullptr); + CALL_CLW_RET_VOID(clGetEventProfilingInfo, event, + CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &device_end, nullptr); + update(name, host_end - host_begin, device_end - device_begin); + } +}; // ProfileDataTy + +// Platform-dependent information -- context and INTEL extension API +struct PlatformInfoTy { + cl_platform_id Platform = nullptr; + cl_context Context = nullptr; + std::vector ExtensionFunctionNames { +#define EXTENSION_FN_NAME(Fn) TO_STRING(Fn), + FOR_EACH_EXTENSION_FN(EXTENSION_FN_NAME) + }; + std::vector ExtensionFunctionPointers; + + PlatformInfoTy() = default; + + PlatformInfoTy(cl_platform_id platform, cl_context context) { + Platform = platform; + Context = context; + ExtensionFunctionPointers.resize(ExtensionFunctionNames.size(), nullptr); + for (int i = 0; i < ExtensionIdLast; i++) { + CALL_CL_RV(ExtensionFunctionPointers[i], + clGetExtensionFunctionAddressForPlatform, platform, + ExtensionFunctionNames[i]); + if (ExtensionFunctionPointers[i]) { + DP("Extension %s is found.\n", ExtensionFunctionNames[i]); + } else { + DP("Warning: Extension %s is not found.\n", ExtensionFunctionNames[i]); + } + } + } +}; + +// OpenCL extensions status. +enum ExtensionStatusTy : uint8_t { + // Default value. It is unknown if the extension is supported. + ExtensionStatusUnknown = 0, + + // Extension is disabled (either because it is unsupported or + // due to user environment control). + ExtensionStatusDisabled, + + // Extenstion is enabled. An extension can only be used, + // if it has this status after __tgt_rtl_load_binary. + ExtensionStatusEnabled, +}; + +// A descriptor of OpenCL extensions with their statuses. +struct ExtensionsTy { + ExtensionStatusTy UnifiedSharedMemory = ExtensionStatusUnknown; + ExtensionStatusTy DeviceAttributeQuery = ExtensionStatusUnknown; + ExtensionStatusTy GetDeviceGlobalVariablePointer = ExtensionStatusUnknown; + ExtensionStatusTy SuggestedGroupSize = ExtensionStatusUnknown; + + // Libdevice extensions that may be supported by device runtime. + struct LibdeviceExtDescTy { + const char *Name; + const char *FallbackLibName; + ExtensionStatusTy Status; + }; + + std::vector LibdeviceExtensions = { + { + "cl_intel_devicelib_cassert", + "libomp-fallback-cassert.spv", + ExtensionStatusUnknown + }, + { + "cl_intel_devicelib_math", + "libomp-fallback-cmath.spv", + ExtensionStatusUnknown + }, + { + "cl_intel_devicelib_math_fp64", + "libomp-fallback-cmath-fp64.spv", + ExtensionStatusUnknown + }, + { + "cl_intel_devicelib_complex", + "libomp-fallback-complex.spv", + ExtensionStatusUnknown + }, + { + "cl_intel_devicelib_complex_fp64", + "libomp-fallback-complex-fp64.spv", + ExtensionStatusUnknown + }, + { + "cl_intel_devicelib_cstring", + "libomp-fallback-cstring.spv", + ExtensionStatusUnknown + }, + }; + + // Initialize extensions' statuses for the given device. + int32_t getExtensionsInfoForDevice(int32_t DeviceId); +}; + +/// Data transfer method +enum DataTransferMethodTy { + DATA_TRANSFER_METHOD_INVALID = -1, // Invalid + DATA_TRANSFER_METHOD_CLMEM = 0, // Use Buffer on SVM + DATA_TRANSFER_METHOD_SVMMAP, // Use SVMMap/Unmap + DATA_TRANSFER_METHOD_SVMMEMCPY, // Use SVMMemcpy + DATA_TRANSFER_METHOD_LAST, +}; + +/// OpenCL program that can contain multiple OCL programs +class OpenCLProgramTy { + struct DeviceOffloadEntryTy { + /// Common part with the host offload table. + __tgt_offload_entry Base; + /// Length of the Base.name string in bytes including + /// the null terminator. + size_t NameSize; + }; + + /// Cached device image + __tgt_device_image *Image = nullptr; + + /// Cached OpenCL context + cl_context Context = nullptr; + + /// Cached OpenDL device + cl_device_id Device = nullptr; + + /// Cached OpenMP device ID + int32_t DeviceId = 0; + + /// Program is created from binary? + bool IsBinary = false; + + /// Target table + __tgt_target_table Table; + + /// Target entries + std::vector<__tgt_offload_entry> Entries; + + /// Internal offload entries + std::vector OffloadEntries; + + /// Handle multiple modules within a single target image + std::vector Programs; + + /// Kernels created from the target image + std::vector Kernels; + + /// Kernel info added by compiler + std::unordered_map KernelInfo; + + /// Final OpenCL program + cl_program FinalProgram = nullptr; + + /// Requires program link + bool RequiresProgramLink = false; + + /// Loads the device version of the offload table for device \p DeviceId. + /// The table is expected to have \p NumEntries entries. + /// Returns true, if the load was successful, false - otherwise. + bool loadOffloadTable(size_t NumEntries); + + /// Add a single OpenCL program created from the given SPIR-V image + int32_t addProgramIL(const size_t Size, const unsigned char *Image); + + /// Add a single OpenCL program created from the given native image + int32_t addProgramBIN(const size_t Size, const unsigned char *Image); + + /// Looks up an OpenMP declare target global variable with the given + /// \p Name and \p Size in the device environment for the current device. + /// The lookup is first done via the device offload table. If it fails, + /// then the lookup falls back to non-OpenMP specific lookup on the device. + void *getOffloadVarDeviceAddr(const char *Name, size_t Size); + + /// Read KernelInfo auxiliary information for the specified kernel. + /// The information is stored in \p KernelInfo. + /// The function is called during the binary loading. + bool readKernelInfo(const __tgt_offload_entry &KernelEntry); + +public: + OpenCLProgramTy() = default; + + OpenCLProgramTy(__tgt_device_image *Image_, cl_context Context_, + cl_device_id Device_, int32_t DeviceId_) : + Image(Image_), Context(Context_), Device(Device_), DeviceId(DeviceId_) {} + + ~OpenCLProgramTy(); + + int32_t buildPrograms(std::string &CompilationOptions, + std::string &LinkingOptions); + + int32_t compilePrograms(std::string &CompilationOptions, + std::string &LinkingOptions); + + int32_t linkPrograms(std::string &LinkingOptions); + + /// Looks up an external global variable with the given \p Name + /// in the device environment for device \p DeviceId. + /// \p Size must not be null. If (*SizePtr) is not zero, then + /// the lookup verifies that the found variable's size matches + /// (*SizePtr), otherwise, the found variable's size is returned + /// via \p Size. + void *getVarDeviceAddr(const char *Name, size_t *SizePtr); + + /// Looks up an external global variable with the given \p Name + /// and \p Size in the device environment for device \p DeviceId. + void *getVarDeviceAddr(const char *Name, size_t Size); + + /// Build kernels from all modules. + int32_t buildKernels(); + + /// Return the pointer to the offload table. + __tgt_target_table *getTablePtr() { return &Table; } + + /// Returns the auxiliary kernel information for the specified kernel. + const KernelInfoTy *getKernelInfo(cl_kernel Kernel) const; +}; + +/// RTL flags +struct RTLFlagsTy { + uint64_t CollectDataTransferLatency : 1; + uint64_t EnableProfile : 1; + uint64_t UseInteropQueueInorderAsync : 1; + uint64_t UseInteropQueueInorderSharedSync : 1; + uint64_t UseHostMemForUSM : 1; + uint64_t UseDriverGroupSizes : 1; + uint64_t EnableSimd : 1; + uint64_t UseSVM : 1; + uint64_t UseBuffer : 1; + uint64_t UseSingleContext : 1; + uint64_t UseImageOptions : 1; + uint64_t ShowBuildLog : 1; + uint64_t LinkLibDevice : 1; + // Add new flags here + uint64_t Reserved : 51; + RTLFlagsTy() : + CollectDataTransferLatency(0), + EnableProfile(0), + UseInteropQueueInorderAsync(0), + UseInteropQueueInorderSharedSync(0), + UseHostMemForUSM(0), + UseDriverGroupSizes(0), + EnableSimd(0), + UseSVM(0), + UseBuffer(0), + UseSingleContext(0), + UseImageOptions(1), + ShowBuildLog(0), + LinkLibDevice(0), + Reserved(0) {} +}; + +/// Kernel properties. +struct KernelPropertiesTy { + size_t Width = 0; + size_t SIMDWidth = 0; + size_t MaxThreadGroupSize = 0; + /// Kernel-specific implicit arguments + std::set ImplicitArgs; +}; + +/// Specialization constants used for an OpenCL program compilation. +class SpecConstantsTy { + std::vector ConstantIds; + std::vector ConstantValueSizes; + std::vector ConstantValues; + +public: + SpecConstantsTy() = default; + SpecConstantsTy(const SpecConstantsTy &) = delete; + SpecConstantsTy(const SpecConstantsTy &&Other) + : ConstantIds(std::move(Other.ConstantIds)), + ConstantValueSizes(std::move(Other.ConstantValueSizes)), + ConstantValues(std::move(Other.ConstantValues)) {} + + ~SpecConstantsTy() { + for (auto I : ConstantValues) { + const char *ValuePtr = reinterpret_cast(I); + delete[] ValuePtr; + } + } + + template + void addConstant(uint32_t Id, T Val) { + const size_t ValSize = sizeof(Val); + char *ValuePtr = new char[ValSize]; + *reinterpret_cast(ValuePtr) = Val; + + ConstantIds.push_back(Id); + ConstantValueSizes.push_back(ValSize); + ConstantValues.push_back(reinterpret_cast(ValuePtr)); + } + + void setProgramConstants(int32_t DeviceId, cl_program Program) const; +}; + +struct MemAllocInfoTy { + /// Base address allocated from compute runtime + void *Base = nullptr; + /// Allocation size known to users/libomptarget + size_t Size = 0; + /// TARGET_ALLOC kind + int32_t Kind = TARGET_ALLOC_DEFAULT; + /// Allocation from pool? + bool InPool = false; + /// Is implicit argument + bool ImplicitArg = false; + + MemAllocInfoTy() = default; + + MemAllocInfoTy(void *_Base, size_t _Size, int32_t _Kind, bool _InPool, + bool _ImplicitArg) : + Base(_Base), Size(_Size), Kind(_Kind), InPool(_InPool), + ImplicitArg(_ImplicitArg) {} +}; + +/// Allocation information maintained in RTL +class MemAllocInfoMapTy { + /// Map from allocated pointer to allocation information + std::map Map; + /// Map from target alloc kind to number of implicit arguments + std::map NumImplicitArgs; + /// Mutex for guarding the internal data + std::mutex Mtx; + +public: + void add(void *Ptr, void *_Base, size_t _Size, int32_t _Kind, + bool _InPool = false, bool _ImplicitArg = false) { + std::lock_guard Lock(Mtx); + auto Inserted = Map.emplace( + Ptr, MemAllocInfoTy{_Base, _Size, _Kind, _InPool, _ImplicitArg}); + (void)Inserted; + if (_ImplicitArg) + NumImplicitArgs[_Kind]++; + } + + bool remove(void *Ptr, MemAllocInfoTy *Removed = nullptr) { + std::lock_guard Lock(Mtx); + auto AllocInfo = Map.find(Ptr); + if (AllocInfo == Map.end()) + return false; + if (AllocInfo->second.ImplicitArg) + NumImplicitArgs[AllocInfo->second.Kind]--; + if (Removed) + *Removed = AllocInfo->second; + Map.erase(AllocInfo); + return true; + } + + const MemAllocInfoTy *find(void *Ptr) { + std::lock_guard Lock(Mtx); + auto AllocInfo = Map.find(Ptr); + if (AllocInfo == Map.end()) + return nullptr; + else + return &AllocInfo->second; + } + + /// Return allocation information if Ptr belongs to any allocation range. + const MemAllocInfoTy *search(void *Ptr) { + std::lock_guard Lock(Mtx); + if (Map.size() == 0) + return nullptr; + auto I = Map.upper_bound(const_cast(Ptr)); + // Key pointer (I->first) may be greater than Ptr, so both I and --I need to + // be checked. + int J = 0; + do { + std::advance(I, J); + if (I == Map.end()) + continue; + uintptr_t AllocBase = (uintptr_t)I->second.Base; + size_t AllocSize = I->second.Size + (uintptr_t)I->first - AllocBase; + if (AllocBase <= (uintptr_t)Ptr && (uintptr_t)Ptr < AllocBase + AllocSize) + return &I->second; + } while (J-- > -1 && I != Map.begin()); + + return nullptr; + } + + bool contains(const void *Ptr, size_t Size) { + std::lock_guard Lock(Mtx); + if (Map.size() == 0) + return false; + auto I = Map.upper_bound(const_cast(Ptr)); + if (I == Map.begin()) + return false; + --I; + bool Ret = (uintptr_t)I->first <= (uintptr_t)Ptr && + (uintptr_t)Ptr + (uintptr_t)Size <= + (uintptr_t)I->first + (uintptr_t)I->second.Size; + return Ret; + } + + /// Add a list of implicit arguments to the output vector. + void getImplicitArgs( + std::vector &SVMArgs, std::vector &USMArgs) { + std::lock_guard Lock(Mtx); + for (auto &AllocInfo : Map) { + if (AllocInfo.second.ImplicitArg) { + if (AllocInfo.second.Kind == TARGET_ALLOC_SVM) + SVMArgs.push_back(AllocInfo.first); + else + USMArgs.push_back(AllocInfo.first); + } + } + } + + bool hasImplicitUSMArg(int32_t Kind = TARGET_ALLOC_DEFAULT) { + std::lock_guard Lock(Mtx); + if (Kind == TARGET_ALLOC_DEFAULT) { + uint32_t Num = NumImplicitArgs[TARGET_ALLOC_HOST] + + NumImplicitArgs[TARGET_ALLOC_DEVICE] + + NumImplicitArgs[TARGET_ALLOC_SHARED]; + return Num > 0; + } else { + return NumImplicitArgs[Kind] > 0; + } + } +}; + +/// RTL options and flags users can override +struct RTLOptionTy { + /// Binary flags + RTLFlagsTy Flags; + + /// Emulated data transfer latency in microsecond + int32_t DataTransferLatency = 0; + + /// Data transfer method when SVM is used + int32_t DataTransferMethod = DATA_TRANSFER_METHOD_SVMMAP; + + /// Plugin profiling resolution (msec by default) + int64_t ProfileResolution = 1000; + + /// Used device type + cl_device_type DeviceType = CL_DEVICE_TYPE_GPU; + + // OpenCL 2.0 builtins (like atomic_load_explicit and etc.) are used by + // runtime, so we have to explicitly specify the "-cl-std=CL2.0" compilation + // option. With it, the SPIR-V will be converted to LLVM IR with OpenCL 2.0 + // builtins. Otherwise, SPIR-V will be converted to LLVM IR with OpenCL 1.2 + // builtins. + std::string CompilationOptions = "-cl-std=CL2.0 "; + std::string UserCompilationOptions = ""; + std::string UserLinkingOptions = ""; + + /// Limit for the number of WIs in a WG. + uint32_t ThreadLimit = 0; + + /// Limit for the number of WGs. + uint32_t NumTeams = 0; + + // This is a factor applied to the number of WGs computed + // for the execution, based on the HW characteristics. + size_t SubscriptionRate = 1; + + // For kernels that compute cross-WG reductions the number of computed WGs + // is reduced by this factor. + size_t ReductionSubscriptionRate = 1; + bool ReductionSubscriptionRateIsDefault = true; + + /// Loop kernels with known ND-range may be known to have + /// few iterations and they may not exploit the offload device + /// to the fullest extent. + /// Let's assume a device has N total HW threads available, + /// and the kernel requires M hardware threads with LWS set to L. + /// If (M < N * ThinThreadsThreshold), then we will try + /// to iteratively divide L by 2 to increase the number of HW + /// threads used for executing the kernel. Effectively, we will + /// end up with L less than the kernel's SIMD width, so the HW + /// threads will not use all their SIMD lanes. This (presumably) should + /// allow more parallelism, because the stalls in the SIMD lanes + /// will be distributed across more HW threads, and the probability + /// of having a stall (or a sequence of stalls) on a critical path + /// in the kernel should decrease. + /// Anyway, this is just a heuristics that seems to work well for some + /// kernels (which poorly expose parallelism in the first place). + double ThinThreadsThreshold = 0.1; + + // Spec constants used for all OpenCL programs. + SpecConstantsTy CommonSpecConstants; + + RTLOptionTy() { + const char *Env; + + // Get global OMP_THREAD_LIMIT for SPMD parallelization. + int ThrLimit = omp_get_thread_limit(); + DP("omp_get_thread_limit() returned %" PRId32 "\n", ThrLimit); + // omp_get_thread_limit() would return INT_MAX by default. + // NOTE: Windows.h defines max() macro, so we have to guard + // the call with parentheses. + ThreadLimit = (ThrLimit > 0 && + ThrLimit != (std::numeric_limits::max)()) ? ThrLimit : 0; + + // Global max number of teams. + int NTeams = omp_get_max_teams(); + DP("omp_get_max_teams() returned %" PRId32 "\n", NTeams); + // omp_get_max_teams() would return INT_MAX by default. + // NOTE: Windows.h defines max() macro, so we have to guard + // the call with parentheses. + NumTeams = (NTeams > 0 && + NTeams != (std::numeric_limits::max)()) ? NTeams : 0; + + // Read LIBOMPTARGET_DATA_TRANSFER_LATENCY (experimental input) + if ((Env = readEnvVar("LIBOMPTARGET_DATA_TRANSFER_LATENCY"))) { + std::string Value(Env); + if (Value.substr(0, 2) == "T,") { + Flags.CollectDataTransferLatency = 1; + int32_t Usec = std::stoi(Value.substr(2).c_str()); + DataTransferLatency = (Usec > 0) ? Usec : 0; + } + } + + // Read LIBOMPTARGET_OPENCL_DATA_TRANSFER_METHOD + if ((Env = readEnvVar("LIBOMPTARGET_OPENCL_DATA_TRANSFER_METHOD"))) { + std::string Value(Env); + DataTransferMethod = DATA_TRANSFER_METHOD_INVALID; + if (Value.size() == 1 && std::isdigit(Value.c_str()[0])) { + int Method = std::stoi(Env); + if (Method < DATA_TRANSFER_METHOD_LAST) + DataTransferMethod = Method; + } + if (DataTransferMethod == DATA_TRANSFER_METHOD_INVALID) { + WARNING("Invalid data transfer method (%s) selected" + " -- using default method.\n", Env); + DataTransferMethod = DATA_TRANSFER_METHOD_SVMMAP; + } + } + + // Read LIBOMPTARGET_DEVICETYPE + if ((Env = readEnvVar("LIBOMPTARGET_DEVICETYPE"))) { + std::string Value(Env); + if (Value == "GPU" || Value == "gpu" || Value == "") + DeviceType = CL_DEVICE_TYPE_GPU; + else if (Value == "CPU" || Value == "cpu") + DeviceType = CL_DEVICE_TYPE_CPU; + else + WARNING("Invalid or unsupported LIBOMPTARGET_DEVICETYPE=%s\n", Env); + } + DP("Target device type is set to %s\n", + (DeviceType == CL_DEVICE_TYPE_CPU) ? "CPU" : "GPU"); + + /// Oversubscription rate for normal kernels + if ((Env = readEnvVar("LIBOMPTARGET_OPENCL_SUBSCRIPTION_RATE"))) { + int32_t Value = std::stoi(Env); + // Set some reasonable limits. + if (Value > 0 && Value <= 0xFFFF) + SubscriptionRate = Value; + } + + /// Oversubscription rate for reduction kernels + if ((Env = readEnvVar("LIBOMPTARGET_ONEAPI_REDUCTION_SUBSCRIPTION_RATE"))) { + int32_t Value = std::stoi(Env); + // Set some reasonable limits. + // '0' is a special value meaning to use regular default ND-range + // for kernels with reductions. + if (Value >= 0 && Value <= 0xFFFF) { + ReductionSubscriptionRate = Value; + ReductionSubscriptionRateIsDefault = false; + } + } + + /// Read LIBOMPTARGET_PLUGIN_PROFILE + if ((Env = readEnvVar("LIBOMPTARGET_PLUGIN_PROFILE"))) { + std::istringstream Value(Env); + std::string Token; + while (std::getline(Value, Token, ',')) { + if (Token == "T" || Token == "1") + Flags.EnableProfile = 1; + else if (Token == "unit_usec" || Token == "usec") + ProfileResolution = 1000000; + } + } + + if ((Env = readEnvVar("LIBOMPTARGET_ENABLE_SIMD"))) { + if (parseBool(Env) == 1) + Flags.EnableSimd = 1; + else + WARNING("Invalid or unsupported LIBOMPTARGET_ENABLE_SIMD=%s\n", Env); + } + + // TODO: deprecate this variable since the default behavior is equivalent + // to "inorder_async" and "inorder_shared_sync". + // Read LIBOMPTARGET_OPENCL_INTEROP_QUEUE + // Two independent options can be specified as follows. + // -- inorder_async: use a new in-order queue for asynchronous case + // (default: shared out-of-order queue) + // -- inorder_shared_sync: use the existing shared in-order queue for + // synchronous case (default: new in-order queue). + if ((Env = readEnvVar("LIBOMPTARGET_OPENCL_INTEROP_QUEUE", + "LIBOMPTARGET_INTEROP_PIPE"))) { + std::istringstream Value(Env); + std::string Token; + while (std::getline(Value, Token, ',')) { + if (Token == "inorder_async") { + Flags.UseInteropQueueInorderAsync = 1; + DP(" enabled in-order asynchronous separate queue\n"); + } else if (Token == "inorder_shared_sync") { + Flags.UseInteropQueueInorderSharedSync = 1; + DP(" enabled in-order synchronous shared queue\n"); + } + } + } + + if ((Env = readEnvVar("LIBOMPTARGET_OPENCL_COMPILATION_OPTIONS"))) { + UserCompilationOptions += Env; + } + + if ((Env = readEnvVar("LIBOMPTARGET_OPENCL_LINKING_OPTIONS"))) { + UserLinkingOptions += Env; + } + + // Read LIBOMPTARGET_USM_HOST_MEM + if ((Env = readEnvVar("LIBOMPTARGET_USM_HOST_MEM"))) { + if (parseBool(Env) == 1) + Flags.UseHostMemForUSM = 1; + } + + // Read LIBOMPTARGET_OPENCL_USE_SVM + if ((Env = readEnvVar("LIBOMPTARGET_OPENCL_USE_SVM"))) { + int32_t Value = parseBool(Env); + if (Value == 1) + Flags.UseSVM = 1; + else if (Value == 0) + Flags.UseSVM = 0; + } + + // Read LIBOMPTARGET_OPENCL_USE_BUFFER + if ((Env = readEnvVar("LIBOMPTARGET_OPENCL_USE_BUFFER"))) { + if (parseBool(Env) == 1) + Flags.UseBuffer = 1; + } + + // Read LIBOMPTARGET_USE_SINGLE_CONTEXT + if ((Env = readEnvVar("LIBOMPTARGET_USE_SINGLE_CONTEXT"))) { + if (parseBool(Env) == 1) + Flags.UseSingleContext = 1; + } + + if (readEnvVar("INTEL_ENABLE_OFFLOAD_ANNOTATIONS")) { + // To match SYCL RT behavior, we just need to check whether + // INTEL_ENABLE_OFFLOAD_ANNOTATIONS is set. The actual value + // does not matter. + CommonSpecConstants.addConstant(0xFF747469, 1); + } + + if ((Env = readEnvVar("LIBOMPTARGET_ONEAPI_USE_IMAGE_OPTIONS"))) { + int32_t Value = parseBool(Env); + if (Value == 1) + Flags.UseImageOptions = 1; + else if (Value == 0) + Flags.UseImageOptions = 0; + } + + if ((Env = readEnvVar("LIBOMPTARGET_ONEAPI_SHOW_BUILD_LOG"))) { + int32_t Value = parseBool(Env); + if (Value == 1) + Flags.ShowBuildLog = 1; + else if (Value == 0) + Flags.ShowBuildLog = 0; + } + + // LIBOMPTARGET_ONEAPI_LINK_LIBDEVICE + if ((Env = readEnvVar("LIBOMPTARGET_ONEAPI_LINK_LIBDEVICE"))) { + int32_t Value = parseBool(Env); + if (Value == 1) + Flags.LinkLibDevice = 1; + else if (Value == 0) + Flags.LinkLibDevice = 0; + } + + if ((Env = readEnvVar("LIBOMPTARGET_ONEAPI_THIN_THREADS_THRESHOLD"))) { + char *StrEnd; + double Value = std::strtod(Env, &StrEnd); + if (errno == 0 && StrEnd != Env && + Value >= 0.0 && Value <= 1.0) { + ThinThreadsThreshold = Value; + } else { + if (errno != 0) + DP("Error parsing value of " + "LIBOMPTARGET_ONEAPI_THIN_THREADS_THRESHOLD: %s\n", + strerror(errno)); + DP("Value of LIBOMPTARGET_ONEAPI_THIN_THREADS_THRESHOLD must " + "be a non-negative floating-point number not greater than 1.0.\n"); + DP("Using default value: %f\n", ThinThreadsThreshold); + } + } + } + + /// Read environment variable value with optional deprecated name + const char *readEnvVar(const char *Name, const char *OldName = nullptr) { + if (!Name) + return nullptr; + const char *Value = std::getenv(Name); + if (Value || !OldName) { + if (Value) + DP("ENV: %s=%s\n", Name, Value); + return Value; + } + Value = std::getenv(OldName); + if (Value) { + DP("ENV: %s=%s\n", OldName, Value); + WARNING("%s is being deprecated. Use %s instead.\n", OldName, Name); + } + return Value; + } + + /// Parse boolean value + /// Return 1 for: TRUE, T, 1, ON, YES, ENABLED (case insensitive) + /// Return 0 for: FALSE, F, 0, OFF, NO, DISABLED (case insensitive) + /// Return -1 for failed match + /// NOTE: we can later simplify the document to just TRUE or FALSE like what + /// OpenMP host runtime does. + int32_t parseBool(const char *Value) { + std::string Str(Value); + std::transform(Str.begin(), Str.end(), Str.begin(), + [](unsigned char C) {return std::tolower(C);}); + if (Str == "true" || Str == "t" || Str == "1" || Str == "on" || + Str == "yes" || Str == "enabled") + return 1; + if (Str == "false" || Str == "f" || Str == "0" || Str == "off" || + Str == "no" || Str == "disabled") + return 0; + return -1; + } +}; // RTLOptionTy + +/// Device property +struct DevicePropertiesTy { + cl_uint DeviceId = 0; + cl_uint NumSlices = 0; + cl_uint NumSubslicesPerSlice = 0; + cl_uint NumEUsPerSubslice = 0; + cl_uint NumThreadsPerEU = 0; + cl_uint NumHWThreads = 0; + + int32_t getDeviceProperties(cl_device_id ID); +}; + +/// Class containing all the device information. +class RTLDeviceInfoTy { +public: + /// Number of OpenMP devices + cl_uint NumDevices = 0; + + /// List of OpenCLProgramTy objects + std::vector> Programs; + + /// Contains context and extension API + std::map PlatformInfos; + + /// Platform that each device belongs to + std::vector Platforms; + + /// Contexts used by each device + std::vector Contexts; + + /// OpenCL device + std::vector Devices; + + // Internal device type ID + std::vector DeviceArchs; + + /// Device properties + std::vector maxExecutionUnits; + std::vector maxWorkGroupSize; + std::vector MaxMemAllocSize; + std::vector DeviceProperties; + + /// A vector of descriptors of OpenCL extensions for each device. + std::vector Extensions; + + /// Default command queues for each devices + std::vector Queues; + + /// Inorder command queues for each devices + std::vector QueuesInOrder; + + /// Kernel properties for each devices + std::vector> KernelProperties; + + /// Kernel-specific implicit arguments + std::vector>> ImplicitArgs; + + /// Thread-private profile information for each devices + std::vector> Profiles; + + std::vector> Names; + + /// Whether each devices are initialized + std::vector Initialized; + + std::vector SLMSize; + + std::mutex *Mutexes; + + std::mutex *ProfileLocks; + + std::vector> ClMemBuffers; + + /// Memory owned by the plugin + std::vector> OwnedMemory; + + /// Internal allocation information + std::vector> MemAllocInfo; + + /// Requires flags + int64_t RequiresFlags = OMP_REQ_UNDEFINED; + + /// Number of active kernel launches for each device + std::vector NumActiveKernels; + + /// RTL option + RTLOptionTy Option; + + RTLDeviceInfoTy() = default; + + /// Return per-thread profile data + ProfileDataTy &getProfiles(int32_t DeviceId) { + int32_t gtid = __kmpc_global_thread_num(nullptr); + ProfileLocks[DeviceId].lock(); + auto &profiles = Profiles[DeviceId]; + if (profiles.count(gtid) == 0) + profiles.emplace(gtid, ProfileDataTy()); + auto &profileData = profiles[gtid]; + ProfileLocks[DeviceId].unlock(); + return profileData; + } + + /// Return context for the given device ID + cl_context getContext(int32_t DeviceId) { + if (Option.Flags.UseSingleContext) + return PlatformInfos[Platforms[DeviceId]].Context; + else + return Contexts[DeviceId]; + } + + /// Return the extension function pointer for the given ID + void *getExtensionFunctionPtr(int32_t DeviceId, int32_t ExtensionId) { + auto platformId = Platforms[DeviceId]; + return PlatformInfos[platformId].ExtensionFunctionPointers[ExtensionId]; + } + + /// Return the extension function name for the given ID + const char *getExtensionFunctionName(int32_t DeviceId, int32_t ExtensionId) { + auto platformId = Platforms[DeviceId]; + return PlatformInfos[platformId].ExtensionFunctionNames[ExtensionId]; + } + + /// Check if extension function is available and enabled. + bool isExtensionFunctionEnabled(int32_t DeviceId, int32_t ExtensionId) { + if (!getExtensionFunctionPtr(DeviceId, ExtensionId)) + return false; + + switch (ExtensionId) { + case clGetMemAllocInfoINTELId: + case clHostMemAllocINTELId: + case clDeviceMemAllocINTELId: + case clSharedMemAllocINTELId: + case clMemFreeINTELId: + case clSetKernelArgMemPointerINTELId: + case clEnqueueMemcpyINTELId: + return Extensions[DeviceId].UnifiedSharedMemory == ExtensionStatusEnabled; + case clGetDeviceGlobalVariablePointerINTELId: + return Extensions[DeviceId].GetDeviceGlobalVariablePointer == + ExtensionStatusEnabled; + case clGetKernelSuggestedLocalWorkSizeINTELId: + return Extensions[DeviceId].SuggestedGroupSize == ExtensionStatusEnabled; + default: + return true; + } + } + + /// Reset program data + int32_t resetProgramData(int32_t DeviceId); + + /// Allocate cl_mem data + void *allocDataClMem(int32_t DeviceId, size_t Size); + + /// Get PCI device ID + uint32_t getPCIDeviceId(int32_t DeviceId); + + /// Get device arch + uint64_t getDeviceArch(int32_t DeviceId); + + /// Get allocated memory type + cl_unified_shared_memory_type_intel getMemAllocType( + int32_t DeviceId, const void *Ptr); + + /// For the given kernel return its KernelInfo auxiliary information + /// that was previously read by readKernelInfo(). + const KernelInfoTy * + getKernelInfo(int32_t DeviceId, const cl_kernel &Kernel) const; + + /// Get memory allocation properties to be used in memory allocation + std::unique_ptr> + getAllocMemProperties(int32_t DeviceId, size_t Size); +}; + +#ifdef _WIN32 +#define __ATTRIBUTE__(X) +#else +#define __ATTRIBUTE__(X) __attribute__((X)) +#endif // _WIN32 + +static RTLDeviceInfoTy *DeviceInfo = nullptr; + +__ATTRIBUTE__(constructor(101)) void init() { + DP("Init OpenCL plugin!\n"); + DeviceInfo = new RTLDeviceInfoTy(); +} + +__ATTRIBUTE__(destructor(101)) void deinit() { + DP("Deinit OpenCL plugin!\n"); + delete DeviceInfo; + DeviceInfo = nullptr; +} + +#if _WIN32 +extern "C" BOOL WINAPI +DllMain(HINSTANCE const instance, // handle to DLL module + DWORD const reason, // reason for calling function + LPVOID const reserved) // reserved +{ + // Perform actions based on the reason for calling. + switch (reason) { + case DLL_PROCESS_ATTACH: + // Initialize once for each new process. + // Return FALSE to fail DLL load. + init(); + break; + + case DLL_THREAD_ATTACH: + // Do thread-specific initialization. + break; + + case DLL_THREAD_DETACH: + // Do thread-specific cleanup. + break; + + case DLL_PROCESS_DETACH: + break; + } + return TRUE; // Successful DLL_PROCESS_ATTACH. +} +#endif // _WIN32 + +// Helper class to collect time intervals for host and device. +// The interval is managed by start()/stop() methods. +// The automatic flush of the time interval happens at the object's +// destruction. +class ProfileIntervalTy { + // A timer interval may be either disabled, paused or running. + // Interval may switch from paused to running and from running + // to paused. Interval may switch to disabled start from any state, + // and it cannot switch to disabled to anything else. + enum TimerStatusTy { + Disabled, + Paused, + Running + }; + + // Cumulative times collected by this interval so far. + double DeviceElapsed = 0.0; + double HostElapsed = 0.0; + + // Temporary timer values initialized at each interval + // (re)start. + cl_ulong DeviceTimeTemp = 0; + cl_ulong HostTimeTemp = 0; + + // The interval name as seen in the profile data output. + std::string Name; + + // OpenMP device id. + int32_t DeviceId; + + // OpenCL device id. + cl_device_id ClDeviceId; + + // Current status of the interval. + TimerStatusTy Status; + +public: + // Create new timer interval for the given OpenMP device + // and with the given name (which will be used for the profile + // data output). + ProfileIntervalTy(const char *Name, int32_t DeviceId) + : Name(Name), DeviceId(DeviceId), + ClDeviceId(DeviceInfo->Devices[DeviceId]) { + if (DeviceInfo->Option.Flags.EnableProfile) + // Start the interval paused. + Status = TimerStatusTy::Paused; + else + // Disable the interval for good. + Status = TimerStatusTy::Disabled; + } + + // The destructor automatically updates the profile data. + ~ProfileIntervalTy() { + if (Status == TimerStatusTy::Disabled) + return; + if (Status == TimerStatusTy::Running) { + Status = TimerStatusTy::Disabled; + WARNING("profiling timer '%s' for OpenMP device (%" PRId32 ") %s " + "is disabled due to start/stop mismatch.\n", + Name.c_str(), DeviceId, DeviceInfo->Names[DeviceId].data()); + return; + } + + DeviceInfo->getProfiles(DeviceId).update( + Name.c_str(), HostElapsed, DeviceElapsed); + } + + // Trigger interval start. + void start() { + if (Status == TimerStatusTy::Disabled) + return; + if (Status == TimerStatusTy::Running) { + Status = TimerStatusTy::Disabled; + WARNING("profiling timer '%s' for OpenMP device (%" PRId32 ") %s " + "is disabled due to start/stop mismatch.\n", + Name.c_str(), DeviceId, DeviceInfo->Names[DeviceId].data()); + return; + } + cl_int rc; + CALL_CL(rc, clGetDeviceAndHostTimer, ClDeviceId, &DeviceTimeTemp, + &HostTimeTemp); + if (rc != CL_SUCCESS) { + Status = TimerStatusTy::Disabled; + WARNING("profiling timer '%s' for OpenMP device (%" PRId32 ") %s " + "is disabled due to invalid OpenCL timer.\n", + Name.c_str(), DeviceId, DeviceInfo->Names[DeviceId].data()); + return; + } + Status = TimerStatusTy::Running; + } + + // Trigger interval stop (actually, a pause). + void stop() { + if (Status == TimerStatusTy::Disabled) + return; + if (Status == TimerStatusTy::Paused) { + Status = TimerStatusTy::Disabled; + WARNING("profiling timer '%s' for OpenMP device (%" PRId32 ") %s " + "is disabled due to start/stop mismatch.\n", + Name.c_str(), DeviceId, DeviceInfo->Names[DeviceId].data()); + return; + } + + cl_ulong DeviceTime, HostTime; + cl_int rc; + CALL_CL(rc, clGetDeviceAndHostTimer, ClDeviceId, &DeviceTime, &HostTime); + if (rc != CL_SUCCESS) { + Status = TimerStatusTy::Disabled; + WARNING("profiling timer '%s' for OpenMP device (%" PRId32 ") %s " + "is disabled due to invalid OpenCL timer.\n", + Name.c_str(), DeviceId, DeviceInfo->Names[DeviceId].data()); + return; + } + + if (DeviceTime < DeviceTimeTemp || HostTime < HostTimeTemp) { + Status = TimerStatusTy::Disabled; + WARNING("profiling timer '%s' for OpenMP device (%" PRId32 ") %s " + "is disabled due to timer overflow.\n", + Name.c_str(), DeviceId, DeviceInfo->Names[DeviceId].data()); + return; + } + + DeviceElapsed += + static_cast(DeviceTime) - static_cast(DeviceTimeTemp); + HostElapsed += + static_cast(HostTime) - static_cast(HostTimeTemp); + Status = TimerStatusTy::Paused; + } +}; // ProfileIntervalTy + +/// Clean-up routine to be registered by std::atexit(). +static void closeRTL() { + for (uint32_t i = 0; i < DeviceInfo->NumDevices; i++) { + if (!DeviceInfo->Initialized[i]) + continue; + if (DeviceInfo->Option.Flags.EnableProfile) { + for (auto &profile : DeviceInfo->Profiles[i]) + profile.second.printData(i, profile.first, DeviceInfo->Names[i].data(), + DeviceInfo->Option.ProfileResolution); + } + + CALL_CL_EXIT_FAIL(clReleaseCommandQueue, DeviceInfo->Queues[i]); + + if (DeviceInfo->QueuesInOrder[i]) + CALL_CL_EXIT_FAIL(clReleaseCommandQueue, DeviceInfo->QueuesInOrder[i]); + + for (auto mem : DeviceInfo->OwnedMemory[i]) + CALL_CL_EXT_VOID(i, clMemFreeINTEL, DeviceInfo->getContext(i), mem); + + if (!DeviceInfo->Option.Flags.UseSingleContext) + CALL_CL_EXIT_FAIL(clReleaseContext, DeviceInfo->Contexts[i]); + + DeviceInfo->Programs[i].clear(); + } + + if (DeviceInfo->Option.Flags.UseSingleContext) + for (auto platformInfo : DeviceInfo->PlatformInfos) + CALL_CL_EXIT_FAIL(clReleaseContext, platformInfo.second.Context); + + delete[] DeviceInfo->Mutexes; + delete[] DeviceInfo->ProfileLocks; + DP("Closed RTL successfully\n"); +} + +static std::string getDeviceRTLPath(const char *BaseName) { + std::string RTLPath; +#ifdef _WIN32 + char Path[_MAX_PATH]; + HMODULE Module = nullptr; + if (!GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | + GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, + (LPCSTR) &DeviceInfo, &Module)) + return RTLPath; + if (!GetModuleFileNameA(Module, Path, sizeof(Path))) + return RTLPath; + RTLPath = Path; +#else + Dl_info RTLInfo; + if (!dladdr(&DeviceInfo, &RTLInfo)) + return RTLPath; + RTLPath = RTLInfo.dli_fname; +#endif + size_t Split = RTLPath.find_last_of("/\\"); + RTLPath.replace(Split + 1, std::string::npos, BaseName); + return RTLPath; +} + +static inline void addDataTransferLatency() { + if (!DeviceInfo->Option.Flags.CollectDataTransferLatency) + return; + double goal = omp_get_wtime() + 1e-6 * DeviceInfo->Option.DataTransferLatency; + // Naive spinning should be enough + while (omp_get_wtime() < goal) + ; +} + +// FIXME: move this to llvm/BinaryFormat/ELF.h and elf.h: +#define NT_INTEL_ONEOMP_OFFLOAD_VERSION 1 +#define NT_INTEL_ONEOMP_OFFLOAD_IMAGE_COUNT 2 +#define NT_INTEL_ONEOMP_OFFLOAD_IMAGE_AUX 3 + +static bool isValidOneOmpImage(__tgt_device_image *Image, + uint64_t &MajorVer, + uint64_t &MinorVer) { + char *ImgBegin = reinterpret_cast(Image->ImageStart); + char *ImgEnd = reinterpret_cast(Image->ImageEnd); + size_t ImgSize = ImgEnd - ImgBegin; + ElfL E(ImgBegin, ImgSize); + if (!E.isValidElf()) { + DP("Warning: unable to get ELF handle: %s!\n", E.getErrmsg(-1)); + return false; + } + + for (auto I = E.section_notes_begin(), IE = E.section_notes_end(); I != IE; + ++I) { + ElfLNote Note = *I; + if (Note.getNameSize() == 0) + continue; + std::string NameStr(Note.getName(), Note.getNameSize()); + if (NameStr != "INTELONEOMPOFFLOAD") + continue; + uint64_t Type = Note.getType(); + if (Type != NT_INTEL_ONEOMP_OFFLOAD_VERSION) + continue; + std::string DescStr(reinterpret_cast(Note.getDesc()), + Note.getDescSize()); + auto DelimPos = DescStr.find('.'); + if (DelimPos == std::string::npos) { + // The version has to look like "Major#.Minor#". + DP("Invalid NT_INTEL_ONEOMP_OFFLOAD_VERSION: '%s'\n", DescStr.c_str()); + return false; + } + std::string MajorVerStr = DescStr.substr(0, DelimPos); + DescStr.erase(0, DelimPos + 1); + MajorVer = std::stoull(MajorVerStr); + MinorVer = std::stoull(DescStr); + bool isSupported = (MajorVer == 1 && MinorVer == 0); + return isSupported; + } + + return false; +} + +static void dumpImageToFile( + const void *Image, size_t ImageSize, const char *Type) { +} + +static void debugPrintBuildLog(cl_program program, cl_device_id did) { + if (DebugLevel <= 0 && !DeviceInfo->Option.Flags.ShowBuildLog) + return; + + size_t len = 0; + CALL_CL_RET_VOID(clGetProgramBuildInfo, program, did, CL_PROGRAM_BUILD_LOG, 0, + nullptr, &len); + // The len must actually be bigger than 0 always, because the log string + // is null-terminated. + if (len == 0) + return; + + std::vector buffer(len); + CALL_CL_RET_VOID(clGetProgramBuildInfo, program, did, CL_PROGRAM_BUILD_LOG, + len, buffer.data(), nullptr); + const char *buildLog = (len > 1) ? buffer.data() : ""; + MESSAGE0("Target build log:"); + std::stringstream Str(buildLog); + std::string Line; + while(std::getline(Str, Line, '\n')) + MESSAGE(" %s", Line.c_str()); +} + +static cl_program createProgramFromFile(const char *BaseName, + int32_t DeviceId) { + std::string RTLPath = getDeviceRTLPath(BaseName); + std::ifstream RTLFile(RTLPath, std::ios::binary); + + if (RTLFile.is_open()) { + DP("Found device RTL: %s\n", RTLPath.c_str()); + RTLFile.seekg(0, RTLFile.end); + int RTLSize = RTLFile.tellg(); + std::string RTL(RTLSize, '\0'); + RTLFile.seekg(0); + if (!RTLFile.read(&RTL[0], RTLSize)) { + DP("I/O Error: Failed to read device RTL.\n"); + return nullptr; + } + + dumpImageToFile(RTL.c_str(), RTLSize, BaseName); + + cl_int RC; + cl_program PGM; + CALL_CL_RVRC(PGM, clCreateProgramWithIL, RC, + DeviceInfo->getContext(DeviceId), RTL.c_str(), RTLSize); + if (RC != CL_SUCCESS) { + DP("Error: Failed to create device RTL from IL: %d\n", RC); + return nullptr; + } + + DeviceInfo->Option.CommonSpecConstants.setProgramConstants(DeviceId, PGM); + + return PGM; + } + + DP("Cannot find device RTL: %s\n", RTLPath.c_str()); + return nullptr; +} + +static inline void *dataAlloc(int32_t DeviceId, int64_t Size, void *HstPtr, + void *HstBase, bool ImplicitArg, cl_uint Align = 0) { + intptr_t Offset = (intptr_t)HstPtr - (intptr_t)HstBase; + // If the offset is negative, then for our practical purposes it can be + // considered 0 because the base address of an array will be contained + // within or after the allocated memory. + intptr_t MeaningfulOffset = Offset >= 0 ? Offset : 0; + // If the offset is negative and the size we map is not large enough to reach + // the base, then we must allocate extra memory up to the base (+1 to include + // at least the first byte the base is pointing to). + int64_t MeaningfulSize = + Offset < 0 && abs(Offset) >= Size ? abs(Offset) + 1 : Size; + + void *Base = nullptr; + auto Context = DeviceInfo->getContext(DeviceId); + size_t AllocSize = MeaningfulSize + MeaningfulOffset; + int32_t AllocKind = TARGET_ALLOC_DEVICE; + + ProfileIntervalTy DataAllocTimer("DataAlloc", DeviceId); + DataAllocTimer.start(); + + if (DeviceInfo->Option.Flags.UseSVM) { + AllocKind = TARGET_ALLOC_SVM; + CALL_CL_RV(Base, clSVMAlloc, Context, CL_MEM_READ_WRITE, AllocSize, Align); + } else { + if (!DeviceInfo->isExtensionFunctionEnabled(DeviceId, + clDeviceMemAllocINTELId)) { + DP("Error: Extension %s is not supported\n", + DeviceInfo->getExtensionFunctionName(DeviceId, + clDeviceMemAllocINTELId)); + return nullptr; + } + cl_int RC; + auto AllocProp = DeviceInfo->getAllocMemProperties(DeviceId, AllocSize); + CALL_CL_EXT_RVRC(DeviceId, Base, clDeviceMemAllocINTEL, RC, Context, + DeviceInfo->Devices[DeviceId], AllocProp->data(), + AllocSize, Align); + if (RC != CL_SUCCESS) + return nullptr; + } + if (!Base) { + DP("Error: Failed to allocate base buffer\n"); + return nullptr; + } + DP("Created base buffer " DPxMOD " during data alloc\n", DPxPTR(Base)); + + void *Ret = (void *)((intptr_t)Base + MeaningfulOffset); + + if (ImplicitArg) { + DP("Stashing an implicit argument " DPxMOD " for next kernel\n", + DPxPTR(Ret)); + } + DeviceInfo->MemAllocInfo[DeviceId]->add( + Ret, Base, Size, AllocKind, false, ImplicitArg); + + DataAllocTimer.stop(); + + return Ret; +} + +static void *dataAllocExplicit( + int32_t DeviceId, int64_t Size, int32_t Kind, cl_uint Align = 0) { + auto Device = DeviceInfo->Devices[DeviceId]; + auto Context = DeviceInfo->getContext(DeviceId); + cl_int RC; + void *Mem = nullptr; + ProfileIntervalTy DataAllocTimer("DataAlloc", DeviceId); + DataAllocTimer.start(); + auto ID = DeviceId; + auto AllocProp = DeviceInfo->getAllocMemProperties(DeviceId, Size); + + switch (Kind) { + case TARGET_ALLOC_DEVICE: + Mem = dataAlloc(DeviceId, Size, nullptr, nullptr, true /* ImplicitArg */, + Align); + break; + case TARGET_ALLOC_HOST: + if (DeviceInfo->Option.Flags.UseSingleContext) + ID = DeviceInfo->NumDevices; + if (!DeviceInfo->isExtensionFunctionEnabled(DeviceId, + clHostMemAllocINTELId)) { + DP("Host memory allocator is not available\n"); + return nullptr; + } + CALL_CL_EXT_RVRC(DeviceId, Mem, clHostMemAllocINTEL, RC, Context, + AllocProp->data(), Size, Align); + if (Mem) { + DeviceInfo->MemAllocInfo[ID]->add( + Mem, Mem, Size, Kind, false /* InPool */, true /* IsImplicitArg */); + DP("Allocated a host memory object " DPxMOD "\n", DPxPTR(Mem)); + } + break; + case TARGET_ALLOC_SHARED: + if (!DeviceInfo->isExtensionFunctionEnabled( + DeviceId, clSharedMemAllocINTELId)) { + DP("Shared memory allocator is not available\n"); + return nullptr; + } + CALL_CL_EXT_RVRC(DeviceId, Mem, clSharedMemAllocINTEL, RC, Context, Device, + AllocProp->data(), Size, Align); + if (Mem) { + DeviceInfo->MemAllocInfo[ID]->add( + Mem, Mem, Size, Kind, false /* InPool */, true /* IsImplicitArg */); + DP("Allocated a shared memory object " DPxMOD "\n", DPxPTR(Mem)); + } + break; + default: + FATAL_ERROR("Invalid target data allocation kind"); + } + + DataAllocTimer.stop(); + + return Mem; +} + +static int32_t submitData(int32_t device_id, void *tgt_ptr, void *hst_ptr, + int64_t size) { + if (size == 0) + // All other plugins seem to be handling 0 size gracefully, + // so we should do as well. + return OFFLOAD_SUCCESS; + + cl_command_queue queue = DeviceInfo->Queues[device_id]; + + // Add synthetic delay for experiments + addDataTransferLatency(); + + const char *ProfileKey = "DataWrite (Host to Device)"; + + if (DeviceInfo->Option.Flags.UseBuffer) { + std::unique_lock lock(DeviceInfo->Mutexes[device_id]); + if (DeviceInfo->ClMemBuffers[device_id].count(tgt_ptr) > 0) { + cl_event event; + CALL_CL_RET_FAIL(clEnqueueWriteBuffer, queue, (cl_mem)tgt_ptr, CL_FALSE, + 0, size, hst_ptr, 0, nullptr, &event); + CALL_CL_RET_FAIL(clWaitForEvents, 1, &event); + if (DeviceInfo->Option.Flags.EnableProfile) + DeviceInfo->getProfiles(device_id).update(ProfileKey, event); + + return OFFLOAD_SUCCESS; + } + } + + if (!DeviceInfo->Option.Flags.UseSVM) { + if (!DeviceInfo->isExtensionFunctionEnabled(device_id, + clEnqueueMemcpyINTELId)) { + DP("Error: Extension %s is not supported\n", + DeviceInfo->getExtensionFunctionName(device_id, + clEnqueueMemcpyINTELId)); + return OFFLOAD_FAIL; + } + cl_event event; + CALL_CL_EXT_RET_FAIL(device_id, clEnqueueMemcpyINTEL, queue, CL_FALSE, + tgt_ptr, hst_ptr, size, 0, nullptr, &event); + CALL_CL_RET_FAIL(clWaitForEvents, 1, &event); + if (DeviceInfo->Option.Flags.EnableProfile) + DeviceInfo->getProfiles(device_id).update(ProfileKey, event); + + return OFFLOAD_SUCCESS; + } + + switch (DeviceInfo->Option.DataTransferMethod) { + case DATA_TRANSFER_METHOD_SVMMAP: { + cl_event event; + ProfileIntervalTy SubmitTime(ProfileKey, device_id); + SubmitTime.start(); + + CALL_CL_RET_FAIL(clEnqueueSVMMap, queue, CL_TRUE, CL_MAP_WRITE, tgt_ptr, + size, 0, nullptr, nullptr); + memcpy(tgt_ptr, hst_ptr, size); + CALL_CL_RET_FAIL(clEnqueueSVMUnmap, queue, tgt_ptr, 0, nullptr, &event); + CALL_CL_RET_FAIL(clWaitForEvents, 1, &event); + + SubmitTime.stop(); + } break; + case DATA_TRANSFER_METHOD_SVMMEMCPY: { + cl_event event; + CALL_CL_RET_FAIL(clEnqueueSVMMemcpy, queue, CL_TRUE, tgt_ptr, hst_ptr, + size, 0, nullptr, &event); + if (DeviceInfo->Option.Flags.EnableProfile) + DeviceInfo->getProfiles(device_id).update(ProfileKey, event); + } break; + case DATA_TRANSFER_METHOD_CLMEM: + default: { + cl_event event; + cl_int rc; + cl_mem mem = nullptr; + CALL_CL_RVRC(mem, clCreateBuffer, rc, DeviceInfo->getContext(device_id), + CL_MEM_USE_HOST_PTR, size, tgt_ptr); + if (rc != CL_SUCCESS) { + DP("Error: Failed to create a buffer from a SVM pointer " DPxMOD "\n", + DPxPTR(tgt_ptr)); + return OFFLOAD_FAIL; + } + CALL_CL_RET_FAIL(clEnqueueWriteBuffer, queue, mem, CL_FALSE, 0, size, + hst_ptr, 0, nullptr, &event); + CALL_CL_RET_FAIL(clWaitForEvents, 1, &event); + CALL_CL_RET_FAIL(clReleaseMemObject, mem); + if (DeviceInfo->Option.Flags.EnableProfile) + DeviceInfo->getProfiles(device_id).update(ProfileKey, event); + } + } + return OFFLOAD_SUCCESS; +} + +static int32_t retrieveData(int32_t device_id, void *hst_ptr, void *tgt_ptr, + int64_t size) { + if (size == 0) + // All other plugins seem to be handling 0 size gracefully, + // so we should do as well. + return OFFLOAD_SUCCESS; + + cl_command_queue queue = DeviceInfo->Queues[device_id]; + + // Add synthetic delay for experiments + addDataTransferLatency(); + + const char *ProfileKey = "DataRead (Device to Host)"; + + if (DeviceInfo->Option.Flags.UseBuffer) { + std::unique_lock lock(DeviceInfo->Mutexes[device_id]); + if (DeviceInfo->ClMemBuffers[device_id].count(tgt_ptr) > 0) { + cl_event event; + CALL_CL_RET_FAIL(clEnqueueReadBuffer, queue, (cl_mem)tgt_ptr, CL_FALSE, + 0, size, hst_ptr, 0, nullptr, &event); + CALL_CL_RET_FAIL(clWaitForEvents, 1, &event); + if (DeviceInfo->Option.Flags.EnableProfile) + DeviceInfo->getProfiles(device_id).update(ProfileKey, event); + + return OFFLOAD_SUCCESS; + } + } + + if (!DeviceInfo->Option.Flags.UseSVM) { + if (!DeviceInfo->isExtensionFunctionEnabled(device_id, + clEnqueueMemcpyINTELId)) { + DP("Error: Extension %s is not supported\n", + DeviceInfo->getExtensionFunctionName(device_id, + clEnqueueMemcpyINTELId)); + return OFFLOAD_FAIL; + } + cl_event event; + CALL_CL_EXT_RET_FAIL(device_id, clEnqueueMemcpyINTEL, queue, CL_FALSE, + hst_ptr, tgt_ptr, size, 0, nullptr, &event); + CALL_CL_RET_FAIL(clWaitForEvents, 1, &event); + if (DeviceInfo->Option.Flags.EnableProfile) + DeviceInfo->getProfiles(device_id).update(ProfileKey, event); + + return OFFLOAD_SUCCESS; + } + + switch (DeviceInfo->Option.DataTransferMethod) { + case DATA_TRANSFER_METHOD_SVMMAP: { + cl_event event; + ProfileIntervalTy RetrieveTime(ProfileKey, device_id); + RetrieveTime.start(); + + CALL_CL_RET_FAIL(clEnqueueSVMMap, queue, CL_TRUE, CL_MAP_READ, tgt_ptr, + size, 0, nullptr, nullptr); + memcpy(hst_ptr, tgt_ptr, size); + CALL_CL_RET_FAIL(clEnqueueSVMUnmap, queue, tgt_ptr, 0, nullptr, &event); + CALL_CL_RET_FAIL(clWaitForEvents, 1, &event); + + RetrieveTime.stop(); + } break; + case DATA_TRANSFER_METHOD_SVMMEMCPY: { + cl_event event; + CALL_CL_RET_FAIL(clEnqueueSVMMemcpy, queue, CL_TRUE, hst_ptr, tgt_ptr, + size, 0, nullptr, &event); + if (DeviceInfo->Option.Flags.EnableProfile) + DeviceInfo->getProfiles(device_id).update(ProfileKey, event); + } break; + case DATA_TRANSFER_METHOD_CLMEM: + default: { + cl_int rc; + cl_event event; + cl_mem mem = nullptr; + CALL_CL_RVRC(mem, clCreateBuffer, rc, DeviceInfo->getContext(device_id), + CL_MEM_USE_HOST_PTR, size, tgt_ptr); + if (rc != CL_SUCCESS) { + DP("Error: Failed to create a buffer from a SVM pointer " DPxMOD "\n", + DPxPTR(tgt_ptr)); + return OFFLOAD_FAIL; + } + CALL_CL_RET_FAIL(clEnqueueReadBuffer, queue, mem, CL_FALSE, 0, size, + hst_ptr, 0, nullptr, &event); + CALL_CL_RET_FAIL(clWaitForEvents, 1, &event); + CALL_CL_RET_FAIL(clReleaseMemObject, mem); + if (DeviceInfo->Option.Flags.EnableProfile) + DeviceInfo->getProfiles(device_id).update(ProfileKey, event); + } + } + return OFFLOAD_SUCCESS; +} + +// Return the number of total HW threads required to execute +// a loop kernel compiled with the given simdWidth, and the given +// loop(s) trip counts and group sizes. +// Returns UINT64_MAX, if computations overflow. +static uint64_t computeThreadsNeeded( + const size_t (&tripCounts)[3], const size_t (&groupSizes)[3], + uint32_t simdWidth) { + uint64_t groupCount[3]; + for (int i = 0; i < 3; ++i) { + if (tripCounts[i] == 0 || groupSizes[i] == 0) + return (std::numeric_limits::max)(); + groupCount[i] = + (uint64_t(tripCounts[i]) + groupSizes[i] - 1) / groupSizes[i]; + if (groupCount[i] > (std::numeric_limits::max)()) + return (std::numeric_limits::max)(); + } + for (int i = 1; i < 3; ++i) { + if ((std::numeric_limits::max)() / groupCount[0] < groupCount[i]) + return (std::numeric_limits::max)(); + groupCount[0] *= groupCount[i]; + } + // Multiplication of the group sizes must never overflow uint64_t + // for any existing device. + uint64_t localWorkSize = + uint64_t(groupSizes[0]) * groupSizes[1] * groupSizes[2]; + uint64_t threadsPerWG = ((localWorkSize + simdWidth - 1) / simdWidth); + + // Check that the total number of threads fits uint64_t. + if ((std::numeric_limits::max)() / groupCount[0] < + threadsPerWG) + return (std::numeric_limits::max)(); + + return groupCount[0] * threadsPerWG; +} + +static void decideLoopKernelGroupArguments( + int32_t DeviceId, int32_t ThreadLimit, TgtNDRangeDescTy *LoopLevels, + cl_kernel Kernel, size_t *GroupSizes, size_t *GroupCounts) { + + size_t maxGroupSize = DeviceInfo->maxWorkGroupSize[DeviceId]; + auto &KernelProperty = DeviceInfo->KernelProperties[DeviceId][Kernel]; + size_t kernelWidth = KernelProperty.Width; + DP("Assumed kernel SIMD width is %zu\n", KernelProperty.SIMDWidth); + DP("Preferred group size is multiple of %zu\n", kernelWidth); + + size_t kernelMaxThreadGroupSize = KernelProperty.MaxThreadGroupSize; + if (kernelMaxThreadGroupSize < maxGroupSize) { + maxGroupSize = kernelMaxThreadGroupSize; + DP("Capping maximum thread group size to %zu due to kernel constraints.\n", + maxGroupSize); + } + + bool maxGroupSizeForced = false; + + if (ThreadLimit > 0) { + maxGroupSizeForced = true; + + if ((uint32_t)ThreadLimit <= maxGroupSize) { + maxGroupSize = ThreadLimit; + DP("Max group size is set to %zu (thread_limit clause)\n", maxGroupSize); + } else { + DP("thread_limit(%" PRIu32 ") exceeds current maximum %zu\n", + ThreadLimit, maxGroupSize); + } + } + + if (DeviceInfo->Option.ThreadLimit > 0) { + maxGroupSizeForced = true; + + if (DeviceInfo->Option.ThreadLimit <= maxGroupSize) { + maxGroupSize = DeviceInfo->Option.ThreadLimit; + DP("Max group size is set to %zu (OMP_THREAD_LIMIT)\n", maxGroupSize); + } else { + DP("OMP_THREAD_LIMIT(%" PRIu32 ") exceeds current maximum %zu\n", + DeviceInfo->Option.ThreadLimit, maxGroupSize); + } + } + + if (DeviceInfo->Option.NumTeams > 0) + DP("OMP_NUM_TEAMS(%" PRIu32 ") is ignored\n", DeviceInfo->Option.NumTeams); + + GroupCounts[0] = GroupCounts[1] = GroupCounts[2] = 1; + size_t groupSizes[3] = {maxGroupSize, 1, 1}; + TgtLoopDescTy *level = LoopLevels->Levels; + int32_t distributeDim = LoopLevels->DistributeDim; + assert(distributeDim >= 0 && distributeDim <= 2 && + "Invalid distribute dimension."); + int32_t numLoopLevels = LoopLevels->NumLoops; + assert((numLoopLevels > 0 && numLoopLevels <= 3) && + "Invalid loop nest description for ND partitioning"); + + // Compute global widths for X/Y/Z dimensions. + size_t tripCounts[3] = {1, 1, 1}; + + for (int32_t i = 0; i < numLoopLevels; i++) { + assert(level[i].Stride > 0 && "Invalid loop stride for ND partitioning"); + DP("Loop %" PRIu32 ": lower bound = %" PRId64 ", upper bound = %" PRId64 + ", Stride = %" PRId64 "\n", + i, level[i].Lb, level[i].Ub, level[i].Stride); + if (level[i].Ub < level[i].Lb) + tripCounts[i] = 0; + else + tripCounts[i] = + (level[i].Ub - level[i].Lb + level[i].Stride) / level[i].Stride; + } + + // Check if any of the loop has zero iterations. + if (tripCounts[0] == 0 || tripCounts[1] == 0 || tripCounts[2] == 0) { + std::fill(GroupSizes, GroupSizes + 3, 1); + std::fill(GroupCounts, GroupCounts + 3, 1); + if (distributeDim > 0 && tripCounts[distributeDim] != 0) { + // There is a distribute dimension, and the distribute loop + // has non-zero iterations, but some inner parallel loop + // has zero iterations. We still want to split the distribute + // loop's iterations between many WGs (of size 1), but the inner/lower + // dimensions should be 1x1. + // Note that this code is currently dead, because we are not + // hoisting the inner loops' bounds outside of the target regions. + // The code is here just for completeness. + size_t distributeTripCount = tripCounts[distributeDim]; + GroupCounts[distributeDim] = distributeTripCount; + } + return; + } + + if (!maxGroupSizeForced) { + // Use clGetKernelSuggestedLocalWorkSizeINTEL to compute group sizes, + // or fallback to setting dimension 0 width to SIMDWidth. + // Note that in case of user-specified LWS groupSizes[0] + // is already set according to the specified value. + size_t globalSizes[3] = { tripCounts[0], tripCounts[1], tripCounts[2] }; + if (distributeDim > 0) { + // There is a distribute dimension. + globalSizes[distributeDim - 1] *= globalSizes[distributeDim]; + globalSizes[distributeDim] = 1; + } + + cl_int rc = CL_DEVICE_NOT_FOUND; + size_t suggestedGroupSizes[3] = {1, 1, 1}; + if (DeviceInfo->Option.Flags.UseDriverGroupSizes && + DeviceInfo->isExtensionFunctionEnabled( + DeviceId, clGetKernelSuggestedLocalWorkSizeINTELId)) { + CALL_CL_EXT(DeviceId, rc, clGetKernelSuggestedLocalWorkSizeINTEL, + DeviceInfo->Queues[DeviceId], Kernel, 3, nullptr, globalSizes, + suggestedGroupSizes); + } + if (rc == CL_SUCCESS) { + groupSizes[0] = suggestedGroupSizes[0]; + groupSizes[1] = suggestedGroupSizes[1]; + groupSizes[2] = suggestedGroupSizes[2]; + } else { + if (maxGroupSize > kernelWidth) { + groupSizes[0] = kernelWidth; + } + if (distributeDim == 0 && + // We need to know exact number of HW threads available + // on the device, so we need cl_intel_device_attribute_query + // extension to be supported. + DeviceInfo->Extensions[DeviceId].DeviceAttributeQuery == + ExtensionStatusEnabled) { + // If there is a distribute dimension, then we do not use + // thin HW threads, since we do not know anything about + // the iteration space of the inner parallel loop regions. + // + // If there is no distribute dimension, then try to use thiner + // HW threads to get more independent HW threads executing + // the kernel - this may allow more parallelism due to + // the stalls being distributed across multiple HW threads rather + // than across SIMD lanes within one HW thread. + assert(groupSizes[1] == 1 && groupSizes[2] == 1 && + "Unexpected group sizes for dimensions 1 or/and 2."); + uint32_t simdWidth = KernelProperty.SIMDWidth; + auto &deviceProperties = DeviceInfo->DeviceProperties[DeviceId]; + uint32_t numEUsPerSubslice = deviceProperties.NumEUsPerSubslice; + uint32_t numSubslices = deviceProperties.NumSlices * + deviceProperties.NumSubslicesPerSlice; + uint32_t numThreadsPerEU = deviceProperties.NumThreadsPerEU; + uint64_t totalThreads = uint64_t(numThreadsPerEU) * numEUsPerSubslice * + numSubslices; + totalThreads *= DeviceInfo->Option.ThinThreadsThreshold; + + uint64_t groupSizePrev = groupSizes[0]; + uint64_t threadsNeeded = + computeThreadsNeeded(tripCounts, groupSizes, simdWidth); + while (threadsNeeded < totalThreads) { + groupSizePrev = groupSizes[0]; + // Try to half the local work size (if possible) and see + // how many HW threads the kernel will require with this + // new local work size. + // In most implementations the initial groupSizes[0] + // will be a power-of-two. + if (groupSizes[0] <= 1) + break; + groupSizes[0] >>= 1; + threadsNeeded = + computeThreadsNeeded(tripCounts, groupSizes, simdWidth); + } + groupSizes[0] = groupSizePrev; + } + } + } + + for (int32_t i = 0; i < numLoopLevels; i++) { + if (i < distributeDim) { + GroupCounts[i] = 1; + continue; + } + size_t trip = tripCounts[i]; + if (groupSizes[i] >= trip) + groupSizes[i] = trip; + GroupCounts[i] = (trip + groupSizes[i] - 1) / groupSizes[i]; + } + std::copy(groupSizes, groupSizes + 3, GroupSizes); +} + +static void decideKernelGroupArguments( + int32_t DeviceId, int32_t NumTeams, int32_t ThreadLimit, + cl_kernel Kernel, size_t *GroupSizes, size_t *GroupCounts) { + const KernelInfoTy *KInfo = DeviceInfo->getKernelInfo(DeviceId, Kernel); + if (!KInfo) { + DP("Warning: Cannot find kernel information for kernel " DPxMOD ".\n", + DPxPTR(Kernel)); + } + size_t maxGroupSize = DeviceInfo->maxWorkGroupSize[DeviceId]; + bool maxGroupSizeForced = false; + bool maxGroupCountForced = false; + + auto &KernelProperty = DeviceInfo->KernelProperties[DeviceId][Kernel]; + size_t kernelWidth = KernelProperty.Width; + DP("Preferred group size is multiple of %zu\n", kernelWidth); + + size_t kernelMaxThreadGroupSize = KernelProperty.MaxThreadGroupSize; + if (kernelMaxThreadGroupSize < maxGroupSize) { + maxGroupSize = kernelMaxThreadGroupSize; + DP("Capping maximum thread group size to %zu due to kernel constraints.\n", + maxGroupSize); + } + + if (ThreadLimit > 0) { + maxGroupSizeForced = true; + + if ((uint32_t)ThreadLimit <= maxGroupSize) { + maxGroupSize = ThreadLimit; + DP("Max group size is set to %zu (thread_limit clause)\n", + maxGroupSize); + } else { + DP("thread_limit(%" PRIu32 ") exceeds current maximum %zu\n", + ThreadLimit, maxGroupSize); + } + } + + if (DeviceInfo->Option.ThreadLimit > 0) { + maxGroupSizeForced = true; + + if (DeviceInfo->Option.ThreadLimit <= maxGroupSize) { + maxGroupSize = DeviceInfo->Option.ThreadLimit; + DP("Max group size is set to %zu (OMP_THREAD_LIMIT)\n", maxGroupSize); + } else { + DP("OMP_THREAD_LIMIT(%" PRIu32 ") exceeds current maximum %zu\n", + DeviceInfo->Option.ThreadLimit, maxGroupSize); + } + } + + size_t maxGroupCount = 0; + + if (NumTeams > 0) { + maxGroupCount = NumTeams; + maxGroupCountForced = true; + DP("Max group count is set to %zu " + "(num_teams clause or no teams construct)\n", maxGroupCount); + } else if (DeviceInfo->Option.NumTeams > 0) { + // OMP_NUM_TEAMS only matters, if num_teams() clause is absent. + maxGroupCount = DeviceInfo->Option.NumTeams; + maxGroupCountForced = true; + DP("Max group count is set to %zu (OMP_NUM_TEAMS)\n", maxGroupCount); + } + + if (maxGroupCountForced) { + // If number of teams is specified by the user, then use kernelWidth + // WIs per WG by default, so that it matches + // decideLoopKernelGroupArguments() behavior. + if (!maxGroupSizeForced) { + maxGroupSize = kernelWidth; + } + } else { + maxGroupCount = DeviceInfo->maxExecutionUnits[DeviceId]; + } + + GroupSizes[0] = maxGroupSize; + GroupSizes[1] = GroupSizes[2] = 1; + + if (KInfo && KInfo->getWINum()) { + GroupSizes[0] = + (std::min)(KInfo->getWINum(), static_cast(GroupSizes[0])); + DP("Capping maximum thread group size to %" PRIu64 + " due to kernel constraints (reduction).\n", KInfo->getWINum()); + } + + GroupCounts[0] = maxGroupCount; + GroupCounts[1] = GroupCounts[2] = 1; + if (!maxGroupCountForced) { + if (KInfo && KInfo->getHasTeamsReduction() && + DeviceInfo->Option.ReductionSubscriptionRate) { + if (!KInfo->isAtomicFreeReduction() || + !DeviceInfo->Option.ReductionSubscriptionRateIsDefault) { + // Use reduction subscription rate 1 for kernels using + // atomic-free reductions, unless user forced reduction subscription + // rate via environment. + GroupCounts[0] /= DeviceInfo->Option.ReductionSubscriptionRate; + GroupCounts[0] = (std::max)(GroupCounts[0], size_t(1)); + } + } else { + GroupCounts[0] *= DeviceInfo->Option.SubscriptionRate; + } + } + + if (KInfo && KInfo->getWGNum()) { + GroupCounts[0] = + (std::min)(KInfo->getWGNum(), static_cast(GroupCounts[0])); + DP("Capping maximum thread groups count to %" PRIu64 + " due to kernel constraints (reduction).\n", KInfo->getWGNum()); + } +} + +static inline int32_t runTargetTeamNDRegion( + int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, + ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams, + int32_t ThreadLimit, void *LoopDesc) { + + cl_kernel Kernel = *static_cast(TgtEntryPtr); + if (!Kernel) { + REPORT("Failed to invoke deleted kernel.\n"); + return OFFLOAD_FAIL; + } + + // Decide group sizes and counts + size_t LocalWorkSize[3] = {1, 1, 1}; + size_t NumWorkGroups[3] = {1, 1, 1}; + if (LoopDesc) { + decideLoopKernelGroupArguments(DeviceId, ThreadLimit, + (TgtNDRangeDescTy *)LoopDesc, Kernel, + LocalWorkSize, NumWorkGroups); + } else { + decideKernelGroupArguments(DeviceId, NumTeams, ThreadLimit, Kernel, + LocalWorkSize, NumWorkGroups); + } + + size_t GlobalWorkSize[3]; + for (int32_t I = 0; I < 3; ++I) + GlobalWorkSize[I] = LocalWorkSize[I] * NumWorkGroups[I]; + + DP("Team sizes = {%zu, %zu, %zu}\n", LocalWorkSize[0], LocalWorkSize[1], + LocalWorkSize[2]); + DP("Number of teams = {%zu, %zu, %zu}\n", + GlobalWorkSize[0] / LocalWorkSize[0], GlobalWorkSize[1] / LocalWorkSize[1], + GlobalWorkSize[2] / LocalWorkSize[2]); + + // Protect thread-unsafe OpenCL API calls + DeviceInfo->Mutexes[DeviceId].lock(); + + // Set kernel args + for (int32_t I = 0; I < NumArgs; ++I) { + ptrdiff_t Offset = TgtOffsets[I]; + const char *ArgType = "Unknown"; + auto *KernelInfo = DeviceInfo->getKernelInfo(DeviceId, Kernel); + if (KernelInfo && KernelInfo->isArgLiteral(I)) { + uint32_t Size = KernelInfo->getArgSize(I); + CALL_CL_RET_FAIL(clSetKernelArg, Kernel, I, Size, TgtArgs[I]); + ArgType = "ByVal"; + } else if (Offset == (std::numeric_limits::max)()) { + // Offset equal to MAX(ptrdiff_t) means that the argument + // must be passed as literal, and the offset should be ignored. + intptr_t Arg = (intptr_t)TgtArgs[I]; + CALL_CL_RET_FAIL(clSetKernelArg, Kernel, I, sizeof(Arg), &Arg); + ArgType = "Scalar"; + } else { + ArgType = "Pointer"; + void *Ptr = (void *)((intptr_t)TgtArgs[I] + Offset); + if (DeviceInfo->Option.Flags.UseBuffer && + DeviceInfo->ClMemBuffers[DeviceId].count(Ptr) > 0) { + CALL_CL_RET_FAIL(clSetKernelArg, Kernel, I, sizeof(cl_mem), &Ptr); + ArgType = "ClMem"; + } else if (DeviceInfo->Option.Flags.UseSVM) { + CALL_CL_RET_FAIL(clSetKernelArgSVMPointer, Kernel, I, Ptr); + } else { + if (!DeviceInfo->isExtensionFunctionEnabled( + DeviceId, clSetKernelArgMemPointerINTELId)) { + DP("Error: Extension %s is not supported\n", + DeviceInfo->getExtensionFunctionName( + DeviceId, clSetKernelArgMemPointerINTELId)); + return OFFLOAD_FAIL; + } + CALL_CL_EXT_RET_FAIL( + DeviceId, clSetKernelArgMemPointerINTEL, Kernel, I, Ptr); + } + } + DP("Kernel %s Arg %d set successfully\n", ArgType, I); + (void)ArgType; + } + + auto &KernelProperty = DeviceInfo->KernelProperties[DeviceId][Kernel]; + std::vector ImplicitSVMArgs; + std::vector ImplicitUSMArgs; + std::map HasUSMArgs{ + {TARGET_ALLOC_DEVICE, false}, + {TARGET_ALLOC_HOST, false}, + {TARGET_ALLOC_SHARED, false} + }; + auto &AllocInfos = DeviceInfo->MemAllocInfo; + + /// Kernel-dependent implicit arguments + for (auto Ptr : KernelProperty.ImplicitArgs) { + if (!Ptr) + continue; + // "Ptr" is not always the allocation information known to libomptarget, so + // use "search" instead of "find". + auto *Info = AllocInfos[DeviceId]->search(Ptr); + if (Info) { + if ((int32_t)Info->Kind == TARGET_ALLOC_SVM) { + ImplicitSVMArgs.push_back(Ptr); + } else { + ImplicitUSMArgs.push_back(Ptr); + HasUSMArgs[Info->Kind] = true; + } + } + if (DeviceInfo->Option.Flags.UseSingleContext) { + Info = AllocInfos[DeviceInfo->NumDevices]->search(Ptr); + if (Info) { + ImplicitUSMArgs.push_back(Ptr); + HasUSMArgs[TARGET_ALLOC_HOST] = true; + } + } + } + + /// Kernel-independent implicit arguments + AllocInfos[DeviceId]->getImplicitArgs(ImplicitSVMArgs, ImplicitUSMArgs); + for (auto &ArgKind : HasUSMArgs) + if (AllocInfos[DeviceId]->hasImplicitUSMArg(ArgKind.first)) + ArgKind.second = true; + if (DeviceInfo->Option.Flags.UseSingleContext) { + auto ID = DeviceInfo->NumDevices; + AllocInfos[ID]->getImplicitArgs(ImplicitSVMArgs, ImplicitUSMArgs); + if (AllocInfos[ID]->hasImplicitUSMArg(TARGET_ALLOC_HOST)) + HasUSMArgs[TARGET_ALLOC_HOST] = true; + } + + if (ImplicitSVMArgs.size() > 0) { + DP("Calling clSetKernelExecInfo to pass %zu implicit SVM arguments " + "to kernel " DPxMOD "\n", ImplicitSVMArgs.size(), DPxPTR(Kernel)); + CALL_CL_RET_FAIL(clSetKernelExecInfo, Kernel, CL_KERNEL_EXEC_INFO_SVM_PTRS, + sizeof(void *) * ImplicitSVMArgs.size(), + ImplicitSVMArgs.data()); + } + + if (ImplicitUSMArgs.size() > 0) { + // Report non-argument USM pointers to the runtime. + DP("Calling clSetKernelExecInfo to pass %zu implicit USM arguments " + "to kernel " DPxMOD "\n", ImplicitUSMArgs.size(), DPxPTR(Kernel)); + CALL_CL_RET_FAIL(clSetKernelExecInfo, Kernel, + CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL, + sizeof(void *) * ImplicitUSMArgs.size(), + ImplicitUSMArgs.data()); + // Mark the kernel as supporting indirect USM accesses, otherwise, + // clEnqueueNDRangeKernel call below will fail. + cl_bool KernelSupportsUSM = CL_TRUE; + if (HasUSMArgs[TARGET_ALLOC_HOST]) + CALL_CL_RET_FAIL(clSetKernelExecInfo, Kernel, + CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL, + sizeof(cl_bool), &KernelSupportsUSM); + if (HasUSMArgs[TARGET_ALLOC_DEVICE]) + CALL_CL_RET_FAIL(clSetKernelExecInfo, Kernel, + CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL, + sizeof(cl_bool), &KernelSupportsUSM); + if (HasUSMArgs[TARGET_ALLOC_SHARED]) + CALL_CL_RET_FAIL(clSetKernelExecInfo, Kernel, + CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL, + sizeof(cl_bool), &KernelSupportsUSM); + } + + cl_event Event; + CALL_CL_RET_FAIL(clEnqueueNDRangeKernel, DeviceInfo->Queues[DeviceId], + Kernel, 3, nullptr, GlobalWorkSize, + LocalWorkSize, 0, nullptr, &Event); + + DeviceInfo->Mutexes[DeviceId].unlock(); + + DP("Started executing kernel.\n"); + + CALL_CL_RET_FAIL(clWaitForEvents, 1, &Event); + if (DeviceInfo->Option.Flags.EnableProfile) { + std::vector Buf; + size_t BufSize; + CALL_CL_RET_FAIL(clGetKernelInfo, Kernel, CL_KERNEL_FUNCTION_NAME, 0, + nullptr, &BufSize); + std::string KernelName("Kernel "); + if (BufSize > 0) { + Buf.resize(BufSize); + CALL_CL_RET_FAIL(clGetKernelInfo, Kernel, CL_KERNEL_FUNCTION_NAME, + Buf.size(), Buf.data(), nullptr); + KernelName += Buf.data(); + } + DeviceInfo->getProfiles(DeviceId).update(KernelName.c_str(), Event); + } + DP("Successfully finished kernel execution.\n"); + + return OFFLOAD_SUCCESS; +} + +void *RTLDeviceInfoTy::allocDataClMem(int32_t DeviceId, size_t Size) { + cl_mem ret = nullptr; + cl_int rc; + + CALL_CL_RVRC(ret, clCreateBuffer, rc, getContext(DeviceId), + CL_MEM_READ_WRITE, Size, nullptr); + if (rc != CL_SUCCESS) + return nullptr; + + std::unique_lock lock(Mutexes[DeviceId]); + ClMemBuffers[DeviceId].insert((void *)ret); + + DP("Allocated cl_mem data " DPxMOD "\n", DPxPTR(ret)); + return (void *)ret; +} + +uint32_t RTLDeviceInfoTy::getPCIDeviceId(int32_t DeviceId) { + if (Extensions[DeviceId].DeviceAttributeQuery == ExtensionStatusEnabled) + return DeviceProperties[DeviceId].DeviceId; + + uint32_t Id = 0; +#ifndef _WIN32 + // Linux: Device name contains "[0xABCD]" device identifier. + if (Option.DeviceType == CL_DEVICE_TYPE_GPU) { + std::string DeviceName(Names[DeviceId].data()); + auto P = DeviceName.rfind("["); + if (P != std::string::npos && DeviceName.size() - P >= 8) + Id = std::strtol(DeviceName.substr(P + 1, 6).c_str(), nullptr, 16); + } +#endif + return Id; +} + +uint64_t RTLDeviceInfoTy::getDeviceArch(int32_t DeviceId) { + if (Option.DeviceType == CL_DEVICE_TYPE_CPU) + return DeviceArch_x86_64; + + uint32_t PCIDeviceId = getPCIDeviceId(DeviceId); + if (PCIDeviceId != 0) { + for (auto &Arch : DeviceArchMap) + for (auto Id : Arch.second) + if (PCIDeviceId == Id || (PCIDeviceId & 0xFF00) == Id) + return Arch.first; // Exact match or prefix match + } + + std::string DeviceName(Names[DeviceId].data()); +#ifdef _WIN32 + // Windows: Device name contains published product name. + for (auto &Arch : DeviceArchStrMap) + for (auto Str : Arch.second) + if (DeviceName.find(Str) != std::string::npos) + return Arch.first; +#endif + + DP("Warning: Cannot decide device arch for %s.\n", DeviceName.c_str()); + return DeviceArch_None; +} + +cl_unified_shared_memory_type_intel RTLDeviceInfoTy::getMemAllocType( + int32_t DeviceId, const void *Ptr) { + cl_unified_shared_memory_type_intel MemType = CL_MEM_TYPE_UNKNOWN_INTEL; + CALL_CL_EXT_RET(DeviceId, MemType, clGetMemAllocInfoINTEL, + getContext(DeviceId), Ptr, CL_MEM_ALLOC_TYPE_INTEL, + sizeof(MemType), &MemType, nullptr); + return MemType; +} + +const KernelInfoTy *RTLDeviceInfoTy::getKernelInfo( + int32_t DeviceId, const cl_kernel &Kernel) const { + for (auto &Program : Programs[DeviceId]) { + auto *KernelInfo = Program.getKernelInfo(Kernel); + if (KernelInfo) + return KernelInfo; + } + + return nullptr; +} + +// Get memory attributes for the given allocation size. +std::unique_ptr> +RTLDeviceInfoTy::getAllocMemProperties(int32_t DeviceId, size_t Size) { + std::vector Properties; + Properties.push_back(0); + + return std::make_unique>( + std::move(Properties)); +} + +void SpecConstantsTy::setProgramConstants( + int32_t DeviceId, cl_program Program) const { + cl_int Rc; + + if (!DeviceInfo->isExtensionFunctionEnabled( + DeviceId, clSetProgramSpecializationConstantId)) { + DP("Error: Extension %s is not supported.\n", + DeviceInfo->getExtensionFunctionName( + DeviceId, clSetProgramSpecializationConstantId)); + return; + } + + for (int I = ConstantValues.size(); I > 0; --I) { + cl_uint Id = static_cast(ConstantIds[I - 1]); + size_t Size = ConstantValueSizes[I - 1]; + const void *Val = ConstantValues[I - 1]; + CALL_CL_EXT_SILENT(DeviceId, Rc, clSetProgramSpecializationConstant, + Program, Id, Size, Val); + if (Rc == CL_SUCCESS) + DP("Set specialization constant '0x%X'\n", static_cast(Id)); + } +} + +int32_t ExtensionsTy::getExtensionsInfoForDevice(int32_t DeviceNum) { + // Identify the size of OpenCL extensions string. + size_t RetSize = 0; + // If the below call fails, some extensions's status may be + // left ExtensionStatusUnknown, so only ExtensionStatusEnabled + // actually means that the extension is enabled. + DP("Getting extensions for device %d\n", DeviceNum); + + cl_device_id DeviceId = DeviceInfo->Devices[DeviceNum]; + CALL_CL_RET_FAIL(clGetDeviceInfo, DeviceId, CL_DEVICE_EXTENSIONS, 0, nullptr, + &RetSize); + + std::unique_ptr Data(new char[RetSize]); + CALL_CL_RET_FAIL(clGetDeviceInfo, DeviceId, CL_DEVICE_EXTENSIONS, RetSize, + Data.get(), &RetSize); + + std::string Extensions(Data.get()); + DP("Device extensions: %s\n", Extensions.c_str()); + + if (UnifiedSharedMemory == ExtensionStatusUnknown && + Extensions.find("cl_intel_unified_shared_memory") != std::string::npos) { + UnifiedSharedMemory = ExtensionStatusEnabled; + DP("Extension UnifiedSharedMemory enabled.\n"); + } + + if (DeviceAttributeQuery == ExtensionStatusUnknown && + Extensions.find("cl_intel_device_attribute_query") != std::string::npos) { + DeviceAttributeQuery = ExtensionStatusEnabled; + DP("Extension DeviceAttributeQuery enabled.\n"); + } + + // Check if the extension was not explicitly disabled, i.e. + // that its current status is unknown. + if (GetDeviceGlobalVariablePointer == ExtensionStatusUnknown) + // FIXME: use the right extension name. + if (Extensions.find("") != std::string::npos) { + GetDeviceGlobalVariablePointer = ExtensionStatusEnabled; + DP("Extension clGetDeviceGlobalVariablePointerINTEL enabled.\n"); + } + + if (SuggestedGroupSize == ExtensionStatusUnknown) + // FIXME: use the right extension name. + if (Extensions.find("") != std::string::npos) { + SuggestedGroupSize = ExtensionStatusEnabled; + DP("Extension clGetKernelSuggestedLocalWorkSizeINTEL enabled.\n"); + } + + std::for_each(LibdeviceExtensions.begin(), LibdeviceExtensions.end(), + [&Extensions](LibdeviceExtDescTy &Desc) { + if (Desc.Status == ExtensionStatusUnknown) + if (Extensions.find(Desc.Name) != std::string::npos) { + Desc.Status = ExtensionStatusEnabled; + DP("Extension %s enabled.\n", Desc.Name); + } + }); + + return CL_SUCCESS; +} + +int32_t DevicePropertiesTy::getDeviceProperties(cl_device_id ID) { + CALL_CL_RET_FAIL(clGetDeviceInfo, ID, CL_DEVICE_ID_INTEL, sizeof(cl_uint), + &DeviceId, nullptr); + CALL_CL_RET_FAIL(clGetDeviceInfo, ID, CL_DEVICE_NUM_SLICES_INTEL, + sizeof(cl_uint), &NumSlices, nullptr); + CALL_CL_RET_FAIL(clGetDeviceInfo, ID, + CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL, sizeof(cl_uint), + &NumSubslicesPerSlice, nullptr); + CALL_CL_RET_FAIL(clGetDeviceInfo, ID, CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL, + sizeof(cl_uint), &NumEUsPerSubslice, nullptr); + CALL_CL_RET_FAIL(clGetDeviceInfo, ID, CL_DEVICE_NUM_THREADS_PER_EU_INTEL, + sizeof(cl_uint), &NumThreadsPerEU, nullptr); + + NumHWThreads = + NumSlices * NumSubslicesPerSlice * NumEUsPerSubslice * NumThreadsPerEU; + + return OFFLOAD_SUCCESS; +} + +OpenCLProgramTy::~OpenCLProgramTy() { + for (auto Kernel : Kernels) { + if (Kernel) + CALL_CL_RET_VOID(clReleaseKernel, Kernel); + } + for (auto PGM : Programs) { + CALL_CL_RET_VOID(clReleaseProgram, PGM); + } + if (RequiresProgramLink) { + CALL_CL_RET_VOID(clReleaseProgram, FinalProgram); + } + // Unload offload entries + for (auto &Entry : OffloadEntries) + delete[] Entry.Base.name; +} + +/// Add program read from a single section +int32_t OpenCLProgramTy::addProgramIL(const size_t Size, + const unsigned char *Image) { + cl_program PGM; + cl_int RC; + CALL_CL_RVRC(PGM, clCreateProgramWithIL, RC, Context, Image, Size); + + auto Flags = DeviceInfo->Option.Flags; + + if (RC != CL_SUCCESS || Flags.ShowBuildLog) + debugPrintBuildLog(PGM, Device); + + if (RC != CL_SUCCESS) { + DP("Error: Failed to create program from SPIR-V: %d\n", RC); + return OFFLOAD_FAIL; + } + + DeviceInfo->Option.CommonSpecConstants.setProgramConstants(DeviceId, PGM); + Programs.push_back(PGM); + IsBinary = false; + + // First SPIR-V image is expected to be the only image or the first image + // that contains global information. We also add fallback libdevice image + // here if required. + if (Programs.size() == 1 && Flags.LinkLibDevice) { + auto &Extensions = DeviceInfo->Extensions[DeviceId].LibdeviceExtensions; + for (auto &ExtDesc : Extensions) { + if (ExtDesc.Status == ExtensionStatusEnabled) { + DP("Fallback libdevice RTL %s is not required.\n", + ExtDesc.FallbackLibName); + continue; + } + // Device runtime does not support this libdevice extension, + // so we have to link in the fallback implementation. + // + // TODO: the device image must specify which libdevice extensions + // are actually required. We should link only the required + // fallback implementations. + PGM = createProgramFromFile(ExtDesc.FallbackLibName, DeviceId); + if (PGM) { + DP("Added fallback libdevice RTL %s.\n", ExtDesc.FallbackLibName); + Programs.push_back(PGM); + } else { + DP("Cannot add fallback libdeice RTL %s.\n", ExtDesc.FallbackLibName); + } + } + } + return OFFLOAD_SUCCESS; +} + +int32_t OpenCLProgramTy::addProgramBIN(const size_t Size, + const unsigned char *Image) { + cl_program PGM; + cl_int RC; + CALL_CL_RVRC(PGM, clCreateProgramWithBinary, RC, Context, 1, &Device, + &Size, &Image, nullptr); + + if (RC != CL_SUCCESS || DeviceInfo->Option.Flags.ShowBuildLog) + debugPrintBuildLog(PGM, Device); + + if (RC != CL_SUCCESS) { + DP("Error: Failed to create program from binary: %d\n", RC); + return OFFLOAD_FAIL; + } + + DeviceInfo->Option.CommonSpecConstants.setProgramConstants(DeviceId, PGM); + Programs.push_back(PGM); + IsBinary = true; + return OFFLOAD_SUCCESS; +} + +int32_t OpenCLProgramTy::buildPrograms(std::string &CompilationOptions, + std::string &LinkingOptions) { + int32_t RC; + + uint64_t MajorVer, MinorVer; + if (!isValidOneOmpImage(Image, MajorVer, MinorVer)) { + // Handle legacy plain SPIR-V image. + char *ImgBegin = reinterpret_cast(Image->ImageStart); + char *ImgEnd = reinterpret_cast(Image->ImageEnd); + size_t ImgSize = ImgEnd - ImgBegin; + dumpImageToFile(ImgBegin, ImgSize, "OpenMP"); + if (addProgramIL(ImgSize, (unsigned char *)ImgBegin) != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + return OFFLOAD_SUCCESS; + } + + // Iterate over the images and pick the first one that fits. + char *ImgBegin = reinterpret_cast(Image->ImageStart); + char *ImgEnd = reinterpret_cast(Image->ImageEnd); + size_t ImgSize = ImgEnd - ImgBegin; + ElfL E(ImgBegin, ImgSize); + assert(E.isValidElf() && + "isValidOneOmpImage() returns true for invalid ELF image."); + assert(MajorVer == 1 && MinorVer == 0 && + "FIXME: update image processing for new oneAPI OpenMP version."); + // Collect auxiliary information. + uint64_t ImageCount = 0; + uint64_t MaxImageIdx = 0; + struct V1ImageInfo { + // 0 - native, 1 - SPIR-V + uint64_t Format = (std::numeric_limits::max)(); + std::string CompileOpts; + std::string LinkOpts; + const uint8_t *Begin; + uint64_t Size; + + V1ImageInfo(uint64_t Format, std::string CompileOpts, + std::string LinkOpts, const uint8_t *Begin, uint64_t Size) + : Format(Format), CompileOpts(CompileOpts), + LinkOpts(LinkOpts), Begin(Begin), Size(Size) {} + }; + + std::unordered_map AuxInfo; + + for (auto I = E.section_notes_begin(), IE = E.section_notes_end(); I != IE; + ++I) { + ElfLNote Note = *I; + if (Note.getNameSize() == 0) + continue; + std::string NameStr(Note.getName(), Note.getNameSize()); + if (NameStr != "INTELONEOMPOFFLOAD") + continue; + uint64_t Type = Note.getType(); + std::string DescStr(reinterpret_cast(Note.getDesc()), + Note.getDescSize()); + switch (Type) { + default: + DP("Warning: unrecognized INTELONEOMPOFFLOAD note.\n"); + break; + case NT_INTEL_ONEOMP_OFFLOAD_VERSION: + break; + case NT_INTEL_ONEOMP_OFFLOAD_IMAGE_COUNT: + ImageCount = std::stoull(DescStr); + break; + case NT_INTEL_ONEOMP_OFFLOAD_IMAGE_AUX: { + std::vector Parts; + do { + auto DelimPos = DescStr.find('\0'); + if (DelimPos == std::string::npos) { + Parts.push_back(DescStr); + break; + } + Parts.push_back(DescStr.substr(0, DelimPos)); + DescStr.erase(0, DelimPos + 1); + } while (Parts.size() < 4); + + // Ignore records with less than 4 strings. + if (Parts.size() != 4) { + DP("Warning: short NT_INTEL_ONEOMP_OFFLOAD_IMAGE_AUX " + "record is ignored.\n"); + continue; + } + + uint64_t Idx = std::stoull(Parts[0]); + MaxImageIdx = (std::max)(MaxImageIdx, Idx); + if (AuxInfo.find(Idx) != AuxInfo.end()) { + DP("Warning: duplicate auxiliary information for image %" PRIu64 + " is ignored.\n", Idx); + continue; + } + AuxInfo.emplace(std::piecewise_construct, + std::forward_as_tuple(Idx), + std::forward_as_tuple(std::stoull(Parts[1]), + Parts[2], Parts[3], + // Image pointer and size + // will be initialized later. + nullptr, 0)); + } + } + } + + if (MaxImageIdx >= ImageCount) + DP("Warning: invalid image index found in auxiliary information.\n"); + + for (auto I = E.sections_begin(), IE = E.sections_end(); I != IE; ++I) { + const char *Prefix = "__openmp_offload_spirv_"; + std::string SectionName((*I).getName() ? (*I).getName() : ""); + if (SectionName.find(Prefix) != 0) + continue; + SectionName.erase(0, std::strlen(Prefix)); + uint64_t Idx = std::stoull(SectionName); + if (Idx >= ImageCount) { + DP("Warning: ignoring image section (index %" PRIu64 + " is out of range).\n", Idx); + continue; + } + + auto AuxInfoIt = AuxInfo.find(Idx); + if (AuxInfoIt == AuxInfo.end()) { + DP("Warning: ignoring image section (no aux info).\n"); + continue; + } + + AuxInfoIt->second.Begin = (*I).getContents(); + AuxInfoIt->second.Size = (*I).getSize(); + } + + for (uint64_t Idx = 0; Idx < ImageCount; ++Idx) { + auto It = AuxInfo.find(Idx); + if (It == AuxInfo.end()) { + DP("Warning: image %" PRIu64 + " without auxiliary information is ingored.\n", Idx); + continue; + } + + const unsigned char *ImgBegin = + reinterpret_cast(It->second.Begin); + size_t ImgSize = It->second.Size; + dumpImageToFile(ImgBegin, ImgSize, "OpenMP"); + + if (It->second.Format == 0) { + // Native format. + RC = addProgramBIN(ImgSize, ImgBegin); + } else if (It->second.Format == 1) { + // SPIR-V format. + RC = addProgramIL(ImgSize, ImgBegin); + } else { + DP("Warning: image %" PRIu64 "is ignored due to unknown format.\n", Idx); + continue; + } + + if (RC != OFFLOAD_SUCCESS) + continue; + + DP("Created offload program from image #%" PRIu64 ".\n", Idx); + if (DeviceInfo->Option.Flags.UseImageOptions) { + CompilationOptions += " " + It->second.CompileOpts; + LinkingOptions += " " + It->second.LinkOpts; + } + + return OFFLOAD_SUCCESS; + } + + return OFFLOAD_FAIL; +} + +int32_t OpenCLProgramTy::compilePrograms(std::string &CompOptions, + std::string &LinkOptions) { + if (IsBinary && Programs.size() > 1) { + // Nothing to be done for split-kernel binaries as later linking step + // falls back to SPIR-V recompilation. + DP("Skipping compilation for multiple binary images.\n"); + RequiresProgramLink = true; + return OFFLOAD_SUCCESS; + } + + cl_int RC; + auto &Flags = DeviceInfo->Option.Flags; + + if (Programs.size() == 1 && + (IsBinary || Flags.EnableSimd || + // Work around GPU API issue: clCompileProgram/clLinkProgram + // does not work with -vc-codegen, so we have to use clBuildProgram. + CompOptions.find(" -vc-codegen ") != std::string::npos)) { + auto BuildOptions = CompOptions + " " + LinkOptions; + CALL_CL(RC, clBuildProgram, Programs[0], 0, nullptr, BuildOptions.c_str(), + nullptr, nullptr); + if (RC != CL_SUCCESS || Flags.ShowBuildLog) { + debugPrintBuildLog(Programs[0], Device); + if (RC != CL_SUCCESS) { + DP("Error: Failed to build program: %d\n", RC); + return OFFLOAD_FAIL; + } + } + RequiresProgramLink = false; + return OFFLOAD_SUCCESS; + } + + // Single or multiple SPIR-V programs are compiled + for (auto &PGM : Programs) { + CALL_CL(RC, clCompileProgram, PGM, 0, nullptr, CompOptions.c_str(), 0, + nullptr, nullptr, nullptr, nullptr); + if (RC != CL_SUCCESS || Flags.ShowBuildLog) { + debugPrintBuildLog(PGM, Device); + if (RC != CL_SUCCESS) { + DP("Error: Failed to compile program: %d\n", RC); + return OFFLOAD_FAIL; + } + } + } + + RequiresProgramLink = true; + return OFFLOAD_SUCCESS; +} + +int32_t OpenCLProgramTy::linkPrograms(std::string &LinkOptions) { + if (!RequiresProgramLink) { + FinalProgram = Programs[0]; + DP("Program linking is not required.\n"); + return OFFLOAD_SUCCESS; + } + + cl_int RC; + CALL_CL_RVRC(FinalProgram, clLinkProgram, RC, Context, 1, &Device, + LinkOptions.c_str(), Programs.size(), Programs.data(), nullptr, + nullptr); + if (RC != CL_SUCCESS || DeviceInfo->Option.Flags.ShowBuildLog) { + debugPrintBuildLog(FinalProgram, Device); + if (RC != CL_SUCCESS) { + DP("Error: Failed to link program: %d\n", RC); + return OFFLOAD_FAIL; + } + } + + DP("Successfully linked %zu programs.\n", Programs.size()); + + return OFFLOAD_SUCCESS; +} + +int32_t OpenCLProgramTy::buildKernels() { + size_t NumEntries = (size_t)(Image->EntriesEnd - Image->EntriesBegin); + + Entries.resize(NumEntries); + Kernels.resize(NumEntries); + + ProfileIntervalTy EntriesTimer("OffloadEntriesInit", DeviceId); + EntriesTimer.start(); + if (!loadOffloadTable(NumEntries)) + DP("Warning: could not load offload table.\n"); + EntriesTimer.stop(); + + // We are supposed to have a single final program at this point + for (size_t I = 0; I < NumEntries; I++) { + // Size is 0 means that it is kernel function. + auto Size = Image->EntriesBegin[I].size; + char *Name = Image->EntriesBegin[I].name; + + if (Size != 0) { + EntriesTimer.start(); + void *HostAddr = Image->EntriesBegin[I].addr; + void *TgtAddr = getOffloadVarDeviceAddr(Name, Size); + + if (!TgtAddr) { + TgtAddr = __tgt_rtl_data_alloc(DeviceId, Size, HostAddr, + TARGET_ALLOC_DEFAULT); + __tgt_rtl_data_submit(DeviceId, TgtAddr, HostAddr, Size); + DP("Warning: global variable '%s' allocated. " + "Direct references will not work properly.\n", Name); + } + + DP("Global variable mapped: Name = %s, Size = %zu, " + "HostPtr = " DPxMOD ", TgtPtr = " DPxMOD "\n", + Name, Size, DPxPTR(HostAddr), DPxPTR(TgtAddr)); + Entries[I].addr = TgtAddr; + Entries[I].name = Name; + Entries[I].size = Size; + Kernels[I] = nullptr; + EntriesTimer.stop(); + continue; + } + +#if _WIN32 + // FIXME: temporary allow zero padding bytes in the entries table + // added by MSVC linker (e.g. for incremental linking). + if (!Name) { + // Initialize the members to be on the safe side. + DP("Warning: Entry with a nullptr name!!!\n"); + Entries[I].addr = nullptr; + Entries[I].name = nullptr; + continue; + } +#endif // _WIN32 + cl_int RC; + CALL_CL_RVRC(Kernels[I], clCreateKernel, RC, FinalProgram, Name); + if (RC != CL_SUCCESS) { + // If a kernel was deleted by optimizations (e.g. DCE), then + // clCreateKernel will fail. We expect that such a kernel + // will never be actually invoked. + DP("Warning: Failed to create kernel %s, %d\n", Name, RC); + Kernels[I] = nullptr; + } + Entries[I].addr = &Kernels[I]; + Entries[I].name = Name; + + // Do not try to query information for deleted kernels. + if (!Kernels[I]) + continue; + + if (!readKernelInfo(Entries[I])) { + DP("Error: failed to read kernel info for kernel %s\n", Name); + return OFFLOAD_FAIL; + } + + // Retrieve kernel group size info. + auto Kernel = Kernels[I]; + auto &KernelProperty = DeviceInfo->KernelProperties[DeviceId][Kernel]; + CALL_CL_RET_FAIL(clGetKernelWorkGroupInfo, Kernel, Device, + CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, + sizeof(size_t), &KernelProperty.Width, nullptr); + CALL_CL_RET_FAIL(clGetKernelSubGroupInfo, Kernel, Device, + CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE, sizeof(size_t), + &KernelProperty.SIMDWidth, sizeof(size_t), + &KernelProperty.SIMDWidth, nullptr); + if (KernelProperty.SIMDWidth == 0) { + // clGetKernelSubGroupInfo is not supported on Windows with CPU device, so + // assign default value to avoid any issues when using this variable. + KernelProperty.SIMDWidth = KernelProperty.Width / 2; + } + assert(KernelProperty.SIMDWidth <= KernelProperty.Width && + "Invalid preferred group size multiple."); + CALL_CL_RET_FAIL(clGetKernelWorkGroupInfo, Kernel, Device, + CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), + &KernelProperty.MaxThreadGroupSize, nullptr); + + if (DebugLevel > 0) { + // Show kernel information + std::vector Buf; + size_t BufSize; + cl_uint NumArgs = 0; + CALL_CL_RET_FAIL(clGetKernelInfo, Kernel, CL_KERNEL_NUM_ARGS, + sizeof(cl_uint), &NumArgs, nullptr); + DP("Kernel %zu: Name = %s, NumArgs = %" PRIu32 "\n", I, Name, NumArgs); + for (cl_uint J = 0; J < NumArgs; J++) { + // clGetKernelArgInfo is not supposed to work unless the program is + // built with clCreateProgramWithSource according to the specification. + // We still allow this if the backend RT is capable of returning the + // argument information without using clCreateProgramWithSource. + CALL_CL_SILENT(RC, clGetKernelArgInfo, Kernel, J, + CL_KERNEL_ARG_TYPE_NAME, 0, nullptr, &BufSize); + if (RC != CL_SUCCESS) + break; // Kernel argument info won't be available + Buf.resize(BufSize); + CALL_CL_RET_FAIL(clGetKernelArgInfo, Kernel, J, + CL_KERNEL_ARG_TYPE_NAME, BufSize, Buf.data(), nullptr); + std::string TypeName = Buf.data(); + CALL_CL_RET_FAIL(clGetKernelArgInfo, Kernel, J, CL_KERNEL_ARG_NAME, 0, + nullptr, &BufSize); + Buf.resize(BufSize); + CALL_CL_RET_FAIL(clGetKernelArgInfo, Kernel, J, CL_KERNEL_ARG_NAME, + BufSize, Buf.data(), nullptr); + DP(" Arg %2" PRIu32 ": %s %s\n", J, TypeName.c_str(), + Buf.data() ? Buf.data() : "undefined"); + } + } + } + + Table.EntriesBegin = &(Entries.data()[0]); + Table.EntriesEnd = &(Entries.data()[Entries.size()]); + + return OFFLOAD_SUCCESS; +} + +bool OpenCLProgramTy::readKernelInfo(const __tgt_offload_entry &KernelEntry) { + const cl_kernel *KernelPtr = + reinterpret_cast(KernelEntry.addr); + const char *Name = KernelEntry.name; + std::string InfoVarName(Name); + InfoVarName += "_kernel_info"; + size_t InfoVarSize = 0; + void *InfoVarAddr = getVarDeviceAddr(InfoVarName.c_str(), &InfoVarSize); + // If there is no kernel info variable, then the kernel might have been + // produced by older toolchain - this is acceptable, so return success. + if (!InfoVarAddr) + return true; + if (InfoVarSize == 0) { + DP("Error: kernel info variable cannot have 0 size.\n"); + return false; + } + std::vector InfoBuffer; + InfoBuffer.resize(InfoVarSize); + CALL_CL_EXT_RET(DeviceId, false, clEnqueueMemcpyINTEL, + DeviceInfo->Queues[DeviceId], + /*blocking=*/CL_TRUE, InfoBuffer.data(), + InfoVarAddr, InfoVarSize, + /*num_events_in_wait_list=*/0, + /*event_wait_list=*/nullptr, + /*event=*/nullptr); + // TODO: add support for big-endian devices, if needed. + // Currently supported devices are little-endian. + char *ReadPtr = InfoBuffer.data(); + uint32_t Version = llvm::support::endian::read32le(ReadPtr); + if (Version == 0) { + DP("Error: version 0 of kernel info structure is illegal.\n"); + return false; + } + if (Version > 4) { + DP("Error: unsupported version (%" PRIu32 ") of kernel info structure.\n", + Version); + DP("Error: please use newer OpenMP offload runtime.\n"); + return false; + } + ReadPtr += 4; + uint32_t KernelArgsNum = llvm::support::endian::read32le(ReadPtr); + size_t ExpectedInfoVarSize = static_cast(KernelArgsNum) * 8 + 8; + // Support Attributes1 since version 2. + if (Version > 1) + ExpectedInfoVarSize += 8; + // Support WGNum since version 3. + if (Version > 2) + ExpectedInfoVarSize += 8; + // Support WINum since version 4. + if (Version > 3) + ExpectedInfoVarSize += 8; + if (InfoVarSize != ExpectedInfoVarSize) { + DP("Error: expected kernel info variable size %zu - got %zu\n", + ExpectedInfoVarSize, InfoVarSize); + return false; + } + KernelInfoTy Info(Version); + ReadPtr += 4; + for (uint64_t I = 0; I < KernelArgsNum; ++I) { + bool ArgIsLiteral = (llvm::support::endian::read32le(ReadPtr) != 0); + ReadPtr += 4; + uint32_t ArgSize = llvm::support::endian::read32le(ReadPtr); + ReadPtr += 4; + Info.addArgInfo(ArgIsLiteral, ArgSize); + } + + if (Version > 1) { + // Read 8-byte Attributes1 since version 2. + uint64_t Attributes1 = llvm::support::endian::read64le(ReadPtr); + Info.setAttributes1(Attributes1); + ReadPtr += 8; + } + + if (Version > 2) { + // Read 8-byte WGNum since version 3. + uint32_t WGNum = llvm::support::endian::read64le(ReadPtr); + Info.setWGNum(WGNum); + ReadPtr += 8; + } + + if (Version > 3) { + // Read 8-byte WGNum since version 3. + uint32_t WINum = llvm::support::endian::read64le(ReadPtr); + Info.setWINum(WINum); + ReadPtr += 8; + } + + KernelInfo.emplace(std::make_pair(*KernelPtr, std::move(Info))); + return true; +} + +bool OpenCLProgramTy::loadOffloadTable(size_t NumEntries) { + const char *OffloadTableSizeVarName = "__omp_offloading_entries_table_size"; + void *OffloadTableSizeVarAddr = + getVarDeviceAddr(OffloadTableSizeVarName, sizeof(int64_t)); + + if (!OffloadTableSizeVarAddr) { + DP("Warning: cannot get device value for global variable '%s'.\n", + OffloadTableSizeVarName); + return false; + } + + int64_t TableSizeVal = 0; + CALL_CL_EXT_RET(DeviceId, false, clEnqueueMemcpyINTEL, + DeviceInfo->Queues[DeviceId], + CL_TRUE, &TableSizeVal, OffloadTableSizeVarAddr, + sizeof(int64_t), 0, nullptr, nullptr); + size_t TableSize = (size_t)TableSizeVal; + + if ((TableSize % sizeof(DeviceOffloadEntryTy)) != 0) { + DP("Warning: offload table size (%zu) is not a multiple of %zu.\n", + TableSize, sizeof(DeviceOffloadEntryTy)); + return false; + } + + size_t DeviceNumEntries = TableSize / sizeof(DeviceOffloadEntryTy); + + if (NumEntries != DeviceNumEntries) { + DP("Warning: number of entries in host and device " + "offload tables mismatch (%zu != %zu).\n", NumEntries, DeviceNumEntries); + } + + const char *OffloadTableVarName = "__omp_offloading_entries_table"; + void *OffloadTableVarAddr = getVarDeviceAddr(OffloadTableVarName, TableSize); + if (!OffloadTableVarAddr) { + DP("Warning: cannot get device value for global variable '%s'.\n", + OffloadTableVarName); + return false; + } + + OffloadEntries.resize(DeviceNumEntries); + CALL_CL_EXT_RET(DeviceId, false, clEnqueueMemcpyINTEL, + DeviceInfo->Queues[DeviceId], + CL_TRUE, OffloadEntries.data(), OffloadTableVarAddr, + TableSize, 0, nullptr, nullptr); + + size_t I = 0; + const char *PreviousName = ""; + bool PreviousIsVar = false; + + for (; I < DeviceNumEntries; ++I) { + DeviceOffloadEntryTy &Entry = OffloadEntries[I]; + size_t NameSize = Entry.NameSize; + void *NameTgtAddr = Entry.Base.name; + Entry.Base.name = nullptr; + + if (NameSize == 0) { + DP("Warning: offload entry (%zu) with 0 name size.\n", I); + break; + } + if (NameTgtAddr == nullptr) { + DP("Warning: offload entry (%zu) with invalid name.\n", I); + break; + } + + Entry.Base.name = new char[NameSize]; + CALL_CL_EXT_RET(DeviceId, false, clEnqueueMemcpyINTEL, + DeviceInfo->Queues[DeviceId], + CL_TRUE, Entry.Base.name, NameTgtAddr, NameSize, 0, nullptr, + nullptr); + if (strnlen(Entry.Base.name, NameSize) != NameSize - 1) { + DP("Warning: offload entry's name has wrong size.\n"); + break; + } + + int Cmp = strncmp(PreviousName, Entry.Base.name, NameSize); + if (Cmp > 0) { + DP("Warning: offload table is not sorted.\n" + "Warning: previous name is '%s'.\n" + "Warning: current name is '%s'.\n", + PreviousName, Entry.Base.name); + break; + } else if (Cmp == 0 && (PreviousIsVar || Entry.Base.size)) { + // The names are equal. This should never happen for + // offload variables, but we allow this for offload functions. + DP("Warning: duplicate names (%s) in offload table.\n", PreviousName); + break; + } + PreviousName = Entry.Base.name; + PreviousIsVar = (Entry.Base.size != 0); + } + + if (I != DeviceNumEntries) { + // Errors during the table processing. + // Deallocate all memory allocated in the loop. + for (size_t J = 0; J <= I; ++J) { + DeviceOffloadEntryTy &Entry = OffloadEntries[J]; + if (Entry.Base.name) + delete[] Entry.Base.name; + } + + OffloadEntries.clear(); + return false; + } + + if (DebugLevel > 0) { + DP("Device offload table loaded:\n"); + for (size_t I = 0; I < DeviceNumEntries; ++I) + DP("\t%zu:\t%s\n", I, OffloadEntries[I].Base.name); + } + + return true; +} + +void *OpenCLProgramTy::getOffloadVarDeviceAddr(const char *Name, size_t Size) { + DP("Looking up OpenMP global variable '%s' of size %zu bytes on device %d.\n", + Name, Size, DeviceId); + + if (!OffloadEntries.empty()) { + size_t NameSize = strlen(Name) + 1; + auto I = std::lower_bound( + OffloadEntries.begin(), OffloadEntries.end(), Name, + [NameSize](const DeviceOffloadEntryTy &E, const char *Name) { + return strncmp(E.Base.name, Name, NameSize) < 0; + }); + + if (I != OffloadEntries.end() && + strncmp(I->Base.name, Name, NameSize) == 0) { + DP("Global variable '%s' found in the offload table at position %zu.\n", + Name, std::distance(OffloadEntries.begin(), I)); + return I->Base.addr; + } + + DP("Warning: global variable '%s' was not found in the offload table.\n", + Name); + } else + DP("Warning: offload table is not loaded for device %d.\n", DeviceId); + + // Fallback to the lookup by name. + return getVarDeviceAddr(Name, Size); +} + +void *OpenCLProgramTy::getVarDeviceAddr(const char *Name, size_t *SizePtr) { + size_t DeviceSize = 0; + void *TgtAddr = nullptr; + size_t Size = *SizePtr; + bool SizeIsKnown = (Size != 0); + if (SizeIsKnown) { + DP("Looking up device global variable '%s' of size %zu bytes " + "on device %d.\n", Name, Size, DeviceId); + } else { + DP("Looking up device global variable '%s' of unknown size " + "on device %d.\n", Name, DeviceId); + } + + if (!DeviceInfo->isExtensionFunctionEnabled( + DeviceId, clGetDeviceGlobalVariablePointerINTELId)) + return nullptr; + + cl_int RC; + auto clGetDeviceGlobalVariablePointerINTELFn = + reinterpret_cast( + DeviceInfo->getExtensionFunctionPtr( + DeviceId, clGetDeviceGlobalVariablePointerINTELId)); + RC = clGetDeviceGlobalVariablePointerINTELFn( + Device, FinalProgram, Name, &DeviceSize, &TgtAddr); + + if (RC != CL_SUCCESS) { + DP("Warning: clGetDeviceGlobalVariablePointerINTEL API returned " + "nullptr for global variable '%s'.\n", Name); + DeviceSize = 0; + } else if (Size != DeviceSize && SizeIsKnown) { + DP("Warning: size mismatch for host (%zu) and device (%zu) versions " + "of global variable: %s\n. Direct references " + "to this variable will not work properly.\n", + Size, DeviceSize, Name); + DeviceSize = 0; + } + + if (DeviceSize == 0) { + DP("Warning: global variable lookup failed.\n"); + return nullptr; + } + + DP("Global variable lookup succeeded (size: %zu bytes).\n", DeviceSize); + *SizePtr = DeviceSize; + return TgtAddr; +} + +void *OpenCLProgramTy::getVarDeviceAddr(const char *Name, size_t Size) { + return getVarDeviceAddr(Name, &Size); +} + +const KernelInfoTy *OpenCLProgramTy::getKernelInfo( + const cl_kernel Kernel) const { + auto I = KernelInfo.find(Kernel); + if (I != KernelInfo.end()) + return &(I->second); + else + return nullptr; +} + +/// +/// Common plugin interface +/// + +int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) { + uint64_t MajorVer, MinorVer; + if (isValidOneOmpImage(Image, MajorVer, MinorVer)) { + DP("Target binary is a valid oneAPI OpenMP image.\n"); + return 1; + } + + DP("Target binary is *not* a valid oneAPI OpenMP image.\n"); + + // Fallback to legacy behavior, when the image is a plain + // SPIR-V file. + uint32_t MagicWord = *(uint32_t *)Image->ImageStart; + // compare magic word in little endian and big endian: + int32_t Ret = (MagicWord == 0x07230203 || MagicWord == 0x03022307); + DP("Target binary is %s\n", Ret ? "VALID" : "INVALID"); + + return Ret; +} + +int32_t __tgt_rtl_number_of_devices() { + // Assume it is thread safe, since it is called once. + + DP("Start initializing OpenCL\n"); + // get available platforms + cl_uint Count = 0; + CALL_CL_RET_ZERO(clGetPlatformIDs, 0, nullptr, &Count); + std::vector PlatformIDs(Count); + CALL_CL_RET_ZERO(clGetPlatformIDs, Count, PlatformIDs.data(), nullptr); + + // All eligible OpenCL device IDs from the platforms are stored in a list + // in the order they are probed by clGetPlatformIDs/clGetDeviceIDs. + for (cl_platform_id ID : PlatformIDs) { + std::vector Buf; + size_t BufSize; + cl_int RC; + CALL_CL(RC, clGetPlatformInfo, ID, CL_PLATFORM_VERSION, 0, nullptr, + &BufSize); + if (RC != CL_SUCCESS || BufSize == 0) + continue; + Buf.resize(BufSize); + CALL_CL(RC, clGetPlatformInfo, ID, CL_PLATFORM_VERSION, BufSize, + Buf.data(), nullptr); + // clCreateProgramWithIL() requires OpenCL 2.1. + if (RC != CL_SUCCESS || std::stof(std::string(Buf.data() + 6)) <= 2.0) { + continue; + } + cl_uint NumDevices = 0; + CALL_CL_SILENT(RC, clGetDeviceIDs, ID, DeviceInfo->Option.DeviceType, 0, + nullptr, &NumDevices); + if (RC != CL_SUCCESS || NumDevices == 0) + continue; + + DP("Platform %s has %" PRIu32 " Devices\n", + Buf.data() ? Buf.data() : "undefined", NumDevices); + std::vector Devices(NumDevices); + CALL_CL_RET_ZERO(clGetDeviceIDs, ID, DeviceInfo->Option.DeviceType, + NumDevices, Devices.data(), nullptr); + + cl_context Context = nullptr; + if (DeviceInfo->Option.Flags.UseSingleContext) { + cl_context_properties ContextProperties[] = { + CL_CONTEXT_PLATFORM, (cl_context_properties)ID, 0 + }; + CALL_CL_RVRC(Context, clCreateContext, RC, ContextProperties, + Devices.size(), Devices.data(), nullptr, nullptr); + if (RC != CL_SUCCESS) + continue; + } + + DeviceInfo->PlatformInfos.emplace(ID, PlatformInfoTy(ID, Context)); + for (auto Device : Devices) { + DeviceInfo->Devices.push_back(Device); + DeviceInfo->Platforms.push_back(ID); + } + DeviceInfo->NumDevices += NumDevices; + } + + if (!DeviceInfo->Option.Flags.UseSingleContext) + DeviceInfo->Contexts.resize(DeviceInfo->NumDevices); + DeviceInfo->Programs.resize(DeviceInfo->NumDevices); + DeviceInfo->maxExecutionUnits.resize(DeviceInfo->NumDevices); + DeviceInfo->maxWorkGroupSize.resize(DeviceInfo->NumDevices); + DeviceInfo->MaxMemAllocSize.resize(DeviceInfo->NumDevices); + DeviceInfo->DeviceProperties.resize(DeviceInfo->NumDevices); + DeviceInfo->Extensions.resize(DeviceInfo->NumDevices); + DeviceInfo->Queues.resize(DeviceInfo->NumDevices); + DeviceInfo->QueuesInOrder.resize(DeviceInfo->NumDevices, nullptr); + DeviceInfo->KernelProperties.resize(DeviceInfo->NumDevices); + DeviceInfo->ClMemBuffers.resize(DeviceInfo->NumDevices); + DeviceInfo->ImplicitArgs.resize(DeviceInfo->NumDevices); + DeviceInfo->Profiles.resize(DeviceInfo->NumDevices); + DeviceInfo->Names.resize(DeviceInfo->NumDevices); + DeviceInfo->DeviceArchs.resize(DeviceInfo->NumDevices); + DeviceInfo->Initialized.resize(DeviceInfo->NumDevices); + DeviceInfo->SLMSize.resize(DeviceInfo->NumDevices); + DeviceInfo->Mutexes = new std::mutex[DeviceInfo->NumDevices]; + DeviceInfo->ProfileLocks = new std::mutex[DeviceInfo->NumDevices]; + DeviceInfo->OwnedMemory.resize(DeviceInfo->NumDevices); + DeviceInfo->NumActiveKernels.resize(DeviceInfo->NumDevices, 0); + + // Host allocation information needs one additional slot + for (uint32_t I = 0; I < DeviceInfo->NumDevices + 1; I++) + DeviceInfo->MemAllocInfo.emplace_back(new MemAllocInfoMapTy()); + + // get device specific information + for (unsigned I = 0; I < DeviceInfo->NumDevices; I++) { + size_t BufSize; + cl_int RC; + cl_device_id DeviceId = DeviceInfo->Devices[I]; + CALL_CL(RC, clGetDeviceInfo, DeviceId, CL_DEVICE_NAME, 0, nullptr, + &BufSize); + if (RC != CL_SUCCESS || BufSize == 0) + continue; + DeviceInfo->Names[I].resize(BufSize); + CALL_CL(RC, clGetDeviceInfo, DeviceId, CL_DEVICE_NAME, BufSize, + DeviceInfo->Names[I].data(), nullptr); + if (RC != CL_SUCCESS) + continue; + DP("Device %d: %s\n", I, DeviceInfo->Names[I].data()); + CALL_CL_RET_ZERO(clGetDeviceInfo, DeviceId, CL_DEVICE_MAX_COMPUTE_UNITS, 4, + &DeviceInfo->maxExecutionUnits[I], nullptr); + DP("Number of execution units on the device is %d\n", + DeviceInfo->maxExecutionUnits[I]); + CALL_CL_RET_ZERO(clGetDeviceInfo, DeviceId, CL_DEVICE_MAX_WORK_GROUP_SIZE, + sizeof(size_t), &DeviceInfo->maxWorkGroupSize[I], nullptr); + DP("Maximum work group size for the device is %d\n", + static_cast(DeviceInfo->maxWorkGroupSize[I])); + CALL_CL_RET_ZERO(clGetDeviceInfo, DeviceId, CL_DEVICE_MAX_MEM_ALLOC_SIZE, + sizeof(cl_ulong), &DeviceInfo->MaxMemAllocSize[I], + nullptr); + DP("Maximum memory allocation size is %" PRIu64 "\n", + DeviceInfo->MaxMemAllocSize[I]); + CALL_CL_RET_ZERO(clGetDeviceInfo, DeviceId, CL_DEVICE_LOCAL_MEM_SIZE, + sizeof(cl_ulong), &DeviceInfo->SLMSize[I], nullptr); + DP("Device local mem size: %zu\n", (size_t)DeviceInfo->SLMSize[I]); + DeviceInfo->Initialized[I] = false; + } + if (DeviceInfo->NumDevices == 0) { + DP("WARNING: No OpenCL devices found.\n"); + } + +#ifndef _WIN32 + // Make sure it is registered after OCL handlers are registered. + // Registerization is done in DLLmain for Windows + if (std::atexit(closeRTL)) { + FATAL_ERROR("Registration of clean-up function"); + } +#endif //WIN32 + + return DeviceInfo->NumDevices; +} + +int32_t __tgt_rtl_init_device(int32_t DeviceId) { + cl_int RC; + DP("Initialize OpenCL device\n"); + assert(DeviceId >= 0 && (cl_uint)DeviceId < DeviceInfo->NumDevices && + "bad device id"); + + // Use out-of-order queue by default. + std::vector QProperties { + CL_QUEUE_PROPERTIES, + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE + }; + if (DeviceInfo->Option.Flags.EnableProfile) + QProperties.back() |= CL_QUEUE_PROFILING_ENABLE; + QProperties.push_back(0); + + if (!DeviceInfo->Option.Flags.UseSingleContext) { + auto Platform = DeviceInfo->Platforms[DeviceId]; + auto Device = DeviceInfo->Devices[DeviceId]; + cl_context_properties ContextProperties[] = { + CL_CONTEXT_PLATFORM, (cl_context_properties)Platform, 0 + }; + CALL_CL_RVRC(DeviceInfo->Contexts[DeviceId], clCreateContext, RC, + ContextProperties, 1, &Device, nullptr, nullptr); + if (RC != CL_SUCCESS) + return OFFLOAD_FAIL; + } + + auto CLDeviceId = DeviceInfo->Devices[DeviceId]; + auto Context = DeviceInfo->getContext(DeviceId); + CALL_CL_RVRC(DeviceInfo->Queues[DeviceId], clCreateCommandQueueWithProperties, + RC, Context, CLDeviceId, QProperties.data()); + if (RC != CL_SUCCESS) { + DP("Error: Failed to create CommandQueue: %d\n", RC); + return OFFLOAD_FAIL; + } + + auto &Extension = DeviceInfo->Extensions[DeviceId]; + Extension.getExtensionsInfoForDevice(DeviceId); + + if (Extension.DeviceAttributeQuery == ExtensionStatusEnabled) { + if (OFFLOAD_SUCCESS != + DeviceInfo->DeviceProperties[DeviceId].getDeviceProperties(CLDeviceId)) + return OFFLOAD_FAIL; + } + + DeviceInfo->DeviceArchs[DeviceId] = DeviceInfo->getDeviceArch(DeviceId); + + DeviceInfo->Initialized[DeviceId] = true; + + return OFFLOAD_SUCCESS; +} + +int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { + DP("Initialize requires flags to %" PRId64 "\n", RequiresFlags); + DeviceInfo->RequiresFlags = RequiresFlags; + return RequiresFlags; +} + +__tgt_target_table *__tgt_rtl_load_binary( + int32_t DeviceId, __tgt_device_image *Image) { + DP("Device %" PRId32 ": Loading binary from " DPxMOD "\n", DeviceId, + DPxPTR(Image->ImageStart)); + + size_t ImageSize = (size_t)Image->ImageEnd - (size_t)Image->ImageStart; + size_t NumEntries = (size_t)(Image->EntriesEnd - Image->EntriesBegin); + (void)NumEntries; + + DP("Expecting to have %zu entries defined\n", NumEntries); + + auto &Option = DeviceInfo->Option; + std::string CompilationOptions(Option.CompilationOptions + " " + + Option.UserCompilationOptions); + std::string LinkingOptions(Option.UserLinkingOptions); + + DP("Base OpenCL compilation options: %s\n", CompilationOptions.c_str()); + DP("Base OpenCL linking options: %s\n", LinkingOptions.c_str()); + + dumpImageToFile(Image->ImageStart, ImageSize, "OpenMP"); + + auto Context = DeviceInfo->getContext(DeviceId); + auto Device = DeviceInfo->Devices[DeviceId]; + DeviceInfo->Programs[DeviceId].emplace_back(Image, Context, Device, DeviceId); + auto &Program = DeviceInfo->Programs[DeviceId].back(); + + ProfileIntervalTy CompilationTimer("Compiling", DeviceId); + ProfileIntervalTy LinkingTimer("Linking", DeviceId); + + CompilationTimer.start(); + int32_t RC = Program.buildPrograms(CompilationOptions, LinkingOptions); + if (RC != OFFLOAD_SUCCESS) + return nullptr; + + // Decide final compilation/linking options + DP("Final OpenCL compilation options: %s\n", CompilationOptions.c_str()); + DP("Final OpenCL linking options: %s\n", LinkingOptions.c_str()); + // clLinkProgram drops the last symbol. Work this around temporarily. + LinkingOptions += " "; + + RC = Program.compilePrograms(CompilationOptions, LinkingOptions); + CompilationTimer.stop(); + if (RC != OFFLOAD_SUCCESS) + return nullptr; + + LinkingTimer.start(); + RC = Program.linkPrograms(LinkingOptions); + LinkingTimer.stop(); + if (RC != OFFLOAD_SUCCESS) + return nullptr; + + RC = Program.buildKernels(); + if (RC != OFFLOAD_SUCCESS) + return nullptr; + + auto *Table = Program.getTablePtr(); + + return Table; +} + +void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *HstPtr, + int32_t Kind) { + bool ImplicitArg = false; + + if (!HstPtr) { + ImplicitArg = true; + // User allocation + if (Kind != TARGET_ALLOC_DEFAULT) { + // Explicit allocation + return dataAllocExplicit(DeviceId, Size, Kind); + } + if (DeviceInfo->Option.Flags.UseBuffer) { + // Experimental CL buffer allocation + return DeviceInfo->allocDataClMem(DeviceId, Size); + } + } + return dataAlloc(DeviceId, Size, HstPtr, HstPtr, ImplicitArg); +} + +int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr, + int64_t Size) { + return submitData(DeviceId, TgtPtr, HstPtr, Size); +} + +int32_t __tgt_rtl_data_submit_async( + int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size, + __tgt_async_info *AsyncInfo /*not used*/) { + return submitData(DeviceId, TgtPtr, HstPtr, Size); +} + +int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, + int64_t Size) { + return retrieveData(DeviceId, HstPtr, TgtPtr, Size); +} + +int32_t __tgt_rtl_data_retrieve_async( + int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size, + __tgt_async_info *AsyncInfo /*not used*/) { + return retrieveData(DeviceId, HstPtr, TgtPtr, Size); +} + +int32_t __tgt_rtl_is_data_exchangable(int32_t SrcId, int32_t DstId) { + // Only support this case. We don't have any documented OpenCL behavior for + // cross-device data transfer. + if (SrcId == DstId) + return 1; + + return 0; +} + +int32_t __tgt_rtl_data_exchange(int32_t SrcId, void *SrcPtr, int32_t DstId, + void *DstPtr, int64_t Size) { + if (SrcId != DstId) + return OFFLOAD_FAIL; + + // This is OK for same-device copy with SVM or USM extension. + return __tgt_rtl_data_submit(DstId, DstPtr, SrcPtr, Size); +} + +int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr) { + DeviceInfo->Mutexes[DeviceId].lock(); + + // Deallocate cl_mem data + if (DeviceInfo->Option.Flags.UseBuffer) { + auto &ClMemBuffers = DeviceInfo->ClMemBuffers[DeviceId]; + if (ClMemBuffers.count(TgtPtr) > 0) { + ClMemBuffers.erase(TgtPtr); + CALL_CL_RET_FAIL(clReleaseMemObject, (cl_mem)TgtPtr); + return OFFLOAD_SUCCESS; + } + } + + DeviceInfo->Mutexes[DeviceId].unlock(); + + MemAllocInfoTy Info; + auto &AllocInfos = DeviceInfo->MemAllocInfo; + auto Removed = AllocInfos[DeviceId]->remove(TgtPtr, &Info); + // Try again with device-independent allocation information (host USM) + if (!Removed && DeviceInfo->Option.Flags.UseSingleContext) + Removed = AllocInfos[DeviceInfo->NumDevices]->remove(TgtPtr, &Info); + if (!Removed) { + DP("Error: Cannot find memory allocation information for " DPxMOD "\n", + DPxPTR(TgtPtr)); + return OFFLOAD_FAIL; + } + + auto Context = DeviceInfo->getContext(DeviceId); + if (DeviceInfo->Option.Flags.UseSVM) { + CALL_CL_VOID(clSVMFree, Context, Info.Base); + } else { + CALL_CL_EXT_VOID(DeviceId, clMemFreeINTEL, Context, Info.Base); + } + + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_run_target_team_region( + int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, + int32_t NumArgs, int32_t NumTeams, int32_t ThreadLimit, + uint64_t LoopTripCount /*not used*/) { + return runTargetTeamNDRegion(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, + NumArgs, NumTeams, ThreadLimit, nullptr); +} + +int32_t __tgt_rtl_run_target_team_region_async( + int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, + int32_t NumArgs, int32_t NumTeams, int32_t ThreadLimit, + uint64_t LoopTripCount /*not used*/, + __tgt_async_info *AsyncInfo /*not used*/) { + return runTargetTeamNDRegion(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, + NumArgs, NumTeams, ThreadLimit, nullptr); +} + +int32_t __tgt_rtl_run_target_region( + int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, + int32_t NumArgs) { + // use one team! + return __tgt_rtl_run_target_team_region(DeviceId, TgtEntryPtr, TgtArgs, + TgtOffsets, NumArgs, 1, 0, 0); +} + +int32_t __tgt_rtl_run_target_region_async( + int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, + int32_t NumArgs, __tgt_async_info *AsyncInfo /*not used*/) { + // use one team! + return __tgt_rtl_run_target_team_region(DeviceId, TgtEntryPtr, TgtArgs, + TgtOffsets, NumArgs, 1, 0, 0); +} + +int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *AsyncInfo) { + return OFFLOAD_SUCCESS; +} + +/// +/// Extended plugin interface +/// + +// Notify the kernel about target pointers that are not explicitly +// passed as arguments, but which are pointing to mapped objects +// that may potentially be accessed in the kernel code (e.g. PTR_AND_OBJ +// objects). +int32_t __tgt_rtl_manifest_data_for_region( + int32_t DeviceId, void *TgtEntryPtr, void **TgtPtrs, size_t NumPtrs) { + cl_kernel Kernel = *static_cast(TgtEntryPtr); + DP("Stashing %zu implicit arguments for kernel " DPxMOD "\n", NumPtrs, + DPxPTR(Kernel)); + auto &KernelProperty = DeviceInfo->KernelProperties[DeviceId][Kernel]; + std::lock_guard Lock(DeviceInfo->Mutexes[DeviceId]); + KernelProperty.ImplicitArgs.clear(); + KernelProperty.ImplicitArgs.insert(TgtPtrs, TgtPtrs + NumPtrs); + + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_requires_mapping(int32_t DeviceId, void *Ptr, int64_t Size) { + // Force mapping for host memory with positive size + int32_t Ret; + cl_unified_shared_memory_type_intel MemType = 0; + CALL_CL_EXT_RET(DeviceId, false, clGetMemAllocInfoINTEL, + DeviceInfo->getContext(DeviceId), Ptr, + CL_MEM_ALLOC_TYPE_INTEL, sizeof(MemType), &MemType, + nullptr); + if (MemType == CL_MEM_TYPE_UNKNOWN_INTEL || + (MemType == CL_MEM_TYPE_HOST_INTEL && Size > 0)) + Ret = 1; + else + Ret = 0; + + DP("Ptr " DPxMOD " %s mapping.\n", DPxPTR(Ptr), + Ret ? "requires" : "does not require"); + return Ret; +} + +// Allocate a base buffer with the given information. +void *__tgt_rtl_data_alloc_base(int32_t DeviceId, int64_t Size, void *HstPtr, + void *HstBase) { + return dataAlloc(DeviceId, Size, HstPtr, HstBase, false); +} + +// Allocate a managed memory object. +void *__tgt_rtl_data_alloc_managed(int32_t DeviceId, int64_t Size) { + int32_t Kind = DeviceInfo->Option.Flags.UseHostMemForUSM + ? TARGET_ALLOC_HOST : TARGET_ALLOC_SHARED; + return dataAllocExplicit(DeviceId, Size, Kind); +} + +void *__tgt_rtl_data_realloc( + int32_t DeviceId, void *Ptr, size_t Size, int32_t Kind) { + const MemAllocInfoTy *Info = nullptr; + + if (Ptr) { + Info = DeviceInfo->MemAllocInfo[DeviceId]->find(Ptr); + if (!Info && DeviceInfo->Option.Flags.UseSingleContext) + Info = DeviceInfo->MemAllocInfo[DeviceInfo->NumDevices]->find(Ptr); + if (!Info) { + DP("Error: Cannot find allocation information for pointer " DPxMOD "\n", + DPxPTR(Ptr)); + return nullptr; + } + if (Size <= Info->Size && Kind == Info->Kind) { + DP("Returning the same pointer " DPxMOD " as reallocation is unneeded\n", + DPxPTR(Ptr)); + return Ptr; + } + } + + int32_t AllocKind = + (Kind == TARGET_ALLOC_DEFAULT) ? TARGET_ALLOC_DEVICE : Kind; + + void *Mem = dataAllocExplicit(DeviceId, Size, AllocKind); + + if (Mem && Info) { + if (AllocKind == TARGET_ALLOC_DEVICE || Info->Kind == TARGET_ALLOC_DEVICE || + Info->Kind == TARGET_ALLOC_SVM) { + // TARGET_ALLOC_SVM is for "Device" memory type when SVM is enabled + auto Queue = DeviceInfo->Queues[DeviceId]; + if (DeviceInfo->Option.Flags.UseSVM) { + CALL_CL_RET_NULL(clEnqueueSVMMemcpy, Queue, CL_TRUE, Mem, Ptr, + Info->Size, 0, nullptr, nullptr); + } else { + CALL_CL_EXT_RET_NULL(DeviceId, clEnqueueMemcpyINTEL, Queue, CL_TRUE, + Mem, Ptr, Info->Size, 0, nullptr, nullptr); + } + } else { + std::copy_n((char *)Ptr, Info->Size, (char *)Mem); + } + auto Rc = __tgt_rtl_data_delete(DeviceId, Ptr); + if (Rc != OFFLOAD_SUCCESS) + return nullptr; + } + + return Mem; +} + +void *__tgt_rtl_get_context_handle(int32_t DeviceId) { + return (void *)DeviceInfo->getContext(DeviceId); +} + +void *__tgt_rtl_data_aligned_alloc(int32_t DeviceId, size_t Align, size_t Size, + int32_t Kind) { + if (Align != 0 && (Align & (Align - 1)) != 0) { + DP("Error: Alignment %zu is not power of two.\n", Align); + return nullptr; + } + + int32_t AllocKind = + (Kind == TARGET_ALLOC_DEFAULT) ? TARGET_ALLOC_DEVICE : Kind; + + return dataAllocExplicit(DeviceId, Size, AllocKind, Align); +} + +int32_t __tgt_rtl_run_target_team_nd_region( + int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, + int32_t NumArgs, int32_t NumTeams, int32_t ThreadLimit, void *LoopDesc) { + return runTargetTeamNDRegion(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, + NumArgs, NumTeams, ThreadLimit, LoopDesc); +} + +char *__tgt_rtl_get_device_name(int32_t DeviceId, char *Buf, size_t BufMax) { + assert(Buf && "Buf cannot be nullptr."); + assert(BufMax > 0 && "BufMax cannot be zero."); + CALL_CL_RET_NULL(clGetDeviceInfo, DeviceInfo->Devices[DeviceId], + CL_DEVICE_NAME, BufMax, Buf, nullptr); + return Buf; +} + +void __tgt_rtl_add_build_options( + const char *CompileOptions, const char *LinkOptions) { + if (CompileOptions) { + auto &compileOptions = DeviceInfo->Option.UserCompilationOptions; + if (compileOptions.empty()) { + compileOptions = std::string(CompileOptions) + " "; + } else { + DP("Respecting LIBOMPTARGET_OPENCL_COMPILATION_OPTIONS=%s\n", + compileOptions.c_str()); + } + } + if (LinkOptions) { + auto &linkOptions = DeviceInfo->Option.UserLinkingOptions; + if (linkOptions.empty()) { + linkOptions = std::string(LinkOptions) + " "; + } else { + DP("Respecting LIBOMPTARGET_OPENCL_LINKING_OPTIONS=%s\n", + linkOptions.c_str()); + } + } +} + +int32_t __tgt_rtl_is_supported_device(int32_t DeviceId, void *DeviceType) { + if (!DeviceType) + return true; + + uint64_t DeviceArch = DeviceInfo->DeviceArchs[DeviceId]; + int32_t Ret = (uint64_t)(DeviceArch & (uint64_t)DeviceType) == DeviceArch; + DP("Device %" PRIu32 " does%s match the requested device types " DPxMOD "\n", + DeviceId, Ret ? "" : " not", DPxPTR(DeviceType)); + return Ret; +} + +void __tgt_rtl_deinit(void) { + // No-op on Linux +#ifdef _WIN32 + if (DeviceInfo) { + closeRTL(); + deinit(); + } +#endif // _WIN32 +} + +int32_t __tgt_rtl_is_accessible_addr_range( + int32_t DeviceId, const void *Ptr, size_t Size) { + if (!Ptr || Size == 0) + return 0; + + auto MemType = DeviceInfo->getMemAllocType(DeviceId, Ptr); + if (MemType != CL_MEM_TYPE_HOST_INTEL && MemType != CL_MEM_TYPE_SHARED_INTEL) + return 0; + + if (MemType == CL_MEM_TYPE_HOST_INTEL && + DeviceInfo->Option.Flags.UseSingleContext) + DeviceId = DeviceInfo->NumDevices; + + if (DeviceInfo->MemAllocInfo[DeviceId]->contains(Ptr, Size)) + return 1; + else + return 0; +} + +int32_t __tgt_rtl_is_private_arg_on_host( + int32_t DeviceId, const void *TgtEntryPtr, uint32_t Idx) { + const cl_kernel *Kernel = static_cast(TgtEntryPtr); + if (!*Kernel) { + REPORT("Querying information about a deleted kernel.\n"); + return 0; + } + auto *KernelInfo = DeviceInfo->getKernelInfo(DeviceId, *Kernel); + if (!KernelInfo) + return 0; + + if (KernelInfo->isArgLiteral(Idx)) + return 1; + + return 0; +} + +void *__tgt_rtl_alloc_per_hw_thread_scratch( + int32_t DeviceId, size_t ObjSize, int32_t AllocKind) { + void *Mem = nullptr; + cl_uint NumHWThreads = DeviceInfo->DeviceProperties[DeviceId].NumHWThreads; + + if (NumHWThreads == 0) + return Mem; + + // Only support USM + cl_int RC; + auto Context = DeviceInfo->getContext(DeviceId); + auto Device = DeviceInfo->Devices[DeviceId]; + size_t AllocSize = ObjSize * NumHWThreads; + auto AllocProp = DeviceInfo->getAllocMemProperties(DeviceId, AllocSize); + + switch (AllocKind) { + case TARGET_ALLOC_HOST: + CALL_CL_EXT_RVRC(DeviceId, Mem, clHostMemAllocINTEL, RC, Context, + AllocProp->data(), AllocSize, 0 /* Align */); + break; + case TARGET_ALLOC_SHARED: + CALL_CL_EXT_RVRC(DeviceId, Mem, clSharedMemAllocINTEL, RC, Context, Device, + AllocProp->data(), AllocSize, 0 /* Align */); + break; + case TARGET_ALLOC_DEVICE: + default: + CALL_CL_EXT_RVRC(DeviceId, Mem, clDeviceMemAllocINTEL, RC, Context, Device, + AllocProp->data(), AllocSize, 0 /* Align */); + } + + if (RC != CL_SUCCESS) { + DP("Failed to allocate per-hw-thread scratch space.\n"); + return nullptr; + } + + DP("Allocated %zu byte per-hw-thread scratch space at " DPxMOD "\n", + AllocSize, DPxPTR(Mem)); + + return Mem; +} + +void __tgt_rtl_free_per_hw_thread_scratch(int32_t DeviceId, void *Ptr) { + auto Context = DeviceInfo->getContext(DeviceId); + CALL_CL_EXT_VOID(DeviceId, clMemFreeINTEL, Context, Ptr); +}