Index: llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h =================================================================== --- llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h +++ llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h @@ -56,24 +56,24 @@ struct GV { /// The size reserved for data in a shared memory slot. - const unsigned GV_Slot_Size; + unsigned GV_Slot_Size; /// The default value of maximum number of threads in a worker warp. - const unsigned GV_Warp_Size; + unsigned GV_Warp_Size; constexpr unsigned warpSlotSize() const { return GV_Warp_Size * GV_Slot_Size; } /// the maximum number of teams. - const unsigned GV_Max_Teams; + unsigned GV_Max_Teams; // An alternative to the heavy data sharing infrastructure that uses global // memory is one that uses device __shared__ memory. The amount of such space // (in bytes) reserved by the OpenMP runtime is noted here. - const unsigned GV_SimpleBufferSize; + unsigned GV_SimpleBufferSize; // The absolute maximum team size for a working group - const unsigned GV_Max_WG_Size; + unsigned GV_Max_WG_Size; // The default maximum team size for a working group - const unsigned GV_Default_WG_Size; + unsigned GV_Default_WG_Size; constexpr unsigned maxWarpNumber() const { return GV_Max_WG_Size / GV_Warp_Size; Index: openmp/libomptarget/CMakeLists.txt =================================================================== --- openmp/libomptarget/CMakeLists.txt +++ openmp/libomptarget/CMakeLists.txt @@ -85,6 +85,7 @@ # Build offloading plugins and device RTLs if they are available. add_subdirectory(plugins) +add_subdirectory(plugins-nextgen) add_subdirectory(DeviceRTL) add_subdirectory(tools) Index: openmp/libomptarget/include/Utilities.h =================================================================== --- /dev/null +++ openmp/libomptarget/include/Utilities.h @@ -0,0 +1,297 @@ +//===------- Utilities.h - Target independent OpenMP target RTL -- C++ ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Routines and classes used to provide useful functionalities like string +// parsing and environment variables. +// +//===----------------------------------------------------------------------===// + +#ifndef OPENMP_LIBOMPTARGET_INCLUDE_UTILITIES_H +#define OPENMP_LIBOMPTARGET_INCLUDE_UTILITIES_H + +#include "llvm/ADT/STLFunctionalExtras.h" + +#include "Debug.h" + +#include +#include +#include +#include +#include +#include +#include + + +namespace llvm { +namespace omp { +namespace target { + +/// Utility class implementing error code checking and function return values. +/// Do not use StatusCode_ directly, use StatusCode instead, which is defined +/// below. +template +class StatusCode_ { + enum CheckedKind { + DONE = 0, + PENDING = 1, + IGNORE = 2 + }; + + int32_t Code; + mutable CheckedKind Checked; + + explicit StatusCode_(int32_t CodeValue, CheckedKind CheckedValue) : + Code(CodeValue), Checked(CheckedValue) {} + + void setChecked() const { + if (Checked != IGNORE) + Checked = DONE; + } + +public: + /// Pre-defined status codes useful for returning and comparing. + static const StatusCode_ SUCCESS; + static const StatusCode_ FAIL; + + /// Create an empty status code. Can be destroyed without checking. + StatusCode_() : + Code(0), Checked(DONE) {} + + /// Create a status code with a specific value. Must be checked before + /// being destroyed. + explicit StatusCode_(int32_t CodeValue) + : Code(CodeValue), Checked(PENDING) {} + + /// Create a status code from another. Inherit the code only; the code has to + /// be checked independently of the other status code object. + StatusCode_(const StatusCode_ &OtherSC) + : Code(OtherSC.Code), Checked(PENDING) {} + + /// Create a status code moving it from another. Inherit the code only; the + /// code has to be checked independently of the other status code object. The + /// other status code object does not need to be checked. + StatusCode_(StatusCode_ &&OtherSC) + : Code(OtherSC.Code), Checked(PENDING) { + OtherSC.Checked = DONE; + } + + ~StatusCode_() { + assert(Checked != PENDING && "Unchecked status code"); + } + + /// Copy operator working similar as the copy constructor. + StatusCode_ &operator=(const StatusCode_ &OtherSC) { + if (this != &OtherSC) { + assert(Checked != PENDING && "Overwriting unchecked status code"); + + Code = OtherSC.Code; + Checked = (OtherSC.Checked != IGNORE) ? OtherSC.Checked : PENDING; + } + return *this; + } + + /// Move operator working similar as the move constructor. + StatusCode_ &operator=(StatusCode_ &&OtherSC) { + if (this != &OtherSC) { + assert(Checked != PENDING && "Overwriting unchecked status code"); + assert(OtherSC.Checked != IGNORE && "Predefined status cannot be moved"); + + Code = OtherSC.Code; + Checked = OtherSC.Checked; + + OtherSC.Checked = DONE; + } + return *this; + } + + /// Get the status code and set as checked. + int32_t getCode() const { + setChecked(); + return Code; + } + + /// Indicate whether the status code is successful and set as checked. + bool succeed() const { + return getCode() == 0; + } + + /// Indicate whether the status code is failed and set as checked. + bool failed() const { + return !succeed(); + } + + /// Indicate whether the status code is failed and set as checked. + operator bool() const { + return getCode(); + } + + /// Compare the status code with another object. + bool operator ==(const StatusCode_ &OtherSC) const { + setChecked(); + return (bool)Code == (bool)OtherSC.Code; + } +}; + +template +const StatusCode_ StatusCode_::SUCCESS(0, StatusCode_::IGNORE); + +template +const StatusCode_ StatusCode_::FAIL(1, StatusCode_::IGNORE); + +/// Always use StatusCode instead of StatusCode_. This trick allows using static +/// inline class members before C++17. Static inline class members are cleanly +/// supported by C++17 and beyond. +using StatusCode = StatusCode_; + +/// Utility class for parsing strings to other types. +struct StringParser { + /// Parse a string to another type. + template + static bool parse(const char *Value, Ty &Result); + + // Convert bool to "Yes" / "No" strings. + static constexpr const char *boolToYesNoString(bool Value) { + return Value ? "Yes" : "No"; + } +}; + +/// Class for reading and checking environment variables. Currently working with +/// integer, floats, std::string and bool types. +template class Envar { + std::string Name; + Ty Data; + bool IsPresent; + bool Initialized; + +public: + /// Create an empty envar. Cannot be consulted. This constructor is merely + /// for convenience. + Envar() : Name(), Data(Ty()), IsPresent(false), Initialized(false) {} + + /// Create an envar with a name and an optional default. The Envar object will + /// take the value read from the environment variable, or the default if it + /// was not set or not correct. + Envar(const char *EnvarName, Ty Default = Ty()) + : Name(EnvarName), Data(Default), IsPresent(false), Initialized(true) { + if (const char *EnvStr = getenv(EnvarName)) { + // Check whether the envar is defined and valid. + IsPresent = StringParser::parse(EnvStr, Data); + + if (!IsPresent) { + DP("Ignoring invalid value %s for envar %s\n", EnvStr, EnvarName); + Data = Default; + } + } + } + + /// Create an envar with a name, getter function and a setter function. The + /// Envar object will take the value read from the environment variable if + /// this value is accepted by the setter function. Otherwise, the getter + /// function will be executed to get the default value. The getter should be + /// of the form StatusCode GetterFunctionTy(Ty &Value) and the setter should + /// be of the form StatusCode SetterFunctionTy(Ty Value). + template + Envar(const char *EnvarName, GetterFunctor Getter, SetterFunctor Setter) + : Name(EnvarName), Data(Ty()), IsPresent(false), Initialized(true) { + init(Getter, Setter); + } + + /// Get the definitive value. + const Ty &get() const { + // Throw a runtime error in case this envar is not initialized. + if (!Initialized) + FATAL_MESSAGE0(1, "Consulting envar %s before initialization", Name.data()); + + return Data; + } + + /// Get the definitive value. + operator Ty() const { return get(); } + + /// Indicate whether the environment variable was defined and valid. + bool isPresent() const { return IsPresent; } + +private: + template + void init(GetterFunctor Getter, SetterFunctor Setter); +}; + +/// Define some common envar types. +using IntEnvar = Envar; +using Int32Envar = Envar; +using Int64Envar = Envar; +using UInt32Envar = Envar; +using UInt64Envar = Envar; +using StringEnvar = Envar; +using BoolEnvar = Envar; + +template <> +inline bool StringParser::parse(const char *ValueStr, bool &Result) +{ + std::string Value(ValueStr); + + // Convert the string to lowercase. + std::transform(Value.begin(), Value.end(), Value.begin(), + [](unsigned char c){ return std::tolower(c); }); + + // May be implemented with fancier C++ features, but let's keep it simple. + if (Value == "true" || Value == "yes" || Value == "on" || Value == "1") + Result = true; + else if (Value == "false" || Value == "no" || Value == "off" || Value == "0") + Result = false; + else + return false; + + // Parsed correctly. + return true; +} + +template +inline bool StringParser::parse(const char *Value, Ty &Result) +{ + assert(Value && "Parsed value cannot be null"); + + std::istringstream Stream(Value); + Stream >> Result; + + return !Stream.fail(); +} + +template +template +inline void Envar::init(GetterFunctor Getter, SetterFunctor Setter) { + // Get the default value (current). + Ty Default; + if (Getter(Default)) + FATAL_MESSAGE0(1, "Getter for envar %s failed", Name.data()); + + if (const char *EnvStr = getenv(Name.data())) { + IsPresent = StringParser::parse(EnvStr, Data); + if (IsPresent) { + // Check whether the envar value is actually valid. + IsPresent = !((bool) Setter(Data)); + if (!IsPresent) { + // Setter reported value as invalid, reset to the getter value. + Data = Default; + DP("Setter of envar %s failed, resetting to %s\n", + Name.data(), std::to_string(Data).data()); + } + } else { + DP("Ignoring invalid value %s for envar %s\n", EnvStr, Name.data()); + Data = Default; + } + } else { + Data = Default; + } +} + +} // namespace target +} // namespace omp +} // namespace llvm + +#endif // OPENMP_LIBOMPTARGET_INCLUDE_UTILITIES_H Index: openmp/libomptarget/include/rtl.h =================================================================== --- openmp/libomptarget/include/rtl.h +++ openmp/libomptarget/include/rtl.h @@ -169,6 +169,9 @@ // (i.e. the library attempts to load the RTLs (plugins) only once). std::once_flag InitFlag; void loadRTLs(); // not thread-safe + +private: + static bool attemptLoadRTL(const std::string &RTLName, RTLInfoTy &RTL); }; /// Map between the host entry begin and the translation table. Each Index: openmp/libomptarget/plugins-nextgen/CMakeLists.txt =================================================================== --- /dev/null +++ openmp/libomptarget/plugins-nextgen/CMakeLists.txt @@ -0,0 +1,19 @@ +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build plugins for the user system if available. +# +##===----------------------------------------------------------------------===## + +add_subdirectory(common) +add_subdirectory(cuda) + +# Make sure the parent scope can see the plugins that will be created. +set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE) +set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE) + Index: openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt =================================================================== --- /dev/null +++ openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt @@ -0,0 +1,17 @@ +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for an aarch64 machine if available. +# +##===----------------------------------------------------------------------===## + +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + build_generic_elf64_nextgen("aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183") +else() + libomptarget_say("Not building aarch64 NextGen offloading plugin: machine not found in the system.") +endif() Index: openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt =================================================================== --- /dev/null +++ openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt @@ -0,0 +1,13 @@ +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Common parts which can be used by all plugins +# +##===----------------------------------------------------------------------===## + +add_subdirectory(PluginInterface) Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt =================================================================== --- /dev/null +++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt @@ -0,0 +1,32 @@ +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Common parts which can be used by all plugins +# +##===----------------------------------------------------------------------===## + +# Plugin Interface library. +add_library(PluginInterface OBJECT PluginInterface.cpp GlobalHandler.cpp) + +# Define the TARGET_NAME. +add_definitions("-DTARGET_NAME=PluginInterface") + +# Define the DEBUG_PREFIX. +add_definitions(-DDEBUG_PREFIX="PluginInterface") + +set_property(TARGET PluginInterface PROPERTY POSITION_INDEPENDENT_CODE ON) +llvm_update_compile_flags(PluginInterface) +set(LINK_LLVM_LIBS LLVMSupport) +if (LLVM_LINK_LLVM_DYLIB) + set(LINK_LLVM_LIBS LLVM) +endif() +target_link_libraries(PluginInterface INTERFACE ${LINK_LLVM_LIBS} PRIVATE elf_common MemoryManager) +add_dependencies(PluginInterface ${LINK_LLVM_LIBS}) + +target_include_directories(PluginInterface INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}) +target_include_directories(PluginInterface PRIVATE ${LIBOMPTARGET_INCLUDE_DIR}) Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/GlobalHandler.h =================================================================== --- /dev/null +++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/GlobalHandler.h @@ -0,0 +1,197 @@ +//===- GlobalHandler.h - Target independent global & enviroment handling --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Target independent global handler and environment manager. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H +#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H + +#include +#include + +#include "llvm/Object/ELFObjectFile.h" + +#include "Debug.h" +#include "omptarget.h" +#include "Utilities.h" + +namespace llvm { +namespace omp { +namespace target { +namespace plugin { + +struct DeviceImageTy; +struct GenericDeviceTy; + +using namespace llvm::object; + +/// Common abstraction for globals that live on the host and device. +/// It simply encapsulates the symbol name, symbol size, and symbol address +/// (which might be host or device depending on the context). +class GlobalTy { + // TODO: Maybe we can have a pointer to the offload entry name instead of + // holding a private copy of the name as a std::string + std::string Name; + uint32_t Size; + void *Ptr; + +public: + GlobalTy(const std::string &Name, uint32_t Size, void *Ptr = nullptr) + : Name(Name), Size(Size), Ptr(Ptr) {} + + GlobalTy(const __tgt_offload_entry &Entry) + : Name(Entry.name), Size(Entry.size), Ptr(Entry.addr) {} + + const std::string &getName() const { return Name; } + uint32_t getSize() const { return Size; } + void *getPtr() const { return Ptr; } + + void setSize(int32_t S) { Size = S; } + void setPtr(void *P) { Ptr = P; } +}; + +/// Subclass of GlobalTy that holds the memory for a global of \p Ty. +template class StaticGlobalTy : public GlobalTy { + Ty Data; + +public: + template + StaticGlobalTy(const std::string &Name, Args &&...args) + : GlobalTy(Name, sizeof(Ty), &Data), + Data(Ty{std::forward(args)...}) {} + template + StaticGlobalTy(const char *Name, Args &&...args) + : GlobalTy(Name, sizeof(Ty), &Data), + Data(Ty{std::forward(args)...}) {} + template + StaticGlobalTy(const char *Name, const char *Suffix, Args &&...args) + : GlobalTy(std::string(Name) + Suffix, sizeof(Ty), &Data), + Data(Ty{std::forward(args)...}) {} + + Ty &getValue() { return Data; } + const Ty &getValue() const { return Data; } + void setValue(const Ty &V) { Data = V; } +}; + +/// Subclass of GlobalTy that holds the memory which may exceed the global type +/// \p Ty. +template class DynamicGlobalTy : public GlobalTy { +public: + DynamicGlobalTy(const std::string &Name, uint32_t Size) + : GlobalTy(Name, Size, malloc(Size)) {} + DynamicGlobalTy(const char *Name, const char *Suffix, uint32_t Size) + : GlobalTy(std::string(Name) + Suffix, Size, malloc(Size)) {} + ~DynamicGlobalTy() { free(getPtr()); } + + Ty &getValue() { return *static_cast(getPtr()); } + const Ty &getValue() const { return *static_cast(getPtr()); } + void setValue(const Ty &V) { *getPtr() = V; } +}; + +/// Helper class to do the heavy lifting when it comes to moving globals between +/// host and device. Through the GenericDeviceTy we access memcpy DtoH and HtoD, +/// which means the only things specialized by the subclass is the retrival of +/// global metadata (size, addr) from the device. +/// \see getGlobalMetadataFromDevice +class GenericGlobalHandlerTy { + /// Map to store the ELF object files that have been loaded + std::unordered_map ELFObjectFiles; + std::mutex ELFObjectFilesMutex; + + /// Get the cached ELF64LEObjectFile previosuly created for a specific + /// device image or create it if did not exist. + const ELF64LEObjectFile *getOrCreateELFObjectFile( + const GenericDeviceTy &Device, + DeviceImageTy &Image); + + /// Actually move memory between host and device. See readGlobalFromDevice and + /// writeGlobalToDevice for the interface description. + StatusCode moveGlobalBetweenDeviceAndHost(GenericDeviceTy &Device, + DeviceImageTy &Image, + const GlobalTy &HostGlobal, + bool Device2Host); + + /// Actually move memory between host and device. See readGlobalFromDevice and + /// writeGlobalToDevice for the interface description. + StatusCode moveGlobalBetweenDeviceAndHost(GenericDeviceTy &Device, + DeviceImageTy &Image, + const GlobalTy &HostGlobal, + const GlobalTy &DeviceGlobal, + bool Device2Host); + +public: + virtual ~GenericGlobalHandlerTy() {} + + /// Get the address and size of a global in the image. Address and size are + /// return in \p ImageGlobal, the global name is passed in \p ImageGlobal. + StatusCode getGlobalMetadataFromImage(GenericDeviceTy &Device, + DeviceImageTy &Image, + GlobalTy &ImageGlobal); + + /// Read the memory associated with a global from the image and store it on + /// the host. The name, size, and destination are defined by \p HostGlobal. + StatusCode readGlobalFromImage(GenericDeviceTy &Device, + DeviceImageTy &Image, + const GlobalTy &HostGlobal); + + /// Get the address and size of a global from the device. Address is return in + /// \p DeviceGlobal, the global name and expected size are passed in + /// \p DeviceGlobal. + virtual StatusCode getGlobalMetadataFromDevice(GenericDeviceTy &Device, + DeviceImageTy &Image, + GlobalTy &DeviceGlobal) = 0; + + /// Copy the memory associated with a global from the device to its + /// counterpart on the host. The name, size, and destination are defined by + /// \p HostGlobal. The origin is defined by \p DeviceGlobal. + StatusCode readGlobalFromDevice(GenericDeviceTy &Device, + DeviceImageTy &Image, + const GlobalTy &HostGlobal, + const GlobalTy &DeviceGlobal) { + return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal, DeviceGlobal, + /* D2H */ true); + } + + /// Copy the memory associated with a global from the device to its + /// counterpart on the host. The name, size, and destination are defined by + /// \p HostGlobal. The origin is automatically resolved. + StatusCode readGlobalFromDevice(GenericDeviceTy &Device, + DeviceImageTy &Image, + const GlobalTy &HostGlobal) { + return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal, /* D2H */ true); + } + + /// Copy the memory associated with a global from the host to its counterpart + /// on the device. The name, size, and origin are defined by \p HostGlobal. + /// The destination is defined by \p DeviceGlobal. + StatusCode writeGlobalToDevice(GenericDeviceTy &Device, + DeviceImageTy &Image, + const GlobalTy &HostGlobal, + const GlobalTy &DeviceGlobal) { + return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal, DeviceGlobal, + /* D2H */ false); + } + + /// Copy the memory associated with a global from the host to its counterpart + /// on the device. The name, size, and origin are defined by \p HostGlobal. + /// The destination is automatically resolved. + StatusCode writeGlobalToDevice(GenericDeviceTy &Device, + DeviceImageTy &Image, + const GlobalTy &HostGlobal) { + return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal, /* D2H */ false); + } +}; + +} // namespace plugin +} // namespace target +} // namespace omp +} // namespace llvm + +#endif // LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_GLOBALHANDLER_GLOBALHANDLER_H Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/GlobalHandler.cpp =================================================================== --- /dev/null +++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/GlobalHandler.cpp @@ -0,0 +1,196 @@ +//===- GlobalHandler.cpp - Target independent global & env. var handling --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Target independent global handler and environment manager. +// +//===----------------------------------------------------------------------===// + +#include "GlobalHandler.h" +#include "PluginInterface.h" + +#include + +using namespace llvm; +using namespace omp; +using namespace target; +using namespace plugin; + +const ELF64LEObjectFile *GenericGlobalHandlerTy::getOrCreateELFObjectFile( + const GenericDeviceTy &Device, + DeviceImageTy &Image) { + + std::lock_guard Lock(ELFObjectFilesMutex); + auto Search = ELFObjectFiles.find(Image.getId()); + if (Search == ELFObjectFiles.end()) { + Expected ExpectedELF = + ELF64LEObjectFile::create(Image.getMemoryBuffer()); + if (!ExpectedELF) { + INFO(OMP_INFOTYPE_DATA_TRANSFER, Device.getDeviceId(), + "Unable to open ELF image."); + return nullptr; + } + + ELFObjectFiles.emplace(Image.getId(), std::move(ExpectedELF.get())); + + Search = ELFObjectFiles.find(Image.getId()); + assert(Search != ELFObjectFiles.end()); + } + return &Search->second; +} + + +StatusCode GenericGlobalHandlerTy::moveGlobalBetweenDeviceAndHost( + GenericDeviceTy &Device, + DeviceImageTy &Image, + const GlobalTy &HostGlobal, + bool Device2Host) { + + GlobalTy DeviceGlobal(HostGlobal.getName(), HostGlobal.getSize()); + StatusCode SC = getGlobalMetadataFromDevice(Device, Image, DeviceGlobal); + if (SC) { + INFO(OMP_INFOTYPE_DATA_TRANSFER, Device.getDeviceId(), + "Failed to read global symbol metadata for '%s' from the device", + HostGlobal.getName().c_str()); + return SC; + } + + return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal, DeviceGlobal, + Device2Host); +} + +/// Actually move memory between host and device. See readGlobalFromDevice and +/// writeGlobalToDevice for the interface description. +StatusCode GenericGlobalHandlerTy::moveGlobalBetweenDeviceAndHost( + GenericDeviceTy &Device, + DeviceImageTy &DeviceImage, + const GlobalTy &HostGlobal, + const GlobalTy &DeviceGlobal, + bool Device2Host) { + + StatusCode SC; + if (Device2Host) + SC = Device.dataRetrieve(HostGlobal.getPtr(), DeviceGlobal.getPtr(), + HostGlobal.getSize(), nullptr); + else + SC = Device.dataSubmit(DeviceGlobal.getPtr(), HostGlobal.getPtr(), + HostGlobal.getSize(), nullptr); + + if (SC) { + INFO(OMP_INFOTYPE_DATA_TRANSFER, Device.getDeviceId(), + "Failed to %s %u bytes associated with global symbol '%s' %s " + "the device", + Device2Host ? "read" : "write", HostGlobal.getSize(), + HostGlobal.getName().c_str(), Device2Host ? "from" : "to"); + return SC; + } + + INFO(OMP_INFOTYPE_DATA_TRANSFER, Device.getDeviceId(), + "Successfully %s %u bytes associated with global symbol '%s' %s " + "the device (%p -> %p)", + Device2Host ? "read" : "write", HostGlobal.getSize(), + HostGlobal.getName().c_str(), Device2Host ? "from" : "to", + DeviceGlobal.getPtr(), HostGlobal.getPtr()); + + return SC; +} + +StatusCode GenericGlobalHandlerTy::getGlobalMetadataFromImage( + GenericDeviceTy &Device, + DeviceImageTy &Image, + GlobalTy &ImageGlobal) { + + // Get the ELF object file for the image. Notice the ELF object may already + // be created in previous calls, so we can reuse it. + const ELF64LEObjectFile *ELF = getOrCreateELFObjectFile(Device, Image); + if (!ELF) + return StatusCode::FAIL; + + // Then extract the base address of elf image. + Expected StartAddr = ELF->getStartAddress(); + if (!StartAddr) { + INFO(OMP_INFOTYPE_DATA_TRANSFER, Device.getDeviceId(), + "Unable to determine ELF start address."); + return StatusCode::FAIL; + } + + // TODO: We should improve the search by consulting HASH or GNU_HASH tables. + for (auto &SymbolIt : ELF->symbols()) { + // First check the name, continue if we don't match. + Expected Name = SymbolIt.getName(); + if (!Name || !Name.get().equals(ImageGlobal.getName())) + continue; + + // If we match we will either succeed or fail with retriving the content, + // either way, the loop is done. First step is to verify the size. + ImageGlobal.setSize(SymbolIt.getSize()); + + // Then extract the relative offset from the elf image base. + Expected Offset = SymbolIt.getValue(); + if (!Offset) { + INFO(OMP_INFOTYPE_DATA_TRANSFER, Device.getDeviceId(), + "Global symbol '%s' was found in the elf image but address could " + "not be determined.", ImageGlobal.getName().c_str()); + return StatusCode::FAIL; + } + + Expected SectionIt = SymbolIt.getSection(); + if (!SectionIt) { + INFO(OMP_INFOTYPE_DATA_TRANSFER, Device.getDeviceId(), + "Global symbol '%s' was found in the elf image but section could " + "not be determined.", ImageGlobal.getName().c_str()); + return StatusCode::FAIL; + } + ELFSectionRef ELFSection = *(SectionIt.get()); + + Expected SectionName = ELFSection.getName(); + if (!SectionName) { + INFO(OMP_INFOTYPE_DATA_TRANSFER, Device.getDeviceId(), + "Global symbol '%s' was found in the elf image but the section name " + "could not be determined.", ImageGlobal.getName().c_str()); + return StatusCode::FAIL; + } + + ImageGlobal.setPtr((char *)Image.getStart() + ELFSection.getOffset() + + Offset.get()); + + return StatusCode::SUCCESS; + } + + INFO(OMP_INFOTYPE_DATA_TRANSFER, Device.getDeviceId(), + "Global symbol '%s' was not found in the elf image.", + ImageGlobal.getName().c_str()); + return StatusCode::FAIL; +} + +StatusCode GenericGlobalHandlerTy::readGlobalFromImage( + GenericDeviceTy &Device, + DeviceImageTy &Image, + const GlobalTy &HostGlobal) { + + GlobalTy ImageGlobal(HostGlobal.getName(), -1); + StatusCode SC = getGlobalMetadataFromImage(Device, Image, ImageGlobal); + if (SC) + return SC; + + if (ImageGlobal.getSize() != HostGlobal.getSize()) { + INFO(OMP_INFOTYPE_DATA_TRANSFER, Device.getDeviceId(), + "Global symbol '%s' has %u bytes in the elf image but %u bytes " + "on the host, abort transfer.", + HostGlobal.getName().c_str(), ImageGlobal.getSize(), + HostGlobal.getSize()); + return StatusCode::FAIL; + } + + INFO(OMP_INFOTYPE_DATA_TRANSFER, Device.getDeviceId(), + "Global symbol '%s' was found in the elf image and %u bytes will " + "copied from %p to %p.", + HostGlobal.getName().c_str(), HostGlobal.getSize(), ImageGlobal.getPtr(), + HostGlobal.getPtr()); + std::memcpy(HostGlobal.getPtr(), ImageGlobal.getPtr(), HostGlobal.getSize()); + return StatusCode::SUCCESS; +} Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h =================================================================== --- /dev/null +++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h @@ -0,0 +1,820 @@ +//===- PluginInterface.h - Target independent plugin device interface -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_PLUGININTERFACE_H +#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_PLUGININTERFACE_H + +#include +#include +#include +#include +#include + +#include "Debug.h" +#include "DeviceEnvironment.h" +#include "GlobalHandler.h" +#include "MemoryManager.h" +#include "Utilities.h" +#include "omptarget.h" + +#include "llvm/ADT/SmallVector.h" +#include "llvm/Frontend/OpenMP/OMPConstants.h" +#include "llvm/Frontend/OpenMP/OMPGridValues.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MemoryBufferRef.h" + +namespace llvm { +namespace omp { +namespace target { +namespace plugin { + +struct GenericPluginTy; +struct GenericKernelTy; +struct GenericDeviceTy; + +/// Class that wraps the __tgt_async_info to simply its usage. In case the +/// object is constructed without a valid __tgt_async_info, the object will use +/// an internal one and will synchronize the current thread with the pending +/// operations on object destruction. +struct AsyncInfoWrapperTy { + AsyncInfoWrapperTy(StatusCode &SC, GenericDeviceTy &Device, + __tgt_async_info *AsyncInfoPtr) + : SC(SC), Device(Device), + AsyncInfoPtr(AsyncInfoPtr ? AsyncInfoPtr : &LocalAsyncInfo) {} + + /// Synchronize with the __tgt_async_info's pending operations if it's the + /// internal one. + ~AsyncInfoWrapperTy(); + + /// Get the raw __tgt_async_info pointer. + operator __tgt_async_info *() const { return AsyncInfoPtr; } + + /// Get a reference to the underlying plugin-specific queue type. + template Ty &getQueueAs() const { + static_assert(sizeof(Ty) == sizeof(AsyncInfoPtr->Queue), + "Queue is not of the same size as target type!"); + return reinterpret_cast(AsyncInfoPtr->Queue); + } + +private: + StatusCode &SC; + GenericDeviceTy &Device; + __tgt_async_info LocalAsyncInfo; + __tgt_async_info *const AsyncInfoPtr; +}; + +/// Class wrapping a __tgt_device_image and its offload entry table on a +/// specific device. This class is responsible for storing and managing +/// the offload entries for an image on a device. +class DeviceImageTy { + + /// Class representing the offload entry table. The class stores the + /// __tgt_target_table and a map to search in the table faster. + struct OffloadEntryTableTy { + /// Clear the table of entries. + void clear() { + Entries.clear(); + TTTablePtr.EntriesBegin = TTTablePtr.EntriesEnd = nullptr; + } + + /// Add new entry in the table. + void addEntry(const __tgt_offload_entry &Entry) { + Entries.push_back(Entry); + TTTablePtr.EntriesBegin = &Entries[0]; + TTTablePtr.EntriesEnd = TTTablePtr.EntriesBegin + Entries.size(); + } + + /// Get the offload entry by its address. + __tgt_offload_entry *getEntry(void *Addr) { + auto It = EntryMap.find(Addr); + if (It == EntryMap.end()) + return nullptr; + return &Entries[It->second]; + } + + /// Get the raw pointer to the __tgt_target_table. + operator __tgt_target_table *() { + if (Entries.empty()) + return nullptr; + return &TTTablePtr; + } + + private: + __tgt_target_table TTTablePtr; + std::map EntryMap; + std::vector<__tgt_offload_entry> Entries; + }; + + /// Image identifier within the corresponding device. Notice that this id is + /// not unique between different device; they may overlap. + int32_t ImageId; + + /// The pointer to the raw __tgt_device_image. + const __tgt_device_image *TgtImage; + + /// Table of offload entries. + OffloadEntryTableTy OffloadEntryTable; + +public: + DeviceImageTy(int32_t Id, const __tgt_device_image *Image) + : ImageId(Id), TgtImage(Image) { + assert(TgtImage && "Target image is invalid"); + } + + /// Get the image identifier within the device. + int32_t getId() const { return ImageId; } + + /// Get the pointer to the raw __tgt_device_image. + const __tgt_device_image *getTgtImage() const { return TgtImage; } + + /// Get the image starting address. + void *getStart() const { return TgtImage->ImageStart; } + + /// Get the image size. + size_t getSize() const { + return ((char *)TgtImage->ImageEnd) - ((char *)TgtImage->ImageStart); + } + + /// Get a memory buffer reference to the whole image. + MemoryBufferRef getMemoryBuffer() const { + return MemoryBufferRef( + StringRef((const char *)getStart(), getSize()), "Image"); + } + + /// Get a reference to the offload entry table for the image. + OffloadEntryTableTy &getOffloadEntryTable() { return OffloadEntryTable; } +}; + +/// Class implementing common functionalities of offload kernels. Each plugin +/// should define the specific kernel class, derive from this generic one, and +/// implement the necessary virtual function members. +struct GenericKernelTy { + /// Construct a kernel with a name and a execution mode. + GenericKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode) + : Name(Name), ExecutionMode(ExecutionMode) {} + + virtual ~GenericKernelTy() {} + + /// Initialize the kernel object from a specific device. + StatusCode init(GenericDeviceTy &GenericDevice); + virtual StatusCode initImpl(GenericDeviceTy &GenericDevice) = 0; + + /// Launch the kernel on the specific device. The device must be the same + /// one used to initialize the kernel. + StatusCode launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, + ptrdiff_t *ArgOffsets, int32_t NumArgs, + int32_t NumTeamsClause, int32_t ThreadLimitClause, + int32_t LoopTripCount, + AsyncInfoWrapperTy &AsyncInfoWrapper) const; + virtual StatusCode launchImpl(GenericDeviceTy &GenericDevice, + int32_t NumThreads, int32_t NumBlocks, + int32_t DynamicMemorySize, + int32_t NumKernelArgs, void *KernelArgs, + AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0; +private: + /// Prepare the arguments before launching the kernel. + void *prepareArgs(GenericDeviceTy &GenericDevice, + void **ArgPtrs, ptrdiff_t *ArgOffsets, + int32_t NumArgs, + llvm::SmallVectorImpl &Args, + llvm::SmallVectorImpl &Ptrs, + AsyncInfoWrapperTy &AsyncInfoWrapper) const; + + /// Get the default number of threads and blocks for the kernel. + virtual int32_t getDefaultNumThreads(GenericDeviceTy &Device) const = 0; + virtual int32_t getDefaultNumBlocks(GenericDeviceTy &Device) const = 0; + + /// Get the number of threads and blocks for the kernel based on the + /// user-defined threads and block clauses. + int32_t getNumThreads(GenericDeviceTy &GenericDevice, + int32_t ThreadLimitClause) const; + int32_t getNumBlocks(GenericDeviceTy &GenericDevice, int32_t BlockLimitClause, + int32_t LoopTripCount, int32_t NumThreads) const; + + /// Get the kernel name. + const char *getName() const { return Name; } + + /// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode. + bool isGenericSPMDMode() const { + return ExecutionMode == OMP_TGT_EXEC_MODE_GENERIC_SPMD; + } + bool isGenericMode() const { + return ExecutionMode == OMP_TGT_EXEC_MODE_GENERIC; + } + bool isSPMDMode() const { + return ExecutionMode == OMP_TGT_EXEC_MODE_SPMD; + } + + /// Get the execution mode name of the kernel. + const char *getExecutionModeName() const { + switch (ExecutionMode) { + case OMP_TGT_EXEC_MODE_SPMD: + return "SPMD"; + case OMP_TGT_EXEC_MODE_GENERIC: + return "Generic"; + case OMP_TGT_EXEC_MODE_GENERIC_SPMD: + return "Generic-SPMD"; + } + llvm_unreachable("Unknown execution mode!"); + } + + /// The kernel name. + const char *Name; + + /// The execution flags of the kernel. + OMPTgtExecModeFlags ExecutionMode; + +protected: + /// The dynamic memory size reserved for executing the kernel. + int32_t DynamicMemorySize = -1; + + /// The preferred number of threads to run the kernel. + int32_t PreferredNumThreads = -1; + + /// The maximum number of threads which the kernel could leverage. + int32_t MaxNumThreads = -1; +}; + +/// Class implementing common functionalities of offload devices. Each plugin +/// should define the specific device class, derive from this generic one, and +/// implement the necessary virtual function members. +struct GenericDeviceTy : public DeviceAllocatorTy { + /// Construct a device with its device id within the plugin, the number of + /// devices in the plugin and the grid values for that kind of device. + GenericDeviceTy(int32_t DeviceId, int32_t NumDevices, + const llvm::omp::GV &GridValues); + + /// Get the device identifier within the corresponding plugin. Notice that + /// this id is not unique between different plugins; they may overlap. + int32_t getDeviceId() const { return DeviceId; } + + /// Set the context of the device if needed, before calling device-specific + /// functions. Plugins may implement this function as a no-op if not needed. + virtual StatusCode setContext() = 0; + + /// Initialize the device. After this call, the device should be already + /// working and ready to accept queries or modifications. + StatusCode init(GenericPluginTy &Plugin); + virtual StatusCode initImpl(GenericPluginTy &Plugin) = 0; + + /// Deinitialize the device and free all its resources. After this call, the + /// device is no longer considered ready, so no queries or modifications are + /// allowed. + StatusCode deinit(); + virtual StatusCode deinitImpl() = 0; + + /// Load the binary image into the device and return the target table. + __tgt_target_table *loadBinary(GenericPluginTy &Plugin, + const __tgt_device_image *TgtImage); + virtual DeviceImageTy *loadBinaryImpl(const __tgt_device_image *TgtImage, + int32_t ImageId) = 0; + + /// Setup the device environment if needed. Notice this setup may not be run + /// on some plugins. By default, it will be executed, but plugins can change + /// this behavior by overriding the shouldSetupDeviceEnvironment function. + StatusCode setupDeviceEnvironment(GenericPluginTy &Plugin, + DeviceImageTy &Image); + + /// Register the offload entries for a specific image on the device. + StatusCode registerOffloadEntries(DeviceImageTy &Image); + + /// Synchronize the current thread with the pending operations on the + /// __tgt_async_info structure. + StatusCode synchronize(__tgt_async_info *AsyncInfo); + virtual StatusCode synchronizeImpl(__tgt_async_info &AsyncInfo) = 0; + + /// Allocate data on the device or involving the device. + void *dataAlloc(int64_t Size, void *HostPtr, TargetAllocTy Kind); + + /// Deallocate data from the device or involving the device. + StatusCode dataDelete(void *TgtPtr, TargetAllocTy Kind); + + /// Submit data to the device (host to device transfer). + StatusCode dataSubmit(void *TgtPtr, const void *HstPtr, int64_t Size, + __tgt_async_info *AsyncInfo); + virtual StatusCode dataSubmitImpl(void *TgtPtr, const void *HstPtr, + int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + + /// Retrieve data from the device (device to host transfer). + StatusCode dataRetrieve(void *HstPtr, const void *TgtPtr, int64_t Size, + __tgt_async_info *AsyncInfo); + virtual StatusCode dataRetrieveImpl(void *HstPtr, const void *TgtPtr, + int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + + /// Exchange data between devices (device to device transfer). Calling this function + /// is only valid if GenericPlugin::isDataExchangable() passing the two devices + /// returns true. + StatusCode dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev, void *DstPtr, + int64_t Size, __tgt_async_info *AsyncInfo); + virtual StatusCode dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev, + void *DstPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + + /// Run the target region with multiple teams. + StatusCode runTargetTeamRegion(void *EntryPtr, void **ArgPtrs, + ptrdiff_t *ArgOffsets, int32_t NumArgs, + int32_t NumTeamsClause, + int32_t ThreadLimitClause, + uint64_t LoopTripCount, + __tgt_async_info *AsyncInfo); + + /// Initialize a __tgt_async_info structure. Related to interop features. + StatusCode initAsyncInfo(__tgt_async_info **AsyncInfoPtr); + virtual StatusCode initAsyncInfoImpl( + AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + + /// Initialize a __tgt_device_info structure. Related to interop features. + StatusCode initDeviceInfo(__tgt_device_info *DeviceInfo, const char **Err); + virtual StatusCode initDeviceInfoImpl(__tgt_device_info *DeviceInfo) = 0; + + /// Create an event. + StatusCode createEvent(void **EventPtrStorage); + virtual StatusCode createEventImpl(void **EventPtrStorage) = 0; + + /// Destroy an event. + StatusCode destroyEvent(void *Event); + virtual StatusCode destroyEventImpl(void *EventPtr) = 0; + + /// Start the recording of the event. + StatusCode recordEvent(void *Event, __tgt_async_info *AsyncInfo); + virtual StatusCode recordEventImpl(void *EventPtr, + AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + + /// Wait for an event to finish. Notice this wait is asynchronous if the + /// __tgt_async_info is not nullptr. + StatusCode waitEvent(void *Event, __tgt_async_info *AsyncInfo); + virtual StatusCode waitEventImpl(void *EventPtr, + AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + + /// Synchronize the current thread with the event. + StatusCode syncEvent(void *EventPtr); + virtual StatusCode syncEventImpl(void *EventPtr) = 0; + + /// Print information about the device. + void printInfo(); + virtual void printInfoImpl() = 0; + + /// Getters of the grid values. + int32_t getWarpSize() const { return GridValues.GV_Warp_Size; } + int32_t getBlockLimit() const { return GridValues.GV_Max_Teams; } + int32_t getThreadLimit() const { return GridValues.GV_Max_WG_Size; } + int32_t getDefaultNumThreads() const { + return GridValues.GV_Default_WG_Size; + } + int32_t getDefaultNumBlocks() const { + // TODO: Introduce a default num blocks value. + return GridValues.GV_Default_WG_Size; + } + int32_t getDynamicMemorySize() const { + return OMPX_SharedMemorySize; + } + +private: + /// Register offload entry for global variable. + StatusCode registerGlobalOffloadEntry(DeviceImageTy &DeviceImage, + const __tgt_offload_entry &GlobalEntry, + __tgt_offload_entry &DeviceEntry); + + /// Register offload entry for kernel function. + StatusCode registerKernelOffloadEntry(DeviceImageTy &DeviceImage, + const __tgt_offload_entry &KernelEntry, + __tgt_offload_entry &DeviceEntry); + + /// Allocate and construct a kernel object. + virtual GenericKernelTy * + constructKernelEntry(const __tgt_offload_entry &KernelEntry, + DeviceImageTy &Image) = 0; + + /// Get and set the stack size and heap size for the device. If not used, the + /// plugin can implement the setters as no-op and setting the output + /// value to zero for the getters. + virtual StatusCode getDeviceStackSize(uint64_t &V) = 0; + virtual StatusCode setDeviceStackSize(uint64_t V) = 0; + virtual StatusCode getDeviceHeapSize(uint64_t &V) = 0; + virtual StatusCode setDeviceHeapSize(uint64_t V) = 0; + + /// Environment variables defined by the OpenMP standard. + Int32Envar OMP_TeamLimit; + Int32Envar OMP_NumTeams; + Int32Envar OMP_TeamsThreadLimit; + + /// Environment variables defined by the LLVM OpenMP implementation. + Int32Envar OMPX_DebugKind; + Int32Envar OMPX_SharedMemorySize; + UInt64Envar OMPX_TargetStackSize; + UInt64Envar OMPX_TargetHeapSize; + + /// Pointer to the memory manager or nullptr if not available. + MemoryManagerTy *MemoryManager; + +protected: + /// Array of images loaded into the device. Images are automatically + /// deallocated by the allocator. + std::vector LoadedImages; + + /// The identifier of the device within the plugin. Notice this is not a + /// global device id and is not the device id visible to the OpenMP user. + const int32_t DeviceId; + + /// The default grid values used for this device. + llvm::omp::GV GridValues; + + /// Enumeration used for representing the current state between two devices + /// two devices (both under the same plugin) for the peer access between them. + /// The states can be a) PENDING when the state has not been queried and needs + /// to be queried, b) AVAILABLE when the peer access is available to be used, + /// and c) UNAVAILABLE if the system does not allow it. + enum class PeerAccessState : uint8_t { + AVAILABLE, + UNAVAILABLE, + PENDING + }; + + /// Array of peer access states with the rest of devices. This means that if + /// the device I has a matrix PeerAccesses with PeerAccesses[J] == AVAILABLE, + /// the device I can access device J's memory directly. However, notice this + /// does not mean that device J can access device I's memory directly. + std::vector PeerAccesses; + std::mutex PeerAccessesLock; +}; + +/// Class implementing common functionalities of offload plugins. Each plugin +/// should define the specific plugin class, derive from this generic one, and +/// implement the necessary virtual function members. +struct GenericPluginTy { + + /// Construct a plugin instance. The number of active instances should be + /// always be either zero or one. + GenericPluginTy() { + ++NumActiveInstances; + } + + /// Destroy the plugin instance and release all its resources. Also decrease + /// the number of instances. + virtual ~GenericPluginTy() { + // There is no global handler if no device is available. + if (GlobalHandler) + delete GlobalHandler; + + // Deinitialize all active devices. + for (int32_t Id = 0; Id < NumDevices; ++Id) { + if (Devices[Id]) { + StatusCode SC = deinitDevice(Id); + if (SC) + REPORT("Error when deinitializing device %d\n", Id); + } + assert(!Devices[Id] && "Device was not deinitialized"); + } + + --NumActiveInstances; + } + + /// Get the reference to the device with a certain device id. + GenericDeviceTy &getDevice(int32_t DeviceId) { + assert(isValidDeviceId(DeviceId) && "Device Id is invalid"); + assert(Devices[DeviceId] && "Device is unitialized"); + + return *Devices[DeviceId]; + } + + /// Get the number of active devices. + int32_t getNumDevices() const { return NumDevices; } + + /// Get the ELF code to recognize the binary image of this plugin. + virtual uint16_t getMagicElfBits() const = 0; + + /// Allocate a structure using the internal allocator. + template Ty *allocate() { + return reinterpret_cast(Allocator.Allocate(sizeof(Ty), alignof(Ty))); + } + + /// Get the reference to the global handler of this plugin. + GenericGlobalHandlerTy &getGlobalHandler() { + assert(GlobalHandler && "Global handler not initialized"); + return *GlobalHandler; + } + + /// Get the OpenMP requires flags set for this plugin. + int64_t getRequiresFlags() const { return RequiresFlags; } + + /// Set the OpenMP requires flags for this plugin. + void setRequiresFlag(int64_t Flags) { RequiresFlags = Flags; } + + /// Initialize a device within the plugin. + StatusCode initDevice(int32_t DeviceId); + + /// Deinitialize a device within the plugin and release its resources. + StatusCode deinitDevice(int32_t DeviceId); + + /// Indicate whether data can be exchanged directly between two devices under + /// this same plugin. If this function returns true, it's safe to call the + /// GenericDeviceTy::exchangeData() function on the source device. + virtual bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) { + return isValidDeviceId(SrcDeviceId) && isValidDeviceId(DstDeviceId); + } + + /// Indicate if an image is compatible with a specific device. Notice that + /// this function may be called before actually initializing the device. So + /// we could not move this function into GenericDeviceTy. + virtual bool isImageCompatible(int32_t DeviceId, + __tgt_image_info *Info) const = 0; + + /// Indicate whether the plugin supports empty images. + virtual bool supportsEmptyImages() const { return false; } + + /// Indicate whether there is any active plugin instance. + static bool hasAnyActiveInstance() { + assert(NumActiveInstances <= 1); + return (NumActiveInstances > 0); + } + +protected: + /// Initialize the plugin and prepare for initializing its devices. + void init(int NumDevices, GenericGlobalHandlerTy *GlobalHandler) { + this->NumDevices = NumDevices; + this->GlobalHandler = GlobalHandler; + + assert(Devices.size() == 0); + + Devices.resize(NumDevices, nullptr); + } + + /// Create a new device with a specific device id. + virtual GenericDeviceTy &createDevice(int32_t DeviceId) = 0; + + /// Indicate whether a device id is valid. + bool isValidDeviceId(int32_t DeviceId) const { + return (DeviceId >= 0 && DeviceId < getNumDevices()); + } + +private: + /// Number of devices available for the plugin. + int32_t NumDevices; + + /// Array of pointers to the devices. Initially, they are all set to nullptr. + /// Once a device is initialized, the pointer is stored in the position given + /// by its device id. A position with nullptr means that the corresponding + /// device was not initialized yet. + std::vector Devices; + + /// OpenMP requires flags + int64_t RequiresFlags = OMP_REQ_UNDEFINED; + + /// Pointer to the global handler for this plugin. + GenericGlobalHandlerTy *GlobalHandler; + + /// Internal allocator for different structures. + BumpPtrAllocator Allocator; + + /// Indicates the number of active plugin instances. Actually, we should only + /// have one active instance per plugin library. But we use a counter for + /// simplicity. + static uint32_t NumActiveInstances; +}; + +/// Class for simplifying the getter operation of the plugin. Anywhere on the +/// code, the current plugin can be retrieved by Plugin::get(). The init(), +/// deinit(), get() and getErrorStr() functions should be defined by each plugin +/// implementation. +class Plugin { + /// Avoid instances of this class. + Plugin() {} + Plugin(const Plugin &) = delete; + void operator=(const Plugin &) = delete; + +public: + /// Initialize the plugin if it was not initialized yet. + static StatusCode init(); + + /// Deinitialize the plugin if it was not deinitialized yet. + static StatusCode deinit(); + + /// Get a reference (or create if it was not created) to the plugin instance. + static GenericPluginTy &get(); + + /// Get a reference to the plugin with a specific plugin-specific type. + template + static Ty &get() { + return static_cast(get()); + } + + /// Indicate if the plugin is currently active. Actually, we check if there is + /// any active instances. + static bool isActive() { return GenericPluginTy::hasAnyActiveInstance(); } + + /// Get a string description from a status code. + static const char *getErrorStr(const StatusCode &SC); +}; + +/// Auxiliary interface class for GenericDeviceResourcePoolTy. This class acts +/// as a reference to a device resource, such as a stream, and requires some +/// basic functions to be implemented. The derived class should define an empty +/// constructor that creates an empty and invalid resource reference. Do not +/// create a new resource on the ctor, but on the create() function instead. +struct GenericDeviceResourceRef { + /// Create a new resource and stores a reference. + virtual StatusCode create() = 0; + + /// Destroy and release the resources pointed by the reference. + virtual StatusCode destroy() = 0; +}; + +/// Class that implements a resource pool belonging to a device. This class +/// operates with references to the actual resources. These reference must +/// derive from the GenericDeviceResourceRef class and implement the create +/// and destroy virtual functions. +template class GenericDeviceResourcePoolTy { + using ResourcePoolTy = GenericDeviceResourcePoolTy; + +public: + /// Create an empty resource pool for a specific device. + GenericDeviceResourcePoolTy(GenericDeviceTy &Device) + : Device(Device), NextAvailable(0) {} + + /// Destroy the resource pool. At this point, the deinit() function should + /// already have been executed so the resource pool should be empty. + virtual ~GenericDeviceResourcePoolTy() { assert(ResourcePool.empty()); } + + /// Initialize the resource pool. + StatusCode init(uint32_t InitialSize) { + assert(ResourcePool.empty() && "resource pool has been initialized"); + return ResourcePoolTy::resizeResourcePool(InitialSize); + } + + /// Deinitialize the resource pool and delete all resources. This function + /// must be called before the destructor. + StatusCode deinit() { + StatusCode SC = ResourcePoolTy::resizeResourcePool(0); + if (SC) + return SC; + + ResourcePool.clear(); + + return StatusCode::SUCCESS; + } + +protected: + /// Get resource from the pool or create new resources. + ResourceRef getResource() { + const std::lock_guard Lock(Mutex); + if (NextAvailable == ResourcePool.size()) { + // By default we double the resource pool every time. + if (ResourcePoolTy::resizeResourcePool(NextAvailable * 2)) + return ResourceRef(); + } + return ResourcePool[NextAvailable++]; + } + + /// Return resource to the pool. + void returnResource(ResourceRef Resource) { + const std::lock_guard Lock(Mutex); + ResourcePool[--NextAvailable] = Resource; + } + +private: + /// The resources between \p OldSize and \p NewSize need to be created or + /// destroyed. The mutex is locked when this function is called. + StatusCode resizeResourcePoolImpl(uint32_t OldSize, uint32_t NewSize) { + assert(OldSize != NewSize && "resizing to the same size"); + + if (Device.setContext()) + return StatusCode::FAIL; + + if (OldSize < NewSize) { + // Create new resources. + for (uint32_t I = OldSize; I < NewSize; ++I) { + StatusCode SC = ResourcePool[I].create(); + if (SC) + return SC; + } + } else { + // Destroy the obsolete resources. + for (uint32_t I = NewSize; I < OldSize; ++I) { + StatusCode SC = ResourcePool[I].destroy(); + if (SC) + return SC; + } + } + return StatusCode::SUCCESS; + } + + /// Increase or decrease the number of resources. This function should + /// be called with the mutex acquired. + StatusCode resizeResourcePool(uint32_t NewSize) { + uint32_t OldSize = ResourcePool.size(); + + StatusCode SC; + if (OldSize == NewSize) { + SC = StatusCode::SUCCESS; + } else if (OldSize < NewSize) { + ResourcePool.resize(NewSize); + SC = ResourcePoolTy::resizeResourcePoolImpl(OldSize, NewSize); + } else { + SC = ResourcePoolTy::resizeResourcePoolImpl(OldSize, NewSize); + ResourcePool.resize(NewSize); + } + return SC; + } + + /// The device to which the resources belong + GenericDeviceTy &Device; + + /// Mutex for the resource pool. + std::mutex Mutex; + + /// The next available resource in the pool. + uint32_t NextAvailable; + +protected: + /// The actual resource pool. + std::deque ResourcePool; +}; + +/// Class implementing a common stream manager. This class can be directly used +/// by the specific plugins if necessary. The StreamRef type should derive from +/// the GenericDeviceResourceRef. Look at its description to know the details of +/// their requirements. +template +class GenericStreamManagerTy : public GenericDeviceResourcePoolTy { + using ResourcePoolTy = GenericDeviceResourcePoolTy; + +public: + /// Create a stream manager with space for an initial number of streams. No + /// stream will be created until the init() function is called. + GenericStreamManagerTy(GenericDeviceTy &Device, uint32_t DefNumStreams = 32) + : ResourcePoolTy(Device), + InitialNumStreams("LIBOMPTARGET_NUM_INITIAL_STREAMS", DefNumStreams) {} + + /// Initialize the stream pool and their resources with the initial number of + /// streams. + StatusCode init() { + return ResourcePoolTy::init(InitialNumStreams.get()); + } + + /// Get an available stream or create new. + StreamRef getStream() { + return ResourcePoolTy::getResource(); + } + + /// Return idle stream. + void returnStream(StreamRef Stream) { + ResourcePoolTy::returnResource(Stream); + } + +private: + /// The initial stream pool size, potentially defined by an envar. + UInt32Envar InitialNumStreams; +}; + +/// Class implementing a common event manager. This class can be directly used +/// by the specific plugins if necessary. The EventRef type should derive from +/// the GenericDeviceResourceRef. Look at its description to know the details of +/// their requirements. +template +struct GenericEventManagerTy : public GenericDeviceResourcePoolTy { + using ResourcePoolTy = GenericDeviceResourcePoolTy; + +public: + /// Create an event manager with space for an initial number of events. No + /// event will be created until the init() function is called. + GenericEventManagerTy(GenericDeviceTy &Device, uint32_t DefNumEvents = 32) + : ResourcePoolTy(Device), + InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", DefNumEvents) {} + + /// Initialize the event pool and their resources with the initial number of + /// events. + StatusCode init() { + return ResourcePoolTy::init(InitialNumEvents.get()); + } + + /// Get an available event or create new. + EventRef getEvent() { + return ResourcePoolTy::getResource(); + } + + /// Return an idle event. + void returnEvent(EventRef Event) { + ResourcePoolTy::returnResource(Event); + } + +private: + /// The initial event pool size, potentially defined by an envar. + UInt32Envar InitialNumEvents; +}; + +} // namespace plugin +} // namespace target +} // namespace omp +} // namespace llvm + +#endif // OPENMP_LIBOMPTARGET_PLUGINS_COMMON_PLUGININTERFACE_H Index: openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp =================================================================== --- /dev/null +++ openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -0,0 +1,821 @@ +//===- PluginInterface.cpp - Target independent plugin device interface ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "Debug.h" +#include "elf_common.h" +#include "PluginInterface.h" +#include "GlobalHandler.h" +#include "omptarget.h" +#include "omptargetplugin.h" + +#include +#include + +using namespace llvm; +using namespace omp; +using namespace target; +using namespace plugin; + +uint32_t GenericPluginTy::NumActiveInstances = 0; + +AsyncInfoWrapperTy::~AsyncInfoWrapperTy() { + // If we used a local async info object we want synchronous behavior. + // In that case, and assuming the current status code is OK, we will + // synchronize explicitly when the object is deleted. + if (AsyncInfoPtr == &LocalAsyncInfo && SC.succeed()) + SC = Device.synchronize(&LocalAsyncInfo); +} + +StatusCode GenericKernelTy::init(GenericDeviceTy &GenericDevice) { + PreferredNumThreads = getDefaultNumThreads(GenericDevice); + if (isGenericMode()) + PreferredNumThreads += GenericDevice.getWarpSize(); + + MaxNumThreads = GenericDevice.getThreadLimit(); + + DynamicMemorySize = GenericDevice.getDynamicMemorySize(); + + StatusCode SC = initImpl(GenericDevice); + if (SC) { + REPORT("Failure to initialize kernel\n"); + } + return SC; +} + +StatusCode GenericKernelTy::launch(GenericDeviceTy &GenericDevice, + void **ArgPtrs, ptrdiff_t *ArgOffsets, + int32_t NumArgs, int32_t NumTeamsClause, + int32_t ThreadLimitClause, + int32_t LoopTripCount, + AsyncInfoWrapperTy &AsyncInfoWrapper) const { + llvm::SmallVector Args; + llvm::SmallVector Ptrs; + + void *KernelArgsPtr = prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, + NumArgs, Args, Ptrs, AsyncInfoWrapper); + + int32_t NumThreads = getNumThreads(GenericDevice, ThreadLimitClause); + int32_t NumBlocks = getNumBlocks(GenericDevice, NumTeamsClause, LoopTripCount, + NumThreads); + + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(), + "Launching kernel %s with %d blocks and %d threads in %s mode\n", + getName(), NumBlocks, NumThreads, getExecutionModeName()); + + StatusCode SC = launchImpl(GenericDevice, NumThreads, NumBlocks, + DynamicMemorySize, NumArgs, KernelArgsPtr, + AsyncInfoWrapper); + if (SC) { + REPORT("Failure to launch kernel %s\n", getName()); + } + return SC; +} + +void *GenericKernelTy::prepareArgs(GenericDeviceTy &GenericDevice, + void **ArgPtrs, ptrdiff_t *ArgOffsets, + int32_t NumArgs, + llvm::SmallVectorImpl &Args, + llvm::SmallVectorImpl &Ptrs, + AsyncInfoWrapperTy &AsyncInfoWrapper) const { + Args.resize(NumArgs); + Ptrs.resize(NumArgs); + + if (NumArgs == 0) + return nullptr; + + for (int I = 0; I < NumArgs; ++I) { + Ptrs[I] = (void *)((intptr_t)ArgPtrs[I] + ArgOffsets[I]); + Args[I] = &Ptrs[I]; + } + return &Args[0]; +} + +int32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice, + int32_t ThreadLimitClause) const { + return std::min(MaxNumThreads, (ThreadLimitClause > 0) ? ThreadLimitClause + : PreferredNumThreads); +} + +int32_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice, + int32_t NumTeamsClause, + int32_t LoopTripCount, + int32_t NumThreads) const { + int32_t PreferredNumBlocks = getDefaultNumBlocks(GenericDevice); + if (NumTeamsClause > 0) { + PreferredNumBlocks = NumTeamsClause; + } else if (LoopTripCount > 0) { + if (isSPMDMode()) { + // We have a combined construct, i.e. `target teams distribute + // parallel for [simd]`. We launch so many teams so that each thread + // will execute one iteration of the loop. round up to the nearest + // integer + PreferredNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1; + } else { + assert((isGenericMode() || isGenericSPMDMode()) && + "Unexpected execution mode!"); + // If we reach this point, then we have a non-combined construct, i.e. + // `teams distribute` with a nested `parallel for` and each team is + // assigned one iteration of the `distribute` loop. E.g.: + // + // #pragma omp target teams distribute + // for(...loop_tripcount...) { + // #pragma omp parallel for + // for(...) {} + // } + // + // Threads within a team will execute the iterations of the `parallel` + // loop. + PreferredNumBlocks = LoopTripCount; + } + } + return std::min(PreferredNumBlocks, GenericDevice.getBlockLimit()); +} + +GenericDeviceTy::GenericDeviceTy(int32_t DeviceId, int32_t NumDevices, + const llvm::omp::GV &OMPGridValues) + : OMP_TeamLimit("OMP_TEAM_LIMIT"), + OMP_NumTeams("OMP_NUM_TEAMS"), + OMP_TeamsThreadLimit("OMP_TEAMS_THREAD_LIMIT"), + OMPX_DebugKind("LIBOMPTARGET_DEVICE_RTL_DEBUG"), + OMPX_SharedMemorySize("LIBOMPTARGET_SHARED_MEMORY_SIZE"), + // Do not initialize the following two envars since they depend on the + // device initialization. These cannot be consulted until the device is + // initialized correctly. We intialize them in GenericDeviceTy::init(). + OMPX_TargetStackSize(), + OMPX_TargetHeapSize(), + MemoryManager(nullptr), + DeviceId(DeviceId), + GridValues(OMPGridValues), + PeerAccesses(NumDevices, PeerAccessState::PENDING), + PeerAccessesLock() { + if (OMP_NumTeams > 0) + GridValues.GV_Max_Teams = + std::min(GridValues.GV_Max_Teams, uint32_t(OMP_NumTeams)); + + if (OMP_TeamsThreadLimit > 0) + GridValues.GV_Max_WG_Size = std::min(GridValues.GV_Max_WG_Size, + uint32_t(OMP_TeamsThreadLimit)); +}; + +StatusCode GenericDeviceTy::init(GenericPluginTy &Plugin) { + StatusCode SC = initImpl(Plugin); + if (SC) { + REPORT("Failure to initialize device\n"); + } + + // Read and reinitialize the envars that depend on the device initialization. + // Notice these two envars may change the stack size and heap size of the + // device, so they need the device properly initialized. + OMPX_TargetStackSize = UInt64Envar("LIBOMPTARGET_STACK_SIZE", + [this](uint64_t &V) -> StatusCode { return getDeviceStackSize(V); }, + [this](uint64_t V) -> StatusCode { return setDeviceStackSize(V); }); + + OMPX_TargetHeapSize = UInt64Envar("LIBOMPTARGET_HEAP_SIZE", + [this](uint64_t &V) -> StatusCode { return getDeviceHeapSize(V); }, + [this](uint64_t V) -> StatusCode { return setDeviceHeapSize(V); }); + + // Enable the memory manager if required. + bool EnableMM; + size_t ThresholdMM; + std::tie(ThresholdMM, EnableMM) = MemoryManagerTy::getSizeThresholdFromEnv(); + if (EnableMM) + MemoryManager = new MemoryManagerTy(*this, ThresholdMM); + + return SC; +} + +StatusCode GenericDeviceTy::deinit() { + // Delete the memory manager before deinitilizing the device. Otherwise, + // we may delete device allocations after the device is deinitialized. + if (MemoryManager) + delete MemoryManager; + MemoryManager = nullptr; + + StatusCode SC = deinitImpl(); + if (SC) { + REPORT("Failure to deinitialize device (%d): %s\n", DeviceId, + Plugin::getErrorStr(SC)); + } + return SC; +} + +__tgt_target_table * +GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, + const __tgt_device_image *TgtImage) { + DP("Load data from image " DPxMOD "\n", DPxPTR(TgtImage->ImageStart)); + + // Load the binary and allocate the image object. Use the next available id + // for the image id, which is the number of previously loaded images. + DeviceImageTy *Image = loadBinaryImpl(TgtImage, LoadedImages.size()); + if (!Image) { + REPORT("Failure to load binary (%p) in vendor specific part.\n", TgtImage); + return nullptr; + } + + // Add the image to list. + LoadedImages.push_back(Image); + + // Setup the device environment if needed. + StatusCode SC = setupDeviceEnvironment(Plugin, *Image); + if (SC) { + REPORT("Failure to setup device environment.\n"); + return nullptr; + } + + // Register all offload entries of the image. + SC = registerOffloadEntries(*Image); + if (SC) { + REPORT("Failed to register offload entries from device image.\n"); + return nullptr; + } + + // Return the pointer to the table of entries. + return Image->getOffloadEntryTable(); +} + +StatusCode GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin, + DeviceImageTy &Image) { + DeviceEnvironmentTy DeviceEnvironment; + DeviceEnvironment.DebugKind = OMPX_DebugKind; + DeviceEnvironment.NumDevices = Plugin.getNumDevices(); + // TODO: The device ID used here is not the real device ID used by OpenMP. + DeviceEnvironment.DeviceNum = DeviceId; + DeviceEnvironment.DynamicMemSize = OMPX_SharedMemorySize; + + GlobalTy DeviceEnvGlobal("omptarget_device_environment", + sizeof(DeviceEnvironmentTy), &DeviceEnvironment); + + GenericGlobalHandlerTy &GlobalHandler = Plugin.getGlobalHandler(); + if (GlobalHandler.writeGlobalToDevice(*this, Image, DeviceEnvGlobal)) { + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "Failed to write device environment into image."); + // TODO: Check the device gfx name against the image gfx name. + return StatusCode::FAIL; + } + return StatusCode::SUCCESS; +} + +StatusCode GenericDeviceTy::registerOffloadEntries(DeviceImageTy &Image) { + const __tgt_offload_entry *Begin = Image.getTgtImage()->EntriesBegin; + const __tgt_offload_entry *End = Image.getTgtImage()->EntriesEnd; + for (const __tgt_offload_entry *Entry = Begin; Entry != End; ++Entry) { + if (!Entry->addr) { + // The host should have always something in the address to uniquely + // identify the entry. + INFO(OMP_INFOTYPE_ALL, DeviceId, + "Unexpected host entry without address (size: %ld), abort!\n", + Entry->size); + return StatusCode::FAIL; + } + + StatusCode SC; + __tgt_offload_entry DeviceEntry = {0}; + + if (Entry->size) + SC = registerGlobalOffloadEntry(Image, *Entry, DeviceEntry); + else + SC = registerKernelOffloadEntry(Image, *Entry, DeviceEntry); + if (SC) + return StatusCode::FAIL; + + assert(DeviceEntry.addr && "Device addr of offload entry cannot be null"); + + DP("Entry point " DPxMOD " maps to%s %s (" DPxMOD ")\n", + DPxPTR(Entry - Begin), (Entry->size) ? " global" : "", Entry->name, + DPxPTR(DeviceEntry.addr)); + } + return StatusCode::SUCCESS; +} + +StatusCode GenericDeviceTy::registerGlobalOffloadEntry(DeviceImageTy &Image, + const __tgt_offload_entry &GlobalEntry, __tgt_offload_entry &DeviceEntry) { + GenericPluginTy &Plugin = Plugin::get(); + GenericGlobalHandlerTy &GlobalHandler = Plugin.getGlobalHandler(); + + DeviceEntry = GlobalEntry; + + // Create a metadata object for the device global. + GlobalTy DeviceGlobal(GlobalEntry.name, GlobalEntry.size); + + // Get the address of the device of the global. + if (GlobalHandler.getGlobalMetadataFromDevice(*this, Image, DeviceGlobal)) + return StatusCode::FAIL; + + // Store the device address on the device entry. + DeviceEntry.addr = DeviceGlobal.getPtr(); + assert(DeviceEntry.addr && "Device global's address cannot be null"); + + // Note: In the current implementation declare target variables + // can either be link or to. This means that once unified + // memory is activated via the requires directive, the variable + // can be used directly from the host in both cases. + if (Plugin.getRequiresFlags() & OMP_REQ_UNIFIED_SHARED_MEMORY) { + // If unified memory is present any target link or to variables + // can access host addresses directly. There is no longer a + // need for device copies. + GlobalTy HostGlobal(GlobalEntry); + if (GlobalHandler.writeGlobalToDevice(*this, Image, HostGlobal, DeviceGlobal)) + return StatusCode::FAIL; + } + + // Add the device entry on the entry table. + Image.getOffloadEntryTable().addEntry(DeviceEntry); + + return StatusCode::SUCCESS; +} + +StatusCode GenericDeviceTy::registerKernelOffloadEntry(DeviceImageTy &Image, + const __tgt_offload_entry &KernelEntry, __tgt_offload_entry &DeviceEntry) { + DeviceEntry = KernelEntry; + + // Create a kernel object. + GenericKernelTy *Kernel = constructKernelEntry(KernelEntry, Image); + if (!Kernel) + return StatusCode::FAIL; + + // Initialize the kernel. + StatusCode SC = Kernel->init(*this); + if (SC) + return SC; + + // Set the device entry address to the kernel address and store the entry on + // the entry table. + DeviceEntry.addr = (void *)Kernel; + Image.getOffloadEntryTable().addEntry(DeviceEntry); + + return StatusCode::SUCCESS; +} + +StatusCode GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo) { + if (!AsyncInfo || !AsyncInfo->Queue) + return StatusCode::FAIL; + + StatusCode SC = synchronizeImpl(*AsyncInfo); + if (SC) { + REPORT("Error when synchronizing stream (%p): %s\n", AsyncInfo->Queue, + Plugin::getErrorStr(SC)); + } + return SC; +} + +void *GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr, + TargetAllocTy Kind) { + switch (Kind) { + case TARGET_ALLOC_DEFAULT: + case TARGET_ALLOC_DEVICE: + if (MemoryManager) { + return MemoryManager->allocate(Size, HostPtr); + } + LLVM_FALLTHROUGH; + case TARGET_ALLOC_HOST: + case TARGET_ALLOC_SHARED: + return allocate(Size, HostPtr, Kind); + } + + REPORT("Invalid target data allocation kind or requested allocator not " + "implemented yet.\n"); + + return nullptr; +} + +StatusCode GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) { + StatusCode SC; + + if (MemoryManager) { + SC = StatusCode(MemoryManager->free(TgtPtr)); + } else { + SC = StatusCode(free(TgtPtr, Kind)); + } + + if (SC) { + REPORT("Failed to deallocate device pointer %p\n", TgtPtr); + } + return SC; +} + +StatusCode GenericDeviceTy::dataSubmit(void *TgtPtr, const void *HstPtr, + int64_t Size, + __tgt_async_info *AsyncInfo) { + StatusCode SC; + AsyncInfoWrapperTy AsyncInfoWrapper(SC, *this, AsyncInfo); + SC = dataSubmitImpl(TgtPtr, HstPtr, Size, AsyncInfoWrapper); + if (SC) { + REPORT("Error when copying data from host to device. Pointers: host " + "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", + DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); + } + return SC; +} + +StatusCode GenericDeviceTy::dataRetrieve(void *HstPtr, const void *TgtPtr, + int64_t Size, + __tgt_async_info *AsyncInfo) { + StatusCode SC; + AsyncInfoWrapperTy AsyncInfoWrapper(SC, *this, AsyncInfo); + SC = dataRetrieveImpl(HstPtr, TgtPtr, Size, AsyncInfoWrapper); + if (SC) { + REPORT("Error when copying data from device to host. Pointers: host " + "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", + DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); + } + return SC; +} + +StatusCode GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev, + void *DstPtr, int64_t Size, + __tgt_async_info *AsyncInfo) { + StatusCode SC; + AsyncInfoWrapperTy AsyncInfoWrapper(SC, *this, AsyncInfo); + SC = dataExchangeImpl(SrcPtr, DstDev, DstPtr, Size, AsyncInfoWrapper); + if (SC) { + REPORT("Error when copying data from device (%d) to device (%d). Pointers: " + "host " + "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n", + DeviceId, DstDev.DeviceId, DPxPTR(SrcPtr), DPxPTR(DstPtr), Size); + } + return SC; +} + +StatusCode GenericDeviceTy::runTargetTeamRegion( + void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets, int32_t NumArgs, + int32_t NumTeamsClause, int32_t ThreadLimitClause, uint64_t LoopTripCount, + __tgt_async_info *AsyncInfo) { + StatusCode SC; + AsyncInfoWrapperTy AsyncInfoWrapper(SC, *this, AsyncInfo); + + GenericKernelTy &GenericKernel = + *reinterpret_cast(EntryPtr); + + int32_t LoopTripCount32 = + (LoopTripCount > uint64_t(std::numeric_limits::max())) + ? std::numeric_limits::max() + : LoopTripCount; + + SC = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, NumArgs, NumTeamsClause, + ThreadLimitClause, LoopTripCount32, + AsyncInfoWrapper); + if (SC) { + REPORT("Error when running target region " DPxMOD " in device %d\n", + DPxPTR(EntryPtr), DeviceId); + } + return SC; +} + +StatusCode GenericDeviceTy::initAsyncInfo(__tgt_async_info **AsyncInfoPtr) { + assert(AsyncInfoPtr && "Async info pointer is null"); + + *AsyncInfoPtr = new __tgt_async_info(); + + StatusCode SC; + AsyncInfoWrapperTy AsyncInfoWrapper(SC, *this, *AsyncInfoPtr); + + SC = initAsyncInfoImpl(AsyncInfoWrapper); + if (SC) { + REPORT("Error when initializing async info at " DPxMOD " on device %d\n", + DPxPTR(*AsyncInfoPtr), DeviceId); + } + return SC; +} + +StatusCode GenericDeviceTy::initDeviceInfo(__tgt_device_info *DeviceInfo, + const char **ErrStr) { + assert(DeviceInfo && "Device info is null"); + + *ErrStr = nullptr; + + StatusCode SC = initDeviceInfoImpl(DeviceInfo); + if (SC) { + REPORT("Error when initializing device info at " DPxMOD " on device %d\n", + DPxPTR(DeviceInfo), DeviceId); + } + return SC; +} + +StatusCode GenericPluginTy::initDevice(int32_t DeviceId) { + assert(!Devices[DeviceId] && "Device already initialized"); + + // Create the device and save the reference. + GenericDeviceTy &Device = createDevice(DeviceId); + Devices[DeviceId] = &Device; + + // Initialize the device and its resources. + StatusCode SC = Device.init(*this); + if (SC) { + REPORT("Failure to initialize device (%d): %s\n", DeviceId, + Plugin::getErrorStr(SC)); + } + return SC; +} + +StatusCode GenericPluginTy::deinitDevice(int32_t DeviceId) { + // The device may be already deinitialized. + if (Devices[DeviceId] == nullptr) + return StatusCode::SUCCESS; + + // Deinitialize the device and release its resources. + StatusCode SC = Devices[DeviceId]->deinit(); + if (SC) { + REPORT("Failure to deinitialize device (%d): %s\n", DeviceId, + Plugin::getErrorStr(SC)); + } + + // Delete the device and invalidate its reference. + delete Devices[DeviceId]; + Devices[DeviceId] = nullptr; + + return StatusCode::SUCCESS; +} + +void GenericDeviceTy::printInfo() { + // TODO: Print generic information here + printInfoImpl(); +} + +StatusCode GenericDeviceTy::createEvent(void **EventPtrStorage) { + StatusCode SC = createEventImpl(EventPtrStorage); + if (SC) { + REPORT("Failure to create event: %s\n", Plugin::getErrorStr(SC)); + } + return SC; +} + +StatusCode GenericDeviceTy::destroyEvent(void *EventPtr) { + StatusCode SC = destroyEventImpl(EventPtr); + if (SC) { + REPORT("Failure to destroy event (%p): %s\n", EventPtr, + Plugin::getErrorStr(SC)); + } + return SC; +} + +StatusCode GenericDeviceTy::recordEvent(void *EventPtr, + __tgt_async_info *AsyncInfo) { + StatusCode SC; + AsyncInfoWrapperTy AsyncInfoWrapper(SC, *this, AsyncInfo); + SC = recordEventImpl(EventPtr, AsyncInfoWrapper); + if (SC) { + REPORT("Failure to record event (%p): %s\n", EventPtr, + Plugin::getErrorStr(SC)); + } + return SC; +} + +StatusCode GenericDeviceTy::waitEvent(void *EventPtr, + __tgt_async_info *AsyncInfo) { + StatusCode SC; + AsyncInfoWrapperTy AsyncInfoWrapper(SC, *this, AsyncInfo); + SC = waitEventImpl(EventPtr, AsyncInfoWrapper); + if (SC) { + REPORT("Failure to wait for event (%p): %s\n", EventPtr, + Plugin::getErrorStr(SC)); + } + return SC; +} + +StatusCode GenericDeviceTy::syncEvent(void *EventPtr) { + StatusCode SC = syncEventImpl(EventPtr); + if (SC) { + REPORT("Failure to sync event (%p): %s\n", EventPtr, + Plugin::getErrorStr(SC)); + } + return SC; +} + +/// Exposed library API function, basically wrappers around the GenericDeviceTy +/// functionality with the same name. All non-async functions are redirected +/// to the async versions right away with a NULL AsyncInfoPtr. +#ifdef __cplusplus +extern "C" { +#endif + +int32_t __tgt_rtl_init_plugin() { + StatusCode SC = Plugin::init(); + if (SC) { + REPORT("Error when initializing plugin" GETNAME(TARGET_NAME) "\n"); + } + return SC; +} + +int32_t __tgt_rtl_deinit_plugin() { + StatusCode SC = Plugin::deinit(); + if (SC) { + REPORT("Error when deinitializing plugin" GETNAME(TARGET_NAME) "\n"); + } + return SC; +} + +int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *TgtImage) { + if (!Plugin::isActive()) + return false; + + return elf_check_machine(TgtImage, Plugin::get().getMagicElfBits()); +} + +int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *TgtImage, + __tgt_image_info *Info) { + if (!Plugin::isActive()) + return false; + + if (!__tgt_rtl_is_valid_binary(TgtImage)) + return false; + + // A subarchitecture was not specified. Assume it is compatible. + if (!Info->Arch) + return true; + + GenericPluginTy &Plugin = Plugin::get(); + + int32_t NumDevices = Plugin.getNumDevices(); + for (int32_t DeviceId = 0; DeviceId < NumDevices; ++DeviceId) { + if (!Plugin.isImageCompatible(DeviceId, Info)) + return false; + } + + DP("Image is compatible with current environment: %s\n", Info->Arch); + + return true; +} + +int32_t __tgt_rtl_supports_empty_images() { + // KNOTE: Only VE supports empty images + return Plugin::get().supportsEmptyImages(); +} + +int32_t __tgt_rtl_deinit_device(int32_t DeviceId) { + return Plugin::get().deinitDevice(DeviceId); +} + +int32_t __tgt_rtl_number_of_devices() { + return Plugin::get().getNumDevices(); +} + +int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { + Plugin::get().setRequiresFlag(RequiresFlags); + return RequiresFlags; +} + +int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDeviceId, int32_t DstDeviceId) { + return Plugin::get().isDataExchangable(SrcDeviceId, DstDeviceId); +} + +int32_t __tgt_rtl_init_device(int32_t DeviceId) { + return Plugin::get().initDevice(DeviceId); +} + +__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId, + __tgt_device_image *TgtImage) { + GenericPluginTy &Plugin = Plugin::get(); + return Plugin.getDevice(DeviceId).loadBinary(Plugin, TgtImage); +} + +void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr, + int32_t Kind) { + return Plugin::get().getDevice(DeviceId).dataAlloc(Size, HostPtr, + (TargetAllocTy)Kind); +} + +int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr, int32_t Kind) { + return Plugin::get().getDevice(DeviceId).dataDelete(TgtPtr, (TargetAllocTy)Kind); +} + +int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr, + int64_t Size) { + return __tgt_rtl_data_submit_async(DeviceId, TgtPtr, HstPtr, Size, + /* AsyncInfoPtr */ nullptr); +} + +int32_t __tgt_rtl_data_submit_async(int32_t DeviceId, void *TgtPtr, + void *HstPtr, int64_t Size, + __tgt_async_info *AsyncInfoPtr) { + return Plugin::get().getDevice(DeviceId).dataSubmit(TgtPtr, HstPtr, Size, + AsyncInfoPtr); +} + +int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, + int64_t Size) { + return __tgt_rtl_data_retrieve_async(DeviceId, HstPtr, TgtPtr, Size, + /* AsyncInfoPtr */ nullptr); +} + +int32_t __tgt_rtl_data_retrieve_async(int32_t DeviceId, void *HstPtr, + void *TgtPtr, int64_t Size, + __tgt_async_info *AsyncInfoPtr) { + return Plugin::get().getDevice(DeviceId).dataRetrieve(HstPtr, TgtPtr, Size, + AsyncInfoPtr); +} + +int32_t __tgt_rtl_data_exchange(int32_t SrcDeviceId, void *SrcPtr, + int32_t DstDeviceId, void *DstPtr, + int64_t Size) { + return __tgt_rtl_data_exchange_async(SrcDeviceId, SrcPtr, DstDeviceId, DstPtr, + Size, /* AsyncInfoPtr */ nullptr); +} + +int32_t __tgt_rtl_data_exchange_async(int32_t SrcDeviceId, void *SrcPtr, + int DstDeviceId, void *DstPtr, + int64_t Size, + __tgt_async_info *AsyncInfo) { + GenericDeviceTy &SrcDevice = Plugin::get().getDevice(SrcDeviceId); + GenericDeviceTy &DstDevice = Plugin::get().getDevice(DstDeviceId); + return SrcDevice.dataExchange(SrcPtr, DstDevice, DstPtr, Size, AsyncInfo); +} + +int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr, + void **TgtArgs, + ptrdiff_t *TgtOffsets, + int32_t NumArgs, int32_t NumTeams, + int32_t ThreadLimit, + uint64_t LoopTripCount) { + return __tgt_rtl_run_target_team_region_async( + DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, NumArgs, NumTeams, + ThreadLimit, LoopTripCount, + /* AsyncInfoPtr */ nullptr); +} + +int32_t __tgt_rtl_run_target_team_region_async( + int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, + ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams, + int32_t ThreadLimit, uint64_t LoopTripCount, + __tgt_async_info *AsyncInfoPtr) { + return Plugin::get().getDevice(DeviceId).runTargetTeamRegion( + TgtEntryPtr, TgtArgs, TgtOffsets, NumArgs, NumTeams, ThreadLimit, + LoopTripCount, AsyncInfoPtr); +} + +int32_t __tgt_rtl_synchronize(int32_t DeviceId, + __tgt_async_info *AsyncInfoPtr) { + return Plugin::get().getDevice(DeviceId).synchronize(AsyncInfoPtr); +} + +int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr, + void **TgtArgs, ptrdiff_t *TgtOffsets, + int32_t NumArgs) { + return __tgt_rtl_run_target_region_async(DeviceId, TgtEntryPtr, TgtArgs, + TgtOffsets, NumArgs, + /* AsyncInfoPtr */ nullptr); +} + +int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, + void *TgtEntryPtr, void **TgtArgs, + ptrdiff_t *TgtOffsets, + int32_t NumArgs, + __tgt_async_info *AsyncInfoPtr) { + return __tgt_rtl_run_target_team_region_async( + DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, NumArgs, + /* team num*/ 1, /* thread limit */ 1, /* loop tripcount */ 0, + AsyncInfoPtr); +} + +void __tgt_rtl_print_device_info(int32_t DeviceId) { + Plugin::get().getDevice(DeviceId).printInfo(); +} + +int32_t __tgt_rtl_create_event(int32_t DeviceId, void **EventPtr) { + return Plugin::get().getDevice(DeviceId).createEvent(EventPtr); +} + +int32_t __tgt_rtl_record_event(int32_t DeviceId, void *EventPtr, + __tgt_async_info *AsyncInfoPtr) { + return Plugin::get().getDevice(DeviceId).recordEvent(EventPtr, AsyncInfoPtr); +} + +int32_t __tgt_rtl_wait_event(int32_t DeviceId, void *EventPtr, + __tgt_async_info *AsyncInfoPtr) { + return Plugin::get().getDevice(DeviceId).waitEvent(EventPtr, AsyncInfoPtr); +} + +int32_t __tgt_rtl_sync_event(int32_t DeviceId, void *EventPtr) { + return Plugin::get().getDevice(DeviceId).syncEvent(EventPtr); +} + +int32_t __tgt_rtl_destroy_event(int32_t DeviceId, void *EventPtr) { + return Plugin::get().getDevice(DeviceId).destroyEvent(EventPtr); +} + +void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) { + std::atomic &InfoLevel = getInfoLevelInternal(); + InfoLevel.store(NewInfoLevel); +} + +int32_t __tgt_rtl_init_async_info(int32_t DeviceId, __tgt_async_info **AsyncInfoPtr) { + assert(AsyncInfoPtr && "async_info is nullptr"); + + return Plugin::get().getDevice(DeviceId).initAsyncInfo(AsyncInfoPtr); +} + +int32_t __tgt_rtl_init_device_info(int32_t DeviceId, + __tgt_device_info *DeviceInfo, + const char **ErrStr) { + return Plugin::get().getDevice(DeviceId).initDeviceInfo(DeviceInfo, ErrStr); +} + +#ifdef __cplusplus +} +#endif Index: openmp/libomptarget/plugins-nextgen/cuda/CMakeLists.txt =================================================================== --- /dev/null +++ openmp/libomptarget/plugins-nextgen/cuda/CMakeLists.txt @@ -0,0 +1,111 @@ +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a CUDA machine if available. +# +##===----------------------------------------------------------------------===## +set(LIBOMPTARGET_BUILD_CUDA_PLUGIN TRUE CACHE BOOL + "Whether to build CUDA plugin") +if (NOT LIBOMPTARGET_BUILD_CUDA_PLUGIN) + libomptarget_say("Not building CUDA NextGen offloading plugin: LIBOMPTARGET_BUILD_CUDA_PLUGIN is false") + return() +endif() + +if (NOT(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux")) + libomptarget_say("Not building CUDA NextGen offloading plugin: only support CUDA in Linux x86_64, ppc64le, or aarch64 hosts.") + return() +endif() + +libomptarget_say("Building CUDA NextGen offloading plugin.") + +# Define the suffix for the runtime messaging dumps. +add_definitions("-DTARGET_NAME=CUDA") + +# Define debug prefix. TODO: This should be automatized in the Debug.h but it +# requires changing the original plugins. +add_definitions(-DDEBUG_PREFIX="TARGET CUDA RTL") + +set(LIBOMPTARGET_DLOPEN_LIBCUDA OFF) +option(LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA "Build with dlopened libcuda" ${LIBOMPTARGET_DLOPEN_LIBCUDA}) + +set(LIBOMPTARGET_CAN_LINK_LIBCUDA FALSE) +if (LIBOMPTARGET_DEP_CUDA_FOUND AND LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND) + set(LIBOMPTARGET_CAN_LINK_LIBCUDA TRUE) +endif() + +if (LIBOMPTARGET_CAN_LINK_LIBCUDA AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA) + libomptarget_say("Building CUDA NextGen plugin linked against libcuda") + include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS}) + add_llvm_library(omptarget.rtl.cuda.nextgen SHARED + + src/rtl.cpp + + ADDITIONAL_HEADER_DIRS + ${LIBOMPTARGET_INCLUDE_DIR} + + LINK_LIBS + PRIVATE + elf_common + MemoryManager + PluginInterface + ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES} + ${OPENMP_PTHREAD_LIB} + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports" + "-Wl,-z,defs" + + NO_INSTALL_RPATH + ) +else() + libomptarget_say("Building CUDA NextGen plugin for dlopened libcuda") + include_directories(dynamic_cuda) + add_llvm_library(omptarget.rtl.cuda.nextgen + SHARED + + src/rtl.cpp + dynamic_cuda/cuda.cpp + + ADDITIONAL_HEADER_DIRS + ${LIBOMPTARGET_INCLUDE_DIR} + + LINK_LIBS + PRIVATE + elf_common + MemoryManager + PluginInterface + ${CMAKE_DL_LIBS} + ${OPENMP_PTHREAD_LIB} + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports" + "-Wl,-z,defs" + + NO_INSTALL_RPATH + ) +endif() +add_dependencies(omptarget.rtl.cuda.nextgen omptarget.devicertl.nvptx) + +# Install plugin under the lib destination folder. +install(TARGETS omptarget.rtl.cuda.nextgen LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") +set_target_properties(omptarget.rtl.cuda.nextgen PROPERTIES + INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/..") + +target_include_directories(omptarget.rtl.cuda.nextgen PRIVATE + ${LIBOMPTARGET_INCLUDE_DIR} +) + +# Report to the parent scope that we are building a plugin for CUDA. +# This controls whether tests are run for the nvptx offloading target +# Run them if libcuda is available, or if the user explicitly asked for dlopen +# Otherwise this plugin is being built speculatively and there may be no cuda available +#if (LIBOMPTARGET_CAN_LINK_LIBCUDA OR LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA) +# libomptarget_say("Enable tests using CUDA NextGen plugin") +# set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda-nextgen nvptx64-nvidia-cuda-nextgen-oldDriver" PARENT_SCOPE) +# set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda-nextgen nvptx64-nvidia-cuda-nextgen-LTO" PARENT_SCOPE) +# list(APPEND LIBOMPTARGET_TESTED_PLUGINS "omptarget.rtl.cuda.nextgen") +# set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE) +#else() +# libomptarget_say("Disabling tests using CUDA NextGen plugin as cuda may not be available") +#endif() Index: openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h =================================================================== --- /dev/null +++ openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h @@ -0,0 +1,266 @@ +//===--- cuda/dynamic_cuda/cuda.h --------------------------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// The parts of the cuda api that are presently in use by the openmp cuda plugin +// +//===----------------------------------------------------------------------===// + +#ifndef DYNAMIC_CUDA_CUDA_H_INCLUDED +#define DYNAMIC_CUDA_CUDA_H_INCLUDED + +#include +#include + +typedef int CUdevice; +typedef uintptr_t CUdeviceptr; +typedef struct CUmod_st *CUmodule; +typedef struct CUctx_st *CUcontext; +typedef struct CUfunc_st *CUfunction; +typedef struct CUstream_st *CUstream; +typedef struct CUevent_st *CUevent; + +typedef enum cudaError_enum { + CUDA_SUCCESS = 0, + CUDA_ERROR_INVALID_VALUE = 1, + CUDA_ERROR_NO_DEVICE = 100, + CUDA_ERROR_INVALID_HANDLE = 400, +} CUresult; + +typedef enum CUstream_flags_enum { + CU_STREAM_DEFAULT = 0x0, + CU_STREAM_NON_BLOCKING = 0x1, +} CUstream_flags; + +typedef enum CUlimit_enum { + CU_LIMIT_STACK_SIZE = 0x0, + CU_LIMIT_PRINTF_FIFO_SIZE = 0x1, + CU_LIMIT_MALLOC_HEAP_SIZE = 0x2, + CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH = 0x3, + CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x4, + CU_LIMIT_MAX_L2_FETCH_GRANULARITY = 0x5, + CU_LIMIT_PERSISTING_L2_CACHE_SIZE = 0x6, + CU_LIMIT_MAX +} CUlimit; + +typedef enum CUdevice_attribute_enum { + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, + CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, + CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, + CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, + CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, + CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, + CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, + CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, + CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, + CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, + CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, + CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, + CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, + CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, + CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, + CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, + CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, + CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, + CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, + CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, + CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, + CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, + CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, + CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, + CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, + CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43, + CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, + CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50, + CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67, + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, + CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78, + CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79, + CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80, + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81, + CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82, + CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83, + CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84, + CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85, + CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86, + CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87, + CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88, + CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89, + CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90, + CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91, + CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92, + CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93, + CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94, + CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95, + CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96, + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97, + CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98, + CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99, + CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, + CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101, + CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED = 102, + CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 102, + CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103, + CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104, + CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105, + CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106, + CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107, + CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108, + CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE = 109, + CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110, + CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111, + CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED = 112, + CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113, + CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED = 114, + CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = 115, + CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED = 116, + CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS = 117, + CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING = 118, + CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES = 119, + CU_DEVICE_ATTRIBUTE_MAX, +} CUdevice_attribute; + +typedef enum CUfunction_attribute_enum { + CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, +} CUfunction_attribute; + +typedef enum CUctx_flags_enum { + CU_CTX_SCHED_BLOCKING_SYNC = 0x04, + CU_CTX_SCHED_MASK = 0x07, +} CUctx_flags; + +typedef enum CUmemAttach_flags_enum { + CU_MEM_ATTACH_GLOBAL = 0x1, + CU_MEM_ATTACH_HOST = 0x2, + CU_MEM_ATTACH_SINGLE = 0x4, +} CUmemAttach_flags; + +typedef enum CUcomputeMode_enum { + CU_COMPUTEMODE_DEFAULT = 0, + CU_COMPUTEMODE_PROHIBITED = 2, + CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3, +} CUcompute_mode; + +typedef enum CUevent_flags_enum { + CU_EVENT_DEFAULT = 0x0, + CU_EVENT_BLOCKING_SYNC = 0x1, + CU_EVENT_DISABLE_TIMING = 0x2, + CU_EVENT_INTERPROCESS = 0x4 +} CUevent_flags; + +CUresult cuCtxGetDevice(CUdevice *); +CUresult cuDeviceGet(CUdevice *, int); +CUresult cuDeviceGetAttribute(int *, CUdevice_attribute, CUdevice); +CUresult cuDeviceGetCount(int *); +CUresult cuFuncGetAttribute(int *, CUfunction_attribute, CUfunction); + +// Device info +CUresult cuDeviceGetName(char *, int, CUdevice); +CUresult cuDeviceTotalMem(size_t *, CUdevice); +CUresult cuDriverGetVersion(int *); + +CUresult cuGetErrorString(CUresult, const char **); +CUresult cuInit(unsigned); +CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, unsigned, unsigned, + unsigned, unsigned, unsigned, CUstream, void **, + void **); + +CUresult cuMemAlloc(CUdeviceptr *, size_t); +CUresult cuMemAllocHost(void **, size_t); +CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int); + +CUresult cuMemcpyDtoDAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream); +CUresult cuMemcpyDtoH(void *, CUdeviceptr, size_t); +CUresult cuMemcpyDtoHAsync(void *, CUdeviceptr, size_t, CUstream); +CUresult cuMemcpyHtoD(CUdeviceptr, const void *, size_t); +CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream); + +CUresult cuMemFree(CUdeviceptr); +CUresult cuMemFreeHost(void *); + +CUresult cuModuleGetFunction(CUfunction *, CUmodule, const char *); +CUresult cuModuleGetGlobal(CUdeviceptr *, size_t *, CUmodule, const char *); + +CUresult cuModuleUnload(CUmodule); +CUresult cuStreamCreate(CUstream *, unsigned); +CUresult cuStreamDestroy(CUstream); +CUresult cuStreamSynchronize(CUstream); +CUresult cuCtxSetCurrent(CUcontext); +CUresult cuDevicePrimaryCtxRelease(CUdevice); +CUresult cuDevicePrimaryCtxGetState(CUdevice, unsigned *, int *); +CUresult cuDevicePrimaryCtxSetFlags(CUdevice, unsigned); +CUresult cuDevicePrimaryCtxRetain(CUcontext *, CUdevice); +CUresult cuModuleLoadDataEx(CUmodule *, const void *, unsigned, void *, + void **); + +CUresult cuDeviceCanAccessPeer(int *, CUdevice, CUdevice); +CUresult cuCtxEnablePeerAccess(CUcontext, unsigned); +CUresult cuMemcpyPeerAsync(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, + size_t, CUstream); + +CUresult cuCtxGetLimit(size_t *, CUlimit); +CUresult cuCtxSetLimit(CUlimit, size_t); + +CUresult cuEventCreate(CUevent *, unsigned int); +CUresult cuEventRecord(CUevent, CUstream); +CUresult cuStreamWaitEvent(CUstream, CUevent, unsigned int); +CUresult cuEventSynchronize(CUevent); +CUresult cuEventDestroy(CUevent); + +#endif Index: openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp =================================================================== --- /dev/null +++ openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp @@ -0,0 +1,145 @@ +//===--- cuda/dynamic_cuda/cuda.pp ------------------------------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Implement subset of cuda api by calling into cuda library via dlopen +// Does the dlopen/dlsym calls as part of the call to cuInit +// +//===----------------------------------------------------------------------===// + +#include "cuda.h" +#include "Debug.h" +#include "dlwrap.h" + +#include +#include + +#include + +DLWRAP_INITIALIZE(); + +DLWRAP_INTERNAL(cuInit, 1); + +DLWRAP(cuCtxGetDevice, 1); +DLWRAP(cuDeviceGet, 2); +DLWRAP(cuDeviceGetAttribute, 3); +DLWRAP(cuDeviceGetCount, 1); +DLWRAP(cuFuncGetAttribute, 3); + +// Device info +DLWRAP(cuDeviceGetName, 3); +DLWRAP(cuDeviceTotalMem, 2); +DLWRAP(cuDriverGetVersion, 1); + +DLWRAP(cuGetErrorString, 2); +DLWRAP(cuLaunchKernel, 11); + +DLWRAP(cuMemAlloc, 2); +DLWRAP(cuMemAllocHost, 2); +DLWRAP(cuMemAllocManaged, 3); + +DLWRAP(cuMemcpyDtoDAsync, 4); +DLWRAP(cuMemcpyDtoH, 3); +DLWRAP(cuMemcpyDtoHAsync, 4); +DLWRAP(cuMemcpyHtoD, 3); +DLWRAP(cuMemcpyHtoDAsync, 4); + +DLWRAP(cuMemFree, 1); +DLWRAP(cuMemFreeHost, 1); +DLWRAP(cuModuleGetFunction, 3); +DLWRAP(cuModuleGetGlobal, 4); + +DLWRAP(cuModuleUnload, 1); +DLWRAP(cuStreamCreate, 2); +DLWRAP(cuStreamDestroy, 1); +DLWRAP(cuStreamSynchronize, 1); +DLWRAP(cuCtxSetCurrent, 1); +DLWRAP(cuDevicePrimaryCtxRelease, 1); +DLWRAP(cuDevicePrimaryCtxGetState, 3); +DLWRAP(cuDevicePrimaryCtxSetFlags, 2); +DLWRAP(cuDevicePrimaryCtxRetain, 2); +DLWRAP(cuModuleLoadDataEx, 5); + +DLWRAP(cuDeviceCanAccessPeer, 3); +DLWRAP(cuCtxEnablePeerAccess, 2); +DLWRAP(cuMemcpyPeerAsync, 6); + +DLWRAP(cuCtxGetLimit, 2); +DLWRAP(cuCtxSetLimit, 2); + +DLWRAP(cuEventCreate, 2); +DLWRAP(cuEventRecord, 2); +DLWRAP(cuStreamWaitEvent, 3); +DLWRAP(cuEventSynchronize, 1); +DLWRAP(cuEventDestroy, 1); + +DLWRAP_FINALIZE(); + +#ifndef DYNAMIC_CUDA_PATH +#define DYNAMIC_CUDA_PATH "libcuda.so" +#endif + +static bool checkForCUDA() { + // return true if dlopen succeeded and all functions found + + // Prefer _v2 versions of functions if found in the library + std::unordered_map TryFirst = { + {"cuMemAlloc", "cuMemAlloc_v2"}, + {"cuMemFree", "cuMemFree_v2"}, + {"cuMemcpyDtoH", "cuMemcpyDtoH_v2"}, + {"cuMemcpyHtoD", "cuMemcpyHtoD_v2"}, + {"cuStreamDestroy", "cuStreamDestroy_v2"}, + {"cuModuleGetGlobal", "cuModuleGetGlobal_v2"}, + {"cuMemcpyDtoHAsync", "cuMemcpyDtoHAsync_v2"}, + {"cuMemcpyDtoDAsync", "cuMemcpyDtoDAsync_v2"}, + {"cuMemcpyHtoDAsync", "cuMemcpyHtoDAsync_v2"}, + {"cuDevicePrimaryCtxRelease", "cuDevicePrimaryCtxRelease_v2"}, + {"cuDevicePrimaryCtxSetFlags", "cuDevicePrimaryCtxSetFlags_v2"}, + }; + + const char *CudaLib = DYNAMIC_CUDA_PATH; + void *DynlibHandle = dlopen(CudaLib, RTLD_NOW); + if (!DynlibHandle) { + DP("Unable to load library '%s': %s!\n", CudaLib, dlerror()); + return false; + } + + for (size_t I = 0; I < dlwrap::size(); I++) { + const char *Sym = dlwrap::symbol(I); + + auto It = TryFirst.find(Sym); + if (It != TryFirst.end()) { + const char *First = It->second; + void *P = dlsym(DynlibHandle, First); + if (P) { + DP("Implementing %s with dlsym(%s) -> %p\n", Sym, First, P); + *dlwrap::pointer(I) = P; + continue; + } + } + + void *P = dlsym(DynlibHandle, Sym); + if (P == nullptr) { + DP("Unable to find '%s' in '%s'!\n", Sym, CudaLib); + return false; + } + DP("Implementing %s with dlsym(%s) -> %p\n", Sym, Sym, P); + + *dlwrap::pointer(I) = P; + } + + return true; +} + +CUresult cuInit(unsigned X) { + // Note: Called exactly once from cuda rtl.cpp in a global constructor so + // does not need to handle being called repeatedly or concurrently + if (!checkForCUDA()) { + return CUDA_ERROR_INVALID_HANDLE; + } + return dlwrap_cuInit(X); +} Index: openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp =================================================================== --- /dev/null +++ openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp @@ -0,0 +1,1025 @@ +//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// RTL NextGen for CUDA machine +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include "Debug.h" +#include "DeviceEnvironment.h" +#include "GlobalHandler.h" +#include "PluginInterface.h" + +#include "llvm/Frontend/OpenMP/OMPConstants.h" +#include "llvm/Frontend/OpenMP/OMPGridValues.h" + +namespace llvm { +namespace omp { +namespace target { +namespace plugin { + +/// Forward declarations for all specialized data structures. +struct CUDAKernelTy; +struct CUDADeviceTy; +struct CUDAPluginTy; +struct CUDAStreamManagerTy; +struct CUDAEventManagerTy; + +// TODO: Improve this function. +template +static bool checkResult(CUresult Err, const char *ErrMsg, ArgsTy... Args) { + if (Err == CUDA_SUCCESS) + return false; + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wformat-security" + REPORT(ErrMsg, Args...); +#pragma clang diagnostic pop + + const char *ErrStr = nullptr; + CUresult ErrStrStatus = cuGetErrorString(Err, &ErrStr); + if (ErrStrStatus == CUDA_ERROR_INVALID_VALUE) { + REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code: %d\n", Err); + } else if (ErrStrStatus == CUDA_SUCCESS) { + REPORT(GETNAME(TARGET_NAME) " error is: %s\n", ErrStr); + } else { + REPORT("Unresolved " GETNAME(TARGET_NAME) " error code: %d\n" + "Unsuccessful cuGetErrorString return status: %d\n", Err, ErrStrStatus); + } + return true; +} + +/// Class implementing the CUDA kernel functionalities which derives from the +/// generic kernel class. +struct CUDAKernelTy : public GenericKernelTy { + /// Create a CUDA kernel with a name, an execution mode, and the kernel function. + CUDAKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode, + CUfunction Func) + : GenericKernelTy(Name, ExecutionMode), Func(Func) {} + + /// Initialize the CUDA kernel + StatusCode initImpl(GenericDeviceTy &GenericDevice) override { + int MaxThreads; + CUresult Err = cuFuncGetAttribute( + &MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Func); + if (checkResult(Err, "Error returned from cuFuncGetAttribute\n")) + return StatusCode(Err); + + /// Set the maximum number of threads for the CUDA kernel. + MaxNumThreads = std::min(MaxNumThreads, MaxThreads); + + return StatusCode::SUCCESS; + } + + /// Launch the CUDA kernel function + StatusCode launchImpl(GenericDeviceTy &GenericDevice, int32_t NumThreads, + int32_t NumBlocks, int32_t DynamicMemorySize, + int32_t NumKernelArgs, void *KernelArgs, + AsyncInfoWrapperTy &AsyncInfoWrapper) const override; + + /// The default number of blocks is common to the whole device. + int32_t getDefaultNumBlocks(GenericDeviceTy &GenericDevice) const override { + return GenericDevice.getDefaultNumBlocks(); + } + + /// The default number of threads is common to the whole device. + int32_t getDefaultNumThreads(GenericDeviceTy &GenericDevice) const override { + return GenericDevice.getDefaultNumThreads(); + } + +private: + /// The CUDA kernel function to execute. + CUfunction Func; +}; + +/// Class wrapping a CUDA stream reference. These are the objects handled by the +/// Stream Manager for the CUDA plugin. +class CUDAStreamRef final : public GenericDeviceResourceRef { + /// The reference to the CUDA stream. + CUstream Stream; + +public: + /// Create an empty reference to an invalid stream. + CUDAStreamRef() : Stream(nullptr) {} + + /// Create a reference to an existing stream. + CUDAStreamRef(CUstream Stream) : Stream(Stream) {} + + /// Create a new stream and save the reference. The reference must be empty + /// before calling to this function. + StatusCode create() override { + if (Stream != nullptr) + return StatusCode::FAIL; + + CUresult Err = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING); + if (checkResult(Err, "Error returned from cuStreamCreate\n")) + return StatusCode::FAIL; + + return StatusCode::SUCCESS; + } + + /// Destroy the referenced stream and invalidate the reference. The reference + /// must be to a valid stream before calling to this function. + StatusCode destroy() override { + if (Stream == nullptr) + return StatusCode::FAIL; + + CUresult Err = cuStreamDestroy(Stream); + if (checkResult(Err, "Error returned from cuStreamDestroy\n")) + return StatusCode::FAIL; + + Stream = nullptr; + return StatusCode::SUCCESS; + } + + /// Get the underlying CUstream. + operator CUstream() const { return Stream; } +}; + +/// Class wrapping a CUDA event reference. These are the objects handled by the +/// Event Manager for the CUDA plugin. +class CUDAEventRef final : public GenericDeviceResourceRef { + CUevent Event; + +public: + /// Create an empty reference to an invalid event. + CUDAEventRef() : Event(nullptr) {} + + /// Create a reference to an existing event. + CUDAEventRef(CUevent Event) : Event(Event) {} + + /// Create a new event and save the reference. The reference must be empty + /// before calling to this function. + StatusCode create() override { + if (Event != nullptr) + return StatusCode::FAIL; + + CUresult Err = cuEventCreate(&Event, CU_EVENT_DEFAULT); + if (checkResult(Err, "Error returned from cuEventCreate\n")) + return StatusCode::FAIL; + + return StatusCode::SUCCESS; + } + + /// Destroy the referenced event and invalidate the reference. The reference + /// must be to a valid event before calling to this function. + StatusCode destroy() override { + if (Event == nullptr) + return StatusCode::FAIL; + + CUresult Err = cuEventDestroy(Event); + if (checkResult(Err, "Error returned from cuEventDestroy\n")) + return StatusCode::FAIL; + + Event = nullptr; + return StatusCode::SUCCESS; + } + + /// Get the underlying CUevent. + operator CUevent() const { return Event; } +}; + +/// Class implementing the CUDA device images properties. +struct CUDADeviceImageTy : public DeviceImageTy { + /// Create the CUDA image with the id and the target image pointer. + CUDADeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage) + : DeviceImageTy(ImageId, TgtImage), Module(nullptr) {} + + /// Getter and setter for the CUDA module. + CUmodule getModule() const { return Module; } + void setModule(CUmodule CUDAModule) { Module = CUDAModule; } + +private: + /// The CUDA module that loaded the image. + CUmodule Module; +}; + +/// Class implementing the CUDA device functionalities which derives from the +/// generic device class. +struct CUDADeviceTy : public GenericDeviceTy { + // Create a CUDA device with a device id and the default CUDA grid values. + CUDADeviceTy(int32_t DeviceId, int32_t NumDevices) + : GenericDeviceTy(DeviceId, NumDevices, NVPTXGridValues), + CUDAStreamManager(*this), CUDAEventManager(*this) {} + + ~CUDADeviceTy() {} + + /// Initialize the device, its resources and get its properties. + StatusCode initImpl(GenericPluginTy &Plugin) override { + CUresult Err = cuDeviceGet(&Device, DeviceId); + if (checkResult(Err, "Error returned from cuDeviceGet\n")) + return StatusCode::FAIL; + + // Query the current flags of the primary context and set its flags if + // it is inactive. + unsigned int FormerPrimaryCtxFlags = 0; + int FormerPrimaryCtxIsActive = 0; + Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags, + &FormerPrimaryCtxIsActive); + if (checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n")) + return StatusCode::FAIL; + + if (FormerPrimaryCtxIsActive) { + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "The primary context is active, no change to its flags\n"); + if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) != + CU_CTX_SCHED_BLOCKING_SYNC) + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "Warning: The current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n"); + } else { + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "The primary context is inactive, set its flags to " + "CU_CTX_SCHED_BLOCKING_SYNC\n"); + Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC); + if (checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n")) + return StatusCode::FAIL; + } + + // Retain the per device primary context and save it to use whenever this + // device is selected. + Err = cuDevicePrimaryCtxRetain(&Context, Device); + if (checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n")) + return StatusCode::FAIL; + + if (setContext()) + return StatusCode::FAIL; + + // Initialize stream pool. + if (CUDAStreamManager.init()) + return StatusCode::FAIL; + + if (CUDAEventManager.init()) + return StatusCode::FAIL; + + // Query attributes to determine number of threads/block and blocks/grid. + if (getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, + GridValues.GV_Max_Teams)) + return StatusCode::FAIL; + if (getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, + GridValues.GV_Max_WG_Size)) + return StatusCode::FAIL; + if (getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE, GridValues.GV_Warp_Size)) + return StatusCode::FAIL; + + return StatusCode::SUCCESS; + } + + /// Deinitialize the device and release its resources. + StatusCode deinitImpl() override { + if (Context && setContext()) + return StatusCode::FAIL; + + // Deinitialize the stream manager. + if (CUDAStreamManager.deinit()) + return StatusCode::FAIL; + + if (CUDAEventManager.deinit()) + return StatusCode::FAIL; + + // Close modules if necessary. + if (!LoadedImages.empty()) { + assert(Context && "Context is invalid"); + + // Each image has its own module. + for (DeviceImageTy *Image : LoadedImages) { + CUDADeviceImageTy &CUDAImage = + static_cast(*Image); + + // Unload the module of the image. + CUresult Err = cuModuleUnload(CUDAImage.getModule()); + if (checkResult(Err, "Error returned from cuModuleUnload\n")) + return StatusCode(Err); + + CUDAImage.setModule(nullptr); + } + } + + if (Context) { + CUresult Err = cuDevicePrimaryCtxRelease(Device); + if (checkResult(Err, "Error returned from cuDevicePrimaryCtxRelease\n")) + return StatusCode(Err); + } + + // Invalidate context and device references. + Context = nullptr; + Device = CU_DEVICE_INVALID; + + return StatusCode::SUCCESS; + } + + /// Allocate and construct a CUDA kernel. + GenericKernelTy * + constructKernelEntry(const __tgt_offload_entry &KernelEntry, + DeviceImageTy &Image) override { + CUDADeviceImageTy &CUDAImage = + static_cast(Image); + + // Retrieve the function pointer of the kernel. + CUfunction Func; + CUresult Err = cuModuleGetFunction(&Func, CUDAImage.getModule(), + KernelEntry.name); + if (checkResult(Err, "Failed loading '%s'\n", KernelEntry.name)) + return nullptr; + + DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", DPxPTR(&KernelEntry), + KernelEntry.name, DPxPTR(Func)); + + // Create a metadata object for the exec mode global (auto-generated). + StaticGlobalTy ExecModeGlobal( + KernelEntry.name, "_exec_mode"); + + // Retrieve execution mode for the kernel. + GenericGlobalHandlerTy &GlobalHandler = Plugin::get().getGlobalHandler(); + if (GlobalHandler.readGlobalFromImage(*this, Image, ExecModeGlobal)) { + DP("Failed to read execution mode for '%s', using default GENERIC (1)\n", + KernelEntry.name); + // In some cases the execution mode is not included, so use the default. + ExecModeGlobal.setValue(llvm::omp::OMP_TGT_EXEC_MODE_GENERIC); + } + + // Allocate and initialize the CUDA kernel. + CUDAKernelTy *CUDAKernel = Plugin::get().allocate(); + new (CUDAKernel) + CUDAKernelTy(KernelEntry.name, ExecModeGlobal.getValue(), Func); + + return CUDAKernel; + } + + /// Set the current context to this device's context. + StatusCode setContext() override { + CUresult Err = cuCtxSetCurrent(Context); + if (checkResult(Err, "Error returned from cuCtxSetCurrent\n")) + return StatusCode::FAIL; + return StatusCode::SUCCESS; + } + + /// Get the stream of the asynchronous info sructure or get a new one. + CUstream getStream(AsyncInfoWrapperTy &AsyncInfoWrapper) { + CUstream &Stream = AsyncInfoWrapper.getQueueAs(); + if (!Stream) + Stream = CUDAStreamManager.getStream(); + return Stream; + } + + /// Getters of CUDA references. + CUcontext getCUDAContext() const { return Context; } + CUdevice getCUDADevice() const { return Device; } + + /// Load the binary image into the device and allocate an image object. + DeviceImageTy *loadBinaryImpl(const __tgt_device_image *TgtImage, + int32_t ImageId) override { + if (setContext()) + return nullptr; + + // Allocate and initialize the image object. + CUDADeviceImageTy *CUDAImage = Plugin::get().allocate(); + new (CUDAImage) + CUDADeviceImageTy(ImageId, TgtImage); + + // Load the binary as a CUDA module. + CUmodule Module = nullptr; + CUresult Err = cuModuleLoadDataEx(&Module, CUDAImage->getStart(), 0, + nullptr, nullptr); + if (checkResult(Err, "Error returned from cuModuleLoadDataEx\n")) + return nullptr; + + // Save the reference of the module on the image. + CUDAImage->setModule(Module); + + return CUDAImage; + } + + /// Allocate memory on the device or related to the device. + void *allocate(size_t Size, void *, TargetAllocTy Kind) override { + if (Size == 0) + return nullptr; + + if (setContext()) + return nullptr; + + CUresult Err; + void *MemAlloc = nullptr; + CUdeviceptr DevicePtr; + switch (Kind) { + case TARGET_ALLOC_DEFAULT: + case TARGET_ALLOC_DEVICE: + Err = cuMemAlloc(&DevicePtr, Size); + if (checkResult(Err, "Error returned from cuMemAlloc\n")) + return nullptr; + MemAlloc = (void *)DevicePtr; + break; + case TARGET_ALLOC_HOST: + Err = cuMemAllocHost(&MemAlloc, Size); + if (checkResult(Err, "Error returned from cuMemAllocHost\n")) + return nullptr; + break; + case TARGET_ALLOC_SHARED: + Err = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL); + if (checkResult(Err, "Error returned from cuMemAllocManaged\n")) + return nullptr; + MemAlloc = (void *)DevicePtr; + break; + } + + return MemAlloc; + } + + /// Deallocate memory on the device or related to the device. + int free(void *TgtPtr, TargetAllocTy Kind) override { + if (TgtPtr == nullptr) + return OFFLOAD_SUCCESS; + + if (setContext()) + return OFFLOAD_FAIL; + + CUresult Err; + switch (Kind) { + case TARGET_ALLOC_DEFAULT: + case TARGET_ALLOC_DEVICE: + case TARGET_ALLOC_SHARED: + Err = cuMemFree((CUdeviceptr)TgtPtr); + if (checkResult(Err, "Error returned from cuMemFree\n")) + return OFFLOAD_FAIL; + break; + case TARGET_ALLOC_HOST: + Err = cuMemFreeHost(TgtPtr); + if (checkResult(Err, "Error returned from cuMemFreeHost\n")) + return OFFLOAD_FAIL; + break; + } + + return OFFLOAD_SUCCESS; + } + + /// Synchronize current thread with the pending operations on the async info. + StatusCode synchronizeImpl(__tgt_async_info &AsyncInfo) override { + CUstream Stream = reinterpret_cast(AsyncInfo.Queue); + CUresult Err = cuStreamSynchronize(Stream); + + // Once the stream is synchronized, return it to stream pool and reset + // AsyncInfo. This is to make sure the synchronization only works for its + // own tasks. + CUDAStreamManager.returnStream(Stream); + AsyncInfo.Queue = nullptr; + + return StatusCode(Err); + } + + /// Submit data to the device (host to device transfer). + StatusCode dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + if (setContext()) + return StatusCode::FAIL; + + CUstream Stream = getStream(AsyncInfoWrapper); + + return StatusCode( + cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream)); + } + + /// Retrieve data from the device (device to host transfer). + StatusCode dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + if (setContext()) + return StatusCode::FAIL; + + CUstream Stream = getStream(AsyncInfoWrapper); + + return StatusCode( + cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream)); + } + + /// Exchange data between two devices directly. We may use peer access if + /// the CUDA devices and driver allow them. + StatusCode dataExchangeImpl(const void *SrcPtr, + GenericDeviceTy &DstGenericDevice, + void *DstPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override; + + /// Initialize the async info for interoperability purposes. + StatusCode initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { + if (setContext()) + return StatusCode::FAIL; + + getStream(AsyncInfoWrapper); + + return StatusCode::SUCCESS; + } + + /// Initialize the device info for interoperability purposes. + StatusCode initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override { + assert(Context && "Context is null"); + assert(Device != CU_DEVICE_INVALID && "Device is invalid"); + + if (setContext()) + return StatusCode::FAIL; + + if (!DeviceInfo->Context) + DeviceInfo->Context = Context; + + if (!DeviceInfo->Device) + DeviceInfo->Device = reinterpret_cast(Device); + + return StatusCode::SUCCESS; + } + + /// Create an event. + StatusCode createEventImpl(void **EventPtrStorage) override { + CUevent *Event = reinterpret_cast(EventPtrStorage); + *Event = CUDAEventManager.getEvent(); + return StatusCode::SUCCESS; + } + + /// Destroy a previously created event. + StatusCode destroyEventImpl(void *EventPtr) override { + CUevent Event = reinterpret_cast(EventPtr); + CUDAEventManager.returnEvent(Event); + return StatusCode::SUCCESS; + } + + /// Record the event. + StatusCode recordEventImpl(void *EventPtr, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + CUevent Event = reinterpret_cast(EventPtr); + return StatusCode(cuEventRecord(Event, getStream(AsyncInfoWrapper))); + } + + /// Make the stream wait on the event. + StatusCode waitEventImpl(void *EventPtr, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + // We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from + // specific CUDA version, and defined as 0x0. In previous version, per CUDA + // API document, that argument has to be 0x0. + CUevent Event = reinterpret_cast(EventPtr); + return StatusCode(cuStreamWaitEvent(getStream(AsyncInfoWrapper), Event, 0)); + } + + /// Synchronize the current thread with the event. + StatusCode syncEventImpl(void *EventPtr) override { + CUevent Event = reinterpret_cast(EventPtr); + return StatusCode(cuEventSynchronize(Event)); + } + + /// Print information about the device. + void printInfoImpl() override { + char TmpChar[1000]; + std::string TmpStr; + size_t TmpSt; + int TmpInt, TmpInt2, TmpInt3; + + cuDriverGetVersion(&TmpInt); + printf(" CUDA Driver Version: \t\t%d \n", TmpInt); + printf(" CUDA Device Number: \t\t%d \n", DeviceId); + checkResult(cuDeviceGetName(TmpChar, 1000, Device), + "Error returned from cuDeviceGetName\n"); + printf(" Device Name: \t\t\t%s \n", TmpChar); + checkResult(cuDeviceTotalMem(&TmpSt, Device), + "Error returned from cuDeviceTotalMem\n"); + printf(" Global Memory Size: \t\t%zu bytes \n", TmpSt); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Number of Multiprocessors: \t\t%d \n", TmpInt); + checkResult( + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Concurrent Copy and Execution: \t%s \n", StringParser::boolToYesNoString(TmpInt)); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Total Constant Memory: \t\t%d bytes\n", TmpInt); + checkResult( + cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Max Shared Memory per Block: \t%d bytes \n", TmpInt); + checkResult( + cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Registers per Block: \t\t%d \n", TmpInt); + checkResult( + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Warp Size: \t\t\t\t%d Threads \n", TmpInt); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Maximum Threads per Block: \t\t%d \n", TmpInt); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device), + "Error returned from cuDeviceGetAttribute\n"); + checkResult(cuDeviceGetAttribute( + &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device), + "Error returned from cuDeviceGetAttribute\n"); + checkResult(cuDeviceGetAttribute( + &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2, + TmpInt3); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device), + "Error returned from cuDeviceGetAttribute\n"); + checkResult(cuDeviceGetAttribute( + &TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device), + "Error returned from cuDeviceGetAttribute\n"); + checkResult(cuDeviceGetAttribute( + &TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2, + TmpInt3); + checkResult( + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Maximum Memory Pitch: \t\t%d bytes \n", TmpInt); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Texture Alignment: \t\t\t%d bytes \n", TmpInt); + checkResult( + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Clock Rate: \t\t\t%d kHz\n", TmpInt); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Execution Timeout: \t\t\t%s \n", StringParser::boolToYesNoString(TmpInt)); + checkResult( + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Integrated Device: \t\t\t%s \n", StringParser::boolToYesNoString(TmpInt)); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Can Map Host Memory: \t\t%s \n", StringParser::boolToYesNoString(TmpInt)); + checkResult( + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device), + "Error returned from cuDeviceGetAttribute\n"); + if (TmpInt == CU_COMPUTEMODE_DEFAULT) + TmpStr = "DEFAULT"; + else if (TmpInt == CU_COMPUTEMODE_PROHIBITED) + TmpStr = "PROHIBITED"; + else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS) + TmpStr = "EXCLUSIVE PROCESS"; + else + TmpStr = "unknown"; + printf(" Compute Mode: \t\t\t%s \n", TmpStr.c_str()); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Concurrent Kernels: \t\t%s \n", StringParser::boolToYesNoString(TmpInt)); + checkResult( + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" ECC Enabled: \t\t\t%s \n", StringParser::boolToYesNoString(TmpInt)); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Memory Clock Rate: \t\t\t%d kHz\n", TmpInt); + checkResult( + cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt); + checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, + Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, + Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Max Threads Per SMP: \t\t%d \n", TmpInt); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Async Engines: \t\t\t%s (%d) \n", StringParser::boolToYesNoString(TmpInt), TmpInt); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Unified Addressing: \t\t%s \n", StringParser::boolToYesNoString(TmpInt)); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Managed Memory: \t\t\t%s \n", StringParser::boolToYesNoString(TmpInt)); + checkResult( + cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Concurrent Managed Memory: \t\t%s \n", StringParser::boolToYesNoString(TmpInt)); + checkResult( + cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Preemption Supported: \t\t%s \n", StringParser::boolToYesNoString(TmpInt)); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Cooperative Launch: \t\t%s \n", StringParser::boolToYesNoString(TmpInt)); + checkResult(cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Multi-Device Boars: \t\t%s \n", StringParser::boolToYesNoString(TmpInt)); + checkResult( + cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device), + "Error returned from cuDeviceGetAttribute\n"); + checkResult( + cuDeviceGetAttribute( + &TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device), + "Error returned from cuDeviceGetAttribute\n"); + printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2); + } + + /// Getters and setters for stack and heap sizes. + StatusCode getDeviceStackSize(uint64_t &Value) override { + return getCtxLimit(CU_LIMIT_STACK_SIZE, Value); + } + StatusCode setDeviceStackSize(uint64_t Value) override { + return setCtxLimit(CU_LIMIT_STACK_SIZE, Value); + } + StatusCode getDeviceHeapSize(uint64_t &Value) override { + return getCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value); + } + StatusCode setDeviceHeapSize(uint64_t Value) override { + return setCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value); + } + + /// CUDA-specific functions for getting and setting context limits. + StatusCode setCtxLimit(CUlimit Kind, uint64_t Value) { + CUresult Res = cuCtxSetLimit(Kind, Value); + return StatusCode(Res); + } + StatusCode getCtxLimit(CUlimit Kind, uint64_t &Value) { + CUresult Res = cuCtxGetLimit(&Value, Kind); + return StatusCode(Res); + } + + /// CUDA-specific function to get device attributes. + StatusCode getDeviceAttr(uint32_t Kind, uint32_t &Value) { + // TODO: Warn if the new value is larger than the old. + CUresult Err = + cuDeviceGetAttribute((int *)&Value, (CUdevice_attribute)Kind, Device); + if (checkResult(Err, "Error returned from cuDeviceGetAttribute\n")) + return StatusCode::FAIL; + return StatusCode::SUCCESS; + } + +private: + using CUDAStreamManagerTy = GenericStreamManagerTy; + using CUDAEventManagerTy = GenericEventManagerTy; + + /// Stream manager for CUDA streams. + CUDAStreamManagerTy CUDAStreamManager; + + /// Event manager for CUDA events. + CUDAEventManagerTy CUDAEventManager; + + /// The device's context. This context should be set before performing + /// operations on the device. + CUcontext Context = nullptr; + + /// The CUDA device handler. + CUdevice Device = CU_DEVICE_INVALID; +}; + +StatusCode CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, + int32_t NumThreads, int32_t NumBlocks, + int32_t DynamicMemorySize, + int32_t NumKernelArgs, void *KernelArgs, + AsyncInfoWrapperTy &AsyncInfoWrapper) const { + CUDADeviceTy &CUDADevice = static_cast(GenericDevice); + CUstream Stream = CUDADevice.getStream(AsyncInfoWrapper); + + return StatusCode(cuLaunchKernel(Func, NumBlocks, /* gridDimY */ 1, + /* gridDimZ */ 1, NumThreads, + /* blockDimY */ 1, /* blockDimZ */ 1, + DynamicMemorySize, Stream, + (void **)KernelArgs, nullptr)); +} + +/// Class implementing the CUDA-specific functionalities of the global handler. +class CUDAGlobalHandler final : public GenericGlobalHandlerTy { +public: + /// Get the metadata of a global from the device. The name and size of the + /// global is read from DeviceGlobal and the address of the global is written + /// to DeviceGlobal. + StatusCode getGlobalMetadataFromDevice(GenericDeviceTy &Device, + DeviceImageTy &Image, + GlobalTy &DeviceGlobal) override { + CUDADeviceImageTy &CUDAImage = static_cast(Image); + + const char *GlobalName = DeviceGlobal.getName().data(); + + size_t CUSize; + CUdeviceptr CUPtr; + CUresult Err = cuModuleGetGlobal(&CUPtr, &CUSize, CUDAImage.getModule(), + GlobalName); + if (checkResult(Err, "Failed to load global '%s'\n", GlobalName)) + return StatusCode::FAIL; + + if (CUSize != DeviceGlobal.getSize()) { + DP("Failed to load global '%s' due to size mismatch (%zu != %zu)\n", + GlobalName, CUSize, (size_t) DeviceGlobal.getSize()); + return StatusCode::FAIL; + } + + DeviceGlobal.setPtr(reinterpret_cast(CUPtr)); + return StatusCode::SUCCESS; + } +}; + +/// Class implementing the CUDA-specific functionalities of the plugin. +struct CUDAPluginTy final : public GenericPluginTy { + /// Create a CUDA plugin and initialize the CUDA driver. + CUDAPluginTy() : GenericPluginTy() { + CUresult Err = cuInit(0); + if (Err == CUDA_ERROR_INVALID_HANDLE) { + // Cannot call cuGetErrorString if dlsym failed. + DP("Failed to load CUDA shared library\n"); + return; + } else if (Err == CUDA_ERROR_NO_DEVICE) { + // Do not initialize if there are no devices. + DP("There are no devices supporting CUDA.\n"); + return; + } + + if (checkResult(Err, "Error returned from cuInit\n")) + return; + + // Get the number of devices. + int NumDevices; + Err = cuDeviceGetCount(&NumDevices); + if (checkResult(Err, "Error returned from cuDeviceGetCount\n")) + return; + + // Do not initialize if there are no devices. + if (NumDevices == 0) { + DP("There are no devices supporting CUDA.\n"); + return; + } + + // Initialize the generic plugin structure. + GenericPluginTy::init(NumDevices, new CUDAGlobalHandler()); + } + + /// This class should not be copied. + CUDAPluginTy(const CUDAPluginTy &) = delete; + CUDAPluginTy(CUDAPluginTy &&) = delete; + + ~CUDAPluginTy() {} + + /// Get the ELF code for recognizing the compatible image binary. + uint16_t getMagicElfBits() const override { return /* EM_CUDA */ 190; } + + /// Create a CUDA device with a specific id. + CUDADeviceTy &createDevice(int32_t DeviceId) override { + CUDADeviceTy *Device = new CUDADeviceTy(DeviceId, getNumDevices()); + return *Device; + } + + /// Check whether the image is compatible with a CUDA device. + bool isImageCompatible(int32_t DeviceId, + __tgt_image_info *Info) const override { + assert(isValidDeviceId(DeviceId) && "Invalid CUDA device"); + + CUdevice Device; + if (cuDeviceGet(&Device, DeviceId) != CUDA_SUCCESS) + return false; + + int32_t Major, Minor; + if (cuDeviceGetAttribute(&Major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + Device) != CUDA_SUCCESS) + return false; + + if (cuDeviceGetAttribute(&Minor, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, + Device) != CUDA_SUCCESS) + return false; + + StringRef ArchStr(Info->Arch); + StringRef PrefixStr("sm_"); + if (!ArchStr.startswith(PrefixStr)) { + REPORT("Unrecognized image architecture %s\n", ArchStr.data()); + return false; + } + + int32_t ImageMajor = ArchStr[PrefixStr.size()+0] - '0'; + int32_t ImageMinor = ArchStr[PrefixStr.size()+1] - '0'; + + // A cubin generated for a certain compute capability is supported to run on + // any GPU with the same major revision and same or higher minor revision. + return (Major == ImageMajor && Minor >= ImageMinor); + } +}; + +StatusCode +CUDADeviceTy::dataExchangeImpl(const void *SrcPtr, + GenericDeviceTy &DstGenericDevice, + void *DstPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) { + if (setContext()) + return StatusCode::FAIL; + + CUDADeviceTy &DstDevice = static_cast(DstGenericDevice); + + CUresult Err; + int32_t DstDeviceId = DstDevice.DeviceId; + CUdeviceptr CUSrcPtr = (CUdeviceptr) SrcPtr; + CUdeviceptr CUDstPtr = (CUdeviceptr) DstPtr; + + int CanAccessPeer = 0; + if (DeviceId != DstDeviceId) { + // Make sure the lock is released before performing the copies. + std::lock_guard Lock(PeerAccessesLock); + + switch (PeerAccesses[DstDeviceId]) { + case PeerAccessState::AVAILABLE: + CanAccessPeer = 1; + break; + case PeerAccessState::UNAVAILABLE: + CanAccessPeer = 0; + break; + case PeerAccessState::PENDING: + // Check whether the source device can access the destination device. + Err = cuDeviceCanAccessPeer(&CanAccessPeer, Device, DstDevice.Device); + if (checkResult(Err, "Error returned from cuDeviceCanAccessPeer\n")) + return StatusCode(Err); + + if (CanAccessPeer) { + Err = cuCtxEnablePeerAccess(DstDevice.Context, 0); + if (Err == CUDA_ERROR_TOO_MANY_PEERS) { + // Resources may be exhausted due to many P2P links. + CanAccessPeer = 0; + DP("Too many P2P so fall back to D2D memcpy"); + } else if (checkResult(Err, "Error returned from cuCtxEnablePeerAccess\n")) + return StatusCode(Err); + } + PeerAccesses[DstDeviceId] = (CanAccessPeer) ? + PeerAccessState::AVAILABLE : PeerAccessState::UNAVAILABLE; + } + } + + CUstream Stream = getStream(AsyncInfoWrapper); + + // TODO: Should we fallback to D2D if peer access fails? + if (CanAccessPeer) + return StatusCode(cuMemcpyPeerAsync(CUDstPtr, Context, CUSrcPtr, + DstDevice.Context, Size, Stream)); + + // Fallback to D2D copy. + return StatusCode(cuMemcpyDtoDAsync(CUDstPtr, CUSrcPtr, Size, Stream)); +} + +StatusCode Plugin::init() { + // Call the getter to intialize the CUDA plugin. + get(); + return StatusCode::SUCCESS; +} + +StatusCode Plugin::deinit() { + // The CUDA plugin and the CUDA driver should already be deinitialized + // at this point. So do nothing for this plugin. + if (Plugin::isActive()) + return StatusCode::FAIL; + + return StatusCode::SUCCESS; +} + +GenericPluginTy &Plugin::get() { + static CUDAPluginTy CUDAPlugin; + assert(Plugin::isActive() && "Plugin is not active"); + return CUDAPlugin; +} + +const char *Plugin::getErrorStr(const StatusCode &SC) { + CUresult ErrorCode = static_cast(SC.getCode()); + + const char *ErrorStr = nullptr; + CUresult Ret = cuGetErrorString(ErrorCode, &ErrorStr); + if (Ret == CUDA_SUCCESS) { + return ErrorStr; + } else if (Ret == CUDA_ERROR_INVALID_VALUE) { + REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code: %d\n", ErrorCode); + } else { + checkResult(Ret, "Error returned from cuGetErrorString\n"); + } + return nullptr; +} + +} // plugin +} // target +} // omp +} // llvm Index: openmp/libomptarget/plugins-nextgen/exports =================================================================== --- /dev/null +++ openmp/libomptarget/plugins-nextgen/exports @@ -0,0 +1,40 @@ +VERS1.0 { + global: + __tgt_rtl_init_plugin; + __tgt_rtl_deinit_plugin; + __tgt_rtl_is_valid_binary; + __tgt_rtl_is_valid_binary_info; + __tgt_rtl_is_data_exchangable; + __tgt_rtl_number_of_devices; + __tgt_rtl_init_requires; + __tgt_rtl_init_device; + __tgt_rtl_deinit_device; + __tgt_rtl_load_binary; + __tgt_rtl_data_alloc; + __tgt_rtl_data_submit; + __tgt_rtl_data_submit_async; + __tgt_rtl_data_retrieve; + __tgt_rtl_data_retrieve_async; + __tgt_rtl_data_exchange; + __tgt_rtl_data_exchange_async; + __tgt_rtl_data_delete; + __tgt_rtl_run_target_team_region; + __tgt_rtl_run_target_team_region_async; + __tgt_rtl_run_target_region; + __tgt_rtl_run_target_region_async; + __tgt_rtl_synchronize; + __tgt_rtl_register_lib; + __tgt_rtl_unregister_lib; + __tgt_rtl_supports_empty_images; + __tgt_rtl_set_info_flag; + __tgt_rtl_print_device_info; + __tgt_rtl_create_event; + __tgt_rtl_record_event; + __tgt_rtl_wait_event; + __tgt_rtl_sync_event; + __tgt_rtl_destroy_event; + __tgt_rtl_init_device_info; + __tgt_rtl_init_async_info; + local: + *; +}; Index: openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp =================================================================== --- /dev/null +++ openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp @@ -0,0 +1,417 @@ +//===-RTLs/generic-64bit/src/rtl.cpp - Target RTLs Implementation - C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// RTL NextGen for generic 64-bit machine +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include "Debug.h" +#include "DeviceEnvironment.h" +#include "GlobalHandler.h" +#include "PluginInterface.h" + +#include "llvm/ADT/SmallVector.h" +#include "llvm/Frontend/OpenMP/OMPConstants.h" +#include "llvm/Frontend/OpenMP/OMPGridValues.h" +#include "llvm/Support/DynamicLibrary.h" + +// The number of devices in this plugin. +#define NUM_DEVICES 4 + +// The ELF ID should be defined at compile-time by the build system. +#ifndef TARGET_ELF_ID +#define TARGET_ELF_ID 0 +#endif + +namespace llvm { +namespace omp { +namespace target { +namespace plugin { + +/// Forward declarations for all specialized data structures. +struct GenELF64KernelTy; +struct GenELF64DeviceTy; +struct GenELF64PluginTy; + +using llvm::sys::DynamicLibrary; + +/// Class implementing kernel functionalities for GenELF64. +struct GenELF64KernelTy : public GenericKernelTy { + /// Construct the kernel with a name, execution mode and a function. + GenELF64KernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode, + void (*Func)(void)) + : GenericKernelTy(Name, ExecutionMode), Func(Func) {} + + /// Initialize the kernel. + StatusCode initImpl(GenericDeviceTy &GenericDevice) override { + // Set the maximum number of threads to a single. + MaxNumThreads = 1; + return StatusCode::SUCCESS; + } + + /// Launch the kernel using the libffi. + StatusCode launchImpl(GenericDeviceTy &GenericDevice, int32_t NumThreads, + int32_t NumBlocks, int32_t DynamicMemorySize, + int32_t NumKernelArgs, void *KernelArgs, + AsyncInfoWrapperTy &AsyncInfoWrapper) const override { + // Create a vector of ffi_types, one per argument. + SmallVector ArgTypes(NumKernelArgs, &ffi_type_pointer); + ffi_type **ArgTypesPtr = (ArgTypes.size()) ? &ArgTypes[0] : nullptr; + + // Prepare the cif structure before running the kernel function. + ffi_cif Cif; + ffi_status Status = ffi_prep_cif(&Cif, FFI_DEFAULT_ABI, NumKernelArgs, + &ffi_type_void, ArgTypesPtr); + if (Status != FFI_OK) + return StatusCode::FAIL; + + // Call the kernel function through libffi. + long Return; + ffi_call(&Cif, Func, &Return, (void **) KernelArgs); + + return StatusCode::SUCCESS; + } + + /// Get the default number of blocks and threads for the kernel. + int32_t getDefaultNumBlocks(GenericDeviceTy &) const override { return 1; } + int32_t getDefaultNumThreads(GenericDeviceTy &) const override { return 1; } + +private: + /// The kernel function to execute. + void (*Func)(void); +}; + +/// Class implementing the GenELF64 device images properties. +struct GenELF64DeviceImageTy : public DeviceImageTy { + /// Create the GenELF64 image with the id and the target image pointer. + GenELF64DeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage) + : DeviceImageTy(ImageId, TgtImage), DynLib() {} + + /// Getter and setter for the dynamic library. + DynamicLibrary &getDynamicLibrary() { return DynLib; } + void setDynamicLibrary(const DynamicLibrary &Lib) { DynLib = Lib; } + +private: + /// The dynamic library that loaded the image. + DynamicLibrary DynLib; +}; + +/// Class implementing the device functionalities for GenELF64. +struct GenELF64DeviceTy : public GenericDeviceTy { + /// Create the device with a specific id. + GenELF64DeviceTy(int32_t DeviceId, int32_t NumDevices) + : GenericDeviceTy(DeviceId, NumDevices, GenELF64GridValues) {} + + ~GenELF64DeviceTy() {} + + /// Initialize the device, which is a no-op + StatusCode initImpl(GenericPluginTy &Plugin) override { + return StatusCode::SUCCESS; + } + + /// Deinitialize the device, which is a no-op + StatusCode deinitImpl() override { + return StatusCode::SUCCESS; + } + + /// Construct the kernel for a specific image on the device. + GenericKernelTy * + constructKernelEntry(const __tgt_offload_entry &KernelEntry, + DeviceImageTy &Image) override { + GlobalTy Function(KernelEntry); + + // Get the metadata (address) of the kernel function. + GenericGlobalHandlerTy &GlobalHandler = Plugin::get().getGlobalHandler(); + if (GlobalHandler.getGlobalMetadataFromDevice(*this, Image, Function)) + return nullptr; + + // Allocate and create the kernel. + GenELF64KernelTy *GenELF64Kernel = Plugin::get().allocate(); + new (GenELF64Kernel) + GenELF64KernelTy(KernelEntry.name, OMP_TGT_EXEC_MODE_GENERIC, (void (*)()) Function.getPtr()); + + return GenELF64Kernel; + } + + /// Set the current context to this device, which is a no-op. + StatusCode setContext() override { + return StatusCode::SUCCESS; + } + + /// Load the binary image into the device and allocate an image object. + DeviceImageTy *loadBinaryImpl(const __tgt_device_image *TgtImage, + int32_t ImageId) override { + // Allocate and initialize the image object. + GenELF64DeviceImageTy *Image = + Plugin::get().allocate(); + new (Image) + GenELF64DeviceImageTy(ImageId, TgtImage); + + // Create a temporary file. + char TmpFileName[] = "/tmp/tmpfile_XXXXXX"; + int TmpFileFd = mkstemp(TmpFileName); + if (TmpFileFd == -1) { + DP("Failed to create tmp file for loading target image\n"); + return nullptr; + } + + // Open the temporary file. + FILE *TmpFile = fdopen(TmpFileFd, "wb"); + if (!TmpFile) { + DP("Failed to open tmp file %s for loading target image\n", TmpFileName); + return nullptr; + } + + // Write the image into the temporary file. + size_t Written = fwrite(Image->getStart(), Image->getSize(), 1, TmpFile); + if (Written != 1) { + DP("Failed to write target image to tmp file %s\n", TmpFileName); + return nullptr; + } + + // Close the temporary file. + int Err = fclose(TmpFile); + if (Err) { + DP("Failed to close tmp file %s with the target image\n", TmpFileName); + return nullptr; + } + + // Load the temporary file as a dynamic library. + std::string ErrMsg; + DynamicLibrary DynLib = + DynamicLibrary::getPermanentLibrary(TmpFileName, &ErrMsg); + + // Check if the loaded library is valid. + if (!DynLib.isValid()) { + DP("Failed to load target image: %s\n", ErrMsg.c_str()); + return nullptr; + } + + // Save a reference of the image's dynamic library. + Image->setDynamicLibrary(DynLib); + + return Image; + } + + /// Allocate memory. Use std::malloc in all cases. + void *allocate(size_t Size, void *, TargetAllocTy Kind) override { + if (Size == 0) + return nullptr; + + void *MemAlloc = nullptr; + switch (Kind) { + case TARGET_ALLOC_DEFAULT: + case TARGET_ALLOC_DEVICE: + case TARGET_ALLOC_HOST: + case TARGET_ALLOC_SHARED: + MemAlloc = std::malloc(Size); + break; + } + return MemAlloc; + } + + /// Free the memory. Use std::free in all cases. + int free(void *TgtPtr, TargetAllocTy Kind) override { + std::free(TgtPtr); + return OFFLOAD_SUCCESS; + } + + /// Submit data to the device (host to device transfer). + StatusCode dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + std::memcpy(TgtPtr, HstPtr, Size); + return StatusCode::SUCCESS; + } + + /// Retrieve data from the device (device to host transfer). + StatusCode dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + std::memcpy(HstPtr, TgtPtr, Size); + return StatusCode::SUCCESS; + } + + /// Exchange data between two devices within the plugin. This function is not + /// supported in this plugin. + StatusCode dataExchangeImpl(const void *SrcPtr, + GenericDeviceTy &DstGenericDevice, + void *DstPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + // This function should never be called because the function + // GenELF64Plugin::isDataExchangable() returns false. + return StatusCode::FAIL; + } + + /// All functions are already synchronous. No need to do anything on this + /// synchronization function. + StatusCode synchronizeImpl(__tgt_async_info &AsyncInfo) override { + return StatusCode::SUCCESS; + } + + /// This plugin does not support interoperability + StatusCode initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { + return StatusCode::FAIL; + } + + /// This plugin does not support interoperability + StatusCode initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override { + return StatusCode::FAIL; + } + + /// This plugin does not support the event API. Do nothing without failing. + StatusCode createEventImpl(void **EventPtrStorage) override { + *EventPtrStorage = nullptr; + return StatusCode::SUCCESS; + } + StatusCode destroyEventImpl(void *EventPtr) override { + return StatusCode::SUCCESS; + } + StatusCode recordEventImpl(void *EventPtr, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + return StatusCode::SUCCESS; + } + StatusCode waitEventImpl(void *EventPtr, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + return StatusCode::SUCCESS; + } + StatusCode syncEventImpl(void *EventPtr) override { + return StatusCode::SUCCESS; + } + + /// Print information about the device. + void printInfoImpl() override { + printf(" This is a generic-elf-64bit device\n"); + } + + /// This plugin should not setup the device environment. + virtual bool shouldSetupDeviceEnvironment() const override { return false; }; + + /// Getters and setters for stack size and heap size not relevant. + StatusCode getDeviceStackSize(uint64_t &Value) override { + Value = 0; + return StatusCode::SUCCESS; + } + StatusCode setDeviceStackSize(uint64_t Value) override { + return StatusCode::SUCCESS; + } + StatusCode getDeviceHeapSize(uint64_t &Value) override { + Value = 0; + return StatusCode::SUCCESS; + } + StatusCode setDeviceHeapSize(uint64_t Value) override { + return StatusCode::SUCCESS; + } + +private: + /// Grid values for Generic ELF64 plugins. + static constexpr GV GenELF64GridValues = { + 1, // GV_Slot_Size + 1, // GV_Warp_Size + 1, // GV_Max_Teams + 1, // GV_SimpleBufferSize + 1, // GV_Max_WG_Size + 1, // GV_Default_WG_Size + }; +}; + +class GenELF64GlobalHandler final : public GenericGlobalHandlerTy { +public: + StatusCode getGlobalMetadataFromDevice(GenericDeviceTy &GenericDevice, + DeviceImageTy &Image, + GlobalTy &DeviceGlobal) override { + const char *GlobalName = DeviceGlobal.getName().data(); + GenELF64DeviceImageTy &GenELF64Image = + static_cast(Image); + + // Get dynamic library that has loaded the device image. + DynamicLibrary &DynLib = GenELF64Image.getDynamicLibrary(); + + // Get the address of the symbol. + void *Addr = DynLib.getAddressOfSymbol(GlobalName); + if (Addr == nullptr) { + DP("Failed to load global '%s'\n", GlobalName); + return StatusCode::FAIL; + } + + // Save the pointer to the symbol. + DeviceGlobal.setPtr(Addr); + + return StatusCode::SUCCESS; + } +}; + +/// Class implementing the plugin functionalities for GenELF64. +struct GenELF64PluginTy final : public GenericPluginTy { + /// Create the plugin. + GenELF64PluginTy() : GenericPluginTy() { + // Initialize the generic plugin structure with multiple devices and a + // global handler. + GenericPluginTy::init(NUM_DEVICES, new GenELF64GlobalHandler()); + } + + /// This class should not be copied. + GenELF64PluginTy(const GenELF64PluginTy &) = delete; + GenELF64PluginTy(GenELF64PluginTy &&) = delete; + + ~GenELF64PluginTy() {} + + /// Get the ELF code to recognize the compatible binary images. + uint16_t getMagicElfBits() const override { return TARGET_ELF_ID; } + + /// Create a GenELF64 device with a specific id. + GenELF64DeviceTy &createDevice(int32_t DeviceId) override { + GenELF64DeviceTy *Device = new GenELF64DeviceTy(DeviceId, getNumDevices()); + return *Device; + } + + /// This plugin does not support exchanging data between two devices. + bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override { + return false; + } + + /// All images (ELF-compatible) should be compatible with this plugin. + bool isImageCompatible(int32_t DeviceId, + __tgt_image_info *Info) const override { + assert(isValidDeviceId(DeviceId) && "Invalid Generic ELF64 device"); + return true; + } +}; + +StatusCode Plugin::init() { + // Call the getter to intialize the GenELF64 plugin. + get(); + return StatusCode::SUCCESS; +} + +StatusCode Plugin::deinit() { + // The Generic ELF64 plugin should already be deinitialized at this point. + if (Plugin::isActive()) + return StatusCode::FAIL; + + return StatusCode::SUCCESS; +} + +GenericPluginTy &Plugin::get() { + static GenELF64PluginTy GenELF64Plugin; + assert(Plugin::isActive() && "Plugin is not active"); + return GenELF64Plugin; +} + +const char *Plugin::getErrorStr(const StatusCode &SC) { + return "No description"; +} + +} // plugin +} // target +} // omp +} // llvm Index: openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt =================================================================== --- /dev/null +++ openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt @@ -0,0 +1,17 @@ +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a ppc64 machine if available. +# +##===----------------------------------------------------------------------===## + +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + build_generic_elf64_nextgen("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu" "21") +else() + libomptarget_say("Not building ppc64 NextGen offloading plugin: machine not found in the system.") +endif() Index: openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt =================================================================== --- /dev/null +++ openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt @@ -0,0 +1,17 @@ +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a ppc64le machine if available. +# +##===----------------------------------------------------------------------===## + +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + build_generic_elf64_nextgen("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu" "21") +else() + libomptarget_say("Not building ppc64le NextGen offloading plugin: machine not found in the system.") +endif() Index: openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt =================================================================== --- /dev/null +++ openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt @@ -0,0 +1,17 @@ +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a x86_64 machine if available. +# +##===----------------------------------------------------------------------===## + +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + build_generic_elf64_nextgen("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62") +else() + libomptarget_say("Not building x86_64 NextGen offloading plugin: machine not found in the system.") +endif() Index: openmp/libomptarget/src/rtl.cpp =================================================================== --- openmp/libomptarget/src/rtl.cpp +++ openmp/libomptarget/src/rtl.cpp @@ -15,6 +15,7 @@ #include "device.h" #include "private.h" #include "rtl.h" +#include "Utilities.h" #include #include @@ -24,16 +25,17 @@ using namespace llvm; using namespace llvm::sys; +using namespace llvm::omp::target; // List of all plugins that can support offloading. static const char *RTLNames[] = { - /* PowerPC target */ "libomptarget.rtl.ppc64.so", - /* x86_64 target */ "libomptarget.rtl.x86_64.so", - /* CUDA target */ "libomptarget.rtl.cuda.so", - /* AArch64 target */ "libomptarget.rtl.aarch64.so", - /* SX-Aurora VE target */ "libomptarget.rtl.ve.so", - /* AMDGPU target */ "libomptarget.rtl.amdgpu.so", - /* Remote target */ "libomptarget.rtl.rpc.so", + /* PowerPC target */ "libomptarget.rtl.ppc64", + /* x86_64 target */ "libomptarget.rtl.x86_64", + /* CUDA target */ "libomptarget.rtl.cuda", + /* AArch64 target */ "libomptarget.rtl.aarch64", + /* SX-Aurora VE target */ "libomptarget.rtl.ve", + /* AMDGPU target */ "libomptarget.rtl.amdgpu", + /* Remote target */ "libomptarget.rtl.rpc", }; PluginManager *PM; @@ -86,152 +88,166 @@ DP("Loading RTLs...\n"); + BoolEnvar NextGenPlugins("LIBOMPTARGET_NEXTGEN_PLUGINS", false); + // Attempt to open all the plugins and, if they exist, check if the interface // is correct and if they are supporting any devices. - for (auto *Name : RTLNames) { - DP("Loading library '%s'...\n", Name); - std::string ErrMsg; - auto DynLibrary = std::make_unique( - sys::DynamicLibrary::getPermanentLibrary(Name, &ErrMsg)); - - if (!DynLibrary->isValid()) { - // Library does not exist or cannot be found. - DP("Unable to load library '%s': %s!\n", Name, ErrMsg.c_str()); - continue; - } - - DP("Successfully loaded library '%s'!\n", Name); - + for (const char *Name : RTLNames) { AllRTLs.emplace_back(); - // Retrieve the RTL information from the runtime library. - RTLInfoTy &R = AllRTLs.back(); - - // Remove plugin on failure to call optional init_plugin - *((void **)&R.init_plugin) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_init_plugin"); - if (R.init_plugin) { - int32_t Rc = R.init_plugin(); - if (Rc != OFFLOAD_SUCCESS) { - DP("Unable to initialize library '%s': %u!\n", Name, Rc); - AllRTLs.pop_back(); + RTLInfoTy &RTL = AllRTLs.back(); + + const std::string BaseRTLName(Name); + if (NextGenPlugins) { + if (attemptLoadRTL(BaseRTLName + ".nextgen.so", RTL)) continue; - } - } - bool ValidPlugin = true; - - if (!(*((void **)&R.is_valid_binary) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_is_valid_binary"))) - ValidPlugin = false; - if (!(*((void **)&R.number_of_devices) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_number_of_devices"))) - ValidPlugin = false; - if (!(*((void **)&R.init_device) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_init_device"))) - ValidPlugin = false; - if (!(*((void **)&R.load_binary) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_load_binary"))) - ValidPlugin = false; - if (!(*((void **)&R.data_alloc) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_data_alloc"))) - ValidPlugin = false; - if (!(*((void **)&R.data_submit) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_data_submit"))) - ValidPlugin = false; - if (!(*((void **)&R.data_retrieve) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_data_retrieve"))) - ValidPlugin = false; - if (!(*((void **)&R.data_delete) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_data_delete"))) - ValidPlugin = false; - if (!(*((void **)&R.run_region) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_run_target_region"))) - ValidPlugin = false; - if (!(*((void **)&R.run_team_region) = DynLibrary->getAddressOfSymbol( - "__tgt_rtl_run_target_team_region"))) - ValidPlugin = false; - - // Invalid plugin - if (!ValidPlugin) { - DP("Invalid plugin as necessary interface is not found.\n"); - AllRTLs.pop_back(); - continue; + DP("Falling back to original plugin...\n"); } - // No devices are supported by this RTL? - if (!(R.NumberOfDevices = R.number_of_devices())) { - // The RTL is invalid! Will pop the object from the RTLs list. - DP("No devices supported in this RTL\n"); + if (!attemptLoadRTL(BaseRTLName + ".so", RTL)) AllRTLs.pop_back(); - continue; + } + + DP("RTLs loaded!\n"); +} + +bool RTLsTy::attemptLoadRTL(const std::string &RTLName, RTLInfoTy &RTL) { + const char *Name = RTLName.c_str(); + + DP("Loading library '%s'...\n", Name); + + std::string ErrMsg; + auto DynLibrary = std::make_unique( + sys::DynamicLibrary::getPermanentLibrary(Name, &ErrMsg)); + + if (!DynLibrary->isValid()) { + // Library does not exist or cannot be found. + DP("Unable to load library '%s': %s!\n", Name, ErrMsg.c_str()); + return false; + } + + DP("Successfully loaded library '%s'!\n", Name); + + // Remove plugin on failure to call optional init_plugin + *((void **)&RTL.init_plugin) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_init_plugin"); + if (RTL.init_plugin) { + int32_t Rc = RTL.init_plugin(); + if (Rc != OFFLOAD_SUCCESS) { + DP("Unable to initialize library '%s': %u!\n", Name, Rc); + return false; } + } -#ifdef OMPTARGET_DEBUG - R.RTLName = Name; -#endif + bool ValidPlugin = true; + + if (!(*((void **)&RTL.is_valid_binary) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_is_valid_binary"))) + ValidPlugin = false; + if (!(*((void **)&RTL.number_of_devices) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_number_of_devices"))) + ValidPlugin = false; + if (!(*((void **)&RTL.init_device) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_init_device"))) + ValidPlugin = false; + if (!(*((void **)&RTL.load_binary) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_load_binary"))) + ValidPlugin = false; + if (!(*((void **)&RTL.data_alloc) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_data_alloc"))) + ValidPlugin = false; + if (!(*((void **)&RTL.data_submit) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_data_submit"))) + ValidPlugin = false; + if (!(*((void **)&RTL.data_retrieve) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_data_retrieve"))) + ValidPlugin = false; + if (!(*((void **)&RTL.data_delete) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_data_delete"))) + ValidPlugin = false; + if (!(*((void **)&RTL.run_region) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_run_target_region"))) + ValidPlugin = false; + if (!(*((void **)&RTL.run_team_region) = DynLibrary->getAddressOfSymbol( + "__tgt_rtl_run_target_team_region"))) + ValidPlugin = false; + + // Invalid plugin + if (!ValidPlugin) { + DP("Invalid plugin as necessary interface is not found.\n"); + return false; + } - DP("Registering RTL %s supporting %d devices!\n", R.RTLName.c_str(), - R.NumberOfDevices); - - // Optional functions - *((void **)&R.deinit_plugin) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_deinit_plugin"); - *((void **)&R.is_valid_binary_info) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_is_valid_binary_info"); - *((void **)&R.deinit_device) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_deinit_device"); - *((void **)&R.init_requires) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_init_requires"); - *((void **)&R.data_submit_async) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_data_submit_async"); - *((void **)&R.data_retrieve_async) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_data_retrieve_async"); - *((void **)&R.run_region_async) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_run_target_region_async"); - *((void **)&R.run_team_region_async) = DynLibrary->getAddressOfSymbol( - "__tgt_rtl_run_target_team_region_async"); - *((void **)&R.synchronize) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_synchronize"); - *((void **)&R.data_exchange) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_data_exchange"); - *((void **)&R.data_exchange_async) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_data_exchange_async"); - *((void **)&R.is_data_exchangable) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_is_data_exchangable"); - *((void **)&R.register_lib) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_register_lib"); - *((void **)&R.unregister_lib) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_unregister_lib"); - *((void **)&R.supports_empty_images) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_supports_empty_images"); - *((void **)&R.set_info_flag) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_set_info_flag"); - *((void **)&R.print_device_info) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_print_device_info"); - *((void **)&R.create_event) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_create_event"); - *((void **)&R.record_event) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_record_event"); - *((void **)&R.wait_event) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_wait_event"); - *((void **)&R.sync_event) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_sync_event"); - *((void **)&R.destroy_event) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_destroy_event"); - *((void **)&R.release_async_info) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_release_async_info"); - *((void **)&R.init_async_info) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_init_async_info"); - *((void **)&R.init_device_info) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_init_device_info"); - - R.LibraryHandler = std::move(DynLibrary); + // No devices are supported by this RTL? + if (!(RTL.NumberOfDevices = RTL.number_of_devices())) { + // The RTL is invalid! Will pop the object from the RTLs list. + DP("No devices supported in this RTL\n"); + return false; } - DP("RTLs loaded!\n"); +#ifdef LIBOMPTARGET_DEBUG + RTL.RTLName = Name; +#endif - return; + DP("Registering RTL %s supporting %d devices!\n", Name, RTL.NumberOfDevices); + + // Optional functions + *((void **)&RTL.deinit_plugin) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_deinit_plugin"); + *((void **)&RTL.is_valid_binary_info) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_is_valid_binary_info"); + *((void **)&RTL.deinit_device) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_deinit_device"); + *((void **)&RTL.init_requires) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_init_requires"); + *((void **)&RTL.data_submit_async) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_data_submit_async"); + *((void **)&RTL.data_retrieve_async) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_data_retrieve_async"); + *((void **)&RTL.run_region_async) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_run_target_region_async"); + *((void **)&RTL.run_team_region_async) = DynLibrary->getAddressOfSymbol( + "__tgt_rtl_run_target_team_region_async"); + *((void **)&RTL.synchronize) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_synchronize"); + *((void **)&RTL.data_exchange) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_data_exchange"); + *((void **)&RTL.data_exchange_async) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_data_exchange_async"); + *((void **)&RTL.is_data_exchangable) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_is_data_exchangable"); + *((void **)&RTL.register_lib) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_register_lib"); + *((void **)&RTL.unregister_lib) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_unregister_lib"); + *((void **)&RTL.supports_empty_images) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_supports_empty_images"); + *((void **)&RTL.set_info_flag) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_set_info_flag"); + *((void **)&RTL.print_device_info) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_print_device_info"); + *((void **)&RTL.create_event) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_create_event"); + *((void **)&RTL.record_event) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_record_event"); + *((void **)&RTL.wait_event) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_wait_event"); + *((void **)&RTL.sync_event) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_sync_event"); + *((void **)&RTL.destroy_event) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_destroy_event"); + *((void **)&RTL.release_async_info) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_release_async_info"); + *((void **)&RTL.init_async_info) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_init_async_info"); + *((void **)&RTL.init_device_info) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_init_device_info"); + + RTL.LibraryHandler = std::move(DynLibrary); + + // Successfully loaded + return true; } //////////////////////////////////////////////////////////////////////////////// Index: openmp/libomptarget/test/lit.cfg =================================================================== --- openmp/libomptarget/test/lit.cfg +++ openmp/libomptarget/test/lit.cfg @@ -21,6 +21,10 @@ if 'LIBOMPTARGET_DEBUG' in os.environ: config.environment['LIBOMPTARGET_DEBUG'] = os.environ['LIBOMPTARGET_DEBUG'] +# Allow running the tests with nextgen plugins when available +if 'LIBOMPTARGET_NEXTGEN_PLUGINS' in os.environ: + config.environment['LIBOMPTARGET_NEXTGEN_PLUGINS'] = os.environ['LIBOMPTARGET_NEXTGEN_PLUGINS'] + if 'OMP_TARGET_OFFLOAD' in os.environ: config.environment['OMP_TARGET_OFFLOAD'] = os.environ['OMP_TARGET_OFFLOAD']