diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h --- a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h @@ -56,24 +56,24 @@ struct GV { /// The size reserved for data in a shared memory slot. - const unsigned GV_Slot_Size; + unsigned GV_Slot_Size; /// The default value of maximum number of threads in a worker warp. - const unsigned GV_Warp_Size; + unsigned GV_Warp_Size; constexpr unsigned warpSlotSize() const { return GV_Warp_Size * GV_Slot_Size; } /// the maximum number of teams. - const unsigned GV_Max_Teams; + unsigned GV_Max_Teams; // An alternative to the heavy data sharing infrastructure that uses global // memory is one that uses device __shared__ memory. The amount of such space // (in bytes) reserved by the OpenMP runtime is noted here. - const unsigned GV_SimpleBufferSize; + unsigned GV_SimpleBufferSize; // The absolute maximum team size for a working group - const unsigned GV_Max_WG_Size; + unsigned GV_Max_WG_Size; // The default maximum team size for a working group - const unsigned GV_Default_WG_Size; + unsigned GV_Default_WG_Size; constexpr unsigned maxWarpNumber() const { return GV_Max_WG_Size / GV_Warp_Size; diff --git a/openmp/libomptarget/CMakeLists.txt b/openmp/libomptarget/CMakeLists.txt --- a/openmp/libomptarget/CMakeLists.txt +++ b/openmp/libomptarget/CMakeLists.txt @@ -85,6 +85,7 @@ # Build offloading plugins and device RTLs if they are available. add_subdirectory(plugins) +add_subdirectory(plugins-nextgen) add_subdirectory(DeviceRTL) add_subdirectory(tools) diff --git a/openmp/libomptarget/include/Utilities.h b/openmp/libomptarget/include/Utilities.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/include/Utilities.h @@ -0,0 +1,200 @@ +//===------- Utilities.h - Target independent OpenMP target RTL -- C++ ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Routines and classes used to provide useful functionalities like string +// parsing and environment variables. +// +//===----------------------------------------------------------------------===// + +#ifndef OPENMP_LIBOMPTARGET_INCLUDE_UTILITIES_H +#define OPENMP_LIBOMPTARGET_INCLUDE_UTILITIES_H + +#include "llvm/ADT/STLFunctionalExtras.h" + +#include "Debug.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace llvm { +namespace omp { +namespace target { + +/// Utility class for parsing strings to other types. +struct StringParser { + /// Parse a string to another type. + template static bool parse(const char *Value, Ty &Result); +}; + +/// Class for reading and checking environment variables. Currently working with +/// integer, floats, std::string and bool types. +template class Envar { + Ty Data; + bool IsPresent; + bool Initialized; + +public: + /// Auxiliary function to safely create envars. This static function safely + /// creates envars using fallible constructors. See the constructors to know + /// more details about the creation parameters. + template + static Expected create(ArgsTy &&...Args) { + Error Err = Error::success(); + Envar Envar(std::forward(Args)..., Err); + if (Err) + return std::move(Err); + return std::move(Envar); + } + + /// Create an empty envar. Cannot be consulted. This constructor is merely + /// for convenience. This constructor is not fallible. + Envar() : Data(Ty()), IsPresent(false), Initialized(false) {} + + /// Create an envar with a name and an optional default. The Envar object will + /// take the value read from the environment variable, or the default if it + /// was not set or not correct. This constructor is not fallible. + Envar(StringRef Name, Ty Default = Ty()) + : Data(Default), IsPresent(false), Initialized(true) { + + if (const char *EnvStr = getenv(Name.data())) { + // Check whether the envar is defined and valid. + IsPresent = StringParser::parse(EnvStr, Data); + + if (!IsPresent) { + DP("Ignoring invalid value %s for envar %s\n", EnvStr, Name.data()); + Data = Default; + } + } + } + + /// Get the definitive value. + const Ty &get() const { + // Throw a runtime error in case this envar is not initialized. + if (!Initialized) + FATAL_MESSAGE0(1, "Consulting envar before initialization"); + + return Data; + } + + /// Get the definitive value. + operator Ty() const { return get(); } + + /// Indicate whether the environment variable was defined and valid. + bool isPresent() const { return IsPresent; } + +private: + /// This constructor should never fail but we provide it for convenience. This + /// way, the constructor can be used by the Envar::create() static function + /// to safely create this kind of envars. + Envar(StringRef Name, Ty Default, Error &Err) : Envar(Name, Default) { + ErrorAsOutParameter EAO(&Err); + Err = Error::success(); + } + + /// Create an envar with a name, getter function and a setter function. The + /// Envar object will take the value read from the environment variable if + /// this value is accepted by the setter function. Otherwise, the getter + /// function will be executed to get the default value. The getter should be + /// of the form Error GetterFunctionTy(Ty &Value) and the setter should + /// be of the form Error SetterFunctionTy(Ty Value). This constructor has a + /// private visibility because is a fallible constructor. Please use the + /// Envar::create() static function to safely create this object instead. + template + Envar(StringRef Name, GetterFunctor Getter, SetterFunctor Setter, Error &Err) + : Data(Ty()), IsPresent(false), Initialized(true) { + ErrorAsOutParameter EAO(&Err); + Err = init(Name, Getter, Setter); + } + + template + Error init(StringRef Name, GetterFunctor Getter, SetterFunctor Setter); +}; + +/// Define some common envar types. +using IntEnvar = Envar; +using Int32Envar = Envar; +using Int64Envar = Envar; +using UInt32Envar = Envar; +using UInt64Envar = Envar; +using StringEnvar = Envar; +using BoolEnvar = Envar; + +template <> +inline bool StringParser::parse(const char *ValueStr, bool &Result) { + std::string Value(ValueStr); + + // Convert the string to lowercase. + std::transform(Value.begin(), Value.end(), Value.begin(), + [](unsigned char c) { return std::tolower(c); }); + + // May be implemented with fancier C++ features, but let's keep it simple. + if (Value == "true" || Value == "yes" || Value == "on" || Value == "1") + Result = true; + else if (Value == "false" || Value == "no" || Value == "off" || Value == "0") + Result = false; + else + return false; + + // Parsed correctly. + return true; +} + +template +inline bool StringParser::parse(const char *Value, Ty &Result) { + assert(Value && "Parsed value cannot be null"); + + std::istringstream Stream(Value); + Stream >> Result; + + return !Stream.fail(); +} + +template +template +inline Error Envar::init(StringRef Name, GetterFunctor Getter, + SetterFunctor Setter) { + // Get the default value. + Ty Default; + if (Error Err = Getter(Default)) + return Err; + + if (const char *EnvStr = getenv(Name.data())) { + IsPresent = StringParser::parse(EnvStr, Data); + if (IsPresent) { + // Check whether the envar value is actually valid. + Error Err = Setter(Data); + if (Err) { + // The setter reported an invalid value. Mark the user-defined value as + // not present and reset to the getter value (default). + IsPresent = false; + Data = Default; + DP("Setter of envar %s failed, resetting to %s\n", Name.data(), + std::to_string(Data).data()); + consumeError(std::move(Err)); + } + } else { + DP("Ignoring invalid value %s for envar %s\n", EnvStr, Name.data()); + Data = Default; + } + } else { + Data = Default; + } + + return Error::success(); +} + +} // namespace target +} // namespace omp +} // namespace llvm + +#endif // OPENMP_LIBOMPTARGET_INCLUDE_UTILITIES_H diff --git a/openmp/libomptarget/include/rtl.h b/openmp/libomptarget/include/rtl.h --- a/openmp/libomptarget/include/rtl.h +++ b/openmp/libomptarget/include/rtl.h @@ -169,6 +169,9 @@ // (i.e. the library attempts to load the RTLs (plugins) only once). std::once_flag InitFlag; void loadRTLs(); // not thread-safe + +private: + static bool attemptLoadRTL(const std::string &RTLName, RTLInfoTy &RTL); }; /// Map between the host entry begin and the translation table. Each diff --git a/openmp/libomptarget/plugins-nextgen/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins-nextgen/CMakeLists.txt @@ -0,0 +1,87 @@ +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build plugins for the user system if available. +# +##===----------------------------------------------------------------------===## + +add_subdirectory(common) + +# void build_generic_elf64_nextgen(string tmachine, string tmachine_name, string tmachine_libname, string elf_machine_id); +# - build a plugin for an ELF based generic 64-bit target based on libffi. +# - tmachine: name of the machine processor as used in the cmake build system. +# - tmachine_name: name of the machine to be printed with the debug messages. +# - tmachine_libname: machine name to be appended to the plugin library name. +macro(build_generic_elf64_nextgen tmachine tmachine_name tmachine_libname tmachine_triple elf_machine_id) +if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$") + if(LIBOMPTARGET_DEP_LIBFFI_FOUND) + + libomptarget_say("Building ${tmachine_name} NextGen offloading plugin.") + + # Define macro to be used as prefix of the runtime messages for this target. + add_definitions("-DTARGET_NAME=${tmachine_name}") + + # Define debug prefix. TODO: This should be automatized in the Debug.h but + # it requires changing the original plugins. + add_definitions(-DDEBUG_PREFIX="TARGET ${tmachine_name} RTL") + + # Define macro with the ELF ID for this target. + add_definitions("-DTARGET_ELF_ID=${elf_machine_id}") + + add_llvm_library("omptarget.rtl.${tmachine_libname}.nextgen" + SHARED + + ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/src/rtl.cpp + + ADDITIONAL_HEADER_DIRS + ${LIBOMPTARGET_INCLUDE_DIR} + ${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR} + + LINK_LIBS + PRIVATE + elf_common + MemoryManager + PluginInterface + ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES} + ${OPENMP_PTHREAD_LIB} + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports" + + NO_INSTALL_RPATH + ) + + # Install plugin under the lib destination folder. + install(TARGETS "omptarget.rtl.${tmachine_libname}.nextgen" + LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") + set_target_properties("omptarget.rtl.${tmachine_libname}.nextgen" PROPERTIES + INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.." + CXX_VISIBILITY_PRESET protected) + + target_include_directories( "omptarget.rtl.${tmachine_libname}.nextgen" PRIVATE + ${LIBOMPTARGET_INCLUDE_DIR} + ${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR}) + + list(APPEND LIBOMPTARGET_TESTED_PLUGINS + "omptarget.rtl.${tmachine_libname}.nextgen") + + else(LIBOMPTARGET_DEP_LIBFFI_FOUND) + libomptarget_say("Not building ${tmachine_name} NextGen offloading plugin: libffi dependency not found.") + endif(LIBOMPTARGET_DEP_LIBFFI_FOUND) +else() + libomptarget_say("Not building ${tmachine_name} NextGen offloading plugin: machine not found in the system.") +endif() +endmacro() + +add_subdirectory(aarch64) +add_subdirectory(cuda) +add_subdirectory(ppc64) +add_subdirectory(ppc64le) +add_subdirectory(x86_64) + +# Make sure the parent scope can see the plugins that will be created. +set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE) +set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE) diff --git a/openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt @@ -0,0 +1,17 @@ +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for an aarch64 machine if available. +# +##===----------------------------------------------------------------------===## + +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + build_generic_elf64_nextgen("aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183") +else() + libomptarget_say("Not building aarch64 NextGen offloading plugin: machine not found in the system.") +endif() diff --git a/openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins-nextgen/common/CMakeLists.txt @@ -0,0 +1,13 @@ +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Common parts which can be used by all plugins +# +##===----------------------------------------------------------------------===## + +add_subdirectory(PluginInterface) diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt @@ -0,0 +1,32 @@ +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Common parts which can be used by all plugins +# +##===----------------------------------------------------------------------===## + +# Plugin Interface library. +add_library(PluginInterface OBJECT PluginInterface.cpp GlobalHandler.cpp) + +# Define the TARGET_NAME. +add_definitions("-DTARGET_NAME=PluginInterface") + +# Define the DEBUG_PREFIX. +add_definitions(-DDEBUG_PREFIX="PluginInterface") + +set_property(TARGET PluginInterface PROPERTY POSITION_INDEPENDENT_CODE ON) +llvm_update_compile_flags(PluginInterface) +set(LINK_LLVM_LIBS LLVMSupport) +if (LLVM_LINK_LLVM_DYLIB) + set(LINK_LLVM_LIBS LLVM) +endif() +target_link_libraries(PluginInterface INTERFACE ${LINK_LLVM_LIBS} PRIVATE elf_common MemoryManager) +add_dependencies(PluginInterface ${LINK_LLVM_LIBS}) + +target_include_directories(PluginInterface INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}) +target_include_directories(PluginInterface PRIVATE ${LIBOMPTARGET_INCLUDE_DIR}) diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/GlobalHandler.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/GlobalHandler.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/GlobalHandler.h @@ -0,0 +1,180 @@ +//===- GlobalHandler.h - Target independent global & enviroment handling --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Target independent global handler and environment manager. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H +#define LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H + +#include + +#include "llvm/ADT/DenseMap.h" +#include "llvm/Object/ELFObjectFile.h" + +#include "Debug.h" +#include "Utilities.h" +#include "omptarget.h" + +namespace llvm { +namespace omp { +namespace target { +namespace plugin { + +struct DeviceImageTy; +struct GenericDeviceTy; + +using namespace llvm::object; + +/// Common abstraction for globals that live on the host and device. +/// It simply encapsulates the symbol name, symbol size, and symbol address +/// (which might be host or device depending on the context). +class GlobalTy { + // NOTE: Maybe we can have a pointer to the offload entry name instead of + // holding a private copy of the name as a std::string. + std::string Name; + uint32_t Size; + void *Ptr; + +public: + GlobalTy(const std::string &Name, uint32_t Size, void *Ptr = nullptr) + : Name(Name), Size(Size), Ptr(Ptr) {} + + GlobalTy(const __tgt_offload_entry &Entry) + : Name(Entry.name), Size(Entry.size), Ptr(Entry.addr) {} + + const std::string &getName() const { return Name; } + uint32_t getSize() const { return Size; } + void *getPtr() const { return Ptr; } + + void setSize(int32_t S) { Size = S; } + void setPtr(void *P) { Ptr = P; } +}; + +/// Subclass of GlobalTy that holds the memory for a global of \p Ty. +template class StaticGlobalTy : public GlobalTy { + Ty Data; + +public: + template + StaticGlobalTy(const std::string &Name, Args &&...args) + : GlobalTy(Name, sizeof(Ty), &Data), + Data(Ty{std::forward(args)...}) {} + + template + StaticGlobalTy(const char *Name, Args &&...args) + : GlobalTy(Name, sizeof(Ty), &Data), + Data(Ty{std::forward(args)...}) {} + + template + StaticGlobalTy(const char *Name, const char *Suffix, Args &&...args) + : GlobalTy(std::string(Name) + Suffix, sizeof(Ty), &Data), + Data(Ty{std::forward(args)...}) {} + + Ty &getValue() { return Data; } + const Ty &getValue() const { return Data; } + void setValue(const Ty &V) { Data = V; } +}; + +/// Helper class to do the heavy lifting when it comes to moving globals between +/// host and device. Through the GenericDeviceTy we access memcpy DtoH and HtoD, +/// which means the only things specialized by the subclass is the retrival of +/// global metadata (size, addr) from the device. +/// \see getGlobalMetadataFromDevice +class GenericGlobalHandlerTy { + /// Map to store the ELF object files that have been loaded. + llvm::DenseMap ELFObjectFiles; + + /// Get the cached ELF64LEObjectFile previosuly created for a specific + /// device image or create it if did not exist. + const ELF64LEObjectFile * + getOrCreateELFObjectFile(const GenericDeviceTy &Device, DeviceImageTy &Image); + + /// Actually move memory between host and device. See readGlobalFromDevice and + /// writeGlobalToDevice for the interface description. + Error moveGlobalBetweenDeviceAndHost(GenericDeviceTy &Device, + DeviceImageTy &Image, + const GlobalTy &HostGlobal, + bool Device2Host); + + /// Actually move memory between host and device. See readGlobalFromDevice and + /// writeGlobalToDevice for the interface description. + Error moveGlobalBetweenDeviceAndHost(GenericDeviceTy &Device, + DeviceImageTy &Image, + const GlobalTy &HostGlobal, + const GlobalTy &DeviceGlobal, + bool Device2Host); + +public: + virtual ~GenericGlobalHandlerTy() {} + + /// Get the address and size of a global in the image. Address and size are + /// return in \p ImageGlobal, the global name is passed in \p ImageGlobal. + Error getGlobalMetadataFromImage(GenericDeviceTy &Device, + DeviceImageTy &Image, GlobalTy &ImageGlobal); + + /// Read the memory associated with a global from the image and store it on + /// the host. The name, size, and destination are defined by \p HostGlobal. + Error readGlobalFromImage(GenericDeviceTy &Device, DeviceImageTy &Image, + const GlobalTy &HostGlobal); + + /// Get the address and size of a global from the device. Address is return in + /// \p DeviceGlobal, the global name and expected size are passed in + /// \p DeviceGlobal. + virtual Error getGlobalMetadataFromDevice(GenericDeviceTy &Device, + DeviceImageTy &Image, + GlobalTy &DeviceGlobal) = 0; + + /// Copy the memory associated with a global from the device to its + /// counterpart on the host. The name, size, and destination are defined by + /// \p HostGlobal. The origin is defined by \p DeviceGlobal. + Error readGlobalFromDevice(GenericDeviceTy &Device, DeviceImageTy &Image, + const GlobalTy &HostGlobal, + const GlobalTy &DeviceGlobal) { + return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal, + DeviceGlobal, + /* D2H */ true); + } + + /// Copy the memory associated with a global from the device to its + /// counterpart on the host. The name, size, and destination are defined by + /// \p HostGlobal. The origin is automatically resolved. + Error readGlobalFromDevice(GenericDeviceTy &Device, DeviceImageTy &Image, + const GlobalTy &HostGlobal) { + return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal, + /* D2H */ true); + } + + /// Copy the memory associated with a global from the host to its counterpart + /// on the device. The name, size, and origin are defined by \p HostGlobal. + /// The destination is defined by \p DeviceGlobal. + Error writeGlobalToDevice(GenericDeviceTy &Device, DeviceImageTy &Image, + const GlobalTy &HostGlobal, + const GlobalTy &DeviceGlobal) { + return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal, + DeviceGlobal, + /* D2H */ false); + } + + /// Copy the memory associated with a global from the host to its counterpart + /// on the device. The name, size, and origin are defined by \p HostGlobal. + /// The destination is automatically resolved. + Error writeGlobalToDevice(GenericDeviceTy &Device, DeviceImageTy &Image, + const GlobalTy &HostGlobal) { + return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal, + /* D2H */ false); + } +}; + +} // namespace plugin +} // namespace target +} // namespace omp +} // namespace llvm + +#endif // LLVM_OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_GLOBALHANDLER_H diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/GlobalHandler.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/GlobalHandler.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/GlobalHandler.cpp @@ -0,0 +1,152 @@ +//===- GlobalHandler.cpp - Target independent global & env. var handling --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Target independent global handler and environment manager. +// +//===----------------------------------------------------------------------===// + +#include "GlobalHandler.h" +#include "ELFSymbols.h" +#include "PluginInterface.h" + +#include + +using namespace llvm; +using namespace omp; +using namespace target; +using namespace plugin; + +const ELF64LEObjectFile * +GenericGlobalHandlerTy::getOrCreateELFObjectFile(const GenericDeviceTy &Device, + DeviceImageTy &Image) { + + auto Search = ELFObjectFiles.find(Image.getId()); + if (Search != ELFObjectFiles.end()) + // The ELF object file was already there. + return &Search->second; + + // The ELF object file we are checking is not created yet. + Expected ElfOrErr = + ELF64LEObjectFile::create(Image.getMemoryBuffer()); + if (!ElfOrErr) { + consumeError(ElfOrErr.takeError()); + return nullptr; + } + + auto Result = + ELFObjectFiles.try_emplace(Image.getId(), std::move(ElfOrErr.get())); + assert(Result.second && "Map insertion failed"); + assert(Result.first != ELFObjectFiles.end() && "Map insertion failed"); + + return &Result.first->second; +} + +Error GenericGlobalHandlerTy::moveGlobalBetweenDeviceAndHost( + GenericDeviceTy &Device, DeviceImageTy &Image, const GlobalTy &HostGlobal, + bool Device2Host) { + + GlobalTy DeviceGlobal(HostGlobal.getName(), HostGlobal.getSize()); + + // Get the metadata from the global on the device. + if (auto Err = getGlobalMetadataFromDevice(Device, Image, DeviceGlobal)) + return Err; + + // Perform the actual transfer. + return moveGlobalBetweenDeviceAndHost(Device, Image, HostGlobal, DeviceGlobal, + Device2Host); +} + +/// Actually move memory between host and device. See readGlobalFromDevice and +/// writeGlobalToDevice for the interface description. +Error GenericGlobalHandlerTy::moveGlobalBetweenDeviceAndHost( + GenericDeviceTy &Device, DeviceImageTy &DeviceImage, + const GlobalTy &HostGlobal, const GlobalTy &DeviceGlobal, + bool Device2Host) { + + // Transfer the data from the source to the destination. + if (Device2Host) { + if (auto Err = + Device.dataRetrieve(HostGlobal.getPtr(), DeviceGlobal.getPtr(), + HostGlobal.getSize(), nullptr)) + return Err; + } else { + if (auto Err = Device.dataSubmit(DeviceGlobal.getPtr(), HostGlobal.getPtr(), + HostGlobal.getSize(), nullptr)) + return Err; + } + + DP("Succesfully %s %u bytes associated with global symbol '%s' %s the device " + "(%p -> %p).\n", + Device2Host ? "read" : "write", HostGlobal.getSize(), + HostGlobal.getName().data(), Device2Host ? "from" : "to", + DeviceGlobal.getPtr(), HostGlobal.getPtr()); + + return Plugin::success(); +} + +Error GenericGlobalHandlerTy::getGlobalMetadataFromImage( + GenericDeviceTy &Device, DeviceImageTy &Image, GlobalTy &ImageGlobal) { + + // Get the ELF object file for the image. Notice the ELF object may already + // be created in previous calls, so we can reuse it. + const ELF64LEObjectFile *ELFObj = getOrCreateELFObjectFile(Device, Image); + if (!ELFObj) + return Plugin::error("Unable to create ELF object for image %p", + Image.getStart()); + + // Search the ELF symbol using the the symbol name. + auto SymOrErr = getELFSymbol(*ELFObj, ImageGlobal.getName()); + if (!SymOrErr) + return Plugin::error("Failed ELF lookup of global '%s': %s", + ImageGlobal.getName().data(), + toString(SymOrErr.takeError()).data()); + + if (!*SymOrErr) + return Plugin::error("Failed to find global symbol '%s' in the ELF image", + ImageGlobal.getName().data()); + + // Get the section to which the symbol belongs. + auto SymSecOrErr = ELFObj->getELFFile().getSection((*SymOrErr)->st_shndx); + if (!SymSecOrErr) + return Plugin::error("Failed to get ELF section from global '%s': %s", + ImageGlobal.getName().data(), + toString(SymOrErr.takeError()).data()); + + // Save the global symbol's address and size. The address of the global is the + // image base address + the section offset + the symbol value. + ImageGlobal.setPtr((char *)Image.getStart() + (*SymSecOrErr)->sh_offset + + (*SymOrErr)->st_value); + ImageGlobal.setSize((*SymOrErr)->st_size); + + return Plugin::success(); +} + +Error GenericGlobalHandlerTy::readGlobalFromImage(GenericDeviceTy &Device, + DeviceImageTy &Image, + const GlobalTy &HostGlobal) { + + GlobalTy ImageGlobal(HostGlobal.getName(), -1); + if (auto Err = getGlobalMetadataFromImage(Device, Image, ImageGlobal)) + return Err; + + if (ImageGlobal.getSize() != HostGlobal.getSize()) + return Plugin::error("Transfer failed because global symbol '%s' has " + "%u bytes in the ELF image but %u bytes on the host", + HostGlobal.getName().data(), ImageGlobal.getSize(), + HostGlobal.getSize()); + + DP("Global symbol '%s' was found in the ELF image and %u bytes will copied " + "from %p to %p.\n", + HostGlobal.getName().data(), HostGlobal.getSize(), ImageGlobal.getPtr(), + HostGlobal.getPtr()); + + // Perform the copy from the image to the host memory. + std::memcpy(HostGlobal.getPtr(), ImageGlobal.getPtr(), HostGlobal.getSize()); + + return Plugin::success(); +} diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h @@ -0,0 +1,824 @@ +//===- PluginInterface.h - Target independent plugin device interface -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_PLUGININTERFACE_H +#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_PLUGININTERFACE_H + +#include +#include +#include +#include +#include + +#include "Debug.h" +#include "DeviceEnvironment.h" +#include "GlobalHandler.h" +#include "MemoryManager.h" +#include "Utilities.h" +#include "omptarget.h" + +#include "llvm/ADT/SmallVector.h" +#include "llvm/Frontend/OpenMP/OMPConstants.h" +#include "llvm/Frontend/OpenMP/OMPGridValues.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MemoryBufferRef.h" + +namespace llvm { +namespace omp { +namespace target { +namespace plugin { + +struct GenericPluginTy; +struct GenericKernelTy; +struct GenericDeviceTy; + +/// Class that wraps the __tgt_async_info to simply its usage. In case the +/// object is constructed without a valid __tgt_async_info, the object will use +/// an internal one and will synchronize the current thread with the pending +/// operations on object destruction. +struct AsyncInfoWrapperTy { + AsyncInfoWrapperTy(Error &Err, GenericDeviceTy &Device, + __tgt_async_info *AsyncInfoPtr) + : Err(Err), ErrOutParam(&Err), Device(Device), + AsyncInfoPtr(AsyncInfoPtr ? AsyncInfoPtr : &LocalAsyncInfo) {} + + /// Synchronize with the __tgt_async_info's pending operations if it's the + /// internal one. + ~AsyncInfoWrapperTy(); + + /// Get the raw __tgt_async_info pointer. + operator __tgt_async_info *() const { return AsyncInfoPtr; } + + /// Get a reference to the underlying plugin-specific queue type. + template Ty &getQueueAs() const { + static_assert(sizeof(Ty) == sizeof(AsyncInfoPtr->Queue), + "Queue is not of the same size as target type"); + return reinterpret_cast(AsyncInfoPtr->Queue); + } + +private: + Error &Err; + ErrorAsOutParameter ErrOutParam; + GenericDeviceTy &Device; + __tgt_async_info LocalAsyncInfo; + __tgt_async_info *const AsyncInfoPtr; +}; + +/// Class wrapping a __tgt_device_image and its offload entry table on a +/// specific device. This class is responsible for storing and managing +/// the offload entries for an image on a device. +class DeviceImageTy { + + /// Class representing the offload entry table. The class stores the + /// __tgt_target_table and a map to search in the table faster. + struct OffloadEntryTableTy { + /// Add new entry to the table. + void addEntry(const __tgt_offload_entry &Entry) { + Entries.push_back(Entry); + TTTablePtr.EntriesBegin = &Entries[0]; + TTTablePtr.EntriesEnd = TTTablePtr.EntriesBegin + Entries.size(); + } + + /// Get the raw pointer to the __tgt_target_table. + operator __tgt_target_table *() { + if (Entries.empty()) + return nullptr; + return &TTTablePtr; + } + + private: + __tgt_target_table TTTablePtr; + llvm::SmallVector<__tgt_offload_entry> Entries; + }; + + /// Image identifier within the corresponding device. Notice that this id is + /// not unique between different device; they may overlap. + int32_t ImageId; + + /// The pointer to the raw __tgt_device_image. + const __tgt_device_image *TgtImage; + + /// Table of offload entries. + OffloadEntryTableTy OffloadEntryTable; + +public: + DeviceImageTy(int32_t Id, const __tgt_device_image *Image) + : ImageId(Id), TgtImage(Image) { + assert(TgtImage && "Invalid target image"); + } + + /// Get the image identifier within the device. + int32_t getId() const { return ImageId; } + + /// Get the pointer to the raw __tgt_device_image. + const __tgt_device_image *getTgtImage() const { return TgtImage; } + + /// Get the image starting address. + void *getStart() const { return TgtImage->ImageStart; } + + /// Get the image size. + size_t getSize() const { + return ((char *)TgtImage->ImageEnd) - ((char *)TgtImage->ImageStart); + } + + /// Get a memory buffer reference to the whole image. + MemoryBufferRef getMemoryBuffer() const { + return MemoryBufferRef(StringRef((const char *)getStart(), getSize()), + "Image"); + } + + /// Get a reference to the offload entry table for the image. + OffloadEntryTableTy &getOffloadEntryTable() { return OffloadEntryTable; } +}; + +/// Class implementing common functionalities of offload kernels. Each plugin +/// should define the specific kernel class, derive from this generic one, and +/// implement the necessary virtual function members. +struct GenericKernelTy { + /// Construct a kernel with a name and a execution mode. + GenericKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode) + : Name(Name), ExecutionMode(ExecutionMode), DynamicMemorySize(0), + PreferredNumThreads(0), MaxNumThreads(0) {} + + virtual ~GenericKernelTy() {} + + /// Initialize the kernel object from a specific device. + Error init(GenericDeviceTy &GenericDevice, DeviceImageTy &Image); + virtual Error initImpl(GenericDeviceTy &GenericDevice, + DeviceImageTy &Image) = 0; + + /// Launch the kernel on the specific device. The device must be the same + /// one used to initialize the kernel. + Error launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, + ptrdiff_t *ArgOffsets, int32_t NumArgs, uint64_t NumTeamsClause, + uint32_t ThreadLimitClause, uint64_t LoopTripCount, + AsyncInfoWrapperTy &AsyncInfoWrapper) const; + virtual Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads, + uint64_t NumBlocks, uint32_t DynamicMemorySize, + int32_t NumKernelArgs, void *KernelArgs, + AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0; + + /// Get the kernel name. + const char *getName() const { return Name; } + + /// Indicate whether an execution mode is valid. + static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) { + switch (ExecutionMode) { + case OMP_TGT_EXEC_MODE_SPMD: + case OMP_TGT_EXEC_MODE_GENERIC: + case OMP_TGT_EXEC_MODE_GENERIC_SPMD: + return true; + } + return false; + } + +private: + /// Prepare the arguments before launching the kernel. + void *prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs, + ptrdiff_t *ArgOffsets, int32_t NumArgs, + llvm::SmallVectorImpl &Args, + llvm::SmallVectorImpl &Ptrs, + AsyncInfoWrapperTy &AsyncInfoWrapper) const; + + /// Get the default number of threads and blocks for the kernel. + virtual uint32_t getDefaultNumThreads(GenericDeviceTy &Device) const = 0; + virtual uint64_t getDefaultNumBlocks(GenericDeviceTy &Device) const = 0; + + /// Get the number of threads and blocks for the kernel based on the + /// user-defined threads and block clauses. + uint32_t getNumThreads(GenericDeviceTy &GenericDevice, + uint32_t ThreadLimitClause) const; + uint64_t getNumBlocks(GenericDeviceTy &GenericDevice, + uint64_t BlockLimitClause, uint64_t LoopTripCount, + uint32_t NumThreads) const; + + /// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode. + bool isGenericSPMDMode() const { + return ExecutionMode == OMP_TGT_EXEC_MODE_GENERIC_SPMD; + } + bool isGenericMode() const { + return ExecutionMode == OMP_TGT_EXEC_MODE_GENERIC; + } + bool isSPMDMode() const { return ExecutionMode == OMP_TGT_EXEC_MODE_SPMD; } + + /// Get the execution mode name of the kernel. + const char *getExecutionModeName() const { + switch (ExecutionMode) { + case OMP_TGT_EXEC_MODE_SPMD: + return "SPMD"; + case OMP_TGT_EXEC_MODE_GENERIC: + return "Generic"; + case OMP_TGT_EXEC_MODE_GENERIC_SPMD: + return "Generic-SPMD"; + } + llvm_unreachable("Unknown execution mode!"); + } + + /// The kernel name. + const char *Name; + + /// The execution flags of the kernel. + OMPTgtExecModeFlags ExecutionMode; + +protected: + /// The dynamic memory size reserved for executing the kernel. + uint32_t DynamicMemorySize; + + /// The preferred number of threads to run the kernel. + uint32_t PreferredNumThreads; + + /// The maximum number of threads which the kernel could leverage. + uint32_t MaxNumThreads; +}; + +/// Class implementing common functionalities of offload devices. Each plugin +/// should define the specific device class, derive from this generic one, and +/// implement the necessary virtual function members. +struct GenericDeviceTy : public DeviceAllocatorTy { + /// Construct a device with its device id within the plugin, the number of + /// devices in the plugin and the grid values for that kind of device. + GenericDeviceTy(int32_t DeviceId, int32_t NumDevices, + const llvm::omp::GV &GridValues); + + /// Get the device identifier within the corresponding plugin. Notice that + /// this id is not unique between different plugins; they may overlap. + int32_t getDeviceId() const { return DeviceId; } + + /// Set the context of the device if needed, before calling device-specific + /// functions. Plugins may implement this function as a no-op if not needed. + virtual Error setContext() = 0; + + /// Initialize the device. After this call, the device should be already + /// working and ready to accept queries or modifications. + Error init(GenericPluginTy &Plugin); + virtual Error initImpl(GenericPluginTy &Plugin) = 0; + + /// Deinitialize the device and free all its resources. After this call, the + /// device is no longer considered ready, so no queries or modifications are + /// allowed. + Error deinit(); + virtual Error deinitImpl() = 0; + + /// Load the binary image into the device and return the target table. + Expected<__tgt_target_table *> loadBinary(GenericPluginTy &Plugin, + const __tgt_device_image *TgtImage); + virtual Expected + loadBinaryImpl(const __tgt_device_image *TgtImage, int32_t ImageId) = 0; + + /// Setup the device environment if needed. Notice this setup may not be run + /// on some plugins. By default, it will be executed, but plugins can change + /// this behavior by overriding the shouldSetupDeviceEnvironment function. + Error setupDeviceEnvironment(GenericPluginTy &Plugin, DeviceImageTy &Image); + + /// Register the offload entries for a specific image on the device. + Error registerOffloadEntries(DeviceImageTy &Image); + + /// Synchronize the current thread with the pending operations on the + /// __tgt_async_info structure. + Error synchronize(__tgt_async_info *AsyncInfo); + virtual Error synchronizeImpl(__tgt_async_info &AsyncInfo) = 0; + + /// Allocate data on the device or involving the device. + Expected dataAlloc(int64_t Size, void *HostPtr, TargetAllocTy Kind); + + /// Deallocate data from the device or involving the device. + Error dataDelete(void *TgtPtr, TargetAllocTy Kind); + + /// Submit data to the device (host to device transfer). + Error dataSubmit(void *TgtPtr, const void *HstPtr, int64_t Size, + __tgt_async_info *AsyncInfo); + virtual Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + + /// Retrieve data from the device (device to host transfer). + Error dataRetrieve(void *HstPtr, const void *TgtPtr, int64_t Size, + __tgt_async_info *AsyncInfo); + virtual Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + + /// Exchange data between devices (device to device transfer). Calling this + /// function is only valid if GenericPlugin::isDataExchangable() passing the + /// two devices returns true. + Error dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev, void *DstPtr, + int64_t Size, __tgt_async_info *AsyncInfo); + virtual Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstDev, + void *DstPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + + /// Run the target region with multiple teams. + Error runTargetTeamRegion(void *EntryPtr, void **ArgPtrs, + ptrdiff_t *ArgOffsets, int32_t NumArgs, + uint64_t NumTeamsClause, uint32_t ThreadLimitClause, + uint64_t LoopTripCount, + __tgt_async_info *AsyncInfo); + + /// Initialize a __tgt_async_info structure. Related to interop features. + Error initAsyncInfo(__tgt_async_info **AsyncInfoPtr); + virtual Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + + /// Initialize a __tgt_device_info structure. Related to interop features. + Error initDeviceInfo(__tgt_device_info *DeviceInfo); + virtual Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) = 0; + + /// Create an event. + Error createEvent(void **EventPtrStorage); + virtual Error createEventImpl(void **EventPtrStorage) = 0; + + /// Destroy an event. + Error destroyEvent(void *Event); + virtual Error destroyEventImpl(void *EventPtr) = 0; + + /// Start the recording of the event. + Error recordEvent(void *Event, __tgt_async_info *AsyncInfo); + virtual Error recordEventImpl(void *EventPtr, + AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + + /// Wait for an event to finish. Notice this wait is asynchronous if the + /// __tgt_async_info is not nullptr. + Error waitEvent(void *Event, __tgt_async_info *AsyncInfo); + virtual Error waitEventImpl(void *EventPtr, + AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; + + /// Synchronize the current thread with the event. + Error syncEvent(void *EventPtr); + virtual Error syncEventImpl(void *EventPtr) = 0; + + /// Print information about the device. + Error printInfo(); + virtual Error printInfoImpl() = 0; + + /// Getters of the grid values. + uint32_t getWarpSize() const { return GridValues.GV_Warp_Size; } + uint32_t getThreadLimit() const { return GridValues.GV_Max_WG_Size; } + uint64_t getBlockLimit() const { return GridValues.GV_Max_Teams; } + uint32_t getDefaultNumThreads() const { + return GridValues.GV_Default_WG_Size; + } + uint64_t getDefaultNumBlocks() const { + // TODO: Introduce a default num blocks value. + return GridValues.GV_Default_WG_Size; + } + uint32_t getDynamicMemorySize() const { return OMPX_SharedMemorySize; } + +private: + /// Register offload entry for global variable. + Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage, + const __tgt_offload_entry &GlobalEntry, + __tgt_offload_entry &DeviceEntry); + + /// Register offload entry for kernel function. + Error registerKernelOffloadEntry(DeviceImageTy &DeviceImage, + const __tgt_offload_entry &KernelEntry, + __tgt_offload_entry &DeviceEntry); + + /// Allocate and construct a kernel object. + virtual Expected + constructKernelEntry(const __tgt_offload_entry &KernelEntry, + DeviceImageTy &Image) = 0; + + /// Get and set the stack size and heap size for the device. If not used, the + /// plugin can implement the setters as no-op and setting the output + /// value to zero for the getters. + virtual Error getDeviceStackSize(uint64_t &V) = 0; + virtual Error setDeviceStackSize(uint64_t V) = 0; + virtual Error getDeviceHeapSize(uint64_t &V) = 0; + virtual Error setDeviceHeapSize(uint64_t V) = 0; + + /// Indicate whether the device should setup the device environment. Notice + /// that returning false in this function will change the behavior of the + /// setupDeviceEnvironment() function. + virtual bool shouldSetupDeviceEnvironment() const { return true; } + + /// Environment variables defined by the OpenMP standard. + Int32Envar OMP_TeamLimit; + Int32Envar OMP_NumTeams; + Int32Envar OMP_TeamsThreadLimit; + + /// Environment variables defined by the LLVM OpenMP implementation. + Int32Envar OMPX_DebugKind; + UInt32Envar OMPX_SharedMemorySize; + UInt64Envar OMPX_TargetStackSize; + UInt64Envar OMPX_TargetHeapSize; + + /// Pointer to the memory manager or nullptr if not available. + MemoryManagerTy *MemoryManager; + +protected: + /// Array of images loaded into the device. Images are automatically + /// deallocated by the allocator. + llvm::SmallVector LoadedImages; + + /// The identifier of the device within the plugin. Notice this is not a + /// global device id and is not the device id visible to the OpenMP user. + const int32_t DeviceId; + + /// The default grid values used for this device. + llvm::omp::GV GridValues; + + /// Enumeration used for representing the current state between two devices + /// two devices (both under the same plugin) for the peer access between them. + /// The states can be a) PENDING when the state has not been queried and needs + /// to be queried, b) AVAILABLE when the peer access is available to be used, + /// and c) UNAVAILABLE if the system does not allow it. + enum class PeerAccessState : uint8_t { AVAILABLE, UNAVAILABLE, PENDING }; + + /// Array of peer access states with the rest of devices. This means that if + /// the device I has a matrix PeerAccesses with PeerAccesses[J] == AVAILABLE, + /// the device I can access device J's memory directly. However, notice this + /// does not mean that device J can access device I's memory directly. + llvm::SmallVector PeerAccesses; + std::mutex PeerAccessesLock; +}; + +/// Class implementing common functionalities of offload plugins. Each plugin +/// should define the specific plugin class, derive from this generic one, and +/// implement the necessary virtual function members. +struct GenericPluginTy { + + /// Construct a plugin instance. The number of active instances should be + /// always be either zero or one. + GenericPluginTy() : RequiresFlags(OMP_REQ_UNDEFINED), GlobalHandler(nullptr) { + ++NumActiveInstances; + } + + /// Destroy the plugin instance and release all its resources. Also decrease + /// the number of instances. + virtual ~GenericPluginTy() { + // There is no global handler if no device is available. + if (GlobalHandler) + delete GlobalHandler; + + // Deinitialize all active devices. + for (int32_t DeviceId = 0; DeviceId < NumDevices; ++DeviceId) { + if (Devices[DeviceId]) { + if (auto Err = deinitDevice(DeviceId)) + REPORT("Failure to deinitialize device %d: %s\n", DeviceId, + toString(std::move(Err)).data()); + } + assert(!Devices[DeviceId] && "Device was not deinitialized"); + } + + --NumActiveInstances; + } + + /// Get the reference to the device with a certain device id. + GenericDeviceTy &getDevice(int32_t DeviceId) { + assert(isValidDeviceId(DeviceId) && "Invalid device id"); + assert(Devices[DeviceId] && "Device is unitialized"); + + return *Devices[DeviceId]; + } + + /// Get the number of active devices. + int32_t getNumDevices() const { return NumDevices; } + + /// Get the ELF code to recognize the binary image of this plugin. + virtual uint16_t getMagicElfBits() const = 0; + + /// Allocate a structure using the internal allocator. + template Ty *allocate() { + return reinterpret_cast(Allocator.Allocate(sizeof(Ty), alignof(Ty))); + } + + /// Get the reference to the global handler of this plugin. + GenericGlobalHandlerTy &getGlobalHandler() { + assert(GlobalHandler && "Global handler not initialized"); + return *GlobalHandler; + } + + /// Get the OpenMP requires flags set for this plugin. + int64_t getRequiresFlags() const { return RequiresFlags; } + + /// Set the OpenMP requires flags for this plugin. + void setRequiresFlag(int64_t Flags) { RequiresFlags = Flags; } + + /// Initialize a device within the plugin. + Error initDevice(int32_t DeviceId); + + /// Deinitialize a device within the plugin and release its resources. + Error deinitDevice(int32_t DeviceId); + + /// Indicate whether data can be exchanged directly between two devices under + /// this same plugin. If this function returns true, it's safe to call the + /// GenericDeviceTy::exchangeData() function on the source device. + virtual bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) { + return isValidDeviceId(SrcDeviceId) && isValidDeviceId(DstDeviceId); + } + + /// Indicate if an image is compatible with the plugin devices. Notice that + /// this function may be called before actually initializing the devices. So + /// we could not move this function into GenericDeviceTy. + virtual Expected isImageCompatible(__tgt_image_info *Info) const = 0; + + /// Indicate whether the plugin supports empty images. + virtual bool supportsEmptyImages() const { return false; } + + /// Indicate whether there is any active plugin instance. + static bool hasAnyActiveInstance() { + assert(NumActiveInstances <= 1 && "Invalid number of instances"); + return (NumActiveInstances > 0); + } + +protected: + /// Initialize the plugin and prepare for initializing its devices. + void init(int NumDevices, GenericGlobalHandlerTy *GlobalHandler) { + this->NumDevices = NumDevices; + this->GlobalHandler = GlobalHandler; + + assert(Devices.size() == 0 && "Plugin already intialized"); + + Devices.resize(NumDevices, nullptr); + } + + /// Create a new device with a specific device id. + virtual GenericDeviceTy &createDevice(int32_t DeviceId) = 0; + + /// Indicate whether a device id is valid. + bool isValidDeviceId(int32_t DeviceId) const { + return (DeviceId >= 0 && DeviceId < getNumDevices()); + } + +private: + /// Number of devices available for the plugin. + int32_t NumDevices; + + /// Array of pointers to the devices. Initially, they are all set to nullptr. + /// Once a device is initialized, the pointer is stored in the position given + /// by its device id. A position with nullptr means that the corresponding + /// device was not initialized yet. + llvm::SmallVector Devices; + + /// OpenMP requires flags. + int64_t RequiresFlags; + + /// Pointer to the global handler for this plugin. + GenericGlobalHandlerTy *GlobalHandler; + + /// Internal allocator for different structures. + BumpPtrAllocator Allocator; + + /// Indicates the number of active plugin instances. Actually, we should only + /// have one active instance per plugin library. But we use a counter for + /// simplicity. + static uint32_t NumActiveInstances; +}; + +/// Class for simplifying the getter operation of the plugin. Anywhere on the +/// code, the current plugin can be retrieved by Plugin::get(). The init(), +/// deinit(), get() and check() functions should be defined by each plugin +/// implementation. +class Plugin { + /// Avoid instances of this class. + Plugin() {} + Plugin(const Plugin &) = delete; + void operator=(const Plugin &) = delete; + +public: + /// Initialize the plugin if it was not initialized yet. + static Error init(); + + /// Deinitialize the plugin if it was not deinitialized yet. + static Error deinit(); + + /// Get a reference (or create if it was not created) to the plugin instance. + static GenericPluginTy &get(); + + /// Get a reference to the plugin with a specific plugin-specific type. + template static Ty &get() { return static_cast(get()); } + + /// Indicate if the plugin is currently active. Actually, we check if there is + /// any active instances. + static bool isActive() { return GenericPluginTy::hasAnyActiveInstance(); } + + /// Create a success error. + static Error success() { return Error::success(); } + + /// Create a string error. + template + static Error error(const char *ErrFmt, ArgsTy... Args) { + return createStringError(inconvertibleErrorCode(), ErrFmt, Args...); + } + + /// Check the plugin-specific error code and return an error or success + /// accordingly. In case of an error, create a string error with the error + /// description. The ErrFmt should follow the format: + /// "Error in []: %s" + /// The last format specifier "%s" is mandatory and will be used to place the + /// error code's description. Notice this function should be only called from + /// the plugin-specific code. + template + static Error check(int32_t ErrorCode, const char *ErrFmt, ArgsTy... Args); +}; + +/// Auxiliary interface class for GenericDeviceResourcePoolTy. This class acts +/// as a reference to a device resource, such as a stream, and requires some +/// basic functions to be implemented. The derived class should define an empty +/// constructor that creates an empty and invalid resource reference. Do not +/// create a new resource on the ctor, but on the create() function instead. +struct GenericDeviceResourceRef { + /// Create a new resource and stores a reference. + virtual Error create() = 0; + + /// Destroy and release the resources pointed by the reference. + virtual Error destroy() = 0; +}; + +/// Class that implements a resource pool belonging to a device. This class +/// operates with references to the actual resources. These reference must +/// derive from the GenericDeviceResourceRef class and implement the create +/// and destroy virtual functions. +template class GenericDeviceResourcePoolTy { + using ResourcePoolTy = GenericDeviceResourcePoolTy; + +public: + /// Create an empty resource pool for a specific device. + GenericDeviceResourcePoolTy(GenericDeviceTy &Device) + : Device(Device), NextAvailable(0) {} + + /// Destroy the resource pool. At this point, the deinit() function should + /// already have been executed so the resource pool should be empty. + virtual ~GenericDeviceResourcePoolTy() { + assert(ResourcePool.empty() && "Resource pool not empty"); + } + + /// Initialize the resource pool. + Error init(uint32_t InitialSize) { + assert(ResourcePool.empty() && "Resource pool already initialized"); + return ResourcePoolTy::resizeResourcePool(InitialSize); + } + + /// Deinitialize the resource pool and delete all resources. This function + /// must be called before the destructor. + Error deinit() { + if (NextAvailable) + DP("Missing %d resources to be returned\n", NextAvailable); + + // TODO: This prevents a bug on libomptarget to make the plugins fail. There + // may be some resources not returned. Do not destroy these ones. + if (auto Err = ResourcePoolTy::resizeResourcePool(NextAvailable)) + return Err; + + ResourcePool.clear(); + + return Plugin::success(); + } + +protected: + /// Get resource from the pool or create new resources. + ResourceRef getResource() { + const std::lock_guard Lock(Mutex); + if (NextAvailable == ResourcePool.size()) { + // By default we double the resource pool every time. + if (auto Err = ResourcePoolTy::resizeResourcePool(NextAvailable * 2)) { + REPORT("Failure to resize the resource pool: %s", + toString(std::move(Err)).data()); + // Return an empty reference. + return ResourceRef(); + } + } + return ResourcePool[NextAvailable++]; + } + + /// Return resource to the pool. + void returnResource(ResourceRef Resource) { + const std::lock_guard Lock(Mutex); + ResourcePool[--NextAvailable] = Resource; + } + +private: + /// The resources between \p OldSize and \p NewSize need to be created or + /// destroyed. The mutex is locked when this function is called. + Error resizeResourcePoolImpl(uint32_t OldSize, uint32_t NewSize) { + assert(OldSize != NewSize && "Resizing to the same size"); + + if (auto Err = Device.setContext()) + return Err; + + if (OldSize < NewSize) { + // Create new resources. + for (uint32_t I = OldSize; I < NewSize; ++I) { + if (auto Err = ResourcePool[I].create()) + return Err; + } + } else { + // Destroy the obsolete resources. + for (uint32_t I = NewSize; I < OldSize; ++I) { + if (auto Err = ResourcePool[I].destroy()) + return Err; + } + } + return Plugin::success(); + } + + /// Increase or decrease the number of resources. This function should + /// be called with the mutex acquired. + Error resizeResourcePool(uint32_t NewSize) { + uint32_t OldSize = ResourcePool.size(); + + // Nothing to do. + if (OldSize == NewSize) + return Plugin::success(); + + if (OldSize > NewSize) { + // Decrease the number of resources. + auto Err = ResourcePoolTy::resizeResourcePoolImpl(OldSize, NewSize); + ResourcePool.resize(NewSize); + return Err; + } + + // Increase the number of resources otherwise. + ResourcePool.resize(NewSize); + return ResourcePoolTy::resizeResourcePoolImpl(OldSize, NewSize); + } + + /// The device to which the resources belong + GenericDeviceTy &Device; + + /// Mutex for the resource pool. + std::mutex Mutex; + + /// The next available resource in the pool. + uint32_t NextAvailable; + +protected: + /// The actual resource pool. + std::deque ResourcePool; +}; + +/// Class implementing a common stream manager. This class can be directly used +/// by the specific plugins if necessary. The StreamRef type should derive from +/// the GenericDeviceResourceRef. Look at its description to know the details of +/// their requirements. +template +class GenericStreamManagerTy : public GenericDeviceResourcePoolTy { + using ResourcePoolTy = GenericDeviceResourcePoolTy; + +public: + /// Create a stream manager with space for an initial number of streams. No + /// stream will be created until the init() function is called. + GenericStreamManagerTy(GenericDeviceTy &Device, uint32_t DefNumStreams = 32) + : ResourcePoolTy(Device), + InitialNumStreams("LIBOMPTARGET_NUM_INITIAL_STREAMS", DefNumStreams) {} + + /// Initialize the stream pool and their resources with the initial number of + /// streams. + Error init() { return ResourcePoolTy::init(InitialNumStreams.get()); } + + /// Get an available stream or create new. + StreamRef getStream() { return ResourcePoolTy::getResource(); } + + /// Return idle stream. + void returnStream(StreamRef Stream) { + ResourcePoolTy::returnResource(Stream); + } + +private: + /// The initial stream pool size, potentially defined by an envar. + UInt32Envar InitialNumStreams; +}; + +/// Class implementing a common event manager. This class can be directly used +/// by the specific plugins if necessary. The EventRef type should derive from +/// the GenericDeviceResourceRef. Look at its description to know the details of +/// their requirements. +template +struct GenericEventManagerTy : public GenericDeviceResourcePoolTy { + using ResourcePoolTy = GenericDeviceResourcePoolTy; + +public: + /// Create an event manager with space for an initial number of events. No + /// event will be created until the init() function is called. + GenericEventManagerTy(GenericDeviceTy &Device, uint32_t DefNumEvents = 32) + : ResourcePoolTy(Device), + InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", DefNumEvents) {} + + /// Initialize the event pool and their resources with the initial number of + /// events. + Error init() { return ResourcePoolTy::init(InitialNumEvents.get()); } + + /// Get an available event or create new. + EventRef getEvent() { return ResourcePoolTy::getResource(); } + + /// Return an idle event. + void returnEvent(EventRef Event) { ResourcePoolTy::returnResource(Event); } + +private: + /// The initial event pool size, potentially defined by an envar. + UInt32Envar InitialNumEvents; +}; + +} // namespace plugin +} // namespace target +} // namespace omp +} // namespace llvm + +#endif // OPENMP_LIBOMPTARGET_PLUGINS_COMMON_PLUGININTERFACE_H diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp @@ -0,0 +1,849 @@ +//===- PluginInterface.cpp - Target independent plugin device interface ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "PluginInterface.h" +#include "Debug.h" +#include "GlobalHandler.h" +#include "elf_common.h" +#include "omptarget.h" +#include "omptargetplugin.h" + +#include +#include + +using namespace llvm; +using namespace omp; +using namespace target; +using namespace plugin; + +uint32_t GenericPluginTy::NumActiveInstances = 0; + +AsyncInfoWrapperTy::~AsyncInfoWrapperTy() { + // If we used a local async info object we want synchronous behavior. + // In that case, and assuming the current status code is OK, we will + // synchronize explicitly when the object is deleted. + if (AsyncInfoPtr == &LocalAsyncInfo && !Err) + Err = Device.synchronize(&LocalAsyncInfo); +} + +Error GenericKernelTy::init(GenericDeviceTy &GenericDevice, + DeviceImageTy &Image) { + PreferredNumThreads = getDefaultNumThreads(GenericDevice); + if (isGenericMode()) + PreferredNumThreads += GenericDevice.getWarpSize(); + + MaxNumThreads = GenericDevice.getThreadLimit(); + + DynamicMemorySize = GenericDevice.getDynamicMemorySize(); + + return initImpl(GenericDevice, Image); +} + +Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, + ptrdiff_t *ArgOffsets, int32_t NumArgs, + uint64_t NumTeamsClause, + uint32_t ThreadLimitClause, + uint64_t LoopTripCount, + AsyncInfoWrapperTy &AsyncInfoWrapper) const { + llvm::SmallVector Args; + llvm::SmallVector Ptrs; + + void *KernelArgsPtr = prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, NumArgs, + Args, Ptrs, AsyncInfoWrapper); + + uint32_t NumThreads = getNumThreads(GenericDevice, ThreadLimitClause); + uint64_t NumBlocks = + getNumBlocks(GenericDevice, NumTeamsClause, LoopTripCount, NumThreads); + + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(), + "Launching kernel %s with %d blocks and %d threads in %s mode\n", + getName(), NumBlocks, NumThreads, getExecutionModeName()); + + return launchImpl(GenericDevice, NumThreads, NumBlocks, DynamicMemorySize, + NumArgs, KernelArgsPtr, AsyncInfoWrapper); +} + +void *GenericKernelTy::prepareArgs(GenericDeviceTy &GenericDevice, + void **ArgPtrs, ptrdiff_t *ArgOffsets, + int32_t NumArgs, + llvm::SmallVectorImpl &Args, + llvm::SmallVectorImpl &Ptrs, + AsyncInfoWrapperTy &AsyncInfoWrapper) const { + Args.resize(NumArgs); + Ptrs.resize(NumArgs); + + if (NumArgs == 0) + return nullptr; + + for (int I = 0; I < NumArgs; ++I) { + Ptrs[I] = (void *)((intptr_t)ArgPtrs[I] + ArgOffsets[I]); + Args[I] = &Ptrs[I]; + } + return &Args[0]; +} + +uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice, + uint32_t ThreadLimitClause) const { + return std::min(MaxNumThreads, (ThreadLimitClause > 0) ? ThreadLimitClause + : PreferredNumThreads); +} + +uint64_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice, + uint64_t NumTeamsClause, + uint64_t LoopTripCount, + uint32_t NumThreads) const { + uint64_t PreferredNumBlocks = getDefaultNumBlocks(GenericDevice); + if (NumTeamsClause > 0) { + PreferredNumBlocks = NumTeamsClause; + } else if (LoopTripCount > 0) { + if (isSPMDMode()) { + // We have a combined construct, i.e. `target teams distribute + // parallel for [simd]`. We launch so many teams so that each thread + // will execute one iteration of the loop. round up to the nearest + // integer + PreferredNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1; + } else { + assert((isGenericMode() || isGenericSPMDMode()) && + "Unexpected execution mode!"); + // If we reach this point, then we have a non-combined construct, i.e. + // `teams distribute` with a nested `parallel for` and each team is + // assigned one iteration of the `distribute` loop. E.g.: + // + // #pragma omp target teams distribute + // for(...loop_tripcount...) { + // #pragma omp parallel for + // for(...) {} + // } + // + // Threads within a team will execute the iterations of the `parallel` + // loop. + PreferredNumBlocks = LoopTripCount; + } + } + return std::min(PreferredNumBlocks, GenericDevice.getBlockLimit()); +} + +GenericDeviceTy::GenericDeviceTy(int32_t DeviceId, int32_t NumDevices, + const llvm::omp::GV &OMPGridValues) + : OMP_TeamLimit("OMP_TEAM_LIMIT"), OMP_NumTeams("OMP_NUM_TEAMS"), + OMP_TeamsThreadLimit("OMP_TEAMS_THREAD_LIMIT"), + OMPX_DebugKind("LIBOMPTARGET_DEVICE_RTL_DEBUG"), + OMPX_SharedMemorySize("LIBOMPTARGET_SHARED_MEMORY_SIZE"), + // Do not initialize the following two envars since they depend on the + // device initialization. These cannot be consulted until the device is + // initialized correctly. We intialize them in GenericDeviceTy::init(). + OMPX_TargetStackSize(), OMPX_TargetHeapSize(), MemoryManager(nullptr), + DeviceId(DeviceId), GridValues(OMPGridValues), + PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock() { + if (OMP_NumTeams > 0) + GridValues.GV_Max_Teams = + std::min(GridValues.GV_Max_Teams, uint32_t(OMP_NumTeams)); + + if (OMP_TeamsThreadLimit > 0) + GridValues.GV_Max_WG_Size = + std::min(GridValues.GV_Max_WG_Size, uint32_t(OMP_TeamsThreadLimit)); +}; + +Error GenericDeviceTy::init(GenericPluginTy &Plugin) { + if (auto Err = initImpl(Plugin)) + return Err; + + // Read and reinitialize the envars that depend on the device initialization. + // Notice these two envars may change the stack size and heap size of the + // device, so they need the device properly initialized. + auto StackSizeEnvarOrErr = UInt64Envar::create( + "LIBOMPTARGET_STACK_SIZE", + [this](uint64_t &V) -> Error { return getDeviceStackSize(V); }, + [this](uint64_t V) -> Error { return setDeviceStackSize(V); }); + if (!StackSizeEnvarOrErr) + return StackSizeEnvarOrErr.takeError(); + OMPX_TargetStackSize = std::move(*StackSizeEnvarOrErr); + + auto HeapSizeEnvarOrErr = UInt64Envar::create( + "LIBOMPTARGET_HEAP_SIZE", + [this](uint64_t &V) -> Error { return getDeviceHeapSize(V); }, + [this](uint64_t V) -> Error { return setDeviceHeapSize(V); }); + if (!HeapSizeEnvarOrErr) + return HeapSizeEnvarOrErr.takeError(); + OMPX_TargetHeapSize = std::move(*HeapSizeEnvarOrErr); + + // Enable the memory manager if required. + auto [ThresholdMM, EnableMM] = MemoryManagerTy::getSizeThresholdFromEnv(); + if (EnableMM) + MemoryManager = new MemoryManagerTy(*this, ThresholdMM); + + return Plugin::success(); +} + +Error GenericDeviceTy::deinit() { + // Delete the memory manager before deinitilizing the device. Otherwise, + // we may delete device allocations after the device is deinitialized. + if (MemoryManager) + delete MemoryManager; + MemoryManager = nullptr; + + return deinitImpl(); +} + +Expected<__tgt_target_table *> +GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, + const __tgt_device_image *TgtImage) { + DP("Load data from image " DPxMOD "\n", DPxPTR(TgtImage->ImageStart)); + + // Load the binary and allocate the image object. Use the next available id + // for the image id, which is the number of previously loaded images. + auto ImageOrErr = loadBinaryImpl(TgtImage, LoadedImages.size()); + if (!ImageOrErr) + return ImageOrErr.takeError(); + + DeviceImageTy *Image = *ImageOrErr; + assert(Image != nullptr && "Invalid image"); + + // Add the image to list. + LoadedImages.push_back(Image); + + // Setup the device environment if needed. + if (auto Err = setupDeviceEnvironment(Plugin, *Image)) + return std::move(Err); + + // Register all offload entries of the image. + if (auto Err = registerOffloadEntries(*Image)) + return std::move(Err); + + // Return the pointer to the table of entries. + return Image->getOffloadEntryTable(); +} + +Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin, + DeviceImageTy &Image) { + // There are some plugins that do not need this step. + if (!shouldSetupDeviceEnvironment()) + return Plugin::success(); + + DeviceEnvironmentTy DeviceEnvironment; + DeviceEnvironment.DebugKind = OMPX_DebugKind; + DeviceEnvironment.NumDevices = Plugin.getNumDevices(); + // TODO: The device ID used here is not the real device ID used by OpenMP. + DeviceEnvironment.DeviceNum = DeviceId; + DeviceEnvironment.DynamicMemSize = OMPX_SharedMemorySize; + + // Create the metainfo of the device environment global. + GlobalTy DeviceEnvGlobal("omptarget_device_environment", + sizeof(DeviceEnvironmentTy), &DeviceEnvironment); + + // Write device environment values to the device. + GenericGlobalHandlerTy &GlobalHandler = Plugin.getGlobalHandler(); + return GlobalHandler.writeGlobalToDevice(*this, Image, DeviceEnvGlobal); +} + +Error GenericDeviceTy::registerOffloadEntries(DeviceImageTy &Image) { + const __tgt_offload_entry *Begin = Image.getTgtImage()->EntriesBegin; + const __tgt_offload_entry *End = Image.getTgtImage()->EntriesEnd; + for (const __tgt_offload_entry *Entry = Begin; Entry != End; ++Entry) { + // The host should have always something in the address to uniquely + // identify the entry. + if (!Entry->addr) + return Plugin::error("Failure to register entry without address"); + + __tgt_offload_entry DeviceEntry = {0}; + + if (Entry->size) { + if (auto Err = registerGlobalOffloadEntry(Image, *Entry, DeviceEntry)) + return Err; + } else { + if (auto Err = registerKernelOffloadEntry(Image, *Entry, DeviceEntry)) + return Err; + } + + assert(DeviceEntry.addr && "Device addr of offload entry cannot be null"); + + DP("Entry point " DPxMOD " maps to%s %s (" DPxMOD ")\n", + DPxPTR(Entry - Begin), (Entry->size) ? " global" : "", Entry->name, + DPxPTR(DeviceEntry.addr)); + } + return Plugin::success(); +} + +Error GenericDeviceTy::registerGlobalOffloadEntry( + DeviceImageTy &Image, const __tgt_offload_entry &GlobalEntry, + __tgt_offload_entry &DeviceEntry) { + + GenericPluginTy &Plugin = Plugin::get(); + + DeviceEntry = GlobalEntry; + + // Create a metadata object for the device global. + GlobalTy DeviceGlobal(GlobalEntry.name, GlobalEntry.size); + + // Get the address of the device of the global. + GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler(); + if (auto Err = + GHandler.getGlobalMetadataFromDevice(*this, Image, DeviceGlobal)) + return Err; + + // Store the device address on the device entry. + DeviceEntry.addr = DeviceGlobal.getPtr(); + assert(DeviceEntry.addr && "Invalid device global's address"); + + // Note: In the current implementation declare target variables + // can either be link or to. This means that once unified + // memory is activated via the requires directive, the variable + // can be used directly from the host in both cases. + if (Plugin.getRequiresFlags() & OMP_REQ_UNIFIED_SHARED_MEMORY) { + // If unified memory is present any target link or to variables + // can access host addresses directly. There is no longer a + // need for device copies. + GlobalTy HostGlobal(GlobalEntry); + if (auto Err = GHandler.writeGlobalToDevice(*this, Image, HostGlobal, + DeviceGlobal)) + return Err; + } + + // Add the device entry on the entry table. + Image.getOffloadEntryTable().addEntry(DeviceEntry); + + return Plugin::success(); +} + +Error GenericDeviceTy::registerKernelOffloadEntry( + DeviceImageTy &Image, const __tgt_offload_entry &KernelEntry, + __tgt_offload_entry &DeviceEntry) { + DeviceEntry = KernelEntry; + + // Create a kernel object. + auto KernelOrErr = constructKernelEntry(KernelEntry, Image); + if (!KernelOrErr) + return KernelOrErr.takeError(); + + GenericKernelTy *Kernel = *KernelOrErr; + assert(Kernel != nullptr && "Invalid kernel"); + + // Initialize the kernel. + if (auto Err = Kernel->init(*this, Image)) + return Err; + + // Set the device entry address to the kernel address and store the entry on + // the entry table. + DeviceEntry.addr = (void *)Kernel; + Image.getOffloadEntryTable().addEntry(DeviceEntry); + + return Plugin::success(); +} + +Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo) { + if (!AsyncInfo || !AsyncInfo->Queue) + return Plugin::error("Invalid async info queue"); + + return synchronizeImpl(*AsyncInfo); +} + +Expected GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr, + TargetAllocTy Kind) { + void *Alloc = nullptr; + + switch (Kind) { + case TARGET_ALLOC_DEFAULT: + case TARGET_ALLOC_DEVICE: + if (MemoryManager) { + Alloc = MemoryManager->allocate(Size, HostPtr); + if (!Alloc) + return Plugin::error("Failed to allocate from memory manager"); + break; + } + [[fallthrough]]; + case TARGET_ALLOC_HOST: + case TARGET_ALLOC_SHARED: + Alloc = allocate(Size, HostPtr, Kind); + if (!Alloc) + return Plugin::error("Failed to allocate from device allocator"); + } + + // Sucessful and valid allocation. + if (Alloc) + return Alloc; + + // At this point means that we did not tried to allocate from the memory + // manager nor the device allocator. + return Plugin::error("Invalid target data allocation kind or requested " + "allocator not implemented yet"); +} + +Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) { + int Res; + if (MemoryManager) + Res = MemoryManager->free(TgtPtr); + else + Res = free(TgtPtr, Kind); + + if (Res) + return Plugin::error("Failure to deallocate device pointer %p", TgtPtr); + + return Plugin::success(); +} + +Error GenericDeviceTy::dataSubmit(void *TgtPtr, const void *HstPtr, + int64_t Size, __tgt_async_info *AsyncInfo) { + auto Err = Plugin::success(); + AsyncInfoWrapperTy AsyncInfoWrapper(Err, *this, AsyncInfo); + Err = dataSubmitImpl(TgtPtr, HstPtr, Size, AsyncInfoWrapper); + return Err; +} + +Error GenericDeviceTy::dataRetrieve(void *HstPtr, const void *TgtPtr, + int64_t Size, __tgt_async_info *AsyncInfo) { + auto Err = Plugin::success(); + AsyncInfoWrapperTy AsyncInfoWrapper(Err, *this, AsyncInfo); + Err = dataRetrieveImpl(HstPtr, TgtPtr, Size, AsyncInfoWrapper); + return Err; +} + +Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev, + void *DstPtr, int64_t Size, + __tgt_async_info *AsyncInfo) { + auto Err = Plugin::success(); + AsyncInfoWrapperTy AsyncInfoWrapper(Err, *this, AsyncInfo); + Err = dataExchangeImpl(SrcPtr, DstDev, DstPtr, Size, AsyncInfoWrapper); + return Err; +} + +Error GenericDeviceTy::runTargetTeamRegion( + void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets, int32_t NumArgs, + uint64_t NumTeamsClause, uint32_t ThreadLimitClause, uint64_t LoopTripCount, + __tgt_async_info *AsyncInfo) { + auto Err = Plugin::success(); + AsyncInfoWrapperTy AsyncInfoWrapper(Err, *this, AsyncInfo); + + GenericKernelTy &GenericKernel = + *reinterpret_cast(EntryPtr); + + Err = + GenericKernel.launch(*this, ArgPtrs, ArgOffsets, NumArgs, NumTeamsClause, + ThreadLimitClause, LoopTripCount, AsyncInfoWrapper); + return Err; +} + +Error GenericDeviceTy::initAsyncInfo(__tgt_async_info **AsyncInfoPtr) { + assert(AsyncInfoPtr && "Invalid async info"); + + *AsyncInfoPtr = new __tgt_async_info(); + + auto Err = Plugin::success(); + AsyncInfoWrapperTy AsyncInfoWrapper(Err, *this, *AsyncInfoPtr); + Err = initAsyncInfoImpl(AsyncInfoWrapper); + return Err; +} + +Error GenericDeviceTy::initDeviceInfo(__tgt_device_info *DeviceInfo) { + assert(DeviceInfo && "Invalid device info"); + + return initDeviceInfoImpl(DeviceInfo); +} + +Error GenericPluginTy::initDevice(int32_t DeviceId) { + assert(!Devices[DeviceId] && "Device already initialized"); + + // Create the device and save the reference. + GenericDeviceTy &Device = createDevice(DeviceId); + Devices[DeviceId] = &Device; + + // Initialize the device and its resources. + return Device.init(*this); +} + +Error GenericPluginTy::deinitDevice(int32_t DeviceId) { + // The device may be already deinitialized. + if (Devices[DeviceId] == nullptr) + return Plugin::success(); + + // Deinitialize the device and release its resources. + if (auto Err = Devices[DeviceId]->deinit()) + return Err; + + // Delete the device and invalidate its reference. + delete Devices[DeviceId]; + Devices[DeviceId] = nullptr; + + return Plugin::success(); +} + +Error GenericDeviceTy::printInfo() { + // TODO: Print generic information here + return printInfoImpl(); +} + +Error GenericDeviceTy::createEvent(void **EventPtrStorage) { + return createEventImpl(EventPtrStorage); +} + +Error GenericDeviceTy::destroyEvent(void *EventPtr) { + return destroyEventImpl(EventPtr); +} + +Error GenericDeviceTy::recordEvent(void *EventPtr, + __tgt_async_info *AsyncInfo) { + auto Err = Plugin::success(); + AsyncInfoWrapperTy AsyncInfoWrapper(Err, *this, AsyncInfo); + Err = recordEventImpl(EventPtr, AsyncInfoWrapper); + return Err; +} + +Error GenericDeviceTy::waitEvent(void *EventPtr, __tgt_async_info *AsyncInfo) { + auto Err = Plugin::success(); + AsyncInfoWrapperTy AsyncInfoWrapper(Err, *this, AsyncInfo); + Err = waitEventImpl(EventPtr, AsyncInfoWrapper); + return Err; +} + +Error GenericDeviceTy::syncEvent(void *EventPtr) { + return syncEventImpl(EventPtr); +} + +/// Exposed library API function, basically wrappers around the GenericDeviceTy +/// functionality with the same name. All non-async functions are redirected +/// to the async versions right away with a NULL AsyncInfoPtr. +#ifdef __cplusplus +extern "C" { +#endif + +int32_t __tgt_rtl_init_plugin() { + auto Err = Plugin::init(); + if (Err) + REPORT("Failure to initialize plugin " GETNAME(TARGET_NAME) ": %s\n", + toString(std::move(Err)).data()); + + return (bool)Err; +} + +int32_t __tgt_rtl_deinit_plugin() { + auto Err = Plugin::deinit(); + if (Err) + REPORT("Failure to deinitialize plugin " GETNAME(TARGET_NAME) ": %s\n", + toString(std::move(Err)).data()); + + return (bool)Err; +} + +int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *TgtImage) { + if (!Plugin::isActive()) + return false; + + return elf_check_machine(TgtImage, Plugin::get().getMagicElfBits()); +} + +int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *TgtImage, + __tgt_image_info *Info) { + if (!Plugin::isActive()) + return false; + + if (!__tgt_rtl_is_valid_binary(TgtImage)) + return false; + + // A subarchitecture was not specified. Assume it is compatible. + if (!Info->Arch) + return true; + + // Check the compatibility with all the available devices. Notice the + // devices may not be initialized yet. + auto CompatibleOrErr = Plugin::get().isImageCompatible(Info); + if (!CompatibleOrErr) { + // This error should not abort the execution, so we just inform the user + // through the debug system. + std::string ErrString = toString(CompatibleOrErr.takeError()); + DP("Failure to check whether image %p is valid: %s\n", TgtImage, + ErrString.data()); + return false; + } + + bool Compatible = *CompatibleOrErr; + DP("Image is %scompatible with current environment: %s\n", + (Compatible) ? "" : "not", Info->Arch); + + return Compatible; +} + +int32_t __tgt_rtl_supports_empty_images() { + return Plugin::get().supportsEmptyImages(); +} + +int32_t __tgt_rtl_init_device(int32_t DeviceId) { + auto Err = Plugin::get().initDevice(DeviceId); + if (Err) + REPORT("Failure to initialize device %d: %s\n", DeviceId, + toString(std::move(Err)).data()); + + return (bool)Err; +} + +int32_t __tgt_rtl_deinit_device(int32_t DeviceId) { + auto Err = Plugin::get().deinitDevice(DeviceId); + if (Err) + REPORT("Failure to deinitialize device %d: %s\n", DeviceId, + toString(std::move(Err)).data()); + + return (bool)Err; +} + +int32_t __tgt_rtl_number_of_devices() { return Plugin::get().getNumDevices(); } + +int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { + Plugin::get().setRequiresFlag(RequiresFlags); + return RequiresFlags; +} + +int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDeviceId, + int32_t DstDeviceId) { + return Plugin::get().isDataExchangable(SrcDeviceId, DstDeviceId); +} + +__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId, + __tgt_device_image *TgtImage) { + GenericPluginTy &Plugin = Plugin::get(); + auto TableOrErr = Plugin.getDevice(DeviceId).loadBinary(Plugin, TgtImage); + if (!TableOrErr) { + auto Err = TableOrErr.takeError(); + REPORT("Failure to load binary image %p on device %d: %s\n", TgtImage, + DeviceId, toString(std::move(Err)).data()); + return nullptr; + } + + __tgt_target_table *Table = *TableOrErr; + assert(Table != nullptr && "Invalid table"); + + return Table; +} + +void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr, + int32_t Kind) { + auto AllocOrErr = Plugin::get().getDevice(DeviceId).dataAlloc( + Size, HostPtr, (TargetAllocTy)Kind); + if (!AllocOrErr) { + auto Err = AllocOrErr.takeError(); + REPORT("Failure to allocate device memory: %s\n", + toString(std::move(Err)).data()); + return nullptr; + } + assert(*AllocOrErr && "Null pointer upon successful allocation"); + + return *AllocOrErr; +} + +int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr, int32_t Kind) { + auto Err = + Plugin::get().getDevice(DeviceId).dataDelete(TgtPtr, (TargetAllocTy)Kind); + if (Err) + REPORT("Failure to deallocate device pointer %p: %s\n", TgtPtr, + toString(std::move(Err)).data()); + + return (bool)Err; +} + +int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr, + int64_t Size) { + return __tgt_rtl_data_submit_async(DeviceId, TgtPtr, HstPtr, Size, + /* AsyncInfoPtr */ nullptr); +} + +int32_t __tgt_rtl_data_submit_async(int32_t DeviceId, void *TgtPtr, + void *HstPtr, int64_t Size, + __tgt_async_info *AsyncInfoPtr) { + auto Err = Plugin::get().getDevice(DeviceId).dataSubmit(TgtPtr, HstPtr, Size, + AsyncInfoPtr); + if (Err) + REPORT("Failure to copy data from host to device. Pointers: host " + "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n", + DPxPTR(HstPtr), DPxPTR(TgtPtr), Size, + toString(std::move(Err)).data()); + + return (bool)Err; +} + +int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, + int64_t Size) { + return __tgt_rtl_data_retrieve_async(DeviceId, HstPtr, TgtPtr, Size, + /* AsyncInfoPtr */ nullptr); +} + +int32_t __tgt_rtl_data_retrieve_async(int32_t DeviceId, void *HstPtr, + void *TgtPtr, int64_t Size, + __tgt_async_info *AsyncInfoPtr) { + auto Err = Plugin::get().getDevice(DeviceId).dataRetrieve(HstPtr, TgtPtr, + Size, AsyncInfoPtr); + if (Err) + REPORT("Faliure to copy data from device to host. Pointers: host " + "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n", + DPxPTR(HstPtr), DPxPTR(TgtPtr), Size, + toString(std::move(Err)).data()); + + return (bool)Err; +} + +int32_t __tgt_rtl_data_exchange(int32_t SrcDeviceId, void *SrcPtr, + int32_t DstDeviceId, void *DstPtr, + int64_t Size) { + return __tgt_rtl_data_exchange_async(SrcDeviceId, SrcPtr, DstDeviceId, DstPtr, + Size, /* AsyncInfoPtr */ nullptr); +} + +int32_t __tgt_rtl_data_exchange_async(int32_t SrcDeviceId, void *SrcPtr, + int DstDeviceId, void *DstPtr, + int64_t Size, + __tgt_async_info *AsyncInfo) { + GenericDeviceTy &SrcDevice = Plugin::get().getDevice(SrcDeviceId); + GenericDeviceTy &DstDevice = Plugin::get().getDevice(DstDeviceId); + auto Err = SrcDevice.dataExchange(SrcPtr, DstDevice, DstPtr, Size, AsyncInfo); + if (Err) + REPORT("Failure to copy data from device (%d) to device (%d). Pointers: " + "host = " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n", + SrcDeviceId, DstDeviceId, DPxPTR(SrcPtr), DPxPTR(DstPtr), Size, + toString(std::move(Err)).data()); + + return (bool)Err; +} + +int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr, + void **TgtArgs, ptrdiff_t *TgtOffsets, + int32_t NumArgs, int32_t NumTeams, + int32_t ThreadLimit, + uint64_t LoopTripCount) { + return __tgt_rtl_run_target_team_region_async(DeviceId, TgtEntryPtr, TgtArgs, + TgtOffsets, NumArgs, NumTeams, + ThreadLimit, LoopTripCount, + /* AsyncInfoPtr */ nullptr); +} + +int32_t __tgt_rtl_run_target_team_region_async( + int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets, + int32_t NumArgs, int32_t NumTeams, int32_t ThreadLimit, + uint64_t LoopTripCount, __tgt_async_info *AsyncInfoPtr) { + auto Err = Plugin::get().getDevice(DeviceId).runTargetTeamRegion( + TgtEntryPtr, TgtArgs, TgtOffsets, NumArgs, NumTeams, ThreadLimit, + LoopTripCount, AsyncInfoPtr); + if (Err) + REPORT("Failure to run target region " DPxMOD " in device %d: %s\n", + DPxPTR(TgtEntryPtr), DeviceId, toString(std::move(Err)).data()); + + return (bool)Err; +} + +int32_t __tgt_rtl_synchronize(int32_t DeviceId, + __tgt_async_info *AsyncInfoPtr) { + auto Err = Plugin::get().getDevice(DeviceId).synchronize(AsyncInfoPtr); + if (Err) + REPORT("Failure to synchronize stream %p: %s\n", AsyncInfoPtr->Queue, + toString(std::move(Err)).data()); + + return (bool)Err; +} + +int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr, + void **TgtArgs, ptrdiff_t *TgtOffsets, + int32_t NumArgs) { + return __tgt_rtl_run_target_region_async(DeviceId, TgtEntryPtr, TgtArgs, + TgtOffsets, NumArgs, + /* AsyncInfoPtr */ nullptr); +} + +int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, void *TgtEntryPtr, + void **TgtArgs, ptrdiff_t *TgtOffsets, + int32_t NumArgs, + __tgt_async_info *AsyncInfoPtr) { + return __tgt_rtl_run_target_team_region_async( + DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, NumArgs, + /* team num*/ 1, /* thread limit */ 1, /* loop tripcount */ 0, + AsyncInfoPtr); +} + +void __tgt_rtl_print_device_info(int32_t DeviceId) { + if (auto Err = Plugin::get().getDevice(DeviceId).printInfo()) + REPORT("Failure to print device %d info: %s\n", DeviceId, + toString(std::move(Err)).data()); +} + +int32_t __tgt_rtl_create_event(int32_t DeviceId, void **EventPtr) { + auto Err = Plugin::get().getDevice(DeviceId).createEvent(EventPtr); + if (Err) + REPORT("Failure to create event: %s\n", toString(std::move(Err)).data()); + + return (bool)Err; +} + +int32_t __tgt_rtl_record_event(int32_t DeviceId, void *EventPtr, + __tgt_async_info *AsyncInfoPtr) { + auto Err = + Plugin::get().getDevice(DeviceId).recordEvent(EventPtr, AsyncInfoPtr); + if (Err) + REPORT("Failure to record event %p: %s\n", EventPtr, + toString(std::move(Err)).data()); + + return (bool)Err; +} + +int32_t __tgt_rtl_wait_event(int32_t DeviceId, void *EventPtr, + __tgt_async_info *AsyncInfoPtr) { + auto Err = + Plugin::get().getDevice(DeviceId).waitEvent(EventPtr, AsyncInfoPtr); + if (Err) + REPORT("Failure to wait event %p: %s\n", EventPtr, + toString(std::move(Err)).data()); + + return (bool)Err; +} + +int32_t __tgt_rtl_sync_event(int32_t DeviceId, void *EventPtr) { + auto Err = Plugin::get().getDevice(DeviceId).syncEvent(EventPtr); + if (Err) + REPORT("Failure to synchronize event %p: %s\n", EventPtr, + toString(std::move(Err)).data()); + + return (bool)Err; +} + +int32_t __tgt_rtl_destroy_event(int32_t DeviceId, void *EventPtr) { + auto Err = Plugin::get().getDevice(DeviceId).destroyEvent(EventPtr); + if (Err) + REPORT("Failure to destroy event %p: %s\n", EventPtr, + toString(std::move(Err)).data()); + + return (bool)Err; +} + +void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) { + std::atomic &InfoLevel = getInfoLevelInternal(); + InfoLevel.store(NewInfoLevel); +} + +int32_t __tgt_rtl_init_async_info(int32_t DeviceId, + __tgt_async_info **AsyncInfoPtr) { + assert(AsyncInfoPtr && "Invalid async info"); + + auto Err = Plugin::get().getDevice(DeviceId).initAsyncInfo(AsyncInfoPtr); + if (Err) + REPORT("Failure to initialize async info at " DPxMOD " on device %d: %s\n", + DPxPTR(*AsyncInfoPtr), DeviceId, toString(std::move(Err)).data()); + + return (bool)Err; +} + +int32_t __tgt_rtl_init_device_info(int32_t DeviceId, + __tgt_device_info *DeviceInfo, + const char **ErrStr) { + *ErrStr = ""; + + auto Err = Plugin::get().getDevice(DeviceId).initDeviceInfo(DeviceInfo); + if (Err) + REPORT("Failure to initialize device info at " DPxMOD " on device %d: %s\n", + DPxPTR(DeviceInfo), DeviceId, toString(std::move(Err)).data()); + + return (bool)Err; +} + +#ifdef __cplusplus +} +#endif diff --git a/openmp/libomptarget/plugins-nextgen/cuda/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/cuda/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins-nextgen/cuda/CMakeLists.txt @@ -0,0 +1,98 @@ +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a CUDA machine if available. +# +##===----------------------------------------------------------------------===## +set(LIBOMPTARGET_BUILD_CUDA_PLUGIN TRUE CACHE BOOL + "Whether to build CUDA plugin") +if (NOT LIBOMPTARGET_BUILD_CUDA_PLUGIN) + libomptarget_say("Not building CUDA NextGen offloading plugin: LIBOMPTARGET_BUILD_CUDA_PLUGIN is false") + return() +endif() + +if (NOT(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux")) + libomptarget_say("Not building CUDA NextGen offloading plugin: only support CUDA in Linux x86_64, ppc64le, or aarch64 hosts.") + return() +endif() + +libomptarget_say("Building CUDA NextGen offloading plugin.") + +# Define the suffix for the runtime messaging dumps. +add_definitions("-DTARGET_NAME=CUDA") + +# Define debug prefix. TODO: This should be automatized in the Debug.h but it +# requires changing the original plugins. +add_definitions(-DDEBUG_PREFIX="TARGET CUDA RTL") + +set(LIBOMPTARGET_DLOPEN_LIBCUDA OFF) +option(LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA "Build with dlopened libcuda" ${LIBOMPTARGET_DLOPEN_LIBCUDA}) + +set(LIBOMPTARGET_CAN_LINK_LIBCUDA FALSE) +if (LIBOMPTARGET_DEP_CUDA_FOUND AND LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND) + set(LIBOMPTARGET_CAN_LINK_LIBCUDA TRUE) +endif() + +if (LIBOMPTARGET_CAN_LINK_LIBCUDA AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA) + libomptarget_say("Building CUDA NextGen plugin linked against libcuda") + include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS}) + add_llvm_library(omptarget.rtl.cuda.nextgen SHARED + + src/rtl.cpp + + ADDITIONAL_HEADER_DIRS + ${LIBOMPTARGET_INCLUDE_DIR} + + LINK_LIBS + PRIVATE + elf_common + MemoryManager + PluginInterface + ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES} + ${OPENMP_PTHREAD_LIB} + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports" + "-Wl,-z,defs" + + NO_INSTALL_RPATH + ) +else() + libomptarget_say("Building CUDA NextGen plugin for dlopened libcuda") + include_directories(../../plugins/cuda/dynamic_cuda) + add_llvm_library(omptarget.rtl.cuda.nextgen + SHARED + + src/rtl.cpp + ../../plugins/cuda/dynamic_cuda/cuda.cpp + + ADDITIONAL_HEADER_DIRS + ${LIBOMPTARGET_INCLUDE_DIR} + + LINK_LIBS + PRIVATE + elf_common + MemoryManager + PluginInterface + ${CMAKE_DL_LIBS} + ${OPENMP_PTHREAD_LIB} + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports" + "-Wl,-z,defs" + + NO_INSTALL_RPATH + ) +endif() +add_dependencies(omptarget.rtl.cuda.nextgen omptarget.devicertl.nvptx) + +# Install plugin under the lib destination folder. +install(TARGETS omptarget.rtl.cuda.nextgen LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") +set_target_properties(omptarget.rtl.cuda.nextgen PROPERTIES + INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.." + CXX_VISIBILITY_PRESET protected) + +target_include_directories(omptarget.rtl.cuda.nextgen PRIVATE + ${LIBOMPTARGET_INCLUDE_DIR} +) diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp @@ -0,0 +1,1051 @@ +//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// RTL NextGen for CUDA machine +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include "Debug.h" +#include "DeviceEnvironment.h" +#include "GlobalHandler.h" +#include "PluginInterface.h" + +#include "llvm/BinaryFormat/ELF.h" +#include "llvm/Frontend/OpenMP/OMPConstants.h" +#include "llvm/Frontend/OpenMP/OMPGridValues.h" + +namespace llvm { +namespace omp { +namespace target { +namespace plugin { + +/// Forward declarations for all specialized data structures. +struct CUDAKernelTy; +struct CUDADeviceTy; +struct CUDAPluginTy; +struct CUDAStreamManagerTy; +struct CUDAEventManagerTy; + +/// Class implementing the CUDA kernel functionalities which derives from the +/// generic kernel class. +struct CUDAKernelTy : public GenericKernelTy { + /// Create a CUDA kernel with a name, an execution mode, and the kernel + /// function. + CUDAKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode, + CUfunction Func) + : GenericKernelTy(Name, ExecutionMode), Func(Func) {} + + /// Initialize the CUDA kernel + Error initImpl(GenericDeviceTy &GenericDevice, + DeviceImageTy &Image) override { + int MaxThreads; + CUresult Res = cuFuncGetAttribute( + &MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Func); + if (auto Err = Plugin::check(Res, "Error in cuFuncGetAttribute: %s")) + return Err; + + /// Set the maximum number of threads for the CUDA kernel. + MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads); + + return Plugin::success(); + } + + /// Launch the CUDA kernel function + Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads, + uint64_t NumBlocks, uint32_t DynamicMemorySize, + int32_t NumKernelArgs, void *KernelArgs, + AsyncInfoWrapperTy &AsyncInfoWrapper) const override; + + /// The default number of blocks is common to the whole device. + uint64_t getDefaultNumBlocks(GenericDeviceTy &GenericDevice) const override { + return GenericDevice.getDefaultNumBlocks(); + } + + /// The default number of threads is common to the whole device. + uint32_t getDefaultNumThreads(GenericDeviceTy &GenericDevice) const override { + return GenericDevice.getDefaultNumThreads(); + } + +private: + /// The CUDA kernel function to execute. + CUfunction Func; +}; + +/// Class wrapping a CUDA stream reference. These are the objects handled by the +/// Stream Manager for the CUDA plugin. +class CUDAStreamRef final : public GenericDeviceResourceRef { + /// The reference to the CUDA stream. + CUstream Stream; + +public: + /// Create an empty reference to an invalid stream. + CUDAStreamRef() : Stream(nullptr) {} + + /// Create a reference to an existing stream. + CUDAStreamRef(CUstream Stream) : Stream(Stream) {} + + /// Create a new stream and save the reference. The reference must be empty + /// before calling to this function. + Error create() override { + if (Stream) + return Plugin::error("Creating an existing stream"); + + CUresult Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING); + if (auto Err = Plugin::check(Res, "Error in cuStreamCreate: %s")) + return Err; + + return Plugin::success(); + } + + /// Destroy the referenced stream and invalidate the reference. The reference + /// must be to a valid stream before calling to this function. + Error destroy() override { + if (!Stream) + return Plugin::error("Destroying an invalid stream"); + + CUresult Res = cuStreamDestroy(Stream); + if (auto Err = Plugin::check(Res, "Error in cuStreamDestroy: %s")) + return Err; + + Stream = nullptr; + return Plugin::success(); + } + + /// Get the underlying CUstream. + operator CUstream() const { return Stream; } +}; + +/// Class wrapping a CUDA event reference. These are the objects handled by the +/// Event Manager for the CUDA plugin. +class CUDAEventRef final : public GenericDeviceResourceRef { + CUevent Event; + +public: + /// Create an empty reference to an invalid event. + CUDAEventRef() : Event(nullptr) {} + + /// Create a reference to an existing event. + CUDAEventRef(CUevent Event) : Event(Event) {} + + /// Create a new event and save the reference. The reference must be empty + /// before calling to this function. + Error create() override { + if (Event) + return Plugin::error("Creating an existing event"); + + CUresult Res = cuEventCreate(&Event, CU_EVENT_DEFAULT); + if (auto Err = Plugin::check(Res, "Error in cuEventCreate: %s")) + return Err; + + return Plugin::success(); + } + + /// Destroy the referenced event and invalidate the reference. The reference + /// must be to a valid event before calling to this function. + Error destroy() override { + if (!Event) + return Plugin::error("Destroying an invalid event"); + + CUresult Res = cuEventDestroy(Event); + if (auto Err = Plugin::check(Res, "Error in cuEventDestroy: %s")) + return Err; + + Event = nullptr; + return Plugin::success(); + } + + /// Get the underlying CUevent. + operator CUevent() const { return Event; } +}; + +/// Class implementing the CUDA device images properties. +struct CUDADeviceImageTy : public DeviceImageTy { + /// Create the CUDA image with the id and the target image pointer. + CUDADeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage) + : DeviceImageTy(ImageId, TgtImage), Module(nullptr) {} + + /// Load the image as a CUDA module. + Error loadModule() { + assert(!Module && "Module already loaded"); + + CUresult Res = cuModuleLoadDataEx(&Module, getStart(), 0, nullptr, nullptr); + if (auto Err = Plugin::check(Res, "Error in cuModuleLoadDataEx: %s")) + return Err; + + return Plugin::success(); + } + + /// Unload the CUDA module corresponding to the image. + Error unloadModule() { + assert(Module && "Module not loaded"); + + CUresult Res = cuModuleUnload(Module); + if (auto Err = Plugin::check(Res, "Error in cuModuleUnload: %s")) + return Err; + + Module = nullptr; + + return Plugin::success(); + } + + /// Getter of the CUDA module. + CUmodule getModule() const { return Module; } + +private: + /// The CUDA module that loaded the image. + CUmodule Module; +}; + +/// Class implementing the CUDA device functionalities which derives from the +/// generic device class. +struct CUDADeviceTy : public GenericDeviceTy { + // Create a CUDA device with a device id and the default CUDA grid values. + CUDADeviceTy(int32_t DeviceId, int32_t NumDevices) + : GenericDeviceTy(DeviceId, NumDevices, NVPTXGridValues), + CUDAStreamManager(*this), CUDAEventManager(*this) {} + + ~CUDADeviceTy() {} + + /// Initialize the device, its resources and get its properties. + Error initImpl(GenericPluginTy &Plugin) override { + CUresult Res = cuDeviceGet(&Device, DeviceId); + if (auto Err = Plugin::check(Res, "Error in cuDeviceGet: %s")) + return Err; + + // Query the current flags of the primary context and set its flags if + // it is inactive. + unsigned int FormerPrimaryCtxFlags = 0; + int FormerPrimaryCtxIsActive = 0; + Res = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags, + &FormerPrimaryCtxIsActive); + if (auto Err = + Plugin::check(Res, "Error in cuDevicePrimaryCtxGetState: %s")) + return Err; + + if (FormerPrimaryCtxIsActive) { + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "The primary context is active, no change to its flags\n"); + if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) != + CU_CTX_SCHED_BLOCKING_SYNC) + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "Warning: The current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n"); + } else { + INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, + "The primary context is inactive, set its flags to " + "CU_CTX_SCHED_BLOCKING_SYNC\n"); + Res = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC); + if (auto Err = + Plugin::check(Res, "Error in cuDevicePrimaryCtxSetFlags: %s")) + return Err; + } + + // Retain the per device primary context and save it to use whenever this + // device is selected. + Res = cuDevicePrimaryCtxRetain(&Context, Device); + if (auto Err = Plugin::check(Res, "Error in cuDevicePrimaryCtxRetain: %s")) + return Err; + + if (auto Err = setContext()) + return Err; + + // Initialize stream pool. + if (auto Err = CUDAStreamManager.init()) + return Err; + + // Initialize event pool. + if (auto Err = CUDAEventManager.init()) + return Err; + + // Query attributes to determine number of threads/block and blocks/grid. + if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, + GridValues.GV_Max_Teams)) + return Err; + + if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, + GridValues.GV_Max_WG_Size)) + return Err; + + if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE, + GridValues.GV_Warp_Size)) + return Err; + + return Plugin::success(); + } + + /// Deinitialize the device and release its resources. + Error deinitImpl() override { + if (Context) { + if (auto Err = setContext()) + return Err; + } + + // Deinitialize the stream manager. + if (auto Err = CUDAStreamManager.deinit()) + return Err; + + if (auto Err = CUDAEventManager.deinit()) + return Err; + + // Close modules if necessary. + if (!LoadedImages.empty()) { + assert(Context && "Invalid CUDA context"); + + // Each image has its own module. + for (DeviceImageTy *Image : LoadedImages) { + CUDADeviceImageTy &CUDAImage = static_cast(*Image); + + // Unload the module of the image. + if (auto Err = CUDAImage.unloadModule()) + return Err; + } + } + + if (Context) { + CUresult Res = cuDevicePrimaryCtxRelease(Device); + if (auto Err = + Plugin::check(Res, "Error in cuDevicePrimaryCtxRelease: %s")) + return Err; + } + + // Invalidate context and device references. + Context = nullptr; + Device = CU_DEVICE_INVALID; + + return Plugin::success(); + } + + /// Allocate and construct a CUDA kernel. + Expected + constructKernelEntry(const __tgt_offload_entry &KernelEntry, + DeviceImageTy &Image) override { + CUDADeviceImageTy &CUDAImage = static_cast(Image); + + // Retrieve the function pointer of the kernel. + CUfunction Func; + CUresult Res = + cuModuleGetFunction(&Func, CUDAImage.getModule(), KernelEntry.name); + if (auto Err = Plugin::check(Res, "Error in cuModuleGetFunction('%s'): %s", + KernelEntry.name)) + return std::move(Err); + + DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", DPxPTR(&KernelEntry), + KernelEntry.name, DPxPTR(Func)); + + // Create a metadata object for the exec mode global (auto-generated). + StaticGlobalTy ExecModeGlobal( + KernelEntry.name, "_exec_mode"); + + // Retrieve execution mode for the kernel. This may fail since some kernels + // may not have a execution mode. + GenericGlobalHandlerTy &GHandler = Plugin::get().getGlobalHandler(); + if (auto Err = GHandler.readGlobalFromImage(*this, Image, ExecModeGlobal)) { + // In some cases the execution mode is not included, so use the default. + ExecModeGlobal.setValue(llvm::omp::OMP_TGT_EXEC_MODE_GENERIC); + // Consume the error since it is acceptable to fail. + [[maybe_unused]] std::string ErrStr = toString(std::move(Err)); + + DP("Failed to read execution mode for '%s': %s\n" + "Using default GENERIC (1) execution mode\n", + KernelEntry.name, ErrStr.data()); + } + + // Check that the retrieved execution mode is valid. + if (!GenericKernelTy::isValidExecutionMode(ExecModeGlobal.getValue())) + return Plugin::error("Invalid execution mode %d for '%s'", + ExecModeGlobal.getValue(), KernelEntry.name); + + // Allocate and initialize the CUDA kernel. + CUDAKernelTy *CUDAKernel = Plugin::get().allocate(); + new (CUDAKernel) + CUDAKernelTy(KernelEntry.name, ExecModeGlobal.getValue(), Func); + + return CUDAKernel; + } + + /// Set the current context to this device's context. + Error setContext() override { + CUresult Res = cuCtxSetCurrent(Context); + return Plugin::check(Res, "Error in cuCtxSetCurrent: %s"); + } + + /// Get the stream of the asynchronous info sructure or get a new one. + CUstream getStream(AsyncInfoWrapperTy &AsyncInfoWrapper) { + CUstream &Stream = AsyncInfoWrapper.getQueueAs(); + if (!Stream) + Stream = CUDAStreamManager.getStream(); + return Stream; + } + + /// Getters of CUDA references. + CUcontext getCUDAContext() const { return Context; } + CUdevice getCUDADevice() const { return Device; } + + /// Load the binary image into the device and allocate an image object. + Expected loadBinaryImpl(const __tgt_device_image *TgtImage, + int32_t ImageId) override { + if (auto Err = setContext()) + return std::move(Err); + + // Allocate and initialize the image object. + CUDADeviceImageTy *CUDAImage = Plugin::get().allocate(); + new (CUDAImage) CUDADeviceImageTy(ImageId, TgtImage); + + // Load the CUDA module. + if (auto Err = CUDAImage->loadModule()) + return std::move(Err); + + return CUDAImage; + } + + /// Allocate memory on the device or related to the device. + void *allocate(size_t Size, void *, TargetAllocTy Kind) override { + if (Size == 0) + return nullptr; + + if (auto Err = setContext()) { + REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data()); + return nullptr; + } + + void *MemAlloc = nullptr; + CUdeviceptr DevicePtr; + CUresult Res; + + switch (Kind) { + case TARGET_ALLOC_DEFAULT: + case TARGET_ALLOC_DEVICE: + Res = cuMemAlloc(&DevicePtr, Size); + MemAlloc = (void *)DevicePtr; + break; + case TARGET_ALLOC_HOST: + Res = cuMemAllocHost(&MemAlloc, Size); + break; + case TARGET_ALLOC_SHARED: + Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL); + MemAlloc = (void *)DevicePtr; + break; + } + + if (auto Err = + Plugin::check(Res, "Error in cuMemAlloc[Host|Managed]: %s")) { + REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data()); + return nullptr; + } + return MemAlloc; + } + + /// Deallocate memory on the device or related to the device. + int free(void *TgtPtr, TargetAllocTy Kind) override { + if (TgtPtr == nullptr) + return OFFLOAD_SUCCESS; + + if (auto Err = setContext()) { + REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data()); + return OFFLOAD_FAIL; + } + + CUresult Res; + switch (Kind) { + case TARGET_ALLOC_DEFAULT: + case TARGET_ALLOC_DEVICE: + case TARGET_ALLOC_SHARED: + Res = cuMemFree((CUdeviceptr)TgtPtr); + break; + case TARGET_ALLOC_HOST: + Res = cuMemFreeHost(TgtPtr); + break; + } + + if (auto Err = Plugin::check(Res, "Error in cuMemFree[Host]: %s")) { + REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data()); + return OFFLOAD_FAIL; + } + return OFFLOAD_SUCCESS; + } + + /// Synchronize current thread with the pending operations on the async info. + Error synchronizeImpl(__tgt_async_info &AsyncInfo) override { + CUstream Stream = reinterpret_cast(AsyncInfo.Queue); + CUresult Res = cuStreamSynchronize(Stream); + + // Once the stream is synchronized, return it to stream pool and reset + // AsyncInfo. This is to make sure the synchronization only works for its + // own tasks. + CUDAStreamManager.returnStream(Stream); + AsyncInfo.Queue = nullptr; + + return Plugin::check(Res, "Error in cuStreamSynchronize: %s"); + } + + /// Submit data to the device (host to device transfer). + Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + if (auto Err = setContext()) + return Err; + + CUstream Stream = getStream(AsyncInfoWrapper); + if (!Stream) + return Plugin::error("Failure to get stream"); + + CUresult Res = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); + return Plugin::check(Res, "Error in cuMemcpyHtoDAsync: %s"); + } + + /// Retrieve data from the device (device to host transfer). + Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + if (auto Err = setContext()) + return Err; + + CUstream Stream = getStream(AsyncInfoWrapper); + if (!Stream) + return Plugin::error("Failure to get stream"); + + CUresult Res = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); + return Plugin::check(Res, "Error in cuMemcpyDtoHAsync: %s"); + } + + /// Exchange data between two devices directly. We may use peer access if + /// the CUDA devices and driver allow them. + Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice, + void *DstPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override; + + /// Initialize the async info for interoperability purposes. + Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { + if (auto Err = setContext()) + return Err; + + if (!getStream(AsyncInfoWrapper)) + return Plugin::error("Failure to get stream"); + + return Plugin::success(); + } + + /// Initialize the device info for interoperability purposes. + Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override { + assert(Context && "Context is null"); + assert(Device != CU_DEVICE_INVALID && "Invalid CUDA device"); + + if (auto Err = setContext()) + return Err; + + if (!DeviceInfo->Context) + DeviceInfo->Context = Context; + + if (!DeviceInfo->Device) + DeviceInfo->Device = reinterpret_cast(Device); + + return Plugin::success(); + } + + /// Create an event. + Error createEventImpl(void **EventPtrStorage) override { + CUevent *Event = reinterpret_cast(EventPtrStorage); + *Event = CUDAEventManager.getEvent(); + return Plugin::success(); + } + + /// Destroy a previously created event. + Error destroyEventImpl(void *EventPtr) override { + CUevent Event = reinterpret_cast(EventPtr); + CUDAEventManager.returnEvent(Event); + return Plugin::success(); + } + + /// Record the event. + Error recordEventImpl(void *EventPtr, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + CUevent Event = reinterpret_cast(EventPtr); + + CUstream Stream = getStream(AsyncInfoWrapper); + if (!Stream) + return Plugin::error("Failure to get stream"); + + CUresult Res = cuEventRecord(Event, Stream); + return Plugin::check(Res, "Error in cuEventRecord: %s"); + } + + /// Make the stream wait on the event. + Error waitEventImpl(void *EventPtr, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + CUevent Event = reinterpret_cast(EventPtr); + + CUstream Stream = getStream(AsyncInfoWrapper); + if (!Stream) + return Plugin::error("Failure to get stream"); + + // Do not use CU_EVENT_WAIT_DEFAULT here as it is only available from + // specific CUDA version, and defined as 0x0. In previous version, per CUDA + // API document, that argument has to be 0x0. + CUresult Res = cuStreamWaitEvent(Stream, Event, 0); + return Plugin::check(Res, "Error in cuStreamWaitEvent: %s"); + } + + /// Synchronize the current thread with the event. + Error syncEventImpl(void *EventPtr) override { + CUevent Event = reinterpret_cast(EventPtr); + CUresult Res = cuEventSynchronize(Event); + return Plugin::check(Res, "Error in cuEventSynchronize: %s"); + } + + /// Print information about the device. + Error printInfoImpl() override { + char TmpChar[1000]; + std::string TmpStr; + size_t TmpSt; + int TmpInt, TmpInt2, TmpInt3; + + // TODO: All these calls should be checked, but the whole printInfo must be + // improved, so we will refactor it in the future. + cuDriverGetVersion(&TmpInt); + printf(" CUDA Driver Version: \t\t%d \n", TmpInt); + printf(" CUDA Device Number: \t\t%d \n", DeviceId); + + cuDeviceGetName(TmpChar, 1000, Device); + printf(" Device Name: \t\t\t%s \n", TmpChar); + + cuDeviceTotalMem(&TmpSt, Device); + printf(" Global Memory Size: \t\t%zu bytes \n", TmpSt); + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, + Device); + printf(" Number of Multiprocessors: \t\t%d \n", TmpInt); + + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device); + printf(" Concurrent Copy and Execution: \t%s \n", TmpInt ? "Yes" : "No"); + + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, + Device); + printf(" Total Constant Memory: \t\t%d bytes\n", TmpInt); + + cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device); + printf(" Max Shared Memory per Block: \t%d bytes \n", TmpInt); + + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, + Device), + printf(" Registers per Block: \t\t%d \n", TmpInt); + + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device); + printf(" Warp Size: \t\t\t\t%d Threads \n", TmpInt); + + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, + Device); + printf(" Maximum Threads per Block: \t\t%d \n", TmpInt); + + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device); + cuDeviceGetAttribute(&TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device); + cuDeviceGetAttribute(&TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device); + printf(" Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2, + TmpInt3); + + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device); + cuDeviceGetAttribute(&TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device); + cuDeviceGetAttribute(&TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device); + printf(" Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2, + TmpInt3); + + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device); + printf(" Maximum Memory Pitch: \t\t%d bytes \n", TmpInt); + + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, + Device); + printf(" Texture Alignment: \t\t\t%d bytes \n", TmpInt); + + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device); + printf(" Clock Rate: \t\t\t%d kHz\n", TmpInt); + + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, + Device); + printf(" Execution Timeout: \t\t\t%s \n", TmpInt ? "Yes" : "No"); + + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device); + printf(" Integrated Device: \t\t\t%s \n", TmpInt ? "Yes" : "No"); + + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, + Device); + printf(" Can Map Host Memory: \t\t%s \n", TmpInt ? "Yes" : "No"); + + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device); + if (TmpInt == CU_COMPUTEMODE_DEFAULT) + TmpStr = "DEFAULT"; + else if (TmpInt == CU_COMPUTEMODE_PROHIBITED) + TmpStr = "PROHIBITED"; + else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS) + TmpStr = "EXCLUSIVE PROCESS"; + else + TmpStr = "unknown"; + printf(" Compute Mode: \t\t\t%s \n", TmpStr.c_str()); + + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, + Device); + printf(" Concurrent Kernels: \t\t%s \n", TmpInt ? "Yes" : "No"); + + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device); + printf(" ECC Enabled: \t\t\t%s \n", TmpInt ? "Yes" : "No"); + + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, + Device); + printf(" Memory Clock Rate: \t\t\t%d kHz\n", TmpInt); + + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, + Device); + printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt); + + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, Device); + printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt); + + cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, Device); + printf(" Max Threads Per SMP: \t\t%d \n", TmpInt); + + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, + Device); + printf(" Async Engines: \t\t\t%s (%d) \n", TmpInt ? "Yes" : "No", + TmpInt); + + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, + Device); + printf(" Unified Addressing: \t\t%s \n", TmpInt ? "Yes" : "No"); + + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device); + printf(" Managed Memory: \t\t\t%s \n", TmpInt ? "Yes" : "No"); + + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, + Device); + printf(" Concurrent Managed Memory: \t\t%s \n", TmpInt ? "Yes" : "No"); + + cuDeviceGetAttribute( + &TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device); + printf(" Preemption Supported: \t\t%s \n", TmpInt ? "Yes" : "No"); + + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, + Device); + printf(" Cooperative Launch: \t\t%s \n", TmpInt ? "Yes" : "No"); + + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device); + printf(" Multi-Device Boars: \t\t%s \n", TmpInt ? "Yes" : "No"); + + cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + Device); + cuDeviceGetAttribute(&TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, + Device); + printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2); + + return Plugin::success(); + } + + /// Getters and setters for stack and heap sizes. + Error getDeviceStackSize(uint64_t &Value) override { + return getCtxLimit(CU_LIMIT_STACK_SIZE, Value); + } + Error setDeviceStackSize(uint64_t Value) override { + return setCtxLimit(CU_LIMIT_STACK_SIZE, Value); + } + Error getDeviceHeapSize(uint64_t &Value) override { + return getCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value); + } + Error setDeviceHeapSize(uint64_t Value) override { + return setCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value); + } + + /// CUDA-specific functions for getting and setting context limits. + Error setCtxLimit(CUlimit Kind, uint64_t Value) { + CUresult Res = cuCtxSetLimit(Kind, Value); + return Plugin::check(Res, "Error in cuCtxSetLimit: %s"); + } + Error getCtxLimit(CUlimit Kind, uint64_t &Value) { + CUresult Res = cuCtxGetLimit(&Value, Kind); + return Plugin::check(Res, "Error in cuCtxGetLimit: %s"); + } + + /// CUDA-specific function to get device attributes. + Error getDeviceAttr(uint32_t Kind, uint32_t &Value) { + // TODO: Warn if the new value is larger than the old. + CUresult Res = + cuDeviceGetAttribute((int *)&Value, (CUdevice_attribute)Kind, Device); + return Plugin::check(Res, "Error in cuDeviceGetAttribute: %s"); + } + +private: + using CUDAStreamManagerTy = GenericStreamManagerTy; + using CUDAEventManagerTy = GenericEventManagerTy; + + /// Stream manager for CUDA streams. + CUDAStreamManagerTy CUDAStreamManager; + + /// Event manager for CUDA events. + CUDAEventManagerTy CUDAEventManager; + + /// The device's context. This context should be set before performing + /// operations on the device. + CUcontext Context = nullptr; + + /// The CUDA device handler. + CUdevice Device = CU_DEVICE_INVALID; +}; + +Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice, + uint32_t NumThreads, uint64_t NumBlocks, + uint32_t DynamicMemorySize, + int32_t NumKernelArgs, void *KernelArgs, + AsyncInfoWrapperTy &AsyncInfoWrapper) const { + CUDADeviceTy &CUDADevice = static_cast(GenericDevice); + + CUstream Stream = CUDADevice.getStream(AsyncInfoWrapper); + if (!Stream) + return Plugin::error("Failure to get stream"); + + CUresult Res = + cuLaunchKernel(Func, NumBlocks, /* gridDimY */ 1, + /* gridDimZ */ 1, NumThreads, + /* blockDimY */ 1, /* blockDimZ */ 1, DynamicMemorySize, + Stream, (void **)KernelArgs, nullptr); + return Plugin::check(Res, "Error in cuLaunchKernel for '%s': %s", getName()); +} + +/// Class implementing the CUDA-specific functionalities of the global handler. +class CUDAGlobalHandlerTy final : public GenericGlobalHandlerTy { +public: + /// Get the metadata of a global from the device. The name and size of the + /// global is read from DeviceGlobal and the address of the global is written + /// to DeviceGlobal. + Error getGlobalMetadataFromDevice(GenericDeviceTy &Device, + DeviceImageTy &Image, + GlobalTy &DeviceGlobal) override { + CUDADeviceImageTy &CUDAImage = static_cast(Image); + + const char *GlobalName = DeviceGlobal.getName().data(); + + size_t CUSize; + CUdeviceptr CUPtr; + CUresult Res = + cuModuleGetGlobal(&CUPtr, &CUSize, CUDAImage.getModule(), GlobalName); + if (auto Err = Plugin::check(Res, "Error in cuModuleGetGlobal for '%s': %s", + GlobalName)) + return Err; + + if (CUSize != DeviceGlobal.getSize()) + return Plugin::error( + "Failed to load global '%s' due to size mismatch (%zu != %zu)", + GlobalName, CUSize, (size_t)DeviceGlobal.getSize()); + + DeviceGlobal.setPtr(reinterpret_cast(CUPtr)); + return Plugin::success(); + } +}; + +/// Class implementing the CUDA-specific functionalities of the plugin. +struct CUDAPluginTy final : public GenericPluginTy { + /// Create a CUDA plugin and initialize the CUDA driver. + CUDAPluginTy() : GenericPluginTy() { + CUresult Res = cuInit(0); + if (Res == CUDA_ERROR_INVALID_HANDLE) { + // Cannot call cuGetErrorString if dlsym failed. + DP("Failed to load CUDA shared library\n"); + return; + } + + if (Res == CUDA_ERROR_NO_DEVICE) { + // Do not initialize if there are no devices. + DP("There are no devices supporting CUDA.\n"); + return; + } + + if (auto Err = Plugin::check(Res, "Error in cuInit: %s")) { + REPORT("%s\n", toString(std::move(Err)).data()); + return; + } + + // Get the number of devices. + int NumDevices; + Res = cuDeviceGetCount(&NumDevices); + if (auto Err = Plugin::check(Res, "Error in cuDeviceGetCount: %s")) { + REPORT("%s\n", toString(std::move(Err)).data()); + return; + } + + // Do not initialize if there are no devices. + if (NumDevices == 0) { + DP("There are no devices supporting CUDA.\n"); + return; + } + + // Initialize the generic plugin structure. + GenericPluginTy::init(NumDevices, new CUDAGlobalHandlerTy()); + } + + /// This class should not be copied. + CUDAPluginTy(const CUDAPluginTy &) = delete; + CUDAPluginTy(CUDAPluginTy &&) = delete; + + ~CUDAPluginTy() {} + + /// Get the ELF code for recognizing the compatible image binary. + uint16_t getMagicElfBits() const override { return ELF::EM_CUDA; } + + /// Create a CUDA device with a specific id. + CUDADeviceTy &createDevice(int32_t DeviceId) override { + CUDADeviceTy *Device = new CUDADeviceTy(DeviceId, getNumDevices()); + return *Device; + } + + /// Check whether the image is compatible with the available CUDA devices. + Expected isImageCompatible(__tgt_image_info *Info) const override { + for (int32_t DevId = 0; DevId < getNumDevices(); ++DevId) { + CUdevice Device; + CUresult Res = cuDeviceGet(&Device, DevId); + if (auto Err = Plugin::check(Res, "Error in cuDeviceGet: %s")) + return std::move(Err); + + int32_t Major, Minor; + Res = cuDeviceGetAttribute( + &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device); + if (auto Err = Plugin::check(Res, "Error in cuDeviceGetAttribute: %s")) + return std::move(Err); + + Res = cuDeviceGetAttribute( + &Minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device); + if (auto Err = Plugin::check(Res, "Error in cuDeviceGetAttribute: %s")) + return std::move(Err); + + StringRef ArchStr(Info->Arch); + StringRef PrefixStr("sm_"); + if (!ArchStr.startswith(PrefixStr)) + return Plugin::error("Unrecognized image arch %s", ArchStr.data()); + + int32_t ImageMajor = ArchStr[PrefixStr.size() + 0] - '0'; + int32_t ImageMinor = ArchStr[PrefixStr.size() + 1] - '0'; + + // A cubin generated for a certain compute capability is supported to run + // on any GPU with the same major revision and same or higher minor + // revision. + if (Major != ImageMajor || Minor < ImageMinor) + return false; + } + return true; + } +}; + +Error CUDADeviceTy::dataExchangeImpl(const void *SrcPtr, + GenericDeviceTy &DstGenericDevice, + void *DstPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) { + if (auto Err = setContext()) + return Err; + + CUDADeviceTy &DstDevice = static_cast(DstGenericDevice); + + CUresult Res; + int32_t DstDeviceId = DstDevice.DeviceId; + CUdeviceptr CUSrcPtr = (CUdeviceptr)SrcPtr; + CUdeviceptr CUDstPtr = (CUdeviceptr)DstPtr; + + int CanAccessPeer = 0; + if (DeviceId != DstDeviceId) { + // Make sure the lock is released before performing the copies. + std::lock_guard Lock(PeerAccessesLock); + + switch (PeerAccesses[DstDeviceId]) { + case PeerAccessState::AVAILABLE: + CanAccessPeer = 1; + break; + case PeerAccessState::UNAVAILABLE: + CanAccessPeer = 0; + break; + case PeerAccessState::PENDING: + // Check whether the source device can access the destination device. + Res = cuDeviceCanAccessPeer(&CanAccessPeer, Device, DstDevice.Device); + if (auto Err = Plugin::check(Res, "Error in cuDeviceCanAccessPeer: %s")) + return Err; + + if (CanAccessPeer) { + Res = cuCtxEnablePeerAccess(DstDevice.Context, 0); + if (Res == CUDA_ERROR_TOO_MANY_PEERS) { + // Resources may be exhausted due to many P2P links. + CanAccessPeer = 0; + DP("Too many P2P so fall back to D2D memcpy"); + } else if (auto Err = + Plugin::check(Res, "Error in cuCtxEnablePeerAccess: %s")) + return Err; + } + PeerAccesses[DstDeviceId] = (CanAccessPeer) + ? PeerAccessState::AVAILABLE + : PeerAccessState::UNAVAILABLE; + } + } + + CUstream Stream = getStream(AsyncInfoWrapper); + if (!Stream) + return Plugin::error("Failure to get stream"); + + if (CanAccessPeer) { + // TODO: Should we fallback to D2D if peer access fails? + Res = cuMemcpyPeerAsync(CUDstPtr, Context, CUSrcPtr, DstDevice.Context, + Size, Stream); + return Plugin::check(Res, "Error in cuMemcpyPeerAsync: %s"); + } + + // Fallback to D2D copy. + Res = cuMemcpyDtoDAsync(CUDstPtr, CUSrcPtr, Size, Stream); + return Plugin::check(Res, "Error in cuMemcpyDtoDAsync: %s"); +} + +Error Plugin::init() { + // Call the getter to intialize the CUDA plugin. + get(); + return Plugin::success(); +} + +Error Plugin::deinit() { + // The CUDA plugin and the CUDA driver should already be deinitialized + // at this point. So do nothing for this plugin. + if (Plugin::isActive()) + return Plugin::error("CUDA plugin is not deinitialized"); + + return Plugin::success(); +} + +GenericPluginTy &Plugin::get() { + // The CUDA plugin instance is built the first time that Plugin::get() is + // called thanks to the following static variable. The ideal implementation + // would initialize the plugin in Plugin::init() (__tgt_rtl_plugin_init) and + // destroy it in Plugin::deinit() (__tgt_rtl_plugin_deinit). However, at the + // time Plugin::deinit() is called, the CUDA driver is already shut down. That + // is caused by the fact that __tgt_rtl_plugin_deinit is called from a dtor + // in libomptarget. Thus, this is a workaround until that aspect is fixed. + static CUDAPluginTy CUDAPlugin; + assert(Plugin::isActive() && "Plugin is not active"); + return CUDAPlugin; +} + +template +Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) { + CUresult ResultCode = static_cast(Code); + if (ResultCode == CUDA_SUCCESS) + return Error::success(); + + const char *Desc = "Unknown error"; + CUresult Ret = cuGetErrorString(ResultCode, &Desc); + if (Ret != CUDA_SUCCESS) + REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code %d\n", Code); + + return createStringError(inconvertibleErrorCode(), + ErrFmt, Args..., Desc); +} + +} // namespace plugin +} // namespace target +} // namespace omp +} // namespace llvm diff --git a/openmp/libomptarget/plugins-nextgen/exports b/openmp/libomptarget/plugins-nextgen/exports new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins-nextgen/exports @@ -0,0 +1,6 @@ +VERS1.0 { + global: + __tgt_rtl*; + local: + *; +}; diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp @@ -0,0 +1,401 @@ +//===-RTLs/generic-64bit/src/rtl.cpp - Target RTLs Implementation - C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// RTL NextGen for generic 64-bit machine +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include "Debug.h" +#include "DeviceEnvironment.h" +#include "GlobalHandler.h" +#include "PluginInterface.h" + +#include "llvm/ADT/SmallVector.h" +#include "llvm/Frontend/OpenMP/OMPConstants.h" +#include "llvm/Frontend/OpenMP/OMPGridValues.h" +#include "llvm/Support/DynamicLibrary.h" + +// The number of devices in this plugin. +#define NUM_DEVICES 4 + +// The ELF ID should be defined at compile-time by the build system. +#ifndef TARGET_ELF_ID +#define TARGET_ELF_ID 0 +#endif + +namespace llvm { +namespace omp { +namespace target { +namespace plugin { + +/// Forward declarations for all specialized data structures. +struct GenELF64KernelTy; +struct GenELF64DeviceTy; +struct GenELF64PluginTy; + +using llvm::sys::DynamicLibrary; + +/// Class implementing kernel functionalities for GenELF64. +struct GenELF64KernelTy : public GenericKernelTy { + /// Construct the kernel with a name, execution mode and a function. + GenELF64KernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode, + void (*Func)(void)) + : GenericKernelTy(Name, ExecutionMode), Func(Func) {} + + /// Initialize the kernel. + Error initImpl(GenericDeviceTy &GenericDevice, + DeviceImageTy &Image) override { + // Set the maximum number of threads to a single. + MaxNumThreads = 1; + return Plugin::success(); + } + + /// Launch the kernel using the libffi. + Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads, + uint64_t NumBlocks, uint32_t DynamicMemorySize, + int32_t NumKernelArgs, void *KernelArgs, + AsyncInfoWrapperTy &AsyncInfoWrapper) const override { + // Create a vector of ffi_types, one per argument. + SmallVector ArgTypes(NumKernelArgs, &ffi_type_pointer); + ffi_type **ArgTypesPtr = (ArgTypes.size()) ? &ArgTypes[0] : nullptr; + + // Prepare the cif structure before running the kernel function. + ffi_cif Cif; + ffi_status Status = ffi_prep_cif(&Cif, FFI_DEFAULT_ABI, NumKernelArgs, + &ffi_type_void, ArgTypesPtr); + if (Status != FFI_OK) + return Plugin::error("Error in ffi_prep_cif: %d", Status); + + // Call the kernel function through libffi. + long Return; + ffi_call(&Cif, Func, &Return, (void **)KernelArgs); + + return Plugin::success(); + } + + /// Get the default number of blocks and threads for the kernel. + uint64_t getDefaultNumBlocks(GenericDeviceTy &) const override { return 1; } + uint32_t getDefaultNumThreads(GenericDeviceTy &) const override { return 1; } + +private: + /// The kernel function to execute. + void (*Func)(void); +}; + +/// Class implementing the GenELF64 device images properties. +struct GenELF64DeviceImageTy : public DeviceImageTy { + /// Create the GenELF64 image with the id and the target image pointer. + GenELF64DeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage) + : DeviceImageTy(ImageId, TgtImage), DynLib() {} + + /// Getter and setter for the dynamic library. + DynamicLibrary &getDynamicLibrary() { return DynLib; } + void setDynamicLibrary(const DynamicLibrary &Lib) { DynLib = Lib; } + +private: + /// The dynamic library that loaded the image. + DynamicLibrary DynLib; +}; + +/// Class implementing the device functionalities for GenELF64. +struct GenELF64DeviceTy : public GenericDeviceTy { + /// Create the device with a specific id. + GenELF64DeviceTy(int32_t DeviceId, int32_t NumDevices) + : GenericDeviceTy(DeviceId, NumDevices, GenELF64GridValues) {} + + ~GenELF64DeviceTy() {} + + /// Initialize the device, which is a no-op + Error initImpl(GenericPluginTy &Plugin) override { return Plugin::success(); } + + /// Deinitialize the device, which is a no-op + Error deinitImpl() override { return Plugin::success(); } + + /// Construct the kernel for a specific image on the device. + Expected + constructKernelEntry(const __tgt_offload_entry &KernelEntry, + DeviceImageTy &Image) override { + GlobalTy Func(KernelEntry); + + // Get the metadata (address) of the kernel function. + GenericGlobalHandlerTy &GHandler = Plugin::get().getGlobalHandler(); + if (auto Err = GHandler.getGlobalMetadataFromDevice(*this, Image, Func)) + return std::move(Err); + + // Allocate and create the kernel. + GenELF64KernelTy *GenELF64Kernel = + Plugin::get().allocate(); + new (GenELF64Kernel) GenELF64KernelTy( + KernelEntry.name, OMP_TGT_EXEC_MODE_GENERIC, (void (*)())Func.getPtr()); + + return GenELF64Kernel; + } + + /// Set the current context to this device, which is a no-op. + Error setContext() override { return Plugin::success(); } + + /// Load the binary image into the device and allocate an image object. + Expected loadBinaryImpl(const __tgt_device_image *TgtImage, + int32_t ImageId) override { + // Allocate and initialize the image object. + GenELF64DeviceImageTy *Image = + Plugin::get().allocate(); + new (Image) GenELF64DeviceImageTy(ImageId, TgtImage); + + // Create a temporary file. + char TmpFileName[] = "/tmp/tmpfile_XXXXXX"; + int TmpFileFd = mkstemp(TmpFileName); + if (TmpFileFd == -1) + return Plugin::error("Failed to create tmpfile for loading target image"); + + // Open the temporary file. + FILE *TmpFile = fdopen(TmpFileFd, "wb"); + if (!TmpFile) + return Plugin::error("Failed to open tmpfile %s for loading target image", + TmpFileName); + + // Write the image into the temporary file. + size_t Written = fwrite(Image->getStart(), Image->getSize(), 1, TmpFile); + if (Written != 1) + return Plugin::error("Failed to write target image to tmpfile %s", + TmpFileName); + + // Close the temporary file. + int Ret = fclose(TmpFile); + if (Ret) + return Plugin::error("Failed to close tmpfile %s with the target image", + TmpFileName); + + // Load the temporary file as a dynamic library. + std::string ErrMsg; + DynamicLibrary DynLib = + DynamicLibrary::getPermanentLibrary(TmpFileName, &ErrMsg); + + // Check if the loaded library is valid. + if (!DynLib.isValid()) + return Plugin::error("Failed to load target image: %s", ErrMsg.c_str()); + + // Save a reference of the image's dynamic library. + Image->setDynamicLibrary(DynLib); + + return Image; + } + + /// Allocate memory. Use std::malloc in all cases. + void *allocate(size_t Size, void *, TargetAllocTy Kind) override { + if (Size == 0) + return nullptr; + + void *MemAlloc = nullptr; + switch (Kind) { + case TARGET_ALLOC_DEFAULT: + case TARGET_ALLOC_DEVICE: + case TARGET_ALLOC_HOST: + case TARGET_ALLOC_SHARED: + MemAlloc = std::malloc(Size); + break; + } + return MemAlloc; + } + + /// Free the memory. Use std::free in all cases. + int free(void *TgtPtr, TargetAllocTy Kind) override { + std::free(TgtPtr); + return OFFLOAD_SUCCESS; + } + + /// Submit data to the device (host to device transfer). + Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + std::memcpy(TgtPtr, HstPtr, Size); + return Plugin::success(); + } + + /// Retrieve data from the device (device to host transfer). + Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + std::memcpy(HstPtr, TgtPtr, Size); + return Plugin::success(); + } + + /// Exchange data between two devices within the plugin. This function is not + /// supported in this plugin. + Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice, + void *DstPtr, int64_t Size, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + // This function should never be called because the function + // GenELF64PluginTy::isDataExchangable() returns false. + return Plugin::error("dataExchangeImpl not supported"); + } + + /// All functions are already synchronous. No need to do anything on this + /// synchronization function. + Error synchronizeImpl(__tgt_async_info &AsyncInfo) override { + return Plugin::success(); + } + + /// This plugin does not support interoperability + Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { + return Plugin::error("initAsyncInfoImpl not supported"); + } + + /// This plugin does not support interoperability + Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override { + return Plugin::error("initDeviceInfoImpl not supported"); + } + + /// This plugin does not support the event API. Do nothing without failing. + Error createEventImpl(void **EventPtrStorage) override { + *EventPtrStorage = nullptr; + return Plugin::success(); + } + Error destroyEventImpl(void *EventPtr) override { return Plugin::success(); } + Error recordEventImpl(void *EventPtr, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + return Plugin::success(); + } + Error waitEventImpl(void *EventPtr, + AsyncInfoWrapperTy &AsyncInfoWrapper) override { + return Plugin::success(); + } + Error syncEventImpl(void *EventPtr) override { return Plugin::success(); } + + /// Print information about the device. + Error printInfoImpl() override { + printf(" This is a generic-elf-64bit device\n"); + return Plugin::success(); + } + + /// This plugin should not setup the device environment. + virtual bool shouldSetupDeviceEnvironment() const override { return false; }; + + /// Getters and setters for stack size and heap size not relevant. + Error getDeviceStackSize(uint64_t &Value) override { + Value = 0; + return Plugin::success(); + } + Error setDeviceStackSize(uint64_t Value) override { + return Plugin::success(); + } + Error getDeviceHeapSize(uint64_t &Value) override { + Value = 0; + return Plugin::success(); + } + Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); } + +private: + /// Grid values for Generic ELF64 plugins. + static constexpr GV GenELF64GridValues = { + 1, // GV_Slot_Size + 1, // GV_Warp_Size + 1, // GV_Max_Teams + 1, // GV_SimpleBufferSize + 1, // GV_Max_WG_Size + 1, // GV_Default_WG_Size + }; +}; + +class GenELF64GlobalHandlerTy final : public GenericGlobalHandlerTy { +public: + Error getGlobalMetadataFromDevice(GenericDeviceTy &GenericDevice, + DeviceImageTy &Image, + GlobalTy &DeviceGlobal) override { + const char *GlobalName = DeviceGlobal.getName().data(); + GenELF64DeviceImageTy &GenELF64Image = + static_cast(Image); + + // Get dynamic library that has loaded the device image. + DynamicLibrary &DynLib = GenELF64Image.getDynamicLibrary(); + + // Get the address of the symbol. + void *Addr = DynLib.getAddressOfSymbol(GlobalName); + if (Addr == nullptr) { + return Plugin::error("Failed to load global '%s'", GlobalName); + } + + // Save the pointer to the symbol. + DeviceGlobal.setPtr(Addr); + + return Plugin::success(); + } +}; + +/// Class implementing the plugin functionalities for GenELF64. +struct GenELF64PluginTy final : public GenericPluginTy { + /// Create the plugin. + GenELF64PluginTy() : GenericPluginTy() { + // Initialize the generic plugin structure with multiple devices and a + // global handler. + GenericPluginTy::init(NUM_DEVICES, new GenELF64GlobalHandlerTy()); + } + + /// This class should not be copied. + GenELF64PluginTy(const GenELF64PluginTy &) = delete; + GenELF64PluginTy(GenELF64PluginTy &&) = delete; + + ~GenELF64PluginTy() {} + + /// Get the ELF code to recognize the compatible binary images. + uint16_t getMagicElfBits() const override { return TARGET_ELF_ID; } + + /// Create a GenELF64 device with a specific id. + GenELF64DeviceTy &createDevice(int32_t DeviceId) override { + GenELF64DeviceTy *Device = new GenELF64DeviceTy(DeviceId, getNumDevices()); + return *Device; + } + + /// This plugin does not support exchanging data between two devices. + bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override { + return false; + } + + /// All images (ELF-compatible) should be compatible with this plugin. + Expected isImageCompatible(__tgt_image_info *Info) const override { + return true; + } +}; + +Error Plugin::init() { + // Call the getter to intialize the GenELF64 plugin. + get(); + return Plugin::success(); +} + +Error Plugin::deinit() { + // The Generic ELF64 plugin should already be deinitialized at this point. + if (Plugin::isActive()) + return Plugin::error("Generic ELF64 plugin is not deinitialized"); + + return Plugin::success(); +} + +GenericPluginTy &Plugin::get() { + static GenELF64PluginTy GenELF64Plugin; + assert(Plugin::isActive() && "Plugin is not active"); + return GenELF64Plugin; +} + +template +Error Plugin::check(int32_t Code, const char *ErrMsg, ArgsTy... Args) { + if (Code == 0) + return Error::success(); + + return createStringError( + inconvertibleErrorCode(), ErrMsg, Args..., std::to_string(Code).data()); +} + +} // namespace plugin +} // namespace target +} // namespace omp +} // namespace llvm diff --git a/openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt @@ -0,0 +1,17 @@ +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a ppc64 machine if available. +# +##===----------------------------------------------------------------------===## + +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + build_generic_elf64_nextgen("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu" "21") +else() + libomptarget_say("Not building ppc64 NextGen offloading plugin: machine not found in the system.") +endif() diff --git a/openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt @@ -0,0 +1,17 @@ +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a ppc64le machine if available. +# +##===----------------------------------------------------------------------===## + +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + build_generic_elf64_nextgen("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu" "21") +else() + libomptarget_say("Not building ppc64le NextGen offloading plugin: machine not found in the system.") +endif() diff --git a/openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt @@ -0,0 +1,17 @@ +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a x86_64 machine if available. +# +##===----------------------------------------------------------------------===## + +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + build_generic_elf64_nextgen("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62") +else() + libomptarget_say("Not building x86_64 NextGen offloading plugin: machine not found in the system.") +endif() diff --git a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h --- a/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h +++ b/openmp/libomptarget/plugins/cuda/dynamic_cuda/cuda.h @@ -24,11 +24,14 @@ typedef struct CUstream_st *CUstream; typedef struct CUevent_st *CUevent; +#define CU_DEVICE_INVALID ((CUdevice)-2) + typedef enum cudaError_enum { CUDA_SUCCESS = 0, CUDA_ERROR_INVALID_VALUE = 1, CUDA_ERROR_NO_DEVICE = 100, CUDA_ERROR_INVALID_HANDLE = 400, + CUDA_ERROR_TOO_MANY_PEERS = 711, } CUresult; typedef enum CUstream_flags_enum { diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -16,6 +16,8 @@ #include "private.h" #include "rtl.h" +#include "Utilities.h" + #include #include #include @@ -24,16 +26,17 @@ using namespace llvm; using namespace llvm::sys; +using namespace llvm::omp::target; // List of all plugins that can support offloading. static const char *RTLNames[] = { - /* PowerPC target */ "libomptarget.rtl.ppc64.so", - /* x86_64 target */ "libomptarget.rtl.x86_64.so", - /* CUDA target */ "libomptarget.rtl.cuda.so", - /* AArch64 target */ "libomptarget.rtl.aarch64.so", - /* SX-Aurora VE target */ "libomptarget.rtl.ve.so", - /* AMDGPU target */ "libomptarget.rtl.amdgpu.so", - /* Remote target */ "libomptarget.rtl.rpc.so", + /* PowerPC target */ "libomptarget.rtl.ppc64", + /* x86_64 target */ "libomptarget.rtl.x86_64", + /* CUDA target */ "libomptarget.rtl.cuda", + /* AArch64 target */ "libomptarget.rtl.aarch64", + /* SX-Aurora VE target */ "libomptarget.rtl.ve", + /* AMDGPU target */ "libomptarget.rtl.amdgpu", + /* Remote target */ "libomptarget.rtl.rpc", }; PluginManager *PM; @@ -86,152 +89,166 @@ DP("Loading RTLs...\n"); + BoolEnvar NextGenPlugins("LIBOMPTARGET_NEXTGEN_PLUGINS", false); + // Attempt to open all the plugins and, if they exist, check if the interface // is correct and if they are supporting any devices. - for (auto *Name : RTLNames) { - DP("Loading library '%s'...\n", Name); - std::string ErrMsg; - auto DynLibrary = std::make_unique( - sys::DynamicLibrary::getPermanentLibrary(Name, &ErrMsg)); - - if (!DynLibrary->isValid()) { - // Library does not exist or cannot be found. - DP("Unable to load library '%s': %s!\n", Name, ErrMsg.c_str()); - continue; - } - - DP("Successfully loaded library '%s'!\n", Name); - + for (const char *Name : RTLNames) { AllRTLs.emplace_back(); - // Retrieve the RTL information from the runtime library. - RTLInfoTy &R = AllRTLs.back(); - - // Remove plugin on failure to call optional init_plugin - *((void **)&R.init_plugin) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_init_plugin"); - if (R.init_plugin) { - int32_t Rc = R.init_plugin(); - if (Rc != OFFLOAD_SUCCESS) { - DP("Unable to initialize library '%s': %u!\n", Name, Rc); - AllRTLs.pop_back(); + RTLInfoTy &RTL = AllRTLs.back(); + + const std::string BaseRTLName(Name); + if (NextGenPlugins) { + if (attemptLoadRTL(BaseRTLName + ".nextgen.so", RTL)) continue; - } - } - bool ValidPlugin = true; - - if (!(*((void **)&R.is_valid_binary) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_is_valid_binary"))) - ValidPlugin = false; - if (!(*((void **)&R.number_of_devices) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_number_of_devices"))) - ValidPlugin = false; - if (!(*((void **)&R.init_device) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_init_device"))) - ValidPlugin = false; - if (!(*((void **)&R.load_binary) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_load_binary"))) - ValidPlugin = false; - if (!(*((void **)&R.data_alloc) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_data_alloc"))) - ValidPlugin = false; - if (!(*((void **)&R.data_submit) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_data_submit"))) - ValidPlugin = false; - if (!(*((void **)&R.data_retrieve) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_data_retrieve"))) - ValidPlugin = false; - if (!(*((void **)&R.data_delete) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_data_delete"))) - ValidPlugin = false; - if (!(*((void **)&R.run_region) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_run_target_region"))) - ValidPlugin = false; - if (!(*((void **)&R.run_team_region) = DynLibrary->getAddressOfSymbol( - "__tgt_rtl_run_target_team_region"))) - ValidPlugin = false; - - // Invalid plugin - if (!ValidPlugin) { - DP("Invalid plugin as necessary interface is not found.\n"); - AllRTLs.pop_back(); - continue; + DP("Falling back to original plugin...\n"); } - // No devices are supported by this RTL? - if (!(R.NumberOfDevices = R.number_of_devices())) { - // The RTL is invalid! Will pop the object from the RTLs list. - DP("No devices supported in this RTL\n"); + if (!attemptLoadRTL(BaseRTLName + ".so", RTL)) AllRTLs.pop_back(); - continue; + } + + DP("RTLs loaded!\n"); +} + +bool RTLsTy::attemptLoadRTL(const std::string &RTLName, RTLInfoTy &RTL) { + const char *Name = RTLName.c_str(); + + DP("Loading library '%s'...\n", Name); + + std::string ErrMsg; + auto DynLibrary = std::make_unique( + sys::DynamicLibrary::getPermanentLibrary(Name, &ErrMsg)); + + if (!DynLibrary->isValid()) { + // Library does not exist or cannot be found. + DP("Unable to load library '%s': %s!\n", Name, ErrMsg.c_str()); + return false; + } + + DP("Successfully loaded library '%s'!\n", Name); + + // Remove plugin on failure to call optional init_plugin + *((void **)&RTL.init_plugin) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_init_plugin"); + if (RTL.init_plugin) { + int32_t Rc = RTL.init_plugin(); + if (Rc != OFFLOAD_SUCCESS) { + DP("Unable to initialize library '%s': %u!\n", Name, Rc); + return false; } + } -#ifdef OMPTARGET_DEBUG - R.RTLName = Name; -#endif + bool ValidPlugin = true; + + if (!(*((void **)&RTL.is_valid_binary) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_is_valid_binary"))) + ValidPlugin = false; + if (!(*((void **)&RTL.number_of_devices) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_number_of_devices"))) + ValidPlugin = false; + if (!(*((void **)&RTL.init_device) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_init_device"))) + ValidPlugin = false; + if (!(*((void **)&RTL.load_binary) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_load_binary"))) + ValidPlugin = false; + if (!(*((void **)&RTL.data_alloc) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_data_alloc"))) + ValidPlugin = false; + if (!(*((void **)&RTL.data_submit) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_data_submit"))) + ValidPlugin = false; + if (!(*((void **)&RTL.data_retrieve) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_data_retrieve"))) + ValidPlugin = false; + if (!(*((void **)&RTL.data_delete) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_data_delete"))) + ValidPlugin = false; + if (!(*((void **)&RTL.run_region) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_run_target_region"))) + ValidPlugin = false; + if (!(*((void **)&RTL.run_team_region) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_run_target_team_region"))) + ValidPlugin = false; + + // Invalid plugin + if (!ValidPlugin) { + DP("Invalid plugin as necessary interface is not found.\n"); + return false; + } - DP("Registering RTL %s supporting %d devices!\n", R.RTLName.c_str(), - R.NumberOfDevices); - - // Optional functions - *((void **)&R.deinit_plugin) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_deinit_plugin"); - *((void **)&R.is_valid_binary_info) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_is_valid_binary_info"); - *((void **)&R.deinit_device) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_deinit_device"); - *((void **)&R.init_requires) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_init_requires"); - *((void **)&R.data_submit_async) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_data_submit_async"); - *((void **)&R.data_retrieve_async) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_data_retrieve_async"); - *((void **)&R.run_region_async) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_run_target_region_async"); - *((void **)&R.run_team_region_async) = DynLibrary->getAddressOfSymbol( - "__tgt_rtl_run_target_team_region_async"); - *((void **)&R.synchronize) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_synchronize"); - *((void **)&R.data_exchange) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_data_exchange"); - *((void **)&R.data_exchange_async) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_data_exchange_async"); - *((void **)&R.is_data_exchangable) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_is_data_exchangable"); - *((void **)&R.register_lib) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_register_lib"); - *((void **)&R.unregister_lib) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_unregister_lib"); - *((void **)&R.supports_empty_images) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_supports_empty_images"); - *((void **)&R.set_info_flag) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_set_info_flag"); - *((void **)&R.print_device_info) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_print_device_info"); - *((void **)&R.create_event) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_create_event"); - *((void **)&R.record_event) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_record_event"); - *((void **)&R.wait_event) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_wait_event"); - *((void **)&R.sync_event) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_sync_event"); - *((void **)&R.destroy_event) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_destroy_event"); - *((void **)&R.release_async_info) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_release_async_info"); - *((void **)&R.init_async_info) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_init_async_info"); - *((void **)&R.init_device_info) = - DynLibrary->getAddressOfSymbol("__tgt_rtl_init_device_info"); - - R.LibraryHandler = std::move(DynLibrary); + // No devices are supported by this RTL? + if (!(RTL.NumberOfDevices = RTL.number_of_devices())) { + // The RTL is invalid! Will pop the object from the RTLs list. + DP("No devices supported in this RTL\n"); + return false; } - DP("RTLs loaded!\n"); +#ifdef LIBOMPTARGET_DEBUG + RTL.RTLName = Name; +#endif - return; + DP("Registering RTL %s supporting %d devices!\n", Name, RTL.NumberOfDevices); + + // Optional functions + *((void **)&RTL.deinit_plugin) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_deinit_plugin"); + *((void **)&RTL.is_valid_binary_info) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_is_valid_binary_info"); + *((void **)&RTL.deinit_device) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_deinit_device"); + *((void **)&RTL.init_requires) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_init_requires"); + *((void **)&RTL.data_submit_async) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_data_submit_async"); + *((void **)&RTL.data_retrieve_async) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_data_retrieve_async"); + *((void **)&RTL.run_region_async) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_run_target_region_async"); + *((void **)&RTL.run_team_region_async) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_run_target_team_region_async"); + *((void **)&RTL.synchronize) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_synchronize"); + *((void **)&RTL.data_exchange) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_data_exchange"); + *((void **)&RTL.data_exchange_async) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_data_exchange_async"); + *((void **)&RTL.is_data_exchangable) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_is_data_exchangable"); + *((void **)&RTL.register_lib) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_register_lib"); + *((void **)&RTL.unregister_lib) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_unregister_lib"); + *((void **)&RTL.supports_empty_images) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_supports_empty_images"); + *((void **)&RTL.set_info_flag) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_set_info_flag"); + *((void **)&RTL.print_device_info) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_print_device_info"); + *((void **)&RTL.create_event) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_create_event"); + *((void **)&RTL.record_event) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_record_event"); + *((void **)&RTL.wait_event) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_wait_event"); + *((void **)&RTL.sync_event) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_sync_event"); + *((void **)&RTL.destroy_event) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_destroy_event"); + *((void **)&RTL.release_async_info) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_release_async_info"); + *((void **)&RTL.init_async_info) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_init_async_info"); + *((void **)&RTL.init_device_info) = + DynLibrary->getAddressOfSymbol("__tgt_rtl_init_device_info"); + + RTL.LibraryHandler = std::move(DynLibrary); + + // Successfully loaded + return true; } //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg --- a/openmp/libomptarget/test/lit.cfg +++ b/openmp/libomptarget/test/lit.cfg @@ -21,6 +21,10 @@ if 'LIBOMPTARGET_DEBUG' in os.environ: config.environment['LIBOMPTARGET_DEBUG'] = os.environ['LIBOMPTARGET_DEBUG'] +# Allow running the tests with nextgen plugins when available +if 'LIBOMPTARGET_NEXTGEN_PLUGINS' in os.environ: + config.environment['LIBOMPTARGET_NEXTGEN_PLUGINS'] = os.environ['LIBOMPTARGET_NEXTGEN_PLUGINS'] + if 'OMP_TARGET_OFFLOAD' in os.environ: config.environment['OMP_TARGET_OFFLOAD'] = os.environ['OMP_TARGET_OFFLOAD']