diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -610,6 +610,8 @@ CmdArgs.push_back("gnu"); CmdArgs.push_back("--no-undefined"); CmdArgs.push_back("-shared"); + std::string ArchArg = std::string("-plugin-opt=mcpu=").append(Arch.str()); + CmdArgs.push_back(ArchArg); CmdArgs.push_back("-o"); CmdArgs.push_back(TempFile); @@ -1147,6 +1149,9 @@ Error wrapOpenMPImages(Module &M, ArrayRef Images) { SmallVector, 4> SavedBuffers; SmallVector, 4> ImagesToWrap; + SmallVector, 4> OffloadArchs; + std::string Arch; + OffloadArchs.reserve(Images.size()); for (const DeviceFile &File : Images) { llvm::ErrorOr> ImageOrError = llvm::MemoryBuffer::getFileOrSTDIN(File.Filename); @@ -1155,9 +1160,19 @@ ImagesToWrap.emplace_back((*ImageOrError)->getBufferStart(), (*ImageOrError)->getBufferSize()); SavedBuffers.emplace_back(std::move(*ImageOrError)); + + if (Arch.empty()) { + Arch = std::string(File.Arch); + Arch.push_back('\0'); + OffloadArchs.emplace_back(Arch.data(), Arch.size()); + } else { + auto curSize = Arch.size(); + Arch.append(File.Arch); + OffloadArchs.emplace_back(&(Arch.at(curSize)), File.Arch.size()); + } } - if (Error Err = wrapOpenMPBinaries(M, ImagesToWrap)) + if (Error Err = wrapOpenMPBinaries(M, ImagesToWrap, OffloadArchs)) return Err; return Error::success(); } diff --git a/clang/tools/clang-linker-wrapper/OffloadWrapper.h b/clang/tools/clang-linker-wrapper/OffloadWrapper.h --- a/clang/tools/clang-linker-wrapper/OffloadWrapper.h +++ b/clang/tools/clang-linker-wrapper/OffloadWrapper.h @@ -14,8 +14,9 @@ /// Wraps the input device images into the module \p M as global symbols and /// registers the images with the OpenMP Offloading runtime libomptarget. -llvm::Error wrapOpenMPBinaries(llvm::Module &M, - llvm::ArrayRef> Images); +llvm::Error +wrapOpenMPBinaries(llvm::Module &M, llvm::ArrayRef> Images, + llvm::ArrayRef> OffloadArchs); /// Wraps the input fatbinary image into the module \p M as global symbols and /// registers the images with the CUDA runtime. diff --git a/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp b/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp --- a/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp @@ -55,11 +55,31 @@ return PointerType::getUnqual(getEntryTy(M)); } +// This matches the runtime struct definition of __tgt_image_info +// declared in openmp/libomptarget/include/omptarget.h / +// struct __tgt_image_info { +// int32_t version; +// int8_t *offload_arch; +// }; +StructType *getImageInfoTy(Module &M) { + LLVMContext &C = M.getContext(); + StructType *ImageInfoTy = StructType::getTypeByName(C, "__tgt_image_info"); + if (!ImageInfoTy) + ImageInfoTy = StructType::create("__tgt_image_info", Type::getInt32Ty(C), + Type::getInt8PtrTy(C)); + return ImageInfoTy; +} + +PointerType *getImageInfoPtrTy(Module &M) { + return PointerType::getUnqual(getImageInfoTy(M)); +} + // struct __tgt_device_image { // void *ImageStart; // void *ImageEnd; // __tgt_offload_entry *EntriesBegin; // __tgt_offload_entry *EntriesEnd; +// __tgt_image_info *ImageInfo; // }; StructType *getDeviceImageTy(Module &M) { LLVMContext &C = M.getContext(); @@ -67,7 +87,7 @@ if (!ImageTy) ImageTy = StructType::create("__tgt_device_image", Type::getInt8PtrTy(C), Type::getInt8PtrTy(C), getEntryPtrTy(M), - getEntryPtrTy(M)); + getEntryPtrTy(M), getImageInfoPtrTy(M)); return ImageTy; } @@ -114,14 +134,16 @@ /// Image0, /*ImageStart*/ /// Image0 + sizeof(Image0), /*ImageEnd*/ /// __start_omp_offloading_entries, /*EntriesBegin*/ -/// __stop_omp_offloading_entries /*EntriesEnd*/ +/// __stop_omp_offloading_entries, /*EntriesEnd*/ +/// __tgt_image_info /*ImageInfo*/ /// }, /// ... /// { /// ImageN, /*ImageStart*/ /// ImageN + sizeof(ImageN), /*ImageEnd*/ /// __start_omp_offloading_entries, /*EntriesBegin*/ -/// __stop_omp_offloading_entries /*EntriesEnd*/ +/// __stop_omp_offloading_entries, /*EntriesEnd*/ +/// __tgt_image_info /*ImageInfo*/ /// } /// }; /// @@ -133,7 +155,8 @@ /// }; /// /// Global variable that represents BinDesc is returned. -GlobalVariable *createBinDesc(Module &M, ArrayRef> Bufs) { +GlobalVariable *createBinDesc(Module &M, ArrayRef> Bufs, + ArrayRef> OffloadArchs) { LLVMContext &C = M.getContext(); // Create external begin/end symbols for the offload entries table. auto *EntriesB = new GlobalVariable( @@ -161,6 +184,10 @@ auto *Zero = ConstantInt::get(getSizeTTy(M), 0u); Constant *ZeroZero[] = {Zero, Zero}; + unsigned int ImgCount = 0; + std::string OffloadArchBase = "__offload_arch"; + std::string OffloadImageBase = "offload_image_info"; + // Create initializer for the images array. SmallVector ImagesInits; ImagesInits.reserve(Bufs.size()); @@ -179,8 +206,41 @@ auto *ImageE = ConstantExpr::getGetElementPtr(Image->getValueType(), Image, ZeroSize); - ImagesInits.push_back(ConstantStruct::get(getDeviceImageTy(M), ImageB, - ImageE, EntriesB, EntriesE)); + auto OArch = OffloadArchs[ImgCount]; + Constant *OArchV = ConstantDataArray::get(C, OArch); + std::string OffloadArchGV(OffloadArchBase), + OffloadImageGV(OffloadImageBase); + if (ImgCount) { + auto Suffix = std::to_string(ImgCount); + OffloadArchGV.append(".").append(Suffix); + OffloadImageGV.append(".").append(Suffix); + } + + auto *GV = + new GlobalVariable(M, OArchV->getType(), /*isConstant*/ true, + GlobalValue::InternalLinkage, OArchV, OffloadArchGV); + GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + + // store value of these variables (i.e. offload archs) into a custom + // section which will be used by "offload-arch -f". It won't be + // removed during binary stripping. + GV->setSection(".offload_arch_list"); + + auto *RequirementVPtr = + ConstantExpr::getGetElementPtr(GV->getValueType(), GV, Zero); + RequirementVPtr = + ConstantExpr::getBitCast(RequirementVPtr, Type::getInt8PtrTy(C)); + auto *InfoInit = ConstantStruct::get( + getImageInfoTy(M), ConstantInt::get(Type::getInt32Ty(C), 1), + RequirementVPtr); + ++ImgCount; + auto *ImageInfoGV = + new GlobalVariable(M, InfoInit->getType(), + /*isConstant*/ true, GlobalValue::InternalLinkage, + InfoInit, OffloadImageGV); + ImageInfoGV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + ImagesInits.push_back(ConstantStruct::get( + getDeviceImageTy(M), ImageB, ImageE, EntriesB, EntriesE, ImageInfoGV)); } // Then create images array. @@ -218,7 +278,7 @@ auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(M), /*isVarArg*/ false); FunctionCallee RegFuncC = - M.getOrInsertFunction("__tgt_register_lib", RegFuncTy); + M.getOrInsertFunction("__tgt_register_lib_v2", RegFuncTy); // Construct function body IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func)); @@ -531,8 +591,9 @@ } // namespace -Error wrapOpenMPBinaries(Module &M, ArrayRef> Images) { - GlobalVariable *Desc = createBinDesc(M, Images); +Error wrapOpenMPBinaries(Module &M, ArrayRef> Images, + ArrayRef> OffloadArchs) { + GlobalVariable *Desc = createBinDesc(M, Images, OffloadArchs); if (!Desc) return createStringError(inconvertibleErrorCode(), "No binary descriptors created."); diff --git a/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp b/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp --- a/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp +++ b/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp @@ -284,7 +284,7 @@ auto *RegFuncTy = FunctionType::get(Type::getVoidTy(C), getBinDescPtrTy(), /*isVarArg*/ false); FunctionCallee RegFuncC = - M.getOrInsertFunction("__tgt_register_lib", RegFuncTy); + M.getOrInsertFunction("__tgt_register_lib_v2", RegFuncTy); // Construct function body IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func)); diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -119,12 +119,46 @@ int32_t reserved; // Reserved, to be used by the runtime library. }; +/// __tgt_image_info: +/// +/// The information in this struct is provided in the clang-linker-wrapper +/// as a call to __tgt_register_image_info for each image in the library +/// of images also created by the clang-linker-wrapper. +/// __tgt_register_image_info is called for each image BEFORE the single +/// call to __tgt_register_lib so that image information is available +/// before they are loaded. clang-linker-wrapper gets this image information +/// from command line arguments provided by the clang driver when it creates +/// the call to the __clang-linker-wrapper command. +/// This architecture allows the binary image (pointed to by ImageStart and +/// ImageEnd in __tgt_device_image) to remain architecture indenendent. +/// That is, the architecture independent part of the libomptarget runtime +/// does not need to peer inside the image to determine if it is loadable +/// even though in most cases the image is an elf object. +/// There is one __tgt_image_info for each __tgt_device_image. For backward +/// compabibility, no changes are allowed to either __tgt_device_image or +/// __tgt_bin_desc. The absense of __tgt_image_info is the indication that +/// the runtime is being used on a binary created by an old version of +/// the compiler. +/// +struct __tgt_image_info { + int32_t image_info_version; // The version of this struct (= 1) + char *offload_arch; // e.g. sm_30, sm_70, gfx906, includes features +}; + /// This struct is a record of the device image information +struct __tgt_device_image_old { + void *ImageStart; // Pointer to the target code start + void *ImageEnd; // Pointer to the target code end + __tgt_offload_entry *EntriesBegin; // Begin of table with all target entries + __tgt_offload_entry *EntriesEnd; // End of table (non inclusive) +}; + struct __tgt_device_image { void *ImageStart; // Pointer to the target code start void *ImageEnd; // Pointer to the target code end __tgt_offload_entry *EntriesBegin; // Begin of table with all target entries __tgt_offload_entry *EntriesEnd; // End of table (non inclusive) + __tgt_image_info *ImageInfo; // Metadata about the image (may be null). }; /// This struct is a record of all the host code that may be offloaded to a @@ -136,6 +170,15 @@ __tgt_offload_entry *HostEntriesEnd; // End of table (non inclusive) }; +/// __tgt_active_offload_env +/// +/// This structure is created by __tgt_get_active_offload_env and is used +/// to determine compatibility of the images with the current environment +/// that is "in play". +struct __tgt_active_offload_env { + char *capabilities; // string returned by offload-arch -c +}; + /// This struct contains the offload entries identified by the target runtime struct __tgt_target_table { __tgt_offload_entry *EntriesBegin; // Begin of the table with all the entries @@ -233,6 +276,7 @@ /// adds a target shared library to the target execution image void __tgt_register_lib(__tgt_bin_desc *desc); +void __tgt_register_lib_v2(__tgt_bin_desc *desc); /// Initialize all RTLs at once void __tgt_init_all_rtls(); diff --git a/openmp/libomptarget/include/rtl.h b/openmp/libomptarget/include/rtl.h --- a/openmp/libomptarget/include/rtl.h +++ b/openmp/libomptarget/include/rtl.h @@ -148,6 +148,7 @@ // Register a shared library with all (compatible) RTLs. void RegisterLib(__tgt_bin_desc *desc); + void RegisterLibV2(__tgt_bin_desc *desc); // Unregister a shared library from all RTLs. void UnregisterLib(__tgt_bin_desc *desc); diff --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports --- a/openmp/libomptarget/src/exports +++ b/openmp/libomptarget/src/exports @@ -2,6 +2,7 @@ global: __tgt_register_requires; __tgt_register_lib; + __tgt_register_lib_v2; __tgt_unregister_lib; __tgt_init_all_rtls; __tgt_target_data_begin; diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -43,6 +43,19 @@ PM->RTLs.RegisterLib(desc); } +EXTERN void __tgt_register_lib_v2(__tgt_bin_desc *desc) { + TIMESCOPE(); + std::call_once(PM->RTLs.initFlag, &RTLsTy::LoadRTLs, &PM->RTLs); + for (auto &RTL : PM->RTLs.AllRTLs) { + if (RTL.register_lib) { + if ((*RTL.register_lib)(desc) != OFFLOAD_SUCCESS) { + DP("Could not register library with %s", RTL.RTLName.c_str()); + } + } + } + PM->RTLs.RegisterLibV2(desc); +} + //////////////////////////////////////////////////////////////////////////////// /// Initialize all available devices without registering any image EXTERN void __tgt_init_all_rtls() { PM->RTLs.initAllRTLs(); } diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -13,6 +13,7 @@ #include "rtl.h" #include "device.h" #include "private.h" +//#include "llvm/OffloadArch/OffloadArch.h" #include #include @@ -20,6 +21,8 @@ #include #include #include +// It's strange we do not have llvm tools for openmp runtime, so we use stat +#include // List of all plugins that can support offloading. static const char *RTLNames[] = { @@ -351,18 +354,127 @@ initRTLonce(R); } +/// Query runtime capabilities of this system by calling offload-arch -c +/// offload_arch_output_buffer is persistant storage returned by this +/// __tgt_get_active_offload_env. +static void +__tgt_get_active_offload_env(__tgt_active_offload_env *active_env, + char *offload_arch_output_buffer, + size_t offload_arch_output_buffer_size) { + + // If OFFLOAD_ARCH_OVERRIDE env varible is present then use its value instead + // of querying it using LLVMOffloadArch library. + if (char *OffloadArchEnvVar = getenv("OFFLOAD_ARCH_OVERRIDE")) { + if (OffloadArchEnvVar) { + active_env->capabilities = OffloadArchEnvVar; + return; + } + } + // Qget runtime capabilities of this system with libLLVMOffloadArch.a + // if (int rc = getRuntimeCapabilities(offload_arch_output_buffer, + // offload_arch_output_buffer_size)) + // return; + // active_env->capabilities = offload_arch_output_buffer; + // return; +} + +std::vector _splitstrings(char *input, const char *sep) { + std::vector split_strings; + std::string s(input); + std::string delimiter(sep); + size_t pos = 0; + while ((pos = s.find(delimiter)) != std::string::npos) { + if (pos != 0) + split_strings.push_back(s.substr(0, pos)); + s.erase(0, pos + delimiter.length()); + } + if (s.length() > 1) + split_strings.push_back(s.substr(0, s.length())); + return split_strings; +} + +static bool _ImageIsCompatibleWithEnv(__tgt_image_info *image_info, + __tgt_active_offload_env *active_env) { + // get_image_info will return null if no image information was registered. + // If no image information, assume application built with old compiler and + // check each image. + if (!(image_info && image_info->image_info_version == 1)) + return true; + + if (!active_env->capabilities) + return false; + + // Each runtime requirement for the compiled image is stored in + // the image_info->offload_arch (TargetID) string. + // Each runtime capability obtained from "offload-arch -c" is stored in + // actvie_env->capabilities (TargetID) string. + // If every requirement has a matching capability, then the image + // is compatible with active environment + + std::vector reqs = _splitstrings(image_info->offload_arch, ":"); + std::vector caps = _splitstrings(active_env->capabilities, ":"); + + bool is_compatible = true; + for (auto req : reqs) { + bool missing_capability = true; + for (auto capability : caps) + if (capability == req) + missing_capability = false; + if (missing_capability) { + DP("Image requires %s but runtime capability %s is missing.\n", + image_info->offload_arch, req.c_str()); + is_compatible = false; + } + } + return is_compatible; +} + void RTLsTy::RegisterLib(__tgt_bin_desc *desc) { + + __tgt_device_image *newDeviceImages = + new __tgt_device_image[desc->NumDeviceImages]; + + for (int32_t i = 0; i < desc->NumDeviceImages; ++i) { + newDeviceImages[i].EntriesBegin = desc->DeviceImages[i].EntriesBegin; + newDeviceImages[i].EntriesEnd = desc->DeviceImages[i].EntriesEnd; + newDeviceImages[i].ImageStart = desc->DeviceImages[i].ImageStart; + newDeviceImages[i].ImageEnd = desc->DeviceImages[i].ImageEnd; + newDeviceImages[i].ImageInfo = nullptr; + // TODO : delete(desc->DeviceImages[i]); + } + + desc->DeviceImages = static_cast<__tgt_device_image *>(newDeviceImages); + + this->RegisterLibV2(desc); +} + +#define MAX_CAPS_STR_SIZE 1024 +void RTLsTy::RegisterLibV2(__tgt_bin_desc *desc) { + + // Get the current active offload environment + __tgt_active_offload_env offload_env = {nullptr}; + // Need a buffer to hold results of offload-arch -c command + size_t offload_arch_output_buffer_size = MAX_CAPS_STR_SIZE; + std::vector offload_arch_output_buffer; + offload_arch_output_buffer.resize(offload_arch_output_buffer_size); + __tgt_get_active_offload_env(&offload_env, offload_arch_output_buffer.data(), + offload_arch_output_buffer_size); + + RTLInfoTy *FoundRTL = NULL; PM->RTLsMtx.lock(); // Register the images with the RTLs that understand them, if any. for (int32_t i = 0; i < desc->NumDeviceImages; ++i) { // Obtain the image. __tgt_device_image *img = &desc->DeviceImages[i]; - RTLInfoTy *FoundRTL = nullptr; - + // Get corresponding image info offload_arch and check with runtime + if (!_ImageIsCompatibleWithEnv(img->ImageInfo, &offload_env)) + continue; + FoundRTL = NULL; // Scan the RTLs that have associated images until we find one that supports // the current image. for (auto &R : AllRTLs) { + if (!R.is_valid_binary(img)) { DP("Image " DPxMOD " is NOT compatible with RTL %s!\n", DPxPTR(img->ImageStart), R.RTLName.c_str()); @@ -407,6 +519,39 @@ } PM->RTLsMtx.unlock(); + if (!FoundRTL) { + if (PM->TargetOffloadPolicy == tgt_mandatory) + fprintf(stderr, "ERROR:\ + Runtime capabilities do NOT meet any offload image offload_arch\n\ + and the OMP_TARGET_OFFLOAD policy is mandatory. Terminating!\n\ + Runtime capabilities : %s\n", + offload_env.capabilities); + else if (PM->TargetOffloadPolicy == tgt_disabled) + fprintf(stderr, "WARNING: Offloading is disabled.\n"); + else + fprintf( + stderr, + "WARNING: Runtime capabilities do NOT meet any image offload_arch.\n\ + So device offloading is now disabled.\n\ + Runtime capabilities : %s\n", + offload_env.capabilities); + if (PM->TargetOffloadPolicy != tgt_disabled) { + for (int32_t i = 0; i < desc->NumDeviceImages; ++i) { + __tgt_image_info *image_info = desc->DeviceImages[i].ImageInfo; + if (image_info && image_info->image_info_version == 1) + fprintf(stderr, "\ + Image %d offload_arch : %s\n", + i, image_info->offload_arch); + else + fprintf(stderr, "\ + Image %d has no offload_arch. Could be from older compiler\n", + i); + } + } + if (PM->TargetOffloadPolicy == tgt_mandatory) + exit(1); + } + DP("Done registering entries!\n"); }