diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -172,6 +172,15 @@ std::string Filename; }; +/// Information for an offloading image obtained after linking device images. +struct ImageInfo { + ImageInfo(StringRef Arch, StringRef Imagename) + : Arch(Arch), Imagename(Imagename) {} + + std::string Arch; + std::string Imagename; +}; + namespace llvm { /// Helper that allows DeviceFile to be used as a key in a DenseMap. template <> struct DenseMapInfo { @@ -702,6 +711,8 @@ CmdArgs.push_back("gnu"); CmdArgs.push_back("--no-undefined"); CmdArgs.push_back("-shared"); + std::string ArchArg = std::string("-plugin-opt=mcpu=").append(Arch.str()); + CmdArgs.push_back(ArchArg); CmdArgs.push_back("-o"); CmdArgs.push_back(TempFile); @@ -1110,7 +1121,7 @@ /// Runs the appropriate linking action on all the device files specified in \p /// DeviceFiles. The linked device images are returned in \p LinkedImages. Error linkDeviceFiles(ArrayRef DeviceFiles, - SmallVectorImpl &LinkedImages) { + SmallVectorImpl &LinkedImages) { // Get the list of inputs for a specific device. DenseMap> LinkerInputMap; for (auto &File : DeviceFiles) @@ -1120,17 +1131,18 @@ for (auto &LinkerInput : LinkerInputMap) { DeviceFile &File = LinkerInput.getFirst(); Triple TheTriple = Triple(File.TheTriple); + std::string TheArch = File.Arch; bool WholeProgram = false; // Run LTO on any bitcode files and replace the input with the result. if (Error Err = linkBitcodeFiles(LinkerInput.getSecond(), TheTriple, - File.Arch, WholeProgram)) + TheArch, WholeProgram)) return Err; // If we are embedding bitcode for JIT, skip the final device linking. if (EmbedBitcode) { assert(!LinkerInput.getSecond().empty() && "No bitcode image to embed"); - LinkedImages.push_back(LinkerInput.getSecond().front()); + LinkedImages.emplace_back(TheArch, LinkerInput.getSecond().front()); continue; } @@ -1138,15 +1150,15 @@ // CUDA in non-RDC mode. if (WholeProgram && TheTriple.isNVPTX()) { assert(!LinkerInput.getSecond().empty() && "No non-RDC image to embed"); - LinkedImages.push_back(LinkerInput.getSecond().front()); + LinkedImages.emplace_back(TheArch, LinkerInput.getSecond().front()); continue; } - auto ImageOrErr = linkDevice(LinkerInput.getSecond(), TheTriple, File.Arch); + auto ImageOrErr = linkDevice(LinkerInput.getSecond(), TheTriple, TheArch); if (!ImageOrErr) return ImageOrErr.takeError(); - LinkedImages.push_back(*ImageOrErr); + LinkedImages.emplace_back(TheArch, *ImageOrErr); } return Error::success(); } @@ -1193,11 +1205,14 @@ /// Creates the object file containing the device image and runtime registration /// code from the device images stored in \p Images. -Expected wrapDeviceImages(ArrayRef Images) { +Expected wrapDeviceImages(ArrayRef Images) { SmallVector, 4> SavedBuffers; SmallVector, 4> ImagesToWrap; - - for (StringRef ImageFilename : Images) { + SmallVector, 4u> OffloadArchs; + OffloadArchs.reserve(Images.size()); + + for (ImageInfo Image : Images) { + StringRef ImageFilename = Image.Imagename; llvm::ErrorOr> ImageOrError = llvm::MemoryBuffer::getFileOrSTDIN(ImageFilename); if (std::error_code EC = ImageOrError.getError()) @@ -1205,12 +1220,16 @@ ImagesToWrap.emplace_back((*ImageOrError)->getBufferStart(), (*ImageOrError)->getBufferSize()); SavedBuffers.emplace_back(std::move(*ImageOrError)); + + std::string Arch = Image.Arch; + Arch.append("\0"); + OffloadArchs.emplace_back(Arch.data(), Arch.size() + 1); } LLVMContext Context; Module M("offload.wrapper.module", Context); M.setTargetTriple(HostTriple); - if (Error Err = wrapBinaries(M, ImagesToWrap)) + if (Error Err = wrapBinaries(M, ImagesToWrap, OffloadArchs)) return std::move(Err); if (PrintWrappedModule) @@ -1347,7 +1366,7 @@ DeviceFiles.push_back(getBitcodeLibrary(LibraryStr)); // Link the device images extracted from the linker input. - SmallVector LinkedImages; + SmallVector LinkedImages; if (Error Err = linkDeviceFiles(DeviceFiles, LinkedImages)) return reportError(std::move(Err)); diff --git a/clang/tools/clang-linker-wrapper/OffloadWrapper.h b/clang/tools/clang-linker-wrapper/OffloadWrapper.h --- a/clang/tools/clang-linker-wrapper/OffloadWrapper.h +++ b/clang/tools/clang-linker-wrapper/OffloadWrapper.h @@ -15,6 +15,7 @@ /// Wrap the input device images into the module \p M as global symbols and /// registers the images with the OpenMP Offloading runtime libomptarget. llvm::Error wrapBinaries(llvm::Module &M, - llvm::ArrayRef> Images); + llvm::ArrayRef> Images, + llvm::ArrayRef> OffloadArchs); #endif diff --git a/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp b/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp --- a/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp @@ -93,6 +93,29 @@ return PointerType::getUnqual(getBinDescTy(M)); } + // This matches the runtime struct definition of __tgt_image_info + // declared in openmp/libomptarget/include/omptarget.h / + // struct __tgt_image_info { + // int32_t version; + // int32_t image_number; + // int32_t number_images; + // char* offload_arch; + // char* target_compile_opts; + // }; + StructType *getImageInfoTy(Module &M) { + LLVMContext &C = M.getContext(); + StructType *ImageInfoTy = StructType::getTypeByName(C, "__tgt_image_info"); + if (!ImageInfoTy) + ImageInfoTy = StructType::create( + "__tgt_image_info", Type::getInt32Ty(C), Type::getInt32Ty(C), + Type::getInt32Ty(C), Type::getInt8PtrTy(C), Type::getInt8PtrTy(C)); + return ImageInfoTy; + } + + PointerType *getImageInfoPtrTy(Module &M) { + return PointerType::getUnqual(getImageInfoTy(M)); + } + /// Creates binary descriptor for the given device images. Binary descriptor /// is an object that is passed to the offloading runtime at program startup /// and it describes all device images available in the executable or shared @@ -205,7 +228,7 @@ ".omp_offloading.descriptor"); } -void createRegisterFunction(Module &M, GlobalVariable *BinDesc) { +void createRegisterFunction(Module &M, GlobalVariable *BinDesc, ArrayRef> OffloadArchs) { LLVMContext &C = M.getContext(); auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false); auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage, @@ -220,6 +243,56 @@ // Construct function body IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func)); + // Create calls to __tgt_register_image_info for each image + auto *NullPtr = llvm::ConstantPointerNull::get(Builder.getInt8PtrTy()); + auto *Zero = ConstantInt::get(getSizeTTy(M), 0u); + auto *RegInfoFuncTy = + FunctionType::get(Type::getVoidTy(C), getImageInfoPtrTy(M), false); + FunctionCallee RegInfoFuncC = + M.getOrInsertFunction("__tgt_register_image_info", RegInfoFuncTy); + unsigned int ImgCount = 0; + std::string OffloadArchBase = "__offload_arch"; + std::string OffloadImageBase = "offload_image_info"; + + for (ArrayRef OArch : OffloadArchs) { + Constant *OArchV = ConstantDataArray::get(C, OArch); + std::string OffloadArchGV(OffloadArchBase), OffloadImageGV(OffloadImageBase); + if(ImgCount) { + auto Suffix = std::to_string(ImgCount); + OffloadArchGV.append(".").append(Suffix); + OffloadImageGV.append(".").append(Suffix); + } + + auto *GV = + new GlobalVariable(M, OArchV->getType(), /*isConstant*/ true, + GlobalValue::InternalLinkage, OArchV, + OffloadArchGV); + GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + + // store value of these variables (i.e. offload archs) into a custom + // section which will be used by "offload-arch -f". It won't be + // removed during binary stripping. + GV->setSection(".offload_arch_list"); + + auto *RequirementVPtr = + ConstantExpr::getGetElementPtr(GV->getValueType(), GV, Zero); + RequirementVPtr = + ConstantExpr::getBitCast(RequirementVPtr, Type::getInt8PtrTy(C)); + auto *InfoInit = ConstantStruct::get( + getImageInfoTy(M), ConstantInt::get(Type::getInt32Ty(C), 1), + ConstantInt::get(Type::getInt32Ty(C), ImgCount++), + ConstantInt::get(Type::getInt32Ty(C), (uint32_t)OffloadArchs.size()), + RequirementVPtr, + NullPtr // TODO: capture target-compile-opts from clang driver + ); + auto *ImageInfoGV = new GlobalVariable( + M, InfoInit->getType(), + /*isConstant*/ true, GlobalValue::InternalLinkage, InfoInit, + OffloadImageGV); + ImageInfoGV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + Builder.CreateCall(RegInfoFuncC, ImageInfoGV); + } + Builder.CreateCall(RegFuncC, BinDesc); Builder.CreateRetVoid(); @@ -257,12 +330,12 @@ } // namespace -Error wrapBinaries(Module &M, ArrayRef> Images) { +Error wrapBinaries(Module &M, ArrayRef> Images, ArrayRef> OffloadArchs) { GlobalVariable *Desc = createBinDesc(M, Images); if (!Desc) return createStringError(inconvertibleErrorCode(), "No binary descriptors created."); - createRegisterFunction(M, Desc); + createRegisterFunction(M, Desc, OffloadArchs); createUnregisterFunction(M, Desc); return Error::success(); } diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -136,6 +136,44 @@ __tgt_offload_entry *HostEntriesEnd; // End of table (non inclusive) }; +/// __tgt_image_info: +/// +/// The information in this struct is provided in the clang-linker-wrapper +/// as a call to __tgt_register_image_info for each image in the library +/// of images also created by the clang-linker-wrapper. +/// __tgt_register_image_info is called for each image BEFORE the single +/// call to __tgt_register_lib so that image information is available +/// before they are loaded. clang-linker-wrapper gets this image information +/// from command line arguments provided by the clang driver when it creates +/// the call to the __clang-linker-wrapper command. +/// This architecture allows the binary image (pointed to by ImageStart and +/// ImageEnd in __tgt_device_image) to remain architecture indenendent. +/// That is, the architecture independent part of the libomptarget runtime +/// does not need to peer inside the image to determine if it is loadable +/// even though in most cases the image is an elf object. +/// There is one __tgt_image_info for each __tgt_device_image. For backward +/// compabibility, no changes are allowed to either __tgt_device_image or +/// __tgt_bin_desc. The absense of __tgt_image_info is the indication that +/// the runtime is being used on a binary created by an old version of +/// the compiler. +/// +struct __tgt_image_info { + int32_t version; // The version of this struct + int32_t image_number; // Image number in image library starting from 0 + int32_t number_images; // Number of images, used for initial allocation + char *offload_arch; // e.g. sm_30, sm_70, gfx906, includes features + char *compile_opts; // reserved for future use +}; + +/// __tgt_active_offload_env +/// +/// This structure is created by __tgt_get_active_offload_env and is used +/// to determine compatibility of the images with the current environment +/// that is "in play". +struct __tgt_active_offload_env { +char *capabilities; // string returned by offload-arch -c +}; + /// This struct contains the offload entries identified by the target runtime struct __tgt_target_table { __tgt_offload_entry *EntriesBegin; // Begin of the table with all the entries @@ -236,6 +274,12 @@ /// Initialize all RTLs at once void __tgt_init_all_rtls(); +/// adds an image information struct, called for each image +void __tgt_register_image_info(__tgt_image_info *imageInfo); + +/// gets pointer to image information for specified image number +/// Returns nullptr for apps built with old version of compiler +__tgt_image_info *__tgt_get_image_info(uint32_t image_num); /// removes a target shared library from the target execution image void __tgt_unregister_lib(__tgt_bin_desc *desc); diff --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports --- a/openmp/libomptarget/src/exports +++ b/openmp/libomptarget/src/exports @@ -2,6 +2,7 @@ global: __tgt_register_requires; __tgt_register_lib; + __tgt_register_image_info; __tgt_unregister_lib; __tgt_init_all_rtls; __tgt_target_data_begin; diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -43,6 +43,29 @@ PM->RTLs.RegisterLib(desc); } +static __tgt_image_info **__tgt_AllImageInfos; +static int __tgt_num_registered_images = 0; +EXTERN void __tgt_register_image_info(__tgt_image_info *imageInfo) { + + DP(" register_image_info image %d of %d offload-arch:%s VERSION:%d\n", + imageInfo->image_number, imageInfo->number_images, imageInfo->offload_arch, + imageInfo->version); + if (!__tgt_AllImageInfos) + __tgt_AllImageInfos = (__tgt_image_info **)malloc( + sizeof(__tgt_image_info *) * imageInfo->number_images); + __tgt_AllImageInfos[imageInfo->image_number] = imageInfo; + __tgt_num_registered_images = imageInfo->number_images; +} + +//////////////////////////////////////////////////////////////////////////////// +/// Return pointer to image information if it was registered +EXTERN __tgt_image_info *__tgt_get_image_info(unsigned image_number) { + if (__tgt_num_registered_images) + return __tgt_AllImageInfos[image_number]; + else + return nullptr; +} + //////////////////////////////////////////////////////////////////////////////// /// Initialize all available devices without registering any image EXTERN void __tgt_init_all_rtls() { PM->RTLs.initAllRTLs(); } @@ -59,6 +82,10 @@ } } } + if (__tgt_num_registered_images) { + free(__tgt_AllImageInfos); + __tgt_num_registered_images = 0; + } } /// creates host-to-target data mapping, stores it in the diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -13,6 +13,7 @@ #include "rtl.h" #include "device.h" #include "private.h" +#include "llvm/OffloadArch/OffloadArch.h" #include #include @@ -20,6 +21,8 @@ #include #include #include +// It's strange we do not have llvm tools for openmp runtime, so we use stat +#include // List of all plugins that can support offloading. static const char *RTLNames[] = { @@ -351,18 +354,109 @@ initRTLonce(R); } +/// Query runtime capabilities of this system by calling offload-arch -c +/// offload_arch_output_buffer is persistant storage returned by this +/// __tgt_get_active_offload_env. +static void +__tgt_get_active_offload_env(__tgt_active_offload_env *active_env, + char *offload_arch_output_buffer, + size_t offload_arch_output_buffer_size) { + + // If OFFLOAD_ARCH_OVERRIDE env varible is present then use its value instead of + // querying it using LLVMOffloadArch library. + if (char *OffloadArchEnvVar = getenv("OFFLOAD_ARCH_OVERRIDE")) { + if (OffloadArchEnvVar) { + active_env->capabilities = OffloadArchEnvVar; + return; + } + } + // Qget runtime capabilities of this system with libLLVMOffloadArch.a + if (int rc = getRuntimeCapabilities(offload_arch_output_buffer, + offload_arch_output_buffer_size)) + return; + active_env->capabilities = offload_arch_output_buffer; + return; +} + +std::vector _splitstrings(char *input, const char *sep) { + std::vector split_strings; + std::string s(input); + std::string delimiter(sep); + size_t pos = 0; + while ((pos = s.find(delimiter)) != std::string::npos) { + if (pos != 0) + split_strings.push_back(s.substr(0, pos)); + s.erase(0, pos + delimiter.length()); + } + if (s.length() > 1) + split_strings.push_back(s.substr(0, s.length())); + return split_strings; +} + +static bool _ImageIsCompatibleWithEnv(__tgt_image_info *img_info, + __tgt_active_offload_env *active_env) { + // get_image_info will return null if no image information was registered. + // If no image information, assume application built with old compiler and + // check each image. + if (!img_info) + return true; + + if (!active_env->capabilities) + return false; + + // Each runtime requirement for the compiled image is stored in + // the img_info->offload_arch (TargetID) string. + // Each runtime capability obtained from "offload-arch -c" is stored in + // actvie_env->capabilities (TargetID) string. + // If every requirement has a matching capability, then the image + // is compatible with active environment + + std::vector reqs = _splitstrings(img_info->offload_arch, ":"); + std::vector caps = _splitstrings(active_env->capabilities, ":"); + + bool is_compatible = true; + for (auto req : reqs) { + bool missing_capability = true; + for (auto capability : caps) + if (capability == req) + missing_capability = false; + if (missing_capability) { + DP("Image requires %s but runtime capability %s is missing.\n", + img_info->offload_arch, req.c_str()); + is_compatible = false; + } + } + return is_compatible; +} + +#define MAX_CAPS_STR_SIZE 1024 void RTLsTy::RegisterLib(__tgt_bin_desc *desc) { + + // Get the current active offload environment + __tgt_active_offload_env offload_env = { nullptr }; + // Need a buffer to hold results of offload-arch -c command + size_t offload_arch_output_buffer_size = MAX_CAPS_STR_SIZE; + std::vector offload_arch_output_buffer; + offload_arch_output_buffer.resize(offload_arch_output_buffer_size); + __tgt_get_active_offload_env(&offload_env, offload_arch_output_buffer.data(), + offload_arch_output_buffer_size); + + RTLInfoTy *FoundRTL = NULL; PM->RTLsMtx.lock(); // Register the images with the RTLs that understand them, if any. for (int32_t i = 0; i < desc->NumDeviceImages; ++i) { // Obtain the image. __tgt_device_image *img = &desc->DeviceImages[i]; - RTLInfoTy *FoundRTL = nullptr; - + // Get corresponding image info offload_arch and check with runtime + __tgt_image_info *img_info = __tgt_get_image_info(i); + if (!_ImageIsCompatibleWithEnv(img_info, &offload_env)) + continue; + FoundRTL = NULL; // Scan the RTLs that have associated images until we find one that supports // the current image. for (auto &R : AllRTLs) { + if (!R.is_valid_binary(img)) { DP("Image " DPxMOD " is NOT compatible with RTL %s!\n", DPxPTR(img->ImageStart), R.RTLName.c_str()); @@ -407,6 +501,39 @@ } PM->RTLsMtx.unlock(); + if (!FoundRTL) { + if (PM->TargetOffloadPolicy == tgt_mandatory) + fprintf(stderr, "ERROR:\ + Runtime capabilities do NOT meet any offload image offload_arch\n\ + and the OMP_TARGET_OFFLOAD policy is mandatory. Terminating!\n\ + Runtime capabilities : %s\n", + offload_env.capabilities); + else if (PM->TargetOffloadPolicy == tgt_disabled) + fprintf(stderr, "WARNING: Offloading is disabled.\n"); + else + fprintf( + stderr, + "WARNING: Runtime capabilities do NOT meet any image offload_arch.\n\ + So device offloading is now disabled.\n\ + Runtime capabilities : %s\n", + offload_env.capabilities); + if (PM->TargetOffloadPolicy != tgt_disabled) { + for (int32_t i = 0; i < desc->NumDeviceImages; ++i) { + __tgt_image_info *img_info = __tgt_get_image_info(i); + if (img_info) + fprintf(stderr, "\ + Image %d offload_arch : %s\n", + i, img_info->offload_arch); + else + fprintf(stderr, "\ + Image %d has no offload_arch. Could be from older compiler\n", + i); + } + } + if (PM->TargetOffloadPolicy == tgt_mandatory) + exit(1); + } + DP("Done registering entries!\n"); }