diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -610,6 +610,8 @@ CmdArgs.push_back("gnu"); CmdArgs.push_back("--no-undefined"); CmdArgs.push_back("-shared"); + std::string ArchArg = std::string("-plugin-opt=mcpu=").append(Arch.str()); + CmdArgs.push_back(ArchArg); CmdArgs.push_back("-o"); CmdArgs.push_back(TempFile); @@ -1064,9 +1066,10 @@ Triple TheTriple = Triple(File.TheTriple); auto &LinkerInputFiles = LinkerInput.getSecond(); bool WholeProgram = false; + std::string TheArch = File.Arch; // Run LTO on any bitcode files and replace the input with the result. - if (Error Err = linkBitcodeFiles(LinkerInputFiles, TheTriple, File.Arch, + if (Error Err = linkBitcodeFiles(LinkerInputFiles, TheTriple, TheArch, WholeProgram)) return Err; @@ -1075,7 +1078,7 @@ if (LinkerInputFiles.size() != 1 || !WholeProgram) return createStringError(inconvertibleErrorCode(), "Unable to embed bitcode image for JIT"); - LinkedImages.emplace_back(OFK_OpenMP, TheTriple.getTriple(), File.Arch, + LinkedImages.emplace_back(OFK_OpenMP, TheTriple.getTriple(), TheArch, LinkerInputFiles.front()); continue; } @@ -1086,18 +1089,18 @@ return createStringError(inconvertibleErrorCode(), "Invalid number of inputs for non-RDC mode"); for (OffloadKind Kind : ActiveOffloadKinds[LinkerInput.getFirst()]) - LinkedImages.emplace_back(Kind, TheTriple.getTriple(), File.Arch, + LinkedImages.emplace_back(Kind, TheTriple.getTriple(), TheArch, LinkerInputFiles.front()); continue; } - auto ImageOrErr = linkDevice(LinkerInputFiles, TheTriple, File.Arch); + auto ImageOrErr = linkDevice(LinkerInputFiles, TheTriple, TheArch); if (!ImageOrErr) return ImageOrErr.takeError(); // Create separate images for all the active offload kinds. for (OffloadKind Kind : ActiveOffloadKinds[LinkerInput.getFirst()]) - LinkedImages.emplace_back(Kind, TheTriple.getTriple(), File.Arch, + LinkedImages.emplace_back(Kind, TheTriple.getTriple(), TheArch, *ImageOrErr); } return Error::success(); @@ -1147,6 +1150,9 @@ Error wrapOpenMPImages(Module &M, ArrayRef Images) { SmallVector, 4> SavedBuffers; SmallVector, 4> ImagesToWrap; + SmallVector, 4> OffloadArchs; + std::string Arch; + OffloadArchs.reserve(Images.size()); for (const DeviceFile &File : Images) { llvm::ErrorOr> ImageOrError = llvm::MemoryBuffer::getFileOrSTDIN(File.Filename); @@ -1155,9 +1161,19 @@ ImagesToWrap.emplace_back((*ImageOrError)->getBufferStart(), (*ImageOrError)->getBufferSize()); SavedBuffers.emplace_back(std::move(*ImageOrError)); + + if (Arch.empty()) { + Arch = std::string(File.Arch); + Arch.push_back('\0'); + OffloadArchs.emplace_back(Arch.data(), Arch.size()); + } else { + auto curSize = Arch.size(); + Arch.append(File.Arch); + OffloadArchs.emplace_back(&(Arch.at(curSize)), File.Arch.size()); + } } - if (Error Err = wrapOpenMPBinaries(M, ImagesToWrap)) + if (Error Err = wrapOpenMPBinaries(M, ImagesToWrap, OffloadArchs)) return Err; return Error::success(); } diff --git a/clang/tools/clang-linker-wrapper/OffloadWrapper.h b/clang/tools/clang-linker-wrapper/OffloadWrapper.h --- a/clang/tools/clang-linker-wrapper/OffloadWrapper.h +++ b/clang/tools/clang-linker-wrapper/OffloadWrapper.h @@ -14,8 +14,9 @@ /// Wraps the input device images into the module \p M as global symbols and /// registers the images with the OpenMP Offloading runtime libomptarget. -llvm::Error wrapOpenMPBinaries(llvm::Module &M, - llvm::ArrayRef> Images); +llvm::Error +wrapOpenMPBinaries(llvm::Module &M, llvm::ArrayRef> Images, + llvm::ArrayRef> OffloadArchs); /// Wraps the input fatbinary image into the module \p M as global symbols and /// registers the images with the CUDA runtime. diff --git a/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp b/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp --- a/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp @@ -55,11 +55,35 @@ return PointerType::getUnqual(getEntryTy(M)); } +// This matches the runtime struct definition of __tgt_image_info +// declared in openmp/libomptarget/include/omptarget.h / +// struct __tgt_image_info { +// int32_t version; +// int32_t image_number; +// int32_t number_images; +// char* offload_arch; +// char* target_compile_opts; +// }; +StructType *getImageInfoTy(Module &M) { + LLVMContext &C = M.getContext(); + StructType *ImageInfoTy = StructType::getTypeByName(C, "__tgt_image_info"); + if (!ImageInfoTy) + ImageInfoTy = StructType::create( + "__tgt_image_info", Type::getInt32Ty(C), Type::getInt32Ty(C), + Type::getInt32Ty(C), Type::getInt8PtrTy(C), Type::getInt8PtrTy(C)); + return ImageInfoTy; +} + +PointerType *getImageInfoPtrTy(Module &M) { + return PointerType::getUnqual(getImageInfoTy(M)); +} + // struct __tgt_device_image { // void *ImageStart; // void *ImageEnd; // __tgt_offload_entry *EntriesBegin; // __tgt_offload_entry *EntriesEnd; +// __tgt_image_info *ImageInfo; // }; StructType *getDeviceImageTy(Module &M) { LLVMContext &C = M.getContext(); @@ -67,7 +91,7 @@ if (!ImageTy) ImageTy = StructType::create("__tgt_device_image", Type::getInt8PtrTy(C), Type::getInt8PtrTy(C), getEntryPtrTy(M), - getEntryPtrTy(M)); + getEntryPtrTy(M), getImageInfoPtrTy(M)); return ImageTy; } @@ -114,14 +138,16 @@ /// Image0, /*ImageStart*/ /// Image0 + sizeof(Image0), /*ImageEnd*/ /// __start_omp_offloading_entries, /*EntriesBegin*/ -/// __stop_omp_offloading_entries /*EntriesEnd*/ +/// __stop_omp_offloading_entries, /*EntriesEnd*/ +/// __tgt_image_info /*ImageInfo*/ /// }, /// ... /// { /// ImageN, /*ImageStart*/ /// ImageN + sizeof(ImageN), /*ImageEnd*/ /// __start_omp_offloading_entries, /*EntriesBegin*/ -/// __stop_omp_offloading_entries /*EntriesEnd*/ +/// __stop_omp_offloading_entries, /*EntriesEnd*/ +/// __tgt_image_info /*ImageInfo*/ /// } /// }; /// @@ -133,7 +159,8 @@ /// }; /// /// Global variable that represents BinDesc is returned. -GlobalVariable *createBinDesc(Module &M, ArrayRef> Bufs) { +GlobalVariable *createBinDesc(Module &M, ArrayRef> Bufs, + ArrayRef> OffloadArchs) { LLVMContext &C = M.getContext(); // Create external begin/end symbols for the offload entries table. auto *EntriesB = new GlobalVariable( @@ -161,6 +188,11 @@ auto *Zero = ConstantInt::get(getSizeTTy(M), 0u); Constant *ZeroZero[] = {Zero, Zero}; + auto *NullPtr = llvm::ConstantPointerNull::get(Type::getInt8PtrTy(C)); + unsigned int ImgCount = 0; + std::string OffloadArchBase = "__offload_arch"; + std::string OffloadImageBase = "offload_image_info"; + // Create initializer for the images array. SmallVector ImagesInits; ImagesInits.reserve(Bufs.size()); @@ -179,8 +211,44 @@ auto *ImageE = ConstantExpr::getGetElementPtr(Image->getValueType(), Image, ZeroSize); - ImagesInits.push_back(ConstantStruct::get(getDeviceImageTy(M), ImageB, - ImageE, EntriesB, EntriesE)); + auto OArch = OffloadArchs[ImgCount]; + Constant *OArchV = ConstantDataArray::get(C, OArch); + std::string OffloadArchGV(OffloadArchBase), + OffloadImageGV(OffloadImageBase); + if (ImgCount) { + auto Suffix = std::to_string(ImgCount); + OffloadArchGV.append(".").append(Suffix); + OffloadImageGV.append(".").append(Suffix); + } + + auto *GV = + new GlobalVariable(M, OArchV->getType(), /*isConstant*/ true, + GlobalValue::InternalLinkage, OArchV, OffloadArchGV); + GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + + // store value of these variables (i.e. offload archs) into a custom + // section which will be used by "offload-arch -f". It won't be + // removed during binary stripping. + GV->setSection(".offload_arch_list"); + + auto *RequirementVPtr = + ConstantExpr::getGetElementPtr(GV->getValueType(), GV, Zero); + RequirementVPtr = + ConstantExpr::getBitCast(RequirementVPtr, Type::getInt8PtrTy(C)); + auto *InfoInit = ConstantStruct::get( + getImageInfoTy(M), ConstantInt::get(Type::getInt32Ty(C), 1), + ConstantInt::get(Type::getInt32Ty(C), ImgCount++), + ConstantInt::get(Type::getInt32Ty(C), (uint32_t)OffloadArchs.size()), + RequirementVPtr, + NullPtr // TODO: capture target-compile-opts from clang driver + ); + auto *ImageInfoGV = + new GlobalVariable(M, InfoInit->getType(), + /*isConstant*/ true, GlobalValue::InternalLinkage, + InfoInit, OffloadImageGV); + ImageInfoGV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + ImagesInits.push_back(ConstantStruct::get( + getDeviceImageTy(M), ImageB, ImageE, EntriesB, EntriesE, ImageInfoGV)); } // Then create images array. @@ -531,8 +599,9 @@ } // namespace -Error wrapOpenMPBinaries(Module &M, ArrayRef> Images) { - GlobalVariable *Desc = createBinDesc(M, Images); +Error wrapOpenMPBinaries(Module &M, ArrayRef> Images, + ArrayRef> OffloadArchs) { + GlobalVariable *Desc = createBinDesc(M, Images, OffloadArchs); if (!Desc) return createStringError(inconvertibleErrorCode(), "No binary descriptors created."); diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -119,12 +119,42 @@ int32_t reserved; // Reserved, to be used by the runtime library. }; +/// __tgt_image_info: +/// +/// The information in this struct is provided in the clang-linker-wrapper +/// as a call to __tgt_register_image_info for each image in the library +/// of images also created by the clang-linker-wrapper. +/// __tgt_register_image_info is called for each image BEFORE the single +/// call to __tgt_register_lib so that image information is available +/// before they are loaded. clang-linker-wrapper gets this image information +/// from command line arguments provided by the clang driver when it creates +/// the call to the __clang-linker-wrapper command. +/// This architecture allows the binary image (pointed to by ImageStart and +/// ImageEnd in __tgt_device_image) to remain architecture indenendent. +/// That is, the architecture independent part of the libomptarget runtime +/// does not need to peer inside the image to determine if it is loadable +/// even though in most cases the image is an elf object. +/// There is one __tgt_image_info for each __tgt_device_image. For backward +/// compabibility, no changes are allowed to either __tgt_device_image or +/// __tgt_bin_desc. The absense of __tgt_image_info is the indication that +/// the runtime is being used on a binary created by an old version of +/// the compiler. +/// +struct __tgt_image_info { + int32_t version; // The version of this struct + int32_t image_number; // Image number in image library starting from 0 + int32_t number_images; // Number of images, used for initial allocation + char *offload_arch; // e.g. sm_30, sm_70, gfx906, includes features + char *compile_opts; // reserved for future use +}; + /// This struct is a record of the device image information struct __tgt_device_image { void *ImageStart; // Pointer to the target code start void *ImageEnd; // Pointer to the target code end __tgt_offload_entry *EntriesBegin; // Begin of table with all target entries __tgt_offload_entry *EntriesEnd; // End of table (non inclusive) + __tgt_image_info *ImageInfo; // Metadata about the image }; /// This struct is a record of all the host code that may be offloaded to a @@ -136,6 +166,15 @@ __tgt_offload_entry *HostEntriesEnd; // End of table (non inclusive) }; +/// __tgt_active_offload_env +/// +/// This structure is created by __tgt_get_active_offload_env and is used +/// to determine compatibility of the images with the current environment +/// that is "in play". +struct __tgt_active_offload_env { +char *capabilities; // string returned by offload-arch -c +}; + /// This struct contains the offload entries identified by the target runtime struct __tgt_target_table { __tgt_offload_entry *EntriesBegin; // Begin of the table with all the entries diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -13,6 +13,7 @@ #include "rtl.h" #include "device.h" #include "private.h" +//#include "llvm/OffloadArch/OffloadArch.h" #include #include @@ -20,6 +21,8 @@ #include #include #include +// It's strange we do not have llvm tools for openmp runtime, so we use stat +#include // List of all plugins that can support offloading. static const char *RTLNames[] = { @@ -351,18 +354,108 @@ initRTLonce(R); } +/// Query runtime capabilities of this system by calling offload-arch -c +/// offload_arch_output_buffer is persistant storage returned by this +/// __tgt_get_active_offload_env. +static void +__tgt_get_active_offload_env(__tgt_active_offload_env *active_env, + char *offload_arch_output_buffer, + size_t offload_arch_output_buffer_size) { + + // If OFFLOAD_ARCH_OVERRIDE env varible is present then use its value instead + // of querying it using LLVMOffloadArch library. + if (char *OffloadArchEnvVar = getenv("OFFLOAD_ARCH_OVERRIDE")) { + if (OffloadArchEnvVar) { + active_env->capabilities = OffloadArchEnvVar; + return; + } + } + // Qget runtime capabilities of this system with libLLVMOffloadArch.a + // if (int rc = getRuntimeCapabilities(offload_arch_output_buffer, + // offload_arch_output_buffer_size)) + // return; + // active_env->capabilities = offload_arch_output_buffer; + // return; +} + +std::vector _splitstrings(char *input, const char *sep) { + std::vector split_strings; + std::string s(input); + std::string delimiter(sep); + size_t pos = 0; + while ((pos = s.find(delimiter)) != std::string::npos) { + if (pos != 0) + split_strings.push_back(s.substr(0, pos)); + s.erase(0, pos + delimiter.length()); + } + if (s.length() > 1) + split_strings.push_back(s.substr(0, s.length())); + return split_strings; +} + +static bool _ImageIsCompatibleWithEnv(__tgt_image_info *image_info, + __tgt_active_offload_env *active_env) { + // get_image_info will return null if no image information was registered. + // If no image information, assume application built with old compiler and + // check each image. + if (!image_info) + return true; + + if (!active_env->capabilities) + return false; + + // Each runtime requirement for the compiled image is stored in + // the image_info->offload_arch (TargetID) string. + // Each runtime capability obtained from "offload-arch -c" is stored in + // actvie_env->capabilities (TargetID) string. + // If every requirement has a matching capability, then the image + // is compatible with active environment + + std::vector reqs = _splitstrings(image_info->offload_arch, ":"); + std::vector caps = _splitstrings(active_env->capabilities, ":"); + + bool is_compatible = true; + for (auto req : reqs) { + bool missing_capability = true; + for (auto capability : caps) + if (capability == req) + missing_capability = false; + if (missing_capability) { + DP("Image requires %s but runtime capability %s is missing.\n", + image_info->offload_arch, req.c_str()); + is_compatible = false; + } + } + return is_compatible; +} + +#define MAX_CAPS_STR_SIZE 1024 void RTLsTy::RegisterLib(__tgt_bin_desc *desc) { + + // Get the current active offload environment + __tgt_active_offload_env offload_env = {nullptr}; + // Need a buffer to hold results of offload-arch -c command + size_t offload_arch_output_buffer_size = MAX_CAPS_STR_SIZE; + std::vector offload_arch_output_buffer; + offload_arch_output_buffer.resize(offload_arch_output_buffer_size); + __tgt_get_active_offload_env(&offload_env, offload_arch_output_buffer.data(), + offload_arch_output_buffer_size); + + RTLInfoTy *FoundRTL = NULL; PM->RTLsMtx.lock(); // Register the images with the RTLs that understand them, if any. for (int32_t i = 0; i < desc->NumDeviceImages; ++i) { // Obtain the image. __tgt_device_image *img = &desc->DeviceImages[i]; - RTLInfoTy *FoundRTL = nullptr; - + // Get corresponding image info offload_arch and check with runtime + if (!_ImageIsCompatibleWithEnv(img->ImageInfo, &offload_env)) + continue; + FoundRTL = NULL; // Scan the RTLs that have associated images until we find one that supports // the current image. for (auto &R : AllRTLs) { + if (!R.is_valid_binary(img)) { DP("Image " DPxMOD " is NOT compatible with RTL %s!\n", DPxPTR(img->ImageStart), R.RTLName.c_str()); @@ -407,6 +500,39 @@ } PM->RTLsMtx.unlock(); + if (!FoundRTL) { + if (PM->TargetOffloadPolicy == tgt_mandatory) + fprintf(stderr, "ERROR:\ + Runtime capabilities do NOT meet any offload image offload_arch\n\ + and the OMP_TARGET_OFFLOAD policy is mandatory. Terminating!\n\ + Runtime capabilities : %s\n", + offload_env.capabilities); + else if (PM->TargetOffloadPolicy == tgt_disabled) + fprintf(stderr, "WARNING: Offloading is disabled.\n"); + else + fprintf( + stderr, + "WARNING: Runtime capabilities do NOT meet any image offload_arch.\n\ + So device offloading is now disabled.\n\ + Runtime capabilities : %s\n", + offload_env.capabilities); + if (PM->TargetOffloadPolicy != tgt_disabled) { + for (int32_t i = 0; i < desc->NumDeviceImages; ++i) { + __tgt_image_info *image_info = desc->DeviceImages[i].ImageInfo; + if (image_info) + fprintf(stderr, "\ + Image %d offload_arch : %s\n", + i, image_info->offload_arch); + else + fprintf(stderr, "\ + Image %d has no offload_arch. Could be from older compiler\n", + i); + } + } + if (PM->TargetOffloadPolicy == tgt_mandatory) + exit(1); + } + DP("Done registering entries!\n"); }