Index: llvm/include/llvm/Transforms/IPO/OpenMPOpt.h =================================================================== --- llvm/include/llvm/Transforms/IPO/OpenMPOpt.h +++ llvm/include/llvm/Transforms/IPO/OpenMPOpt.h @@ -9,14 +9,334 @@ #ifndef LLVM_TRANSFORMS_IPO_OPENMP_OPT_H #define LLVM_TRANSFORMS_IPO_OPENMP_OPT_H +#include "llvm/ADT/EnumeratedArray.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Frontend/OpenMP/OMPConstants.h" +#include "llvm/Frontend/OpenMP/OMPIRBuilder.h" +#include "llvm/Transforms/IPO/Attributor.h" +#include "llvm/Transforms/Utils/CallGraphUpdater.h" #include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/Analysis/LazyCallGraph.h" #include "llvm/IR/PassManager.h" +#include "llvm/Analysis/MemorySSA.h" namespace llvm { - namespace omp { +using namespace types; + +/// OpenMP specific information. For now, stores RFIs and ICVs also needed for +/// Attributor runs. +struct OMPInformationCache : public InformationCache { + OMPInformationCache(Module &M, AnalysisGetter &AG, + BumpPtrAllocator &Allocator, SetVector *CGSCC, + SmallPtrSetImpl &ModuleSlice) + : InformationCache(M, AG, Allocator, CGSCC), ModuleSlice(ModuleSlice), + OMPBuilder(M) { + initializeTypes(M); + initializeRuntimeFunctions(); + initializeInternalControlVars(); + + OMPBuilder.initialize(); + } + + /// Generic information that describes an internal control variable. + struct InternalControlVarInfo { + /// The kind, as described by InternalControlVar enum. + InternalControlVar Kind; + + /// The name of the ICV. + StringRef Name; + + /// Environment variable associated with this ICV. + StringRef EnvVarName; + + /// Initial value kind. + ICVInitValue InitKind; + + /// Initial value. + ConstantInt *InitValue; + + /// Setter RTL function associated with this ICV. + RuntimeFunction Setter; + + /// Getter RTL function associated with this ICV. + RuntimeFunction Getter; + + /// RTL Function corresponding to the override clause of this ICV + RuntimeFunction Clause; + }; + + /// Generic information that describes a runtime function + struct RuntimeFunctionInfo { + + /// The kind, as described by the RuntimeFunction enum. + RuntimeFunction Kind; + + /// The name of the function. + StringRef Name; + + /// Flag to indicate a variadic function. + bool IsVarArg; + + /// The return type of the function. + Type *ReturnType; + + /// The argument types of the function. + SmallVector ArgumentTypes; + + /// The declaration if available. + Function *Declaration = nullptr; + + /// Uses of this runtime function per function containing the use. + using UseVector = SmallVector; + + /// Return the vector of uses in function \p F. + UseVector &getOrCreateUseVector(Function *F) { + std::unique_ptr &UV = UsesMap[F]; + if (!UV) + UV = std::make_unique(); + return *UV; + } + + /// Return the vector of uses in function \p F or `nullptr` if there are + /// none. + const UseVector *getUseVector(Function &F) const { + auto I = UsesMap.find(&F); + if (I != UsesMap.end()) + return I->second.get(); + return nullptr; + } + + /// Return how many functions contain uses of this runtime function. + size_t getNumFunctionsWithUses() const { return UsesMap.size(); } + + /// Return the number of arguments (or the minimal number for variadic + /// functions). + size_t getNumArgs() const { return ArgumentTypes.size(); } + + /// Run the callback \p CB on each use and forget the use if the result is + /// true. The callback will be fed the function in which the use was + /// encountered as second argument. + void foreachUse(function_ref CB) { + for (auto &It : UsesMap) + foreachUse(CB, It.first, It.second.get()); + } + + /// Run the callback \p CB on each use within the function \p F and forget + /// the use if the result is true. + void foreachUse(function_ref CB, Function *F, + UseVector *Uses = nullptr) { + SmallVector ToBeDeleted; + ToBeDeleted.clear(); + + unsigned Idx = 0; + UseVector &UV = Uses ? *Uses : getOrCreateUseVector(F); + + for (Use *U : UV) { + if (CB(*U, *F)) + ToBeDeleted.push_back(Idx); + ++Idx; + } + + // Remove the to-be-deleted indices in reverse order as prior + // modifcations will not modify the smaller indices. + while (!ToBeDeleted.empty()) { + unsigned Idx = ToBeDeleted.pop_back_val(); + UV[Idx] = UV.back(); + UV.pop_back(); + } + } + + private: + /// Map from functions to all uses of this runtime function contained in + /// them. + DenseMap> UsesMap; + }; + + /// The slice of the module we are allowed to look at. + SmallPtrSetImpl &ModuleSlice; + + /// An OpenMP-IR-Builder instance + OpenMPIRBuilder OMPBuilder; + + /// Map from runtime function kind to the runtime function description. + EnumeratedArray + RFIs; + + /// Map from ICV kind to the ICV description. + EnumeratedArray + ICVs; + + /// Helper to initialize all internal control variable information for those + /// defined in OMPKinds.def. + void initializeInternalControlVars(); + + /// Returns true if the function declaration \p F matches the runtime + /// function types, that is, return type \p RTFRetType, and argument types + /// \p RTFArgTypes. + static bool declMatchesRTFTypes(Function *F, Type *RTFRetType, + SmallVector &RTFArgTypes); + + /// Helper to initialize all runtime function information for those defined + /// in OpenMPKinds.def. + void initializeRuntimeFunctions(); +}; + +struct OpenMPOpt { + + using OptimizationRemarkGetter = + function_ref; + + OpenMPOpt(SmallVectorImpl &SCC, CallGraphUpdater &CGUpdater, + OptimizationRemarkGetter OREGetter, + OMPInformationCache &OMPInfoCache) + : M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater), + OREGetter(OREGetter), OMPInfoCache(OMPInfoCache) {} + + /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice. + bool run(); + + /// Return the call if \p U is a callee use in a regular call. If \p RFI is + /// given it has to be the callee or a nullptr is returned. + static CallInst *getCallIfRegularCall( + Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr); + + /// Return the call if \p V is a regular call. If \p RFI is given it has to be + /// the callee or a nullptr is returned. + static CallInst *getCallIfRegularCall( + Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr); + +private: + /// Try to eliminiate runtime calls by reusing existing ones. + bool deduplicateRuntimeCalls(); + + /// Try to delete parallel regions if possible. + bool deleteParallelRegions(); + + /// Tries to hide the latency of runtime calls that involve host to + /// device memory transfers. + bool hideMemTransfersLatency(); + + static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent, + bool GlobalOnly, bool &SingleChoice); + + /// Return an `struct ident_t*` value that represents the ones used in the + /// calls of \p RFI inside of \p F. If \p GlobalOnly is true, we will not + /// return a local `struct ident_t*`. For now, if we cannot find a suitable + /// return value we create one from scratch. We also do not yet combine + /// information, e.g., the source locations, see combinedIdentStruct. + Value * + getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI, + Function &F, bool GlobalOnly); + + /// Try to eliminiate calls of \p RFI in \p F by reusing an existing one or + /// \p ReplVal if given. + bool deduplicateRuntimeCalls(Function &F, + OMPInformationCache::RuntimeFunctionInfo &RFI, + Value *ReplVal = nullptr); + + /// Collect arguments that represent the global thread id in \p GTIdArgs. + void collectGlobalThreadIdArguments(SmallSetVector >IdArgs); + + /// Emit a remark generically + /// + /// This template function can be used to generically emit a remark. The + /// RemarkKind should be one of the following: + /// - OptimizationRemark to indicate a successful optimization attempt + /// - OptimizationRemarkMissed to report a failed optimization attempt + /// - OptimizationRemarkAnalysis to provide additional information about an + /// optimization attempt + /// + /// The remark is built using a callback function provided by the caller that + /// takes a RemarkKind as input and returns a RemarkKind. + template > + void emitRemark(Instruction *Inst, StringRef RemarkName, + RemarkCallBack &&RemarkCB); + + /// Emit a remark on a function. Since only OptimizationRemark is supporting + /// this, it can't be made generic. + void emitRemarkOnFunction( + Function *F, StringRef RemarkName, + function_ref &&RemarkCB); + + /// The underyling module. + Module &M; + + /// The SCC we are operating on. + SmallVectorImpl &SCC; + + /// Callback to update the call graph, the first argument is a removed call, + /// the second an optional replacement call. + CallGraphUpdater &CGUpdater; + + /// Callback to get an OptimizationRemarkEmitter from a Function * + OptimizationRemarkGetter OREGetter; + + /// OpenMP-specific information cache. Also Used for Attributor runs. + OMPInformationCache &OMPInfoCache; +}; + +/// Used to store/manipualte information about a runtime call that involves +/// host to device memory offloading. +struct HideMemTransferLatency { + + struct MemoryTransfer { + struct OffloadArray { + SmallVector LastAccesses; + SmallVector StoredAddresses; + }; + + CallBase *RuntimeCall; + MemorySSA &MSSA; + std::unique_ptr BasePtrs; + std::unique_ptr Ptrs; + std::unique_ptr Sizes; + + MemoryTransfer(CallBase *RuntimeCall, MemorySSA &MSSA) : + RuntimeCall{RuntimeCall}, MSSA{MSSA}, + BasePtrs {std::make_unique()}, + Ptrs {std::make_unique()}, + Sizes {std::make_unique()} + {} + + static bool isFilled(OffloadArray &OA) { + for (auto *Acc : OA.LastAccesses) + if (!Acc) + return false; + + for (auto *Addr : OA.StoredAddresses) + if (!Addr) + return false; + return true; + } + }; + + HideMemTransferLatency(CallBase *RuntimeCall, + MemorySSA &MSSA, + const DataLayout &DL) + : MT(RuntimeCall, MSSA), DL(DL) {} + + bool splitMemoryTransfer(); + bool getValuesInOfflArrays(); + bool getValuesInOfflArray(Value *OfflArray, + MemoryTransfer::OffloadArray &Dst, + User *Before = nullptr); + + /// Returns the integer representation of \p V. + static uint64_t getIntLiteral(const Value *V) { + assert(V && "Getting Integer value of nullptr"); + return (dyn_cast(V))->getZExtValue(); + } + +private: + MemoryTransfer MT; + const DataLayout &DL; +}; + /// Helper to remember if the module contains OpenMP (runtime calls), to be used /// foremost with containsOpenMP. struct OpenMPInModule { Index: llvm/lib/Transforms/IPO/OpenMPOpt.cpp =================================================================== --- llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -9,12 +9,12 @@ // OpenMP specific optimizations: // // - Deduplication of runtime calls, e.g., omp_get_thread_num. +// - Latency hiding of host to device memory transfers. // //===----------------------------------------------------------------------===// #include "llvm/Transforms/IPO/OpenMPOpt.h" -#include "llvm/ADT/EnumeratedArray.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" @@ -26,6 +26,8 @@ #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/Attributor.h" #include "llvm/Transforms/Utils/CallGraphUpdater.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/ValueTracking.h" using namespace llvm; using namespace omp; @@ -54,156 +56,11 @@ static constexpr auto TAG = "[" DEBUG_TYPE "]"; #endif -namespace { - -/// OpenMP specific information. For now, stores RFIs and ICVs also needed for -/// Attributor runs. -struct OMPInformationCache : public InformationCache { - OMPInformationCache(Module &M, AnalysisGetter &AG, - BumpPtrAllocator &Allocator, SetVector *CGSCC, - SmallPtrSetImpl &ModuleSlice) - : InformationCache(M, AG, Allocator, CGSCC), ModuleSlice(ModuleSlice), - OMPBuilder(M) { - initializeTypes(M); - initializeRuntimeFunctions(); - initializeInternalControlVars(); - - OMPBuilder.initialize(); - } - - /// Generic information that describes an internal control variable. - struct InternalControlVarInfo { - /// The kind, as described by InternalControlVar enum. - InternalControlVar Kind; - - /// The name of the ICV. - StringRef Name; - - /// Environment variable associated with this ICV. - StringRef EnvVarName; - - /// Initial value kind. - ICVInitValue InitKind; - - /// Initial value. - ConstantInt *InitValue; - - /// Setter RTL function associated with this ICV. - RuntimeFunction Setter; - - /// Getter RTL function associated with this ICV. - RuntimeFunction Getter; - - /// RTL Function corresponding to the override clause of this ICV - RuntimeFunction Clause; - }; - - /// Generic information that describes a runtime function - struct RuntimeFunctionInfo { - - /// The kind, as described by the RuntimeFunction enum. - RuntimeFunction Kind; - - /// The name of the function. - StringRef Name; - - /// Flag to indicate a variadic function. - bool IsVarArg; - - /// The return type of the function. - Type *ReturnType; - - /// The argument types of the function. - SmallVector ArgumentTypes; - - /// The declaration if available. - Function *Declaration = nullptr; - - /// Uses of this runtime function per function containing the use. - using UseVector = SmallVector; - - /// Return the vector of uses in function \p F. - UseVector &getOrCreateUseVector(Function *F) { - std::unique_ptr &UV = UsesMap[F]; - if (!UV) - UV = std::make_unique(); - return *UV; - } - - /// Return the vector of uses in function \p F or `nullptr` if there are - /// none. - const UseVector *getUseVector(Function &F) const { - auto I = UsesMap.find(&F); - if (I != UsesMap.end()) - return I->second.get(); - return nullptr; - } - - /// Return how many functions contain uses of this runtime function. - size_t getNumFunctionsWithUses() const { return UsesMap.size(); } - - /// Return the number of arguments (or the minimal number for variadic - /// functions). - size_t getNumArgs() const { return ArgumentTypes.size(); } - - /// Run the callback \p CB on each use and forget the use if the result is - /// true. The callback will be fed the function in which the use was - /// encountered as second argument. - void foreachUse(function_ref CB) { - for (auto &It : UsesMap) - foreachUse(CB, It.first, It.second.get()); - } - - /// Run the callback \p CB on each use within the function \p F and forget - /// the use if the result is true. - void foreachUse(function_ref CB, Function *F, - UseVector *Uses = nullptr) { - SmallVector ToBeDeleted; - ToBeDeleted.clear(); - - unsigned Idx = 0; - UseVector &UV = Uses ? *Uses : getOrCreateUseVector(F); - - for (Use *U : UV) { - if (CB(*U, *F)) - ToBeDeleted.push_back(Idx); - ++Idx; - } - - // Remove the to-be-deleted indices in reverse order as prior - // modifcations will not modify the smaller indices. - while (!ToBeDeleted.empty()) { - unsigned Idx = ToBeDeleted.pop_back_val(); - UV[Idx] = UV.back(); - UV.pop_back(); - } - } - - private: - /// Map from functions to all uses of this runtime function contained in - /// them. - DenseMap> UsesMap; - }; - - /// The slice of the module we are allowed to look at. - SmallPtrSetImpl &ModuleSlice; - - /// An OpenMP-IR-Builder instance - OpenMPIRBuilder OMPBuilder; - - /// Map from runtime function kind to the runtime function description. - EnumeratedArray - RFIs; - - /// Map from ICV kind to the ICV description. - EnumeratedArray - ICVs; +//===----------------------------------------------------------------------===// +// Definitions of the OMPInformationCache helper structure. +//===----------------------------------------------------------------------===// - /// Helper to initialize all internal control variable information for those - /// defined in OMPKinds.def. - void initializeInternalControlVars() { +void OMPInformationCache::initializeInternalControlVars() { #define ICV_RT_SET(_Name, RTL) \ { \ auto &ICV = ICVs[_Name]; \ @@ -237,63 +94,58 @@ } \ } #include "llvm/Frontend/OpenMP/OMPKinds.def" - } +} - /// Returns true if the function declaration \p F matches the runtime - /// function types, that is, return type \p RTFRetType, and argument types - /// \p RTFArgTypes. - static bool declMatchesRTFTypes(Function *F, Type *RTFRetType, - SmallVector &RTFArgTypes) { - // TODO: We should output information to the user (under debug output - // and via remarks). +bool OMPInformationCache::declMatchesRTFTypes( + Function *F, Type *RTFRetType, SmallVector &RTFArgTypes) { + // TODO: We should output information to the user (under debug output + // and via remarks). - if (!F) - return false; - if (F->getReturnType() != RTFRetType) - return false; - if (F->arg_size() != RTFArgTypes.size()) + if (!F) + return false; + if (F->getReturnType() != RTFRetType) + return false; + if (F->arg_size() != RTFArgTypes.size()) + return false; + + auto RTFTyIt = RTFArgTypes.begin(); + for (Argument &Arg : F->args()) { + if (Arg.getType() != *RTFTyIt) return false; - auto RTFTyIt = RTFArgTypes.begin(); - for (Argument &Arg : F->args()) { - if (Arg.getType() != *RTFTyIt) - return false; + ++RTFTyIt; + } - ++RTFTyIt; - } + return true; +} - return true; - } +void OMPInformationCache::initializeRuntimeFunctions() { + // Helper to collect all uses of the decleration in the UsesMap. + auto CollectUses = [&](RuntimeFunctionInfo &RFI) { + unsigned NumUses = 0; + if (!RFI.Declaration) + return NumUses; + OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration); - /// Helper to initialize all runtime function information for those defined - /// in OpenMPKinds.def. - void initializeRuntimeFunctions() { - // Helper to collect all uses of the decleration in the UsesMap. - auto CollectUses = [&](RuntimeFunctionInfo &RFI) { - unsigned NumUses = 0; - if (!RFI.Declaration) - return NumUses; - OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration); - - NumOpenMPRuntimeFunctionsIdentified += 1; - NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses(); - - // TODO: We directly convert uses into proper calls and unknown uses. - for (Use &U : RFI.Declaration->uses()) { - if (Instruction *UserI = dyn_cast(U.getUser())) { - if (ModuleSlice.count(UserI->getFunction())) { - RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U); - ++NumUses; - } - } else { - RFI.getOrCreateUseVector(nullptr).push_back(&U); + NumOpenMPRuntimeFunctionsIdentified += 1; + NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses(); + + // TODO: We directly convert uses into proper calls and unknown uses. + for (Use &U : RFI.Declaration->uses()) { + if (Instruction *UserI = dyn_cast(U.getUser())) { + if (ModuleSlice.count(UserI->getFunction())) { + RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U); ++NumUses; } + } else { + RFI.getOrCreateUseVector(nullptr).push_back(&U); + ++NumUses; } - return NumUses; - }; + } + return NumUses; + }; - Module &M = *((*ModuleSlice.begin())->getParent()); + Module &M = *((*ModuleSlice.begin())->getParent()); #define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \ { \ @@ -321,413 +173,521 @@ } #include "llvm/Frontend/OpenMP/OMPKinds.def" - // TODO: We should attach the attributes defined in OMPKinds.def. - } -}; + // TODO: We should attach the attributes defined in OMPKinds.def. +} -struct OpenMPOpt { - - using OptimizationRemarkGetter = - function_ref; - - OpenMPOpt(SmallVectorImpl &SCC, CallGraphUpdater &CGUpdater, - OptimizationRemarkGetter OREGetter, - OMPInformationCache &OMPInfoCache) - : M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater), - OREGetter(OREGetter), OMPInfoCache(OMPInfoCache) {} - - /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice. - bool run() { - bool Changed = false; - - LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size() - << " functions in a slice with " - << OMPInfoCache.ModuleSlice.size() << " functions\n"); - - /// Print initial ICV values for testing. - /// FIXME: This should be done from the Attributor once it is added. - if (PrintICVValues) { - InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel}; - - for (Function *F : OMPInfoCache.ModuleSlice) { - for (auto ICV : ICVs) { - auto ICVInfo = OMPInfoCache.ICVs[ICV]; - auto Remark = [&](OptimizationRemark OR) { - return OR << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name) - << " Value: " - << (ICVInfo.InitValue - ? ICVInfo.InitValue->getValue().toString(10, true) - : "IMPLEMENTATION_DEFINED"); - }; - - emitRemarkOnFunction(F, "OpenMPICVTracker", Remark); - } +//===----------------------------------------------------------------------===// +// End of definitions of the OMPInformationCache helper structure. +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Definitions of the OpenMPOpt structure. +//===----------------------------------------------------------------------===// + +bool OpenMPOpt::run() { + bool Changed = false; + + LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size() + << " functions in a slice with " + << OMPInfoCache.ModuleSlice.size() << " functions\n"); + + /// Print initial ICV values for testing. + /// FIXME: This should be done from the Attributor once it is added. + if (PrintICVValues) { + InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel}; + + for (Function *F : OMPInfoCache.ModuleSlice) { + for (auto ICV : ICVs) { + auto ICVInfo = OMPInfoCache.ICVs[ICV]; + auto Remark = [&](OptimizationRemark OR) { + return OR << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name) + << " Value: " + << (ICVInfo.InitValue + ? ICVInfo.InitValue->getValue().toString(10, true) + : "IMPLEMENTATION_DEFINED"); + }; + + emitRemarkOnFunction(F, "OpenMPICVTracker", Remark); } } + } - Changed |= deduplicateRuntimeCalls(); - Changed |= deleteParallelRegions(); + Changed |= deduplicateRuntimeCalls(); + Changed |= deleteParallelRegions(); + Changed |= hideMemTransfersLatency(); - return Changed; - } + return Changed; +} - /// Return the call if \p U is a callee use in a regular call. If \p RFI is - /// given it has to be the callee or a nullptr is returned. - static CallInst *getCallIfRegularCall( - Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) { - CallInst *CI = dyn_cast(U.getUser()); - if (CI && CI->isCallee(&U) && !CI->hasOperandBundles() && - (!RFI || CI->getCalledFunction() == RFI->Declaration)) - return CI; - return nullptr; +bool OpenMPOpt::deduplicateRuntimeCalls() { + bool Changed = false; + + RuntimeFunction DeduplicableRuntimeCallIDs[] = { + OMPRTL_omp_get_num_threads, + OMPRTL_omp_in_parallel, + OMPRTL_omp_get_cancellation, + OMPRTL_omp_get_thread_limit, + OMPRTL_omp_get_supported_active_levels, + OMPRTL_omp_get_level, + OMPRTL_omp_get_ancestor_thread_num, + OMPRTL_omp_get_team_size, + OMPRTL_omp_get_active_level, + OMPRTL_omp_in_final, + OMPRTL_omp_get_proc_bind, + OMPRTL_omp_get_num_places, + OMPRTL_omp_get_num_procs, + OMPRTL_omp_get_place_num, + OMPRTL_omp_get_partition_num_places, + OMPRTL_omp_get_partition_place_nums}; + + // Global-tid is handled separately. + SmallSetVector GTIdArgs; + collectGlobalThreadIdArguments(GTIdArgs); + LLVM_DEBUG(dbgs() << TAG << "Found " << GTIdArgs.size() + << " global thread ID arguments\n"); + + for (Function *F : SCC) { + for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs) + deduplicateRuntimeCalls(*F, + OMPInfoCache.RFIs[DeduplicableRuntimeCallID]); + + // __kmpc_global_thread_num is special as we can replace it with an + // argument in enough cases to make it worth trying. + Value *GTIdArg = nullptr; + for (Argument &Arg : F->args()) + if (GTIdArgs.count(&Arg)) { + GTIdArg = &Arg; + break; + } + Changed |= deduplicateRuntimeCalls( + *F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg); } - /// Return the call if \p V is a regular call. If \p RFI is given it has to be - /// the callee or a nullptr is returned. - static CallInst *getCallIfRegularCall( - Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) { - CallInst *CI = dyn_cast(&V); - if (CI && !CI->hasOperandBundles() && - (!RFI || CI->getCalledFunction() == RFI->Declaration)) - return CI; - return nullptr; - } + return Changed; +} -private: - /// Try to delete parallel regions if possible. - bool deleteParallelRegions() { - const unsigned CallbackCalleeOperand = 2; +bool OpenMPOpt::deleteParallelRegions() { + const unsigned CallbackCalleeOperand = 2; - OMPInformationCache::RuntimeFunctionInfo &RFI = - OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call]; + OMPInformationCache::RuntimeFunctionInfo &RFI = + OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call]; - if (!RFI.Declaration) + if (!RFI.Declaration) + return false; + + bool Changed = false; + auto DeleteCallCB = [&](Use &U, Function &) { + CallInst *CI = getCallIfRegularCall(U); + if (!CI) + return false; + auto *Fn = dyn_cast( + CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts()); + if (!Fn) + return false; + if (!Fn->onlyReadsMemory()) + return false; + if (!Fn->hasFnAttribute(Attribute::WillReturn)) return false; - bool Changed = false; - auto DeleteCallCB = [&](Use &U, Function &) { - CallInst *CI = getCallIfRegularCall(U); - if (!CI) - return false; - auto *Fn = dyn_cast( - CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts()); - if (!Fn) - return false; - if (!Fn->onlyReadsMemory()) - return false; - if (!Fn->hasFnAttribute(Attribute::WillReturn)) - return false; + LLVM_DEBUG(dbgs() << TAG << "Delete read-only parallel region in " + << CI->getCaller()->getName() << "\n"); - LLVM_DEBUG(dbgs() << TAG << "Delete read-only parallel region in " - << CI->getCaller()->getName() << "\n"); - - auto Remark = [&](OptimizationRemark OR) { - return OR << "Parallel region in " - << ore::NV("OpenMPParallelDelete", CI->getCaller()->getName()) - << " deleted"; - }; - emitRemark(CI, "OpenMPParallelRegionDeletion", - Remark); - - CGUpdater.removeCallSite(*CI); - CI->eraseFromParent(); - Changed = true; - ++NumOpenMPParallelRegionsDeleted; - return true; + auto Remark = [&](OptimizationRemark OR) { + return OR << "Parallel region in " + << ore::NV("OpenMPParallelDelete", CI->getCaller()->getName()) + << " deleted"; }; + emitRemark(CI, "OpenMPParallelRegionDeletion", + Remark); - RFI.foreachUse(DeleteCallCB); + CGUpdater.removeCallSite(*CI); + CI->eraseFromParent(); + Changed = true; + ++NumOpenMPParallelRegionsDeleted; + return true; + }; - return Changed; - } + RFI.foreachUse(DeleteCallCB); - /// Try to eliminiate runtime calls by reusing existing ones. - bool deduplicateRuntimeCalls() { - bool Changed = false; - - RuntimeFunction DeduplicableRuntimeCallIDs[] = { - OMPRTL_omp_get_num_threads, - OMPRTL_omp_in_parallel, - OMPRTL_omp_get_cancellation, - OMPRTL_omp_get_thread_limit, - OMPRTL_omp_get_supported_active_levels, - OMPRTL_omp_get_level, - OMPRTL_omp_get_ancestor_thread_num, - OMPRTL_omp_get_team_size, - OMPRTL_omp_get_active_level, - OMPRTL_omp_in_final, - OMPRTL_omp_get_proc_bind, - OMPRTL_omp_get_num_places, - OMPRTL_omp_get_num_procs, - OMPRTL_omp_get_place_num, - OMPRTL_omp_get_partition_num_places, - OMPRTL_omp_get_partition_place_nums}; - - // Global-tid is handled separately. - SmallSetVector GTIdArgs; - collectGlobalThreadIdArguments(GTIdArgs); - LLVM_DEBUG(dbgs() << TAG << "Found " << GTIdArgs.size() - << " global thread ID arguments\n"); - - for (Function *F : SCC) { - for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs) - deduplicateRuntimeCalls(*F, - OMPInfoCache.RFIs[DeduplicableRuntimeCallID]); - - // __kmpc_global_thread_num is special as we can replace it with an - // argument in enough cases to make it worth trying. - Value *GTIdArg = nullptr; - for (Argument &Arg : F->args()) - if (GTIdArgs.count(&Arg)) { - GTIdArg = &Arg; - break; - } - Changed |= deduplicateRuntimeCalls( - *F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg); - } + return Changed; +} + +bool OpenMPOpt::hideMemTransfersLatency() { + OMPInformationCache::RuntimeFunctionInfo &RFI = + OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin]; + bool Changed = false; + auto SplitDataTransfer = [&] (Use &U, Function &Decl) { + auto *RTCall = getCallIfRegularCall(U, &RFI); + if (!RTCall) + return false; + + auto *MSSAResult = + OMPInfoCache.getAnalysisResultForFunction( + *RTCall->getCaller()); + if (!MSSAResult) + return false; + + auto &MSSA = MSSAResult->getMSSA(); + HideMemTransferLatency Opt(RTCall, MSSA, OMPInfoCache.getDL()); + Changed = Opt.splitMemoryTransfer(); return Changed; - } + }; - static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent, - bool GlobalOnly, bool &SingleChoice) { - if (CurrentIdent == NextIdent) - return CurrentIdent; + RFI.foreachUse(SplitDataTransfer); + return Changed; +} - // TODO: Figure out how to actually combine multiple debug locations. For - // now we just keep an existing one if there is a single choice. - if (!GlobalOnly || isa(NextIdent)) { - SingleChoice = !CurrentIdent; - return NextIdent; - } - return nullptr; +Value *OpenMPOpt::combinedIdentStruct(Value *CurrentIdent, Value *NextIdent, + bool GlobalOnly, bool &SingleChoice) { + if (CurrentIdent == NextIdent) + return CurrentIdent; + + // TODO: Figure out how to actually combine multiple debug locations. For + // now we just keep an existing one if there is a single choice. + if (!GlobalOnly || isa(NextIdent)) { + SingleChoice = !CurrentIdent; + return NextIdent; } + return nullptr; +} - /// Return an `struct ident_t*` value that represents the ones used in the - /// calls of \p RFI inside of \p F. If \p GlobalOnly is true, we will not - /// return a local `struct ident_t*`. For now, if we cannot find a suitable - /// return value we create one from scratch. We also do not yet combine - /// information, e.g., the source locations, see combinedIdentStruct. - Value * - getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI, - Function &F, bool GlobalOnly) { - bool SingleChoice = true; - Value *Ident = nullptr; - auto CombineIdentStruct = [&](Use &U, Function &Caller) { - CallInst *CI = getCallIfRegularCall(U, &RFI); - if (!CI || &F != &Caller) - return false; - Ident = combinedIdentStruct(Ident, CI->getArgOperand(0), - /* GlobalOnly */ true, SingleChoice); +Value * +OpenMPOpt::getCombinedIdentFromCallUsesIn( + OMPInformationCache::RuntimeFunctionInfo &RFI, + Function &F, bool GlobalOnly) { + bool SingleChoice = true; + Value *Ident = nullptr; + auto CombineIdentStruct = [&](Use &U, Function &Caller) { + CallInst *CI = getCallIfRegularCall(U, &RFI); + if (!CI || &F != &Caller) return false; - }; - RFI.foreachUse(CombineIdentStruct); - - if (!Ident || !SingleChoice) { - // The IRBuilder uses the insertion block to get to the module, this is - // unfortunate but we work around it for now. - if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock()) - OMPInfoCache.OMPBuilder.updateToLocation(OpenMPIRBuilder::InsertPointTy( - &F.getEntryBlock(), F.getEntryBlock().begin())); - // Create a fallback location if non was found. - // TODO: Use the debug locations of the calls instead. - Constant *Loc = OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(); - Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc); - } - return Ident; + Ident = combinedIdentStruct(Ident, CI->getArgOperand(0), + /* GlobalOnly */ true, SingleChoice); + return false; + }; + RFI.foreachUse(CombineIdentStruct); + + if (!Ident || !SingleChoice) { + // The IRBuilder uses the insertion block to get to the module, this is + // unfortunate but we work around it for now. + if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock()) + OMPInfoCache.OMPBuilder.updateToLocation(OpenMPIRBuilder::InsertPointTy( + &F.getEntryBlock(), F.getEntryBlock().begin())); + // Create a fallback location if non was found. + // TODO: Use the debug locations of the calls instead. + Constant *Loc = OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(); + Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc); } + return Ident; +} - /// Try to eliminiate calls of \p RFI in \p F by reusing an existing one or - /// \p ReplVal if given. - bool deduplicateRuntimeCalls(Function &F, - OMPInformationCache::RuntimeFunctionInfo &RFI, - Value *ReplVal = nullptr) { - auto *UV = RFI.getUseVector(F); - if (!UV || UV->size() + (ReplVal != nullptr) < 2) - return false; +bool OpenMPOpt::deduplicateRuntimeCalls( + Function &F, OMPInformationCache::RuntimeFunctionInfo &RFI, Value *ReplVal) { + auto *UV = RFI.getUseVector(F); + if (!UV || UV->size() + (ReplVal != nullptr) < 2) + return false; - LLVM_DEBUG( - dbgs() << TAG << "Deduplicate " << UV->size() << " uses of " << RFI.Name - << (ReplVal ? " with an existing value\n" : "\n") << "\n"); + LLVM_DEBUG( + dbgs() << TAG << "Deduplicate " << UV->size() << " uses of " << RFI.Name + << (ReplVal ? " with an existing value\n" : "\n") << "\n"); - assert((!ReplVal || (isa(ReplVal) && - cast(ReplVal)->getParent() == &F)) && - "Unexpected replacement value!"); + assert((!ReplVal || (isa(ReplVal) && + cast(ReplVal)->getParent() == &F)) && + "Unexpected replacement value!"); - // TODO: Use dominance to find a good position instead. - auto CanBeMoved = [](CallBase &CB) { - unsigned NumArgs = CB.getNumArgOperands(); - if (NumArgs == 0) - return true; - if (CB.getArgOperand(0)->getType() != IdentPtr) - return false; - for (unsigned u = 1; u < NumArgs; ++u) - if (isa(CB.getArgOperand(u))) - return false; + // TODO: Use dominance to find a good position instead. + auto CanBeMoved = [](CallBase &CB) { + unsigned NumArgs = CB.getNumArgOperands(); + if (NumArgs == 0) return true; - }; - - if (!ReplVal) { - for (Use *U : *UV) - if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) { - if (!CanBeMoved(*CI)) - continue; - - auto Remark = [&](OptimizationRemark OR) { - auto newLoc = &*F.getEntryBlock().getFirstInsertionPt(); - return OR << "OpenMP runtime call " - << ore::NV("OpenMPOptRuntime", RFI.Name) << " moved to " - << ore::NV("OpenMPRuntimeMoves", newLoc->getDebugLoc()); - }; - emitRemark(CI, "OpenMPRuntimeCodeMotion", Remark); - - CI->moveBefore(&*F.getEntryBlock().getFirstInsertionPt()); - ReplVal = CI; - break; - } - if (!ReplVal) + if (CB.getArgOperand(0)->getType() != IdentPtr) + return false; + for (unsigned u = 1; u < NumArgs; ++u) + if (isa(CB.getArgOperand(u))) return false; - } + return true; + }; - // If we use a call as a replacement value we need to make sure the ident is - // valid at the new location. For now we just pick a global one, either - // existing and used by one of the calls, or created from scratch. - if (CallBase *CI = dyn_cast(ReplVal)) { - if (CI->getNumArgOperands() > 0 && - CI->getArgOperand(0)->getType() == IdentPtr) { - Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F, - /* GlobalOnly */ true); - CI->setArgOperand(0, Ident); + if (!ReplVal) { + for (Use *U : *UV) + if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) { + if (!CanBeMoved(*CI)) + continue; + + auto Remark = [&](OptimizationRemark OR) { + auto newLoc = &*F.getEntryBlock().getFirstInsertionPt(); + return OR << "OpenMP runtime call " + << ore::NV("OpenMPOptRuntime", RFI.Name) << " moved to " + << ore::NV("OpenMPRuntimeMoves", newLoc->getDebugLoc()); + }; + emitRemark(CI, "OpenMPRuntimeCodeMotion", Remark); + + CI->moveBefore(&*F.getEntryBlock().getFirstInsertionPt()); + ReplVal = CI; + break; } + if (!ReplVal) + return false; + } + + // If we use a call as a replacement value we need to make sure the ident is + // valid at the new location. For now we just pick a global one, either + // existing and used by one of the calls, or created from scratch. + if (CallBase *CI = dyn_cast(ReplVal)) { + if (CI->getNumArgOperands() > 0 && + CI->getArgOperand(0)->getType() == IdentPtr) { + Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F, + /* GlobalOnly */ true); + CI->setArgOperand(0, Ident); } + } - bool Changed = false; - auto ReplaceAndDeleteCB = [&](Use &U, Function &Caller) { - CallInst *CI = getCallIfRegularCall(U, &RFI); - if (!CI || CI == ReplVal || &F != &Caller) - return false; - assert(CI->getCaller() == &F && "Unexpected call!"); - - auto Remark = [&](OptimizationRemark OR) { - return OR << "OpenMP runtime call " - << ore::NV("OpenMPOptRuntime", RFI.Name) << " deduplicated"; - }; - emitRemark(CI, "OpenMPRuntimeDeduplicated", Remark); - - CGUpdater.removeCallSite(*CI); - CI->replaceAllUsesWith(ReplVal); - CI->eraseFromParent(); - ++NumOpenMPRuntimeCallsDeduplicated; - Changed = true; - return true; + bool Changed = false; + auto ReplaceAndDeleteCB = [&](Use &U, Function &Caller) { + CallInst *CI = getCallIfRegularCall(U, &RFI); + if (!CI || CI == ReplVal || &F != &Caller) + return false; + assert(CI->getCaller() == &F && "Unexpected call!"); + + auto Remark = [&](OptimizationRemark OR) { + return OR << "OpenMP runtime call " + << ore::NV("OpenMPOptRuntime", RFI.Name) << " deduplicated"; }; - RFI.foreachUse(ReplaceAndDeleteCB); + emitRemark(CI, "OpenMPRuntimeDeduplicated", Remark); - return Changed; - } + CGUpdater.removeCallSite(*CI); + CI->replaceAllUsesWith(ReplVal); + CI->eraseFromParent(); + ++NumOpenMPRuntimeCallsDeduplicated; + Changed = true; + return true; + }; + RFI.foreachUse(ReplaceAndDeleteCB); - /// Collect arguments that represent the global thread id in \p GTIdArgs. - void collectGlobalThreadIdArguments(SmallSetVector >IdArgs) { - // TODO: Below we basically perform a fixpoint iteration with a pessimistic - // initialization. We could define an AbstractAttribute instead and - // run the Attributor here once it can be run as an SCC pass. + return Changed; +} - // Helper to check the argument \p ArgNo at all call sites of \p F for - // a GTId. - auto CallArgOpIsGTId = [&](Function &F, unsigned ArgNo, CallInst &RefCI) { - if (!F.hasLocalLinkage()) - return false; - for (Use &U : F.uses()) { - if (CallInst *CI = getCallIfRegularCall(U)) { - Value *ArgOp = CI->getArgOperand(ArgNo); - if (CI == &RefCI || GTIdArgs.count(ArgOp) || - getCallIfRegularCall( - *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num])) - continue; - } - return false; - } - return true; - }; +template +void OpenMPOpt::emitRemark(Instruction *Inst, StringRef RemarkName, + RemarkCallBack &&RemarkCB) { + Function *F = Inst->getParent()->getParent(); + auto &ORE = OREGetter(F); - // Helper to identify uses of a GTId as GTId arguments. - auto AddUserArgs = [&](Value >Id) { - for (Use &U : GTId.uses()) - if (CallInst *CI = dyn_cast(U.getUser())) - if (CI->isArgOperand(&U)) - if (Function *Callee = CI->getCalledFunction()) - if (CallArgOpIsGTId(*Callee, U.getOperandNo(), *CI)) - GTIdArgs.insert(Callee->getArg(U.getOperandNo())); - }; + ORE.emit( + [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, Inst)); }); +} + +void OpenMPOpt::emitRemarkOnFunction( + Function *F, StringRef RemarkName, + function_ref &&RemarkCB) { + auto &ORE = OREGetter(F); - // The argument users of __kmpc_global_thread_num calls are GTIds. - OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI = - OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]; + ORE.emit([&]() { + return RemarkCB(OptimizationRemark(DEBUG_TYPE, RemarkName, F)); + }); +} + +void OpenMPOpt::collectGlobalThreadIdArguments( + SmallSetVector >IdArgs) { + // TODO: Below we basically perform a fixpoint iteration with a pessimistic + // initialization. We could define an AbstractAttribute instead and + // run the Attributor here once it can be run as an SCC pass. - GlobThreadNumRFI.foreachUse([&](Use &U, Function &F) { - if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI)) - AddUserArgs(*CI); + // Helper to check the argument \p ArgNo at all call sites of \p F for + // a GTId. + auto CallArgOpIsGTId = [&](Function &F, unsigned ArgNo, CallInst &RefCI) { + if (!F.hasLocalLinkage()) + return false; + for (Use &U : F.uses()) { + if (CallInst *CI = getCallIfRegularCall(U)) { + Value *ArgOp = CI->getArgOperand(ArgNo); + if (CI == &RefCI || GTIdArgs.count(ArgOp) || + getCallIfRegularCall( + *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num])) + continue; + } return false; - }); + } + return true; + }; + + // Helper to identify uses of a GTId as GTId arguments. + auto AddUserArgs = [&](Value >Id) { + for (Use &U : GTId.uses()) + if (CallInst *CI = dyn_cast(U.getUser())) + if (CI->isArgOperand(&U)) + if (Function *Callee = CI->getCalledFunction()) + if (CallArgOpIsGTId(*Callee, U.getOperandNo(), *CI)) + GTIdArgs.insert(Callee->getArg(U.getOperandNo())); + }; + + // The argument users of __kmpc_global_thread_num calls are GTIds. + OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI = + OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]; + + GlobThreadNumRFI.foreachUse([&](Use &U, Function &F) { + if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI)) + AddUserArgs(*CI); + return false; + }); + + // Transitively search for more arguments by looking at the users of the + // ones we know already. During the search the GTIdArgs vector is extended + // so we cannot cache the size nor can we use a range based for. + for (unsigned u = 0; u < GTIdArgs.size(); ++u) + AddUserArgs(*GTIdArgs[u]); +} + +CallInst *OpenMPOpt::getCallIfRegularCall( + Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI) { + CallInst *CI = dyn_cast(U.getUser()); + if (CI && CI->isCallee(&U) && !CI->hasOperandBundles() && + (!RFI || CI->getCalledFunction() == RFI->Declaration)) + return CI; + return nullptr; +} + +CallInst *OpenMPOpt::getCallIfRegularCall( + Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI) { + CallInst *CI = dyn_cast(&V); + if (CI && !CI->hasOperandBundles() && + (!RFI || CI->getCalledFunction() == RFI->Declaration)) + return CI; + return nullptr; +} + +//===----------------------------------------------------------------------===// +// End of definitions of the OpenMPOpt structure. +//===----------------------------------------------------------------------===// - // Transitively search for more arguments by looking at the users of the - // ones we know already. During the search the GTIdArgs vector is extended - // so we cannot cache the size nor can we use a range based for. - for (unsigned u = 0; u < GTIdArgs.size(); ++u) - AddUserArgs(*GTIdArgs[u]); +//===----------------------------------------------------------------------===// +// Definitions of the HideMemTransferLatency optimization. +//===----------------------------------------------------------------------===// + +bool HideMemTransferLatency::splitMemoryTransfer() { + bool Changed = false; + bool Success = getValuesInOfflArrays(); + if (!Success) { + LLVM_DEBUG(dbgs() << TAG << "Couldn't get offload arrays in call to " + << MT.RuntimeCall->getName() << " in function " + << MT.RuntimeCall->getCaller()->getName() << "\n"); + return false; } - /// Emit a remark generically - /// - /// This template function can be used to generically emit a remark. The - /// RemarkKind should be one of the following: - /// - OptimizationRemark to indicate a successful optimization attempt - /// - OptimizationRemarkMissed to report a failed optimization attempt - /// - OptimizationRemarkAnalysis to provide additional information about an - /// optimization attempt - /// - /// The remark is built using a callback function provided by the caller that - /// takes a RemarkKind as input and returns a RemarkKind. - template > - void emitRemark(Instruction *Inst, StringRef RemarkName, - RemarkCallBack &&RemarkCB) { - Function *F = Inst->getParent()->getParent(); - auto &ORE = OREGetter(F); - - ORE.emit( - [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, Inst)); }); + return Changed; +} + +bool HideMemTransferLatency::getValuesInOfflArrays() { + auto *RuntimeCall = MT.RuntimeCall; + auto *BasePtrsArg = RuntimeCall->arg_begin() + 2; // **offload_baseptrs. + auto *PtrsArg = RuntimeCall->arg_begin() + 3; // **offload_ptrs. + auto *SizesArg = RuntimeCall->arg_begin() + 4; // **offload_sizes. + + // Get values stored in **offload_baseptrs. + auto *V = GetUnderlyingObject(BasePtrsArg->get(), DL); + bool Success = getValuesInOfflArray(V, *MT.BasePtrs, RuntimeCall); + if (!Success) { + LLVM_DEBUG(dbgs() << TAG << "Couldn't get offload_baseptrs in call to " + << RuntimeCall->getName() << " in function " + << RuntimeCall->getCaller()->getName() << "\n"); + return false; } - /// Emit a remark on a function. Since only OptimizationRemark is supporting - /// this, it can't be made generic. - void emitRemarkOnFunction( - Function *F, StringRef RemarkName, - function_ref &&RemarkCB) { - auto &ORE = OREGetter(F); + // Get values stored in **offload_ptrs. + V = GetUnderlyingObject(PtrsArg->get(), DL); + Success = getValuesInOfflArray(V, *MT.Ptrs, RuntimeCall); + if (!Success) { + LLVM_DEBUG(dbgs() << TAG << "Couldn't get offload_ptrs in call to " + << RuntimeCall->getName() << " in function " + << RuntimeCall->getCaller()->getName() << "\n"); + return false; + } - ORE.emit([&]() { - return RemarkCB(OptimizationRemark(DEBUG_TYPE, RemarkName, F)); - }); + // Get values stored in **offload_sizes. + V = GetUnderlyingObject(SizesArg->get(), DL); + Success = getValuesInOfflArray(V, *MT.Sizes, RuntimeCall); + if (!Success) { + LLVM_DEBUG(dbgs() << TAG << "Couldn't get offload_sizes in call to " + << RuntimeCall->getName() << " in function " + << RuntimeCall->getCaller()->getName() << "\n"); + return false; } - /// The underyling module. - Module &M; + return true; +} - /// The SCC we are operating on. - SmallVectorImpl &SCC; +/// Gets the values stored in \p OfflArray and stores them in \p Dst. +/// \p Before serves as a lower bound, so don't look at accesses after that. +bool HideMemTransferLatency::getValuesInOfflArray( + Value *OfflArray, MemoryTransfer::OffloadArray &Dst, User *Before) { + assert(OfflArray && "Can't get values in nullptr!"); - /// Callback to update the call graph, the first argument is a removed call, - /// the second an optional replacement call. - CallGraphUpdater &CGUpdater; + if (!isa(OfflArray)) { + LLVM_DEBUG(dbgs() << TAG << "Only alloca arrays supported.\n"); + return false; + } - /// Callback to get an OptimizationRemarkEmitter from a Function * - OptimizationRemarkGetter OREGetter; + auto *ArrayAlloc = cast(OfflArray); + const uint64_t NumValues = + ArrayAlloc->getAllocatedType()->getArrayNumElements(); - /// OpenMP-specific information cache. Also Used for Attributor runs. - OMPInformationCache &OMPInfoCache; -}; -} // namespace + auto &LastAccesses = Dst.LastAccesses; + auto &StoredAddresses = Dst.StoredAddresses; + LastAccesses.assign(NumValues, nullptr); + StoredAddresses.assign(NumValues, nullptr); + + // Get last accesses to the array right before Before. + for (auto *Usr : OfflArray->users()) { + // If reached lower limit. + if (Before && Usr == Before) + break; + + auto *Access = dyn_cast(Usr); + if (!Access) + continue; + + auto *ArrayIdx = Access->idx_begin() + 1; + if (ArrayIdx == Access->idx_end()) + continue; + + const uint64_t IdxLiteral = getIntLiteral(ArrayIdx->get()); + LastAccesses[IdxLiteral] = Usr; + } + + // Get stored addresses. + for (unsigned It = 0; It < NumValues; ++It) { + auto *Accs = LastAccesses[It]; + auto AccsUsr = Accs->user_begin(); + if (AccsUsr == Accs->user_end()) { + LLVM_DEBUG(dbgs() << TAG << "Useless access to offload array.\n"); + return false; + } + + auto *I = cast(*AccsUsr); + if (I->isCast()) + AccsUsr = I->user_begin(); + + if (!isa(*AccsUsr)) { + LLVM_DEBUG(dbgs() << TAG << "Unrecognized access pattern.\n"); + return false; + } + + StoredAddresses[It] = + GetUnderlyingObject(AccsUsr->getOperand(0), DL); + } + + if (!MemoryTransfer::isFilled(Dst)) { + LLVM_DEBUG(dbgs() << TAG << "Didn't get all values in offload array.\n"); + return false; + } + + return true; +} + +//===----------------------------------------------------------------------===// +// End of definitions of the HideMemTransferLatency optimization. +//===----------------------------------------------------------------------===// PreservedAnalyses OpenMPOptPass::run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM, @@ -763,7 +723,7 @@ SetVector Functions(SCC.begin(), SCC.end()); BumpPtrAllocator Allocator; OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator, - /*CGSCC*/ &Functions, ModuleSlice); + /*CGSCC*/ &Functions, ModuleSlice); // TODO: Compute the module slice we are allowed to look at. OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache); Index: llvm/unittests/Transforms/IPO/CMakeLists.txt =================================================================== --- llvm/unittests/Transforms/IPO/CMakeLists.txt +++ llvm/unittests/Transforms/IPO/CMakeLists.txt @@ -1,3 +1,5 @@ +add_subdirectory(OpenMPOpt) + set(LLVM_LINK_COMPONENTS Core Support Index: llvm/unittests/Transforms/IPO/OpenMPOpt/CMakeLists.txt =================================================================== --- /dev/null +++ llvm/unittests/Transforms/IPO/OpenMPOpt/CMakeLists.txt @@ -0,0 +1,11 @@ +set(LLVM_LINK_COMPONENTS + ipo + Core + Analysis + AsmParser + FrontendOpenMP + ) + +add_llvm_unittest(OpenMPOptUnitTests + HideMemTransferLatency.cpp + ) \ No newline at end of file Index: llvm/unittests/Transforms/IPO/OpenMPOpt/HideMemTransferLatency.cpp =================================================================== --- /dev/null +++ llvm/unittests/Transforms/IPO/OpenMPOpt/HideMemTransferLatency.cpp @@ -0,0 +1,43 @@ +//===- LowerTypeTests.cpp - Unit tests for type test lowering -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "OpenMPOptTest.h" + +namespace { + +class HideMemTransferLatencyTest : public OpenMPOptTest { +protected: + /// This stores the runtime calls for further use within each test. + SmallVector MemTransferCalls; + + void getCallSites(const char *ModuleString) { + auto M = parseModuleString(ModuleString); + initializeOMPInfoCache(*M); + + auto &RFI = OMPInfoCache->RFIs[OMPRTL___tgt_target_data_begin]; + auto GetCallSites = [&](Use &U, Function &Decl) { + auto *RTCall = OpenMPOpt::getCallIfRegularCall(U, &RFI); + if (!RTCall) + return false; + MemTransferCalls.push_back(RTCall); + return false; + }; + RFI.foreachUse(GetCallSites); + EXPECT_FALSE(MemTransferCalls.empty()); + } +}; + +TEST_F(HideMemTransferLatencyTest, GetValuesInOfflArrays) { + // Update this ModuleString; + const char *ModuleString = ""; + getCallSites(ModuleString); + + EXPECT_TRUE(true); +} + +} // end anonymous namespace Index: llvm/unittests/Transforms/IPO/OpenMPOpt/OpenMPOptTest.h =================================================================== --- /dev/null +++ llvm/unittests/Transforms/IPO/OpenMPOpt/OpenMPOptTest.h @@ -0,0 +1,64 @@ +//===- LowerTypeTests.cpp - Unit tests for type test lowering -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_UNITTESTS_TRANSFORMS_IPO_OPENMPOPT_H +#define LLVM_UNITTESTS_TRANSFORMS_IPO_OPENMPOPT_H + +#include "llvm/Transforms/IPO/OpenMPOpt.h" +#include "llvm/AsmParser/Parser.h" +#include "gtest/gtest.h" + +using namespace llvm; +using namespace omp; + +namespace { + +class OpenMPOptTest : public ::testing::Test { +protected: + std::unique_ptr OMPInfoCache; + std::unique_ptr Ctx; + + OpenMPOptTest() : Ctx(new LLVMContext) {} + + /// Initializes OMPInfoCache attribute cache given a module \p M. + /// This must be called before using OMPInfoCache. + void initializeOMPInfoCache(Module &M) { + CallGraph CG(M); + scc_iterator CGI = scc_begin(&CG); + CallGraphSCC CGSCC(CG, &CGI); + + SmallPtrSet ModuleSlice; + SmallVector SCC; + for (CallGraphNode *CGN : CGSCC) + if (Function *Fn = CGN->getFunction()) + if (!Fn->isDeclaration()) { + SCC.push_back(Fn); + ModuleSlice.insert(Fn); + } + + EXPECT_FALSE(SCC.empty()); + + AnalysisGetter AG; + SetVector Functions(SCC.begin(), SCC.end()); + BumpPtrAllocator Allocator; + OMPInfoCache = std::make_unique( + *(Functions.back()->getParent()), AG, Allocator, /*CGSCC*/ &Functions, + ModuleSlice); + } + + std::unique_ptr parseModuleString(const char *ModuleString) { + SMDiagnostic Err; + auto M = parseAssemblyString(ModuleString, Err, *Ctx); + EXPECT_TRUE(M); + return M; + } +}; + +} // end anonymous namespace + +#endif // LLVM_UNITTESTS_TRANSFORMS_IPO_OPENMPOPT_H