diff --git a/llvm/include/llvm/IR/PassManager.h b/llvm/include/llvm/IR/PassManager.h --- a/llvm/include/llvm/IR/PassManager.h +++ b/llvm/include/llvm/IR/PassManager.h @@ -217,6 +217,14 @@ NotPreservedAnalysisIDs.insert(ID); } + /// TODO: + template void unpreserve() { unpreserve(AnalysisT::ID()); } + + /// TODO: + void unpreserve(AnalysisKey *ID) { + PreservedIDs.erase(ID); + } + /// Intersect this set with another in place. /// /// This is a mutating operation on this preserved set, removing all @@ -261,6 +269,69 @@ PreservedIDs.erase(ID); } + /// Unite this set with another in place. Any explicitly preserved ID by any + /// set will be preserved by resulting set (even if explicitly not preserved + /// by one of the sets). Only those explicitly not preserved IDs which are not + /// explicitly preserved by any set will be in the resulting set of explicitly + /// not preserved IDs. + /// + /// This is a mutating operation on this preserved set, adding all + /// preserved in the argument if not already preserved in this set. + void unite(const PreservedAnalyses &Arg) { + if (areAllPreserved()) + return; + if (Arg.areAllPreserved()) { + *this = Arg; + return; + } + + // First remove explicitly not preserved IDs from this set if explicitly + // preserved by the Arg. + for (auto *ID : NotPreservedAnalysisIDs) { + if (Arg.PreservedIDs.contains(ID)) + NotPreservedAnalysisIDs.erase(ID); + } + // Then add to the target set all explicitly not preserved IDs by the Arg + // but only if not already explicitly preserved by the this set. + for (auto *ID : Arg.NotPreservedAnalysisIDs) { + if (!PreservedIDs.contains(ID)) + NotPreservedAnalysisIDs.insert(ID); + } + + // Finally unite explicitly preserved IDs. + for (auto *ID : Arg.PreservedIDs) + PreservedIDs.insert(ID); + } + + /// Intersect this set with a temporary other set in place. + /// + /// This is a mutating operation on this preserved set, removing all + /// preserved passes which are not also preserved in the argument. + void unite(PreservedAnalyses &&Arg) { + if (Arg.areAllPreserved()) + return; + if (areAllPreserved()) { + *this = std::move(Arg); + return; + } + // First remove explicitly not preserved IDs from this set if explicitly + // preserved by the Arg. + for (auto *ID : NotPreservedAnalysisIDs) { + if (Arg.PreservedIDs.contains(ID)) + NotPreservedAnalysisIDs.erase(ID); + } + // Then add to the target set all explicitly not preserved IDs by the Arg + // but only if not already explicitly preserved by the this set. + for (auto *ID : Arg.NotPreservedAnalysisIDs) { + if (!PreservedIDs.contains(ID)) + NotPreservedAnalysisIDs.insert(ID); + } + + // Finally unite explicitly preserved IDs. + for (auto *ID : Arg.PreservedIDs) + PreservedIDs.insert(ID); + } + /// A checker object that makes it easy to query for whether an analysis or /// some set covering it is preserved. class PreservedAnalysisChecker { @@ -329,6 +400,10 @@ PreservedIDs.count(&AllAnalysesKey); } + bool areNonePreserved() const { + return PreservedIDs.empty(); + } + /// Directly test whether a set of analyses is preserved. /// /// This is only true when no analyses have been explicitly abandoned. diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h @@ -56,25 +56,38 @@ #ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZE_H #define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZE_H +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/DemandedBits.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/PassManager.h" #include "llvm/Support/CommandLine.h" + + #include namespace llvm { -class AssumptionCache; -class BlockFrequencyInfo; -class DemandedBits; -class DominatorTree; +//class AssumptionCache; +//class BlockFrequencyInfo; +//class DemandedBits; +//class DominatorTree; class Function; class Loop; -class LoopAccessInfoManager; -class LoopInfo; -class OptimizationRemarkEmitter; -class ProfileSummaryInfo; -class ScalarEvolution; -class TargetLibraryInfo; -class TargetTransformInfo; +//class LoopAccessInfoManager; +//class LoopInfo; +//class OptimizationRemarkEmitter; +//class ProfileSummaryInfo; +//class ScalarEvolution; +//class TargetLibraryInfo; +//class TargetTransformInfo; extern cl::opt EnableLoopInterleaving; extern cl::opt EnableLoopVectorization; @@ -147,18 +160,322 @@ } }; -/// Storage for information about made changes. -struct LoopVectorizeResult { - bool MadeAnyChange; - bool MadeCFGChange; +/// Utility class aimed at simplification of analyses tracking when they need to +/// be requested in a lazy fashion with potential invalidation in between. +/// It automatically updates/rebuilds tracked analyses in cases when they have +/// been direct or indirectly invalidated. +/// Let us assume some transformation pass 'T' depends on analyses 'A', 'B' and +/// 'C'. In addition, 'T' needs analyses 'C' at some cases only while analysis +/// 'B' may be invalidated by the time 'C' is needed. Then instead of explicit +/// declaration of 'A', 'B' and 'C' (and manual tracking) simply declare +/// class T { +/// LocalAnalysesProvider LAP; +/// } +/// +/// Now, at the moment you need 'A' and 'B' enable them the following way: +/// +/// LAP->enable(true /* is_required_analysis */); +/// LAP->enable(true /* is_required_analysis */); +/// +/// From that moment (and until explicit invalidation through corresponding APIs) +/// analysis result can be obtained by calling: +/// +/// LAP->getResult(); +/// LAP->getResult(); +/// +/// At the moment analysis 'B' (or any other analysis even not explicitly tracked +/// by 'LAP') become invalid notify about that and optionally re-enable 'B' if +/// it still needed. Note that 'A' may be implicitly updated as well (if 'A' +//// depends on 'B'). +/// +/// LAP->abandon(); +/// LAP->enable(); // optionally +/// +/// Similarly, when 'C' is needed simply enable it and use: +/// +/// LAP->enable(true /* is_required_analysis */); +/// LAP->getResult(); +template class LocalAnalysesProvider; +template +class LocalAnalysesProvider> { + using LocalAnalysesTupleT = std::tuple; + + FunctionAnalysisManager &FAM; + Function &F; + + template + using AnalysesInfoElemT = + std::tuple; + + // Data structure to keep results and status of local analysis. + // For each local analysis we maintain 4 things: + // is_enabled - whether analysis should be tracked or not + // is_valid - whether locally cached analysis result is valid or not. Note + /// that global result (as seen by FAM) may or may not be valid. + // is_required - whether analysis is "required" or "optional" + std::tuple< + std::tuple...> + LocalAnalysisInfo; + + // Set of preserved analysis communicated to FAM during invalidation. Please + // note that actual analysis result may still be invalidated (even if listed + // in GlobalPA) if one of the analysis it depends on is invalidated. + PreservedAnalyses GlobalPA = PreservedAnalyses::all(); + + // 'true' if any analysis has been invalidated trough the corresponding API + // since last time FAM has been notified about invalidation. + bool InvalidateGlobalAnalyses = false; + +public: + explicit LocalAnalysesProvider(FunctionAnalysisManager &FAM, Function &F) + : FAM(FAM), F(F) {} + + /// Returns locally cached result of the specified analysis if known to be + /// valid. If locally cached result is not known to be invalid or specified + /// analysis hasn't been enabled via corresponding API nullptr is returned. + template + typename Analysis::Result *getResultIfValid() const { + if (IsEnabled() && isValid()) + return const_cast(getResultImpl()); + + return nullptr; + } + + /// Returns locally cached result of the specified analysis. It makes sure + /// the returned result is up to date before returning it. If specified + /// analysis hasn't been enabled via corresponding API nullptr is returned. + template typename Analysis::Result *getResult() { + if (!IsEnabled()) + return nullptr; + + updateLocalAnalysis(); + return getResultImpl(); + } + + /// Returns set of "supposed to be valid/preserved" analyses. In other words, + /// the set consists of all not explicitly invalidated analyses. Note that + /// local as well as global analysis result may still be not valid due to + /// dependencies between analysis. + PreservedAnalyses getPreservedAnalyses() const { return GlobalPA; } + + /// Turns on local tracking of the specified analysis. If 'IsRequired' is true + /// and there is no global result of the analysis available it will be + /// re-created. If 'IsRequired' is false a cached global result will be used + /// if exist. + template + void enable(bool IsRequired) { + if (IsRequired) + enableRequiredAnalysis(); + else + enableOptionalAnalysis(); + } + + /// Turns on local tracking of the specified analyses. See 'enable(bool + /// IsRequired) API for details about "required" and "optional" analysis'. + template + void enable() { + if ((std::tuple_size_v) > 0) { + enableRequiredAnalyses( + std::make_index_sequence>{}); + } + + enableOptionalAnalyses( + std::make_index_sequence>{}); + } + + /// Turns off local tracking of the specified analysis. Once analysis is + /// disabled 'getResult/getResultIfEanbled' APIs will return + /// nullptr. + template + void disable() { + SetEnabled(false); + } + + /// Notify about invalidation of the specified analysis result. + template + void invalidate() { + InvalidateGlobalAnalyses = true; + (setValid(false), ...); + GlobalPA.abandon(); + } + + /// Notify about invalidation of all analysis results except the specified + /// ones. + template + void invalidateExcept() + { + InvalidateGlobalAnalyses = true; + (setValid(false), ...); + GlobalPA.intersect(as_preserved( + std::make_index_sequence>{})); + } + + /// Same as the above but takes set of "PreservedAnalyses". + void invalidateExcept(PreservedAnalyses PA) + { + InvalidateGlobalAnalyses = true; + (setValid(false), ...); + GlobalPA.intersect(PA); + } + + /// Notify about invalidation of all analysis. + void invalidateAll() + { + InvalidateGlobalAnalyses = true; + (setValid(false), ...); + GlobalPA = PreservedAnalyses::none(); + } + + template + void abandon() { + invalidate(); + disable(); + } + +private: + template + const AnalysesInfoElemT &getAnalysesInfoElem() const { + return std::get>(LocalAnalysisInfo); + } + + template + AnalysesInfoElemT &getAnalysesInfoElem() { + return std::get>(LocalAnalysisInfo); + } + + template bool IsEnabled() const { + return std::get<0>(getAnalysesInfoElem()); + } - LoopVectorizeResult(bool MadeAnyChange, bool MadeCFGChange) - : MadeAnyChange(MadeAnyChange), MadeCFGChange(MadeCFGChange) {} + template void SetEnabled(bool Enabled) { + std::get<0>(getAnalysesInfoElem()) = Enabled; + } + + template + void SetEnabled(bool Enabled, std::index_sequence) { + (SetEnabled>(Enabled), ...); + } + + template bool isValid() const { + return std::get<1>(getAnalysesInfoElem()); + } + + template void setValid(bool Valid) { + std::get<1>(getAnalysesInfoElem()) = Valid; + } + + template + void setValid(bool Valid, std::index_sequence) { + (setValid>(Valid), ...); + } + + template bool isRequired() const { + return std::get<2>(getAnalysesInfoElem()); + } + + template void setRequired(bool Required) { + std::get<2>(getAnalysesInfoElem()) = Required; + } + + template + void setRequired(bool Required, std::index_sequence) { + (setRequired>(Required), ...); + } + + template + typename Analysis::Result *const &getResultImpl() const { + return std::get<3>(getAnalysesInfoElem()); + } + + template + typename Analysis::Result *&getResultImpl() { + return std::get<3>(getAnalysesInfoElem()); + } + + template + void setResultImpl(typename Analysis::Result *R) { + setValid(true); + getResultImpl() = R; + } + + template void updateLocalAnalysis() { + if (!IsEnabled()) + return; + + if (!isValid() || !getResultImpl()) { + if (InvalidateGlobalAnalyses) { + FAM.invalidate(F, GlobalPA); + InvalidateGlobalAnalyses = false; + } + typename Analysis::Result *Res = nullptr; + if constexpr(std::is_same_v) { + auto &MAMProxy = FAM.getResult(F); + Res = MAMProxy.getCachedResult(*F.getParent()); + } else if (isRequired()){ + Res = &FAM.getResult(F); + } else { + Res = FAM.getCachedResult(F); + } + setResultImpl(Res); + } + } + + template void enableRequiredAnalysis() { + static_assert( + !std::is_same::value && + "PSI can only be provided on best effort basis"); + SetEnabled(true); + setRequired(true); + updateLocalAnalysis(); + } + + template + void enableRequiredAnalyses(std::index_sequence) { + (enableRequiredAnalysis>(), ...); + } + + template + void enableOptionalAnalysis() { + SetEnabled(true); + setRequired(false); + updateLocalAnalysis(); + } + + template + void enableOptionalAnalyses(std::index_sequence) { + (enableOptionalAnalysis>(), ...); + } + + template + PreservedAnalyses as_preserved(std::index_sequence) const { + PreservedAnalyses PA; + (PA.preserve>(), ...); + return PA; + } }; /// The LoopVectorize Pass. struct LoopVectorizePass : public PassInfoMixin { +public: + using VecRequiredAnalyses = + std::tuple; + using VecOptionalAnalysis = + std::tuple; + using AllLocalAnalyses = + decltype(std::tuple_cat(std::declval(), + std::declval())); + using VecAnalysesProviderT = LocalAnalysesProvider; + private: + std::unique_ptr LAP = nullptr; + Function *F; + /// If false, consider all loops for interleaving. /// If true, only loops that explicitly request interleaving are considered. bool InterleaveOnlyWhenForced; @@ -170,32 +487,22 @@ public: LoopVectorizePass(LoopVectorizeOptions Opts = {}); - ScalarEvolution *SE; - LoopInfo *LI; - TargetTransformInfo *TTI; - DominatorTree *DT; - BlockFrequencyInfo *BFI; - TargetLibraryInfo *TLI; - DemandedBits *DB; - AssumptionCache *AC; - LoopAccessInfoManager *LAIs; - OptimizationRemarkEmitter *ORE; - ProfileSummaryInfo *PSI; - PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); void printPipeline(raw_ostream &OS, function_ref MapClassName2PassName); + PreservedAnalyses &getAllLocalRequiredAnalyses() const; + PreservedAnalyses &getAllLocalOptionalAnalyses() const; + template + void updateOneAnalysisImpl(typename AnalysisT::Result *&Result, + std::optional DoUpdate); + template + void updateOneAnalysis(typename AnalysisT::Result *&Result, bool Force); + void updateLocalAnalyses(const PreservedAnalyses &RequiredAnalyses, + const PreservedAnalyses &OptionalAnalyses); - // Shim for old PM. - LoopVectorizeResult runImpl(Function &F, ScalarEvolution &SE_, LoopInfo &LI_, - TargetTransformInfo &TTI_, DominatorTree &DT_, - BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_, - DemandedBits &DB_, AssumptionCache &AC_, - LoopAccessInfoManager &LAIs_, - OptimizationRemarkEmitter &ORE_, - ProfileSummaryInfo *PSI_); + PreservedAnalyses runImpl(); - bool processLoop(Loop *L); + void processLoop(Loop *L); }; /// Reports a vectorization failure: print \p DebugMsg for debugging diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -115,6 +115,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/User.h" @@ -1181,17 +1182,15 @@ /// different operations. class LoopVectorizationCostModel { public: - LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, - PredicatedScalarEvolution &PSE, LoopInfo *LI, - LoopVectorizationLegality *Legal, - const TargetTransformInfo &TTI, - const TargetLibraryInfo *TLI, DemandedBits *DB, - AssumptionCache *AC, BlockFrequencyInfo *BFI, - OptimizationRemarkEmitter *ORE, const Function *F, - const LoopVectorizeHints *Hints, - InterleavedAccessInfo &IAI) - : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), - TTI(TTI), TLI(TLI), DB(DB), AC(AC), BFI(BFI), ORE(ORE), TheFunction(F), + LoopVectorizationCostModel( + LoopVectorizePass::VecAnalysesProviderT *LAP, ScalarEpilogueLowering SEL, + Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, + LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, + const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, + OptimizationRemarkEmitter *ORE, const Function *F, + const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI) + : ScalarEpilogueStatus(SEL), LAP(LAP), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), + TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {} /// \return An upper bound for the vectorization factors (both fixed and @@ -1868,6 +1867,9 @@ const BasicBlock *BB) const; public: + /// TODO: + LoopVectorizePass::VecAnalysesProviderT *LAP; + /// The loop that we evaluate. Loop *TheLoop; @@ -1892,8 +1894,6 @@ /// Assumption cache. AssumptionCache *AC; - BlockFrequencyInfo *BFI; - /// Interface to emit optimization remarks. OptimizationRemarkEmitter *ORE; @@ -1963,7 +1963,7 @@ /// un-linked from the IR and is added back during vector code generation. If /// there is no vector code generation, the check blocks are removed /// completely. - void Create(Loop *L, const LoopAccessInfo &LAI, + bool Create(Loop *L, const LoopAccessInfo &LAI, const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) { // Hard cutoff to limit compile-time increase in case a very large number of @@ -1973,7 +1973,7 @@ CostTooHigh = LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold; if (CostTooHigh) - return; + return false; BasicBlock *LoopHeader = L->getHeader(); BasicBlock *Preheader = L->getLoopPreheader(); @@ -2018,7 +2018,7 @@ } if (!MemCheckBlock && !SCEVCheckBlock) - return; + return false; // Unhook the temporary block with the checks, update various places // accordingly. @@ -2047,6 +2047,7 @@ DT->eraseNode(SCEVCheckBlock); LI->removeBlock(SCEVCheckBlock); } + return true; } InstructionCost getCost() { @@ -5602,12 +5603,16 @@ InstructionCost LoopVectorizationCostModel::getInstCostScaledByFreq( InstructionCost &Cost, const BasicBlock *BB) const { + // TODO: enable + //assert(BFI && ""); + if (!Cost.isValid()) { return Cost; } - if (!BFI) - return Cost / getReciprocalPredBlockProb(); + // TODO: Remove + LAP->enable(true); + auto *BFI = LAP->getResult(); auto HeaderFreq = BFI->getBlockFreq(TheLoop->getHeader()).getFrequency(); @@ -9887,26 +9892,32 @@ // VPlan upfront in the vectorization pipeline, which allows to apply // VPlan-to-VPlan transformations from the very beginning without modifying the // input LLVM IR. -static bool processLoopInVPlanNativePath( - Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, - LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, - TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, - LoopVectorizationRequirements &Requirements) { +static void processLoopInVPlanNativePath( + LoopVectorizePass::VecAnalysesProviderT *LAP, Loop *L, + PredicatedScalarEvolution &PSE, LoopVectorizationLegality *LVL, + LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements) { + + auto *DT = LAP->getResult(); + auto *TTI = LAP->getResult(); + auto *TLI = LAP->getResult(); + auto *LI = LAP->getResult(); + auto *ORE = LAP->getResult(); + auto *DB = LAP->getResult(); + auto *AC = LAP->getResult(); if (isa(PSE.getBackedgeTakenCount())) { LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); - return false; + return; } assert(EnableVPlanNativePath && "VPlan-native path is disabled."); Function *F = L->getHeader()->getParent(); InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); - ScalarEpilogueLowering SEL = - getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI); + ScalarEpilogueLowering SEL = getScalarEpilogueLowering( + F, L, Hints, LAP->getResult(), + LAP->getResult(), TTI, TLI, *LVL, &IAI); - LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, BFI, + LoopVectorizationCostModel CM(LAP, SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, &Hints, IAI); // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an @@ -9925,7 +9936,7 @@ // code. Masked vector code generation support will follow soon. // Also, do not attempt to vectorize if no vector code will be produced. if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) - return false; + return; VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); @@ -9933,7 +9944,9 @@ GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, F->getParent()->getDataLayout()); InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, - VF.Width, 1, LVL, &CM, BFI, PSI, Checks); + VF.Width, 1, LVL, &CM, + LAP->getResult(), + LAP->getResult(), Checks); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); @@ -9942,7 +9955,10 @@ // Mark the loop as already vectorized to avoid vectorizing again. Hints.setAlreadyVectorized(); assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); - return true; + + // TODO: Preserve Loop and Dominator analyses for VPlan-native path. + LAP->invalidateAll(); + return; } // Emit a remark if there are stores to floats that required a floating point @@ -10095,10 +10111,29 @@ VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || !EnableLoopVectorization) {} -bool LoopVectorizePass::processLoop(Loop *L) { +void LoopVectorizePass::processLoop(Loop *L) { assert((EnableVPlanNativePath || L->isInnermost()) && "VPlan-native path is not enabled. Only process inner loops."); + auto *LI = LAP->getResultIfValid(); + assert(LI && "LI is expected to be valid since L depends on it"); + + LAP->enable(); + + auto *SE = LAP->getResult(); + auto *DT = LAP->getResult(); + auto *TTI = LAP->getResult(); + auto *TLI = LAP->getResult(); + auto *LAIs = LAP->getResult(); + auto *ORE = LAP->getResult(); + auto *DB = LAP->getResult(); + auto *AC = LAP->getResult(); + + auto *PSI = LAP->getResult(); + if (PSI && PSI->hasProfileSummary()) { + LAP->enable(true); + } + #ifndef NDEBUG const std::string DebugLocStr = getDebugLocString(L); #endif /* NDEBUG */ @@ -10133,7 +10168,7 @@ if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) { LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n"); - return false; + return; } PredicatedScalarEvolution PSE(*SE, *L); @@ -10141,11 +10176,14 @@ // Check if it is legal to vectorize the loop. LoopVectorizationRequirements Requirements; LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE, - &Requirements, &Hints, DB, AC, BFI, PSI); + &Requirements, &Hints, DB, AC, + LAP->getResult(), + LAP->getResult()); + if (!LVL.canVectorize(EnableVPlanNativePath)) { LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); Hints.emitRemarkWithHints(); - return false; + return; } // Entrance to the VPlan-native vectorization path. Outer loops are processed @@ -10153,9 +10191,10 @@ // even evaluating whether vectorization is profitable. Since we cannot modify // the incoming IR, we need to build VPlan upfront in the vectorization // pipeline. - if (!L->isInnermost()) - return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, - ORE, BFI, PSI, Hints, Requirements); + if (!L->isInnermost()) { + processLoopInVPlanNativePath(LAP.get(), L, PSE, &LVL, Hints, Requirements); + return; + } assert(L->isInnermost() && "Inner loop expected."); @@ -10172,8 +10211,9 @@ // Check the function attributes and profiles to find out if this function // should be optimized for size. - ScalarEpilogueLowering SEL = - getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI); + ScalarEpilogueLowering SEL = getScalarEpilogueLowering( + F, L, Hints, LAP->getResult(), + LAP->getResult(), TTI, TLI, LVL, &IAI); // Check the loop for a trip count threshold: vectorize loops with a tiny trip // count by optimizing for size, to minimize overheads. @@ -10196,7 +10236,7 @@ "loop trip count is too low, avoiding vectorization", "LowTripCount", ORE, L); Hints.emitRemarkWithHints(); - return false; + return; } } } @@ -10209,7 +10249,7 @@ "loop not vectorized due to NoImplicitFloat attribute", "NoImplicitFloat", ORE, L); Hints.emitRemarkWithHints(); - return false; + return; } // Check if the target supports potentially unsafe FP vectorization. @@ -10223,7 +10263,7 @@ "loop not vectorized due to unsafe FP support.", "UnsafeFP", ORE, L); Hints.emitRemarkWithHints(); - return false; + return; } bool AllowOrderedReductions; @@ -10244,12 +10284,12 @@ LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to " "reorder floating-point operations\n"); Hints.emitRemarkWithHints(); - return false; + return; } // Use the cost model. - LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, BFI, - ORE, F, &Hints, IAI); + LoopVectorizationCostModel CM(LAP.get(), SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, + AC, ORE, F, &Hints, IAI); // Use the planner for vectorization. LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE); @@ -10274,11 +10314,16 @@ // Optimistically generate runtime checks if they are needed. Drop them if // they turn out to not be profitable. if (VF.Width.isVector() || SelectedIC > 1) - Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); + if (Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, + SelectedIC)) { + // Request dummy 'ShouldRunExtraVectorPasses' analysis to indicate + // that runtime checks has been generated. + LAP->enable(true); + } - // Check if it is profitable to vectorize with runtime checks. - bool ForceVectorization = - Hints.getForce() == LoopVectorizeHints::FK_Enabled; + // Check if it is profitable to vectorize with runtime checks. + bool ForceVectorization = + Hints.getForce() == LoopVectorizeHints::FK_Enabled; if (!ForceVectorization && !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L, *PSE.getSE())) { @@ -10291,7 +10336,7 @@ }); LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n"); Hints.emitRemarkWithHints(); - return false; + return; } } @@ -10355,7 +10400,7 @@ L->getStartLoc(), L->getHeader()) << IntDiagMsg.second; }); - return false; + return; } else if (!VectorizeLoop && InterleaveLoop) { LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); ORE->emit([&]() { @@ -10386,7 +10431,9 @@ // If we decided that it is not legal to vectorize the loop, then // interleave it. InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, - &CM, BFI, PSI, Checks); + &CM, LAP->getResult(), + LAP->getResult(), + Checks); VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); @@ -10409,8 +10456,10 @@ // to be vectorized by executing the plan (potentially with a different // factor) again shortly afterwards. EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); - EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI, &LVL, &CM, BFI, PSI, Checks); + EpilogueVectorizerMainLoop MainILV( + L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL, &CM, + LAP->getResult(), + LAP->getResult(), Checks); VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, @@ -10421,9 +10470,10 @@ // edges from the first pass. EPI.MainLoopVF = EPI.EpilogueVF; EPI.MainLoopUF = EPI.EpilogueUF; - EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, - ORE, EPI, &LVL, &CM, BFI, PSI, - Checks); + EpilogueVectorizerEpilogueLoop EpilogILV( + L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL, &CM, + LAP->getResult(), + LAP->getResult(), Checks); VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); @@ -10483,8 +10533,10 @@ DisableRuntimeUnroll = true; } else { InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, - VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, - PSI, Checks); + VF.MinProfitableTripCount, IC, &LVL, &CM, + LAP->getResult(), + LAP->getResult(), + Checks); VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); @@ -10523,26 +10575,35 @@ Hints.setAlreadyVectorized(); } + assert(!SE || SE == LAP->getResult()); + assert(!DT || DT == LAP->getResult()); + assert(!TTI || TTI == LAP->getResult()); + assert(!TLI || TLI == LAP->getResult()); + assert(!LAIs || LAIs == LAP->getResult()); + assert(!ORE || ORE == LAP->getResult()); + assert(!DB || DB == LAP->getResult()); + assert(!AC || AC == LAP->getResult()); + assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); - return true; + + using VecPA = + std::tuple; + LAP->invalidateExcept(); + +#ifdef EXPENSIVE_CHECKS + SE.verify(); +#endif + + return; } -LoopVectorizeResult LoopVectorizePass::runImpl( - Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, - DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_, - DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_, - OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { - SE = &SE_; - LI = &LI_; - TTI = &TTI_; - DT = &DT_; - BFI = BFI_; - TLI = TLI_; - AC = &AC_; - LAIs = &LAIs_; - DB = &DB_; - ORE = &ORE_; - PSI = PSI_; +PreservedAnalyses LoopVectorizePass::runImpl() { + + assert(!LAP->getResult() && + "ShouldRunExtraVectorPasses analysis should be disabled on entry " + "(conditionally enabled if runtime checks are requested)"); + + LAP->enable(true); // Don't attempt if // 1. the target claims to have no vector registers, and @@ -10551,104 +10612,106 @@ // The second condition is necessary because, even if the target has no // vector registers, loop vectorization may still enable scalar // interleaving. - if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && - TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2) - return LoopVectorizeResult(false, false); + { + auto *TTI = LAP->getResult(); + if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && + TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2) + return LAP->getPreservedAnalyses(); + } - bool Changed = false, CFGChanged = false; + using SimplificationRA = std::tuple; + using SimplificationOA = + std::tuple; + LAP->enable(); + auto *LI = LAP->getResult(); // The vectorizer requires loops to be in simplified form. // Since simplification may add new inner loops, it has to run before the - // legality and profitability checks. This means running the loop vectorizer + // legality and profitability checks. This means running the loop + // vectorizer // will simplify all loops, regardless of whether anything end up being // vectorized. - for (const auto &L : *LI) - Changed |= CFGChanged |= + bool IsAnyLoopSimplified = false; + for (const auto &L : *LI) { + auto *SE = LAP->getResult(); + auto *DT = LAP->getResult(); + auto *AC = LAP->getResult(); + IsAnyLoopSimplified |= simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); + } + + if (IsAnyLoopSimplified) { + using SimplificationPA = decltype( + std::tuple_cat(std::declval(), + std::declval())); + LAP->invalidateExcept(); + } // Build up a worklist of inner-loops to vectorize. This is necessary as // the act of vectorizing or partially unrolling a loop creates new loops // and can invalidate iterators across the loops. - SmallVector Worklist; + + LI = LAP->getResult(); + LAP->enable(true); + SmallVector Worklist; for (Loop *L : *LI) - collectSupportedLoops(*L, LI, ORE, Worklist); + collectSupportedLoops( + *L, LI, LAP->getResult(), Worklist); LoopsAnalyzed += Worklist.size(); + // Now walk the identified inner loops. while (!Worklist.empty()) { + LAP->enable(true); + auto *DT = LAP->getResult(); Loop *L = Worklist.pop_back_val(); // For the inner loops we actually process, form LCSSA to simplify the // transform. - Changed |= formLCSSARecursively(*L, *DT, LI); + bool IsLoopLCSSAed = formLCSSARecursively(*L, *DT, LI); - Changed |= CFGChanged |= processLoop(L); + if (IsLoopLCSSAed) { + // TODO: Does 'formLCSSARecursively' preserves CFGAnalyses set? + LAP->invalidateExcept>(); + } - if (Changed) - LAIs->clear(); - } + // Make sure 'LI' hasn't changed between iterations as a result of potential + // invalidation. Even though it's not explicitly invalidated it may become + // invalid if any other analysis it depends is invalidated. + auto *tmpLI = LAP->getResult(); + (void)tmpLI; + assert(LI == tmpLI && "LoopInfo is expected to survive invalidation"); + processLoop(L); + } // Process each loop nest in the function. - return LoopVectorizeResult(Changed, CFGChanged); + return LAP->getPreservedAnalyses(); } -PreservedAnalyses LoopVectorizePass::run(Function &F, - FunctionAnalysisManager &AM) { - auto &LI = AM.getResult(F); +PreservedAnalyses LoopVectorizePass::run(Function &F_, + FunctionAnalysisManager &AM_) { + LAP = std::make_unique(AM_, F_); + F = &F_; + + // There are no loops in the function. Return before computing other expensive // analyses. - if (LI.empty()) - return PreservedAnalyses::all(); - auto &SE = AM.getResult(F); - auto &TTI = AM.getResult(F); - auto &DT = AM.getResult(F); - auto &TLI = AM.getResult(F); - auto &AC = AM.getResult(F); - auto &DB = AM.getResult(F); - auto &ORE = AM.getResult(F); - - LoopAccessInfoManager &LAIs = AM.getResult(F); - auto &MAMProxy = AM.getResult(F); - ProfileSummaryInfo *PSI = - MAMProxy.getCachedResult(*F.getParent()); - BlockFrequencyInfo *BFI = &AM.getResult(F); - LoopVectorizeResult Result = - runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI); - if (!Result.MadeAnyChange) + if (AM_.getResult(F_).empty()) return PreservedAnalyses::all(); - PreservedAnalyses PA; - if (isAssignmentTrackingEnabled(*F.getParent())) { - for (auto &BB : F) - RemoveRedundantDbgInstrs(&BB); - } - - // We currently do not preserve loopinfo/dominator analyses with outer loop - // vectorization. Until this is addressed, mark these analyses as preserved - // only for non-VPlan-native path. - // TODO: Preserve Loop and Dominator analyses for VPlan-native path. - if (!EnableVPlanNativePath) { - PA.preserve(); - PA.preserve(); - PA.preserve(); + PreservedAnalyses PA = runImpl(); + if (PA.areAllPreserved()) + return PA; -#ifdef EXPENSIVE_CHECKS - SE.verify(); -#endif + if (isAssignmentTrackingEnabled(*F->getParent())) { + for (auto &BB : *F) + RemoveRedundantDbgInstrs(&BB); } - if (Result.MadeCFGChange) { - // Making CFG changes likely means a loop got vectorized. Indicate that - // extra simplification passes should be run. - // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only - // be run if runtime checks have been added. - AM.getResult(F); - PA.preserve(); - } else { - PA.preserveSet(); - } return PA; } diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll --- a/llvm/test/Other/new-pm-defaults.ll +++ b/llvm/test/Other/new-pm-defaults.ll @@ -246,8 +246,6 @@ ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis on foo ; CHECK-O-NEXT: Running pass: InjectTLIMappings ; CHECK-O-NEXT: Running pass: LoopVectorizePass -; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis -; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll --- a/llvm/test/Other/new-pm-lto-defaults.ll +++ b/llvm/test/Other/new-pm-lto-defaults.ll @@ -117,8 +117,6 @@ ; CHECK-O23SZ-NEXT: Running analysis: LoopAccessAnalysis on foo ; CHECK-O23SZ-NEXT: Running pass: LoopVectorizePass on foo ; CHECK-O23SZ-NEXT: Running analysis: DemandedBitsAnalysis on foo -; CHECK-O23SZ-NEXT: Running analysis: BlockFrequencyAnalysis -; CHECK-O23SZ-NEXT: Running analysis: BranchProbabilityAnalysis ; CHECK-O23SZ-NEXT: Running pass: LoopUnrollPass on foo ; CHECK-O23SZ-NEXT: WarnMissedTransformationsPass on foo ; CHECK-O23SZ-NEXT: Running pass: SROAPass on foo diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll --- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll @@ -180,8 +180,6 @@ ; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis on foo ; CHECK-POSTLINK-O-NEXT: Running pass: InjectTLIMappings ; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass -; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis -; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis ; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass ; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass diff --git a/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll b/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll --- a/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll +++ b/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll @@ -49,17 +49,17 @@ ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next, ir<1> ; CHECK-NEXT: EMIT vp<[[COND:%.+]]> = icmp ule ir<%iv> vp<[[BTC]]> -; CHECK-NEXT: Successor(s): pred.store +; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: pred.store: { ; CHECK-NEXT: pred.store.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<%5> +; CHECK-NEXT: BRANCH-ON-MASK vp<[[COND]]> ; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: -; CHECK-NEXT: vp<%6> = SCALAR-STEPS vp<%3>, ir<1> -; CHECK-NEXT: REPLICATE ir<%cond0> = icmp vp<%6>, ir<13> -; CHECK-NEXT: REPLICATE ir<%gep> = getelementptr ir<%ptr>, vp<%6> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> +; CHECK-NEXT: REPLICATE ir<%cond0> = icmp vp<[[STEPS]]>, ir<13> +; CHECK-NEXT: REPLICATE ir<%gep> = getelementptr ir<%ptr>, vp<[[STEPS]]> ; CHECK-NEXT: REPLICATE ir<%s> = select ir<%cond0>, ir<10>, ir<20> ; CHECK-NEXT: REPLICATE store ir<%s>, ir<%gep> ; CHECK-NEXT: Successor(s): pred.store.continue @@ -69,10 +69,10 @@ ; CHECK-NEXT: } ; CHECK-NEXT: Successor(s): loop.0 ; CHECK-EMPTY: -; CHECK-NEXT: loop.0: -; CHECK-NEXT: EMIT vp<%11> = VF * UF + vp<%3> -; CHECK-NEXT: EMIT branch-on-count vp<%11> vp<%1> -; CHECK-NEXT: No successors +; CHECK-NEXT: loop.0: +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = VF * UF + vp<[[CAN_IV]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]> vp<[[VEC_TC]]> +; CHECK-NEXT: No successors ; CHECK-NEXT:}