diff --git a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h --- a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h +++ b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h @@ -16,6 +16,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/Optional.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ScalarEvolutionNormalization.h" @@ -506,10 +507,12 @@ SCEVExpanderCleaner(SCEVExpander &Expander, DominatorTree &DT) : Expander(Expander), DT(DT), ResultUsed(false) {} - ~SCEVExpanderCleaner(); + ~SCEVExpanderCleaner() { cleanup(); } /// Indicate that the result of the expansion is used. void markResultUsed() { ResultUsed = true; } + + void cleanup(); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -2679,7 +2679,7 @@ return false; } -SCEVExpanderCleaner::~SCEVExpanderCleaner() { +void SCEVExpanderCleaner::cleanup() { // Result is used, nothing to remove. if (ResultUsed) return; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -69,7 +69,6 @@ #include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" @@ -433,6 +432,9 @@ return None; } +// Forward declare GeneratedRTChecks. +class GeneratedRTChecks; + namespace llvm { /// InnerLoopVectorizer vectorizes loops which contain only one basic @@ -458,11 +460,11 @@ OptimizationRemarkEmitter *ORE, ElementCount VecWidth, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI) + ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks) : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI), - PSI(PSI) { + PSI(PSI), RTChecks(RTChecks) { // Query this against the original loop and save it here because the profile // of the original loop header may change as the transformation happens. OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( @@ -695,11 +697,14 @@ void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); /// Emit a bypass check to see if all of the SCEV assumptions we've - /// had to make are correct. - void emitSCEVChecks(Loop *L, BasicBlock *Bypass); + /// had to make are correct. Returns the block containing the checks or + /// nullptr if no checks have been added. + BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); /// Emit bypass checks to check any memory assumptions we may have made. - void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); + /// Returns the block containing the checks or nullptr if no checks have been + /// added. + BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); /// Compute the transformed value of Index at offset StartValue using step /// StepValue. @@ -866,6 +871,10 @@ // Whether this loop should be optimized for size based on profile guided size // optimizatios. bool OptForSizeBasedOnProfile; + + /// Structure to hold information about generated runtime checks, responsible + /// for cleaning the checks, if vectorization turns out unprofitable. + GeneratedRTChecks &RTChecks; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -877,10 +886,10 @@ OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI) + ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1), UnrollFactor, LVL, CM, - BFI, PSI) {} + BFI, PSI, Check) {} private: Value *getBroadcastInstrs(Value *V) override; @@ -929,9 +938,11 @@ const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, - BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, + GeneratedRTChecks &Checks) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI), + EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, + Checks), EPI(EPI) {} // Override this function to handle the more complex control flow around the @@ -965,9 +976,10 @@ const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, - BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, + GeneratedRTChecks &Check) : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI, LVL, CM, BFI, PSI) {} + EPI, LVL, CM, BFI, PSI, Check) {} /// Implements the interface for creating a vectorized skeleton using the /// *main loop* strategy (ie the first pass of vplan execution). BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; @@ -987,17 +999,16 @@ // their epilogues. class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { public: - EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, - LoopInfo *LI, DominatorTree *DT, - const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, - EpilogueLoopVectorizationInfo &EPI, - LoopVectorizationLegality *LVL, - llvm::LoopVectorizationCostModel *CM, - BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) + EpilogueVectorizerEpilogueLoop( + Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, + DominatorTree *DT, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, AssumptionCache *AC, + OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, + LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, + GeneratedRTChecks &Checks) : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI, LVL, CM, BFI, PSI) {} + EPI, LVL, CM, BFI, PSI, Checks) {} /// Implements the interface for creating a vectorized skeleton using the /// *epilogue loop* strategy (ie the second pass of vplan execution). BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; @@ -1833,9 +1844,210 @@ /// Profitable vector factors. SmallVector ProfitableVFs; }; - } // end namespace llvm +/// Helper struct to manage generating runtime checks for vectorization. +/// +/// The runtime checks are created up-front in temporary blocks to allow better +/// estimating the cost and un-linked from the existing IR. After deciding to +/// vectorize, the checks are moved back. If deciding not to vectorize, the +/// temporary blocks are completely removed. +class GeneratedRTChecks { + /// Basic block which contains the generated SCEV checks, if any. + BasicBlock *SCEVCheckBlock = nullptr; + + /// The value representing the result of the generated SCEV checks. If it is + /// nullptr, either no SCEV checks have been generated or they have been used. + Value *SCEVCheckCond = nullptr; + + /// Basic block which contains the generated memory runtime checks, if any. + BasicBlock *MemCheckBlock = nullptr; + + /// The value representing the result of the generated memory runtime checks. + /// If it is nullptr, either no memory runtime checks have been generated or + /// they have been used. + Instruction *MemRuntimeCheckCond = nullptr; + + DominatorTree *DT; + LoopInfo *LI; + + SCEVExpander SCEVExp; + SCEVExpander MemCheckExp; + +public: + GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, + const DataLayout &DL) + : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"), + MemCheckExp(SE, DL, "scev.check") {} + + /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can + /// accurately estimate the cost of the runtime checks. The blocks are + /// un-linked from the IR and is added back during vector code generation. If + /// there is no vector code generation, the check blocks are removed + /// completely. + void Create(Loop *L, const LoopAccessInfo &LAI, + const SCEVUnionPredicate &UnionPred) { + + BasicBlock *LoopHeader = L->getHeader(); + BasicBlock *Preheader = L->getLoopPreheader(); + + // Use SplitBlock to create blocks for SCEV & memory runtime checks to + // ensure the blocks are properly added to LoopInfo & DominatorTree. Those + // may be used by SCEVExpander. The blocks will be un-linked from their + // predecessors and removed from LI & DT at the end of the function. + if (!UnionPred.isAlwaysTrue()) { + SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, + nullptr, "vector.scevcheck"); + + SCEVCheckCond = SCEVExp.expandCodeForPredicate( + &UnionPred, SCEVCheckBlock->getTerminator()); + } + + const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); + if (RtPtrChecking.Need) { + auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader; + MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr, + "vector.memcheck"); + + std::tie(std::ignore, MemRuntimeCheckCond) = + addRuntimeChecks(MemCheckBlock->getTerminator(), L, + RtPtrChecking.getChecks(), MemCheckExp); + assert(MemRuntimeCheckCond && + "no RT checks generated although RtPtrChecking " + "claimed checks are required"); + } + + if (!MemCheckBlock && !SCEVCheckBlock) + return; + + // Unhook the temporary block with the checks, update various places + // accordingly. + if (SCEVCheckBlock) + SCEVCheckBlock->replaceAllUsesWith(Preheader); + if (MemCheckBlock) + MemCheckBlock->replaceAllUsesWith(Preheader); + + if (SCEVCheckBlock) { + SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); + new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); + Preheader->getTerminator()->eraseFromParent(); + } + if (MemCheckBlock) { + MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator()); + new UnreachableInst(Preheader->getContext(), MemCheckBlock); + Preheader->getTerminator()->eraseFromParent(); + } + + DT->changeImmediateDominator(LoopHeader, Preheader); + if (MemCheckBlock) { + DT->eraseNode(MemCheckBlock); + LI->removeBlock(MemCheckBlock); + } + if (SCEVCheckBlock) { + DT->eraseNode(SCEVCheckBlock); + LI->removeBlock(SCEVCheckBlock); + } + } + + /// Remove the created SCEV & memory runtime check blocks & instructions, if + /// unused. + ~GeneratedRTChecks() { + SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT); + SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT); + if (!SCEVCheckCond) + SCEVCleaner.markResultUsed(); + + if (!MemRuntimeCheckCond) + MemCheckCleaner.markResultUsed(); + + if (MemRuntimeCheckCond) { + auto &SE = *MemCheckExp.getSE(); + // Memory runtime check generation creates compares that use expanded + // values. Remove them before running the SCEVExpanderCleaners. + for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) { + if (MemCheckExp.isInsertedInstruction(&I)) + continue; + SE.forgetValue(&I); + SE.eraseValueFromMap(&I); + I.eraseFromParent(); + } + } + MemCheckCleaner.cleanup(); + SCEVCleaner.cleanup(); + + if (SCEVCheckCond) + SCEVCheckBlock->eraseFromParent(); + if (MemRuntimeCheckCond) + MemCheckBlock->eraseFromParent(); + } + + /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and + /// adjusts the branches to branch to the vector preheader or \p Bypass, + /// depending on the generated condition. + BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass, + BasicBlock *LoopVectorPreHeader, + BasicBlock *LoopExitBlock) { + if (!SCEVCheckCond) + return nullptr; + if (auto *C = dyn_cast(SCEVCheckCond)) + if (C->isZero()) + return nullptr; + + auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); + + BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); + // Create new preheader for vector loop. + if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) + PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); + + SCEVCheckBlock->getTerminator()->eraseFromParent(); + SCEVCheckBlock->moveBefore(LoopVectorPreHeader); + Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, + SCEVCheckBlock); + + DT->addNewBlock(SCEVCheckBlock, Pred); + DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); + + ReplaceInstWithInst( + SCEVCheckBlock->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond)); + // Mark the check as used, to prevent it from being removed during cleanup. + SCEVCheckCond = nullptr; + return SCEVCheckBlock; + } + + /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts + /// the branches to branch to the vector preheader or \p Bypass, depending on + /// the generated condition. + BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass, + BasicBlock *LoopVectorPreHeader) { + // Check if we generated code that checks in runtime if arrays overlap. + if (!MemRuntimeCheckCond) + return nullptr; + + auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); + Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, + MemCheckBlock); + + DT->addNewBlock(MemCheckBlock, Pred); + DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); + MemCheckBlock->moveBefore(LoopVectorPreHeader); + + if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) + PL->addBasicBlockToLoop(MemCheckBlock, *LI); + + ReplaceInstWithInst( + MemCheckBlock->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond)); + MemCheckBlock->getTerminator()->setDebugLoc( + Pred->getTerminator()->getDebugLoc()); + + // Mark the check as used, to prevent it from being removed during cleanup. + MemRuntimeCheckCond = nullptr; + return MemCheckBlock; + } +}; + // Return true if \p OuterLp is an outer loop annotated with hints for explicit // vectorization. The loop needs to be annotated with #pragma omp simd // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the @@ -2996,33 +3208,18 @@ LoopBypassBlocks.push_back(TCCheckBlock); } -void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { - // Reuse existing vector loop preheader for SCEV checks. - // Note that new preheader block is generated for vector loop. - BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; - - // Generate the code to check that the SCEV assumptions that we made. - // We want the new basic block to start at the first instruction in a - // sequence of instructions that form a check. - SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), - "scev.check"); - Value *SCEVCheck = Exp.expandCodeForPredicate( - &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); - - if (auto *C = dyn_cast(SCEVCheck)) - if (C->isZero()) - return; +BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { + + BasicBlock *const SCEVCheckBlock = + RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock); + if (!SCEVCheckBlock) + return nullptr; assert(!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size"); - SCEVCheckBlock->setName("vector.scevcheck"); - // Create new preheader for vector loop. - LoopVectorPreHeader = - SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, - nullptr, "vector.ph"); // Update dominator only if this is first RT check. if (LoopBypassBlocks.empty()) { @@ -3030,29 +3227,25 @@ DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); } - ReplaceInstWithInst( - SCEVCheckBlock->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); LoopBypassBlocks.push_back(SCEVCheckBlock); AddedSafetyChecks = true; + return SCEVCheckBlock; } -void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { +BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, + BasicBlock *Bypass) { // VPlan-native path does not do any analysis for runtime checks currently. if (EnableVPlanNativePath) - return; + return nullptr; - // Reuse existing vector loop preheader for runtime memory checks. - // Note that new preheader block is generated for vector loop. - BasicBlock *const MemCheckBlock = L->getLoopPreheader(); + BasicBlock *const MemCheckBlock = + RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader); - // Generate the code that checks in runtime if arrays overlap. We put the - // checks into a separate block to make the more common case of few elements - // faster. - auto *LAI = Legal->getLAI(); - const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); - if (!RtPtrChecking.Need) - return; + // Check if we generated code that checks in runtime if arrays overlap. We put + // the checks into a separate block to make the more common case of few + // elements faster. + if (!MemCheckBlock) + return nullptr; if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && @@ -3068,33 +3261,9 @@ }); } - MemCheckBlock->setName("vector.memcheck"); - // Create new preheader for vector loop. - LoopVectorPreHeader = - SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, - "vector.ph"); - - auto *CondBranch = cast( - Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader)); - ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch); LoopBypassBlocks.push_back(MemCheckBlock); - AddedSafetyChecks = true; - - // Update dominator only if this is first RT check. - if (LoopBypassBlocks.empty()) { - DT->changeImmediateDominator(Bypass, MemCheckBlock); - DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); - } - Instruction *FirstCheckInst; - Instruction *MemRuntimeCheck; - SCEVExpander Exp(*PSE.getSE(), MemCheckBlock->getModule()->getDataLayout(), - "induction"); - std::tie(FirstCheckInst, MemRuntimeCheck) = addRuntimeChecks( - MemCheckBlock->getTerminator(), OrigLoop, RtPtrChecking.getChecks(), Exp); - assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " - "claimed checks are required"); - CondBranch->setCondition(MemRuntimeCheck); + AddedSafetyChecks = true; // We currently don't use LoopVersioning for the actual loop cloning but we // still use it to add the noalias metadata. @@ -3103,6 +3272,7 @@ Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT, PSE.getSE()); LVer->prepareNoAliasMetadata(); + return MemCheckBlock; } Value *InnerLoopVectorizer::emitTransformedIndex( @@ -7779,22 +7949,12 @@ // Generate the code to check any assumptions that we've made for SCEV // expressions. - BasicBlock *SavedPreHeader = LoopVectorPreHeader; - emitSCEVChecks(Lp, LoopScalarPreHeader); - - // If a safety check was generated save it. - if (SavedPreHeader != LoopVectorPreHeader) - EPI.SCEVSafetyCheck = SavedPreHeader; + EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); // Generate the code that checks at runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. - SavedPreHeader = LoopVectorPreHeader; - emitMemRuntimeChecks(Lp, LoopScalarPreHeader); - - // If a safety check was generated save/overwite it. - if (SavedPreHeader != LoopVectorPreHeader) - EPI.MemSafetyCheck = SavedPreHeader; + EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); // Generate the iteration count check for the main loop, *after* the check // for the epilogue loop, so that the path-length is shorter for the case @@ -9241,15 +9401,18 @@ LVP.setBestPlan(VF.Width, 1); - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, - &CM, BFI, PSI); - LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" - << L->getHeader()->getParent()->getName() << "\"\n"); - LVP.executePlan(LB, DT); + { + GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, + F->getParent()->getDataLayout()); + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, + &CM, BFI, PSI, Checks); + LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" + << L->getHeader()->getParent()->getName() << "\"\n"); + LVP.executePlan(LB, DT); + } // Mark the loop as already vectorized to avoid vectorizing again. Hints.setAlreadyVectorized(); - assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); return true; } @@ -9541,82 +9704,91 @@ LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); } - LVP.setBestPlan(VF.Width, IC); - - using namespace ore; bool DisableRuntimeUnroll = false; MDNode *OrigLoopID = L->getLoopID(); + { + // Optimistically generate runtime checks. Drop them if they turn out to not + // be profitable. Limit the scope of Checks, so the cleanup happens + // immediately after vector codegeneration is done. + GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, + F->getParent()->getDataLayout()); + if (!VF.Width.isScalar() || IC > 1) + Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate()); + LVP.setBestPlan(VF.Width, IC); + + using namespace ore; + if (!VectorizeLoop) { + assert(IC > 1 && "interleave count should not be 1 or 0"); + // If we decided that it is not legal to vectorize the loop, then + // interleave it. + InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, + &CM, BFI, PSI, Checks); + LVP.executePlan(Unroller, DT); - if (!VectorizeLoop) { - assert(IC > 1 && "interleave count should not be 1 or 0"); - // If we decided that it is not legal to vectorize the loop, then - // interleave it. - InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, - BFI, PSI); - LVP.executePlan(Unroller, DT); - - ORE->emit([&]() { - return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), - L->getHeader()) - << "interleaved loop (interleaved count: " - << NV("InterleaveCount", IC) << ")"; - }); - } else { - // If we decided that it is *legal* to vectorize the loop, then do it. - - // Consider vectorizing the epilogue too if it's profitable. - VectorizationFactor EpilogueVF = - CM.selectEpilogueVectorizationFactor(VF.Width, LVP); - if (EpilogueVF.Width.isVector()) { - - // The first pass vectorizes the main loop and creates a scalar epilogue - // to be vectorized by executing the plan (potentially with a different - // factor) again shortly afterwards. - EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, - EpilogueVF.Width.getKnownMinValue(), 1); - EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, - &LVL, &CM, BFI, PSI); - - LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); - LVP.executePlan(MainILV, DT); - ++LoopsVectorized; - - simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); - formLCSSARecursively(*L, *DT, LI, SE); - - // Second pass vectorizes the epilogue and adjusts the control flow - // edges from the first pass. - LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); - EPI.MainLoopVF = EPI.EpilogueVF; - EPI.MainLoopUF = EPI.EpilogueUF; - EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, - ORE, EPI, &LVL, &CM, BFI, PSI); - LVP.executePlan(EpilogILV, DT); - ++LoopsEpilogueVectorized; - - if (!MainILV.areSafetyChecksAdded()) - DisableRuntimeUnroll = true; + ORE->emit([&]() { + return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), + L->getHeader()) + << "interleaved loop (interleaved count: " + << NV("InterleaveCount", IC) << ")"; + }); } else { - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, - &LVL, &CM, BFI, PSI); - LVP.executePlan(LB, DT); - ++LoopsVectorized; - - // Add metadata to disable runtime unrolling a scalar loop when there are - // no runtime checks about strides and memory. A scalar loop that is - // rarely used is not worth unrolling. - if (!LB.areSafetyChecksAdded()) - DisableRuntimeUnroll = true; - } + // If we decided that it is *legal* to vectorize the loop, then do it. + + // Consider vectorizing the epilogue too if it's profitable. + VectorizationFactor EpilogueVF = + CM.selectEpilogueVectorizationFactor(VF.Width, LVP); + if (EpilogueVF.Width.isVector()) { + + // The first pass vectorizes the main loop and creates a scalar epilogue + // to be vectorized by executing the plan (potentially with a different + // factor) again shortly afterwards. + EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, + EpilogueVF.Width.getKnownMinValue(), + 1); + EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, + EPI, &LVL, &CM, BFI, PSI, Checks); + + LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); + LVP.executePlan(MainILV, DT); + ++LoopsVectorized; - // Report the vectorization decision. - ORE->emit([&]() { - return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), - L->getHeader()) - << "vectorized loop (vectorization width: " - << NV("VectorizationFactor", VF.Width) - << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; - }); + simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); + formLCSSARecursively(*L, *DT, LI, SE); + + // Second pass vectorizes the epilogue and adjusts the control flow + // edges from the first pass. + LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); + EPI.MainLoopVF = EPI.EpilogueVF; + EPI.MainLoopUF = EPI.EpilogueUF; + EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, + ORE, EPI, &LVL, &CM, BFI, PSI, + Checks); + LVP.executePlan(EpilogILV, DT); + ++LoopsEpilogueVectorized; + + if (!MainILV.areSafetyChecksAdded()) + DisableRuntimeUnroll = true; + } else { + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, + &LVL, &CM, BFI, PSI, Checks); + LVP.executePlan(LB, DT); + ++LoopsVectorized; + + // Add metadata to disable runtime unrolling a scalar loop when there + // are no runtime checks about strides and memory. A scalar loop that is + // rarely used is not worth unrolling. + if (!LB.areSafetyChecksAdded()) + DisableRuntimeUnroll = true; + } + // Report the vectorization decision. + ORE->emit([&]() { + return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), + L->getHeader()) + << "vectorized loop (vectorization width: " + << NV("VectorizationFactor", VF.Width) + << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; + }); + } if (ORE->allowExtraAnalysis(LV_NAME)) checkMixedPrecision(L, ORE); diff --git a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll --- a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll @@ -26,9 +26,9 @@ ; CHECK-NEXT: br i1 [[CMP27]], label [[FOR_BODY3_LR_PH_US_PREHEADER:%.*]], label [[FOR_END15:%.*]] ; CHECK: for.body3.lr.ph.us.preheader: ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[M]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[K:%.*]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[K:%.*]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 ; CHECK-NEXT: br label [[FOR_BODY3_LR_PH_US:%.*]] ; CHECK: for.end.us: ; CHECK-NEXT: [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV33:%.*]] @@ -54,12 +54,12 @@ ; CHECK-NEXT: br i1 [[EXITCOND32]], label [[FOR_END_US:%.*]], label [[FOR_BODY3_US]], !llvm.loop !3 ; CHECK: for.body3.lr.ph.us: ; CHECK-NEXT: [[INDVARS_IV33]] = phi i64 [ [[INDVARS_IV_NEXT34]], [[FOR_END_US]] ], [ 0, [[FOR_BODY3_LR_PH_US_PREHEADER]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP3]], [[INDVARS_IV33]] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP1]], [[INDVARS_IV33]] ; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 ; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[INDVARS_IV33]] to i32 ; CHECK-NEXT: [[ADD_US]] = add i32 [[TMP9]], [[K]] ; CHECK-NEXT: [[ARRAYIDX7_US]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV33]] -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP0]]) @@ -74,8 +74,8 @@ ; CHECK-NEXT: [[TMP16:%.*]] = or i1 false, [[TMP15]] ; CHECK-NEXT: br i1 [[TMP16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -100,7 +100,7 @@ ; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5 ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_US]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY3_LR_PH_US]] ], [ 0, [[VECTOR_SCEVCHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll b/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll --- a/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll +++ b/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll @@ -56,7 +56,6 @@ ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[TMP4:%.*]] = phi i8* [ [[TMP1]], %vector.memcheck ], [ [[TMP1]], %loop.preheader ], [ [[TMP1]], %middle.block ] ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 500, %middle.block ], [ 0, %loop.preheader ], [ 0, %vector.memcheck ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; diff --git a/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll b/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll @@ -0,0 +1,32 @@ +; RUN: opt -loop-vectorize -force-vector-width=4 %s | FileCheck %s + +%struct.foo = type { [400 x double] } + +; Make sure we do not crash when dropping runtime checks. + +; CHECK-NOT: vector.body + +define void @barney(%struct.foo* %ptr) { +entry: + br label %loop + +loop: + %tmp3 = phi i64 [ 0, %entry ], [ %tmp18, %loop ] + %tmp4 = getelementptr inbounds %struct.foo, %struct.foo* %ptr, i64 undef + %tmp5 = bitcast %struct.foo* %tmp4 to i64* + store i64 0, i64* %tmp5, align 8 + %tmp8 = add i64 1, %tmp3 + %tmp10 = getelementptr inbounds %struct.foo, %struct.foo* %ptr, i64 %tmp8 + %tmp11 = bitcast %struct.foo* %tmp10 to i64* + store i64 1, i64* %tmp11, align 8 + %tmp14 = add i64 undef, %tmp3 + %tmp16 = getelementptr inbounds %struct.foo, %struct.foo* %ptr, i64 %tmp14 + %tmp17 = bitcast %struct.foo* %tmp16 to i64* + store i64 2, i64* %tmp17, align 8 + %tmp18 = add nuw nsw i64 %tmp3, 4 + %c = icmp ult i64 %tmp18, 400 + br i1 %c, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll b/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll --- a/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll @@ -23,22 +23,22 @@ ; CHECK-NEXT: [[C_3:%.*]] = call i1 @cond() ; CHECK-NEXT: br i1 [[C_3]], label [[LOOP3_PREHEADER:%.*]], label [[INNER_LATCH:%.*]] ; CHECK: loop3.preheader: -; CHECK-NEXT: [[L_1_LCSSA11:%.*]] = phi i16* [ [[L_1]], [[INNER_BB]] ] +; CHECK-NEXT: [[L_1_LCSSA8:%.*]] = phi i16* [ [[L_1]], [[INNER_BB]] ] ; CHECK-NEXT: [[L_1_LCSSA:%.*]] = phi i16* [ [[L_1]], [[INNER_BB]] ] ; CHECK-NEXT: [[L_2_LCSSA:%.*]] = phi i16* [ [[L_2]], [[INNER_BB]] ] -; CHECK-NEXT: [[L_2_LCSSA4:%.*]] = bitcast i16* [[L_2_LCSSA]] to i8* +; CHECK-NEXT: [[L_2_LCSSA3:%.*]] = bitcast i16* [[L_2_LCSSA]] to i8* ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[L_2_LCSSA4]], i64 1 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[L_2_LCSSA3]], i64 1 ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i16, i16* [[L_1_LCSSA]], i64 1 -; CHECK-NEXT: [[SCEVGEP9:%.*]] = bitcast i16* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[SCEVGEP6:%.*]] = bitcast i16* [[SCEVGEP]] to i8* ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[N]], 2 -; CHECK-NEXT: [[SCEVGEP10:%.*]] = getelementptr i16, i16* [[L_1_LCSSA11]], i64 [[TMP1]] -; CHECK-NEXT: [[SCEVGEP1013:%.*]] = bitcast i16* [[SCEVGEP10]] to i8* -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[L_2_LCSSA4]], [[SCEVGEP1013]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP9]], [[UGLYGEP]] +; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr i16, i16* [[L_1_LCSSA8]], i64 [[TMP1]] +; CHECK-NEXT: [[SCEVGEP710:%.*]] = bitcast i16* [[SCEVGEP7]] to i8* +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[L_2_LCSSA3]], [[SCEVGEP710]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP6]], [[UGLYGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true ; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] @@ -66,8 +66,6 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[L_18:%.*]] = phi i16* [ [[L_1_LCSSA]], [[VECTOR_MEMCHECK]] ], [ [[L_1_LCSSA]], [[LOOP3_PREHEADER]] ], [ [[L_1_LCSSA]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[L_23:%.*]] = phi i16* [ [[L_2_LCSSA]], [[VECTOR_MEMCHECK]] ], [ [[L_2_LCSSA]], [[LOOP3_PREHEADER]] ], [ [[L_2_LCSSA]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP3_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP3:%.*]] ; CHECK: inner.latch: @@ -79,20 +77,19 @@ ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP3]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[C_5:%.*]] = icmp ult i64 [[IV]], [[N]] -; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i16, i16* [[L_18]], i64 [[IV_NEXT]] +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i16, i16* [[L_1_LCSSA]], i64 [[IV_NEXT]] ; CHECK-NEXT: [[LOOP_L_1:%.*]] = load i16, i16* [[GEP_1]], align 2 -; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i16, i16* [[L_23]], i64 0 +; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i16, i16* [[L_2_LCSSA]], i64 0 ; CHECK-NEXT: store i16 [[LOOP_L_1]], i16* [[GEP_2]], align 2 ; CHECK-NEXT: br i1 [[C_5]], label [[LOOP3]], label [[EXIT_LOOPEXIT]], [[LOOP7:!llvm.loop !.*]] ; CHECK: exit.loopexit: -; CHECK-NEXT: [[L_17:%.*]] = phi i16* [ [[L_1_LCSSA]], [[MIDDLE_BLOCK]] ], [ [[L_18]], [[LOOP3]] ] ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: exit.loopexit1: -; CHECK-NEXT: [[L_1_LCSSA5:%.*]] = phi i16* [ [[L_1]], [[INNER_LATCH]] ] +; CHECK-NEXT: [[L_1_LCSSA4:%.*]] = phi i16* [ [[L_1]], [[INNER_LATCH]] ] ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: -; CHECK-NEXT: [[L_16:%.*]] = phi i16* [ [[L_1_LCSSA5]], [[EXIT_LOOPEXIT1]] ], [ [[L_17]], [[EXIT_LOOPEXIT]] ] -; CHECK-NEXT: [[L_3:%.*]] = load i16, i16* [[L_16]], align 2 +; CHECK-NEXT: [[L_15:%.*]] = phi i16* [ [[L_1_LCSSA4]], [[EXIT_LOOPEXIT1]] ], [ [[L_1_LCSSA]], [[EXIT_LOOPEXIT]] ] +; CHECK-NEXT: [[L_3:%.*]] = load i16, i16* [[L_15]], align 2 ; CHECK-NEXT: ret i16 [[L_3]] ; entry: