diff --git a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h --- a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h +++ b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h @@ -16,6 +16,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/Optional.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ScalarEvolutionNormalization.h" @@ -503,10 +504,12 @@ SCEVExpanderCleaner(SCEVExpander &Expander, DominatorTree &DT) : Expander(Expander), DT(DT), ResultUsed(false) {} - ~SCEVExpanderCleaner(); + ~SCEVExpanderCleaner() { cleanup(); } /// Indicate that the result of the expansion is used. void markResultUsed() { ResultUsed = true; } + + void cleanup(); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -2682,7 +2682,7 @@ return false; } -SCEVExpanderCleaner::~SCEVExpanderCleaner() { +void SCEVExpanderCleaner::cleanup() { // Result is used, nothing to remove. if (ResultUsed) return; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -438,9 +438,8 @@ return None; } - +struct GeneratedRTChecks; namespace llvm { - /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -464,12 +463,12 @@ OptimizationRemarkEmitter *ORE, ElementCount VecWidth, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI) + ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), - BFI(BFI), PSI(PSI) { + BFI(BFI), PSI(PSI), Check(Check) { // Query this against the original loop and save it here because the profile // of the original loop header may change as the transformation happens. OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( @@ -730,10 +729,10 @@ /// Emit a bypass check to see if all of the SCEV assumptions we've /// had to make are correct. - void emitSCEVChecks(Loop *L, BasicBlock *Bypass); + BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); /// Emit bypass checks to check any memory assumptions we may have made. - void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); + BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); /// Compute the transformed value of Index at offset StartValue using step /// StepValue. @@ -906,6 +905,10 @@ // Whether this loop should be optimized for size based on profile guided size // optimizatios. bool OptForSizeBasedOnProfile; + + /// Structure to hold information about generated runtime checks, responsible + /// for cleaning the checks, if they turn out unprofitable. + GeneratedRTChecks &Check; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -917,10 +920,10 @@ OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI) + ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1), UnrollFactor, LVL, CM, - BFI, PSI) {} + BFI, PSI, Check) {} private: Value *getBroadcastInstrs(Value *V) override; @@ -969,9 +972,11 @@ const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, - BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, + GeneratedRTChecks &Checks) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI), + EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, + Checks), EPI(EPI) {} // Override this function to handle the more complex control flow around the @@ -1005,9 +1010,10 @@ const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, - BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, + GeneratedRTChecks &Check) : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI, LVL, CM, BFI, PSI) {} + EPI, LVL, CM, BFI, PSI, Check) {} /// Implements the interface for creating a vectorized skeleton using the /// *main loop* strategy (ie the first pass of vplan execution). BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; @@ -1027,17 +1033,16 @@ // their epilogues. class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { public: - EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, - LoopInfo *LI, DominatorTree *DT, - const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, - EpilogueLoopVectorizationInfo &EPI, - LoopVectorizationLegality *LVL, - llvm::LoopVectorizationCostModel *CM, - BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) + EpilogueVectorizerEpilogueLoop( + Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, + DominatorTree *DT, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, AssumptionCache *AC, + OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, + LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, + GeneratedRTChecks &Checks) : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI, LVL, CM, BFI, PSI) {} + EPI, LVL, CM, BFI, PSI, Checks) {} /// Implements the interface for creating a vectorized skeleton using the /// *epilogue loop* strategy (ie the second pass of vplan execution). BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; @@ -1864,9 +1869,99 @@ /// Profitable vector factors. SmallVector ProfitableVFs; }; - } // end namespace llvm +/// Helper struct to manage generating runtime checks for vectorization. +/// +/// The runtime checks are created up-front in a temporary block to allow better +/// estimating the cost and un-linked from the existing IR. After deciding to +/// vectorize, the checks are moved backed. If deciding not to vectorize, the +/// temporary block is completely removed. +struct GeneratedRTChecks { + BasicBlock *TmpBlock = nullptr; + BasicBlock *Preheader; + Value *SCEVCheck; + Instruction *FirstCheckInst = nullptr; + Instruction *MemRuntimeCheck = nullptr; + + ScalarEvolution &SE; + DominatorTree *DT; + + SCEVExpander Exp; + SCEVExpanderCleaner Cleaner; + + GeneratedRTChecks(BasicBlock *Preheader, ScalarEvolution &SE, + DominatorTree *DT) + : Preheader(Preheader), SE(SE), DT(DT), + Exp(SE, Preheader->getModule()->getDataLayout(), "scev.check"), + Cleaner(Exp, *DT) {} + + /// Generate runtime checks in temporary block (TmpBlock), so we can + /// accurately estimate the cost of the runtime checks. The block is un-linked + /// from the IR and is added back during vector code generation. If there is + /// no vector code generation, the check blocks is removed completely. + void Create(Loop *L, const LoopAccessInfo &LAI, + const SCEVUnionPredicate &UnionPred, LoopInfo *LI) { + BasicBlock *LoopHeader = Preheader->getSingleSuccessor(); + TmpBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, + nullptr, "tmp.rtchecks"); + + SCEVCheck = + Exp.expandCodeForPredicate(&UnionPred, TmpBlock->getTerminator()); + + const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); + if (RtPtrChecking.Need) { + std::tie(FirstCheckInst, MemRuntimeCheck) = addRuntimeChecks( + TmpBlock->getTerminator(), L, RtPtrChecking.getChecks(), Exp); + assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " + "claimed checks are required"); + } + + // Unhook the temporary block with the checks, update various places + // accordingly. + TmpBlock->replaceAllUsesWith(Preheader); + TmpBlock->getTerminator()->moveBefore(Preheader->getTerminator()); + Preheader->getTerminator()->eraseFromParent(); + DT->changeImmediateDominator(LoopHeader, Preheader); + DT->eraseNode(TmpBlock); + LI->removeBlock(TmpBlock); + } + + ~GeneratedRTChecks() { + if (!TmpBlock) { + Cleaner.markResultUsed(); + return; + } + + if (!SCEVCheck && TmpBlock->empty()) { + Cleaner.markResultUsed(); + TmpBlock->eraseFromParent(); + return; + } + + if (MemRuntimeCheck && !isa(MemRuntimeCheck)) + MemRuntimeCheck->replaceAllUsesWith( + ConstantInt::getFalse(MemRuntimeCheck->getType()->getContext())); + if (SCEVCheck && !isa(SCEVCheck)) + SCEVCheck->replaceAllUsesWith( + ConstantInt::getFalse(SCEVCheck->getType()->getContext())); + + SmallPtrSet Removed; + // Completely remove the block. + for (auto &I : make_early_inc_range(reverse(*TmpBlock))) { + if (Exp.isInsertedInstruction(&I)) + continue; + SE.forgetValue(&I); + SE.eraseValueFromMap(&I); + Removed.insert(&I); + I.eraseFromParent(); + } + + Cleaner.cleanup(); + TmpBlock->eraseFromParent(); + } +}; + // Return true if \p OuterLp is an outer loop annotated with hints for explicit // vectorization. The loop needs to be annotated with #pragma omp simd // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the @@ -3132,22 +3227,16 @@ LoopBypassBlocks.push_back(TCCheckBlock); } -void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { +BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { // Reuse existing vector loop preheader for SCEV checks. // Note that new preheader block is generated for vector loop. - BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; - - // Generate the code to check that the SCEV assumptions that we made. - // We want the new basic block to start at the first instruction in a - // sequence of instructions that form a check. - SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), - "scev.check"); - Value *SCEVCheck = Exp.expandCodeForPredicate( - &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); + BasicBlock *const SCEVCheckBlock = Check.TmpBlock; - if (auto *C = dyn_cast(SCEVCheck)) + if (!Check.TmpBlock) + return nullptr; + if (auto *C = dyn_cast(Check.SCEVCheck)) if (C->isZero()) - return; + return nullptr; assert(!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && @@ -3155,10 +3244,30 @@ "Cannot SCEV check stride or overflow when optimizing for size"); SCEVCheckBlock->setName("vector.scevcheck"); + + auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); + + BranchInst::Create(LoopVectorPreHeader, Check.TmpBlock); // Create new preheader for vector loop. - LoopVectorPreHeader = - SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, - nullptr, "vector.ph"); + Check.TmpBlock = SplitBlock( + SCEVCheckBlock, cast(Check.SCEVCheck)->getNextNode(), + (DominatorTree *)nullptr, nullptr, nullptr, "vector.ph", false); + + if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) + PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); + + Check.TmpBlock->replaceAllUsesWith(SCEVCheckBlock); + Check.TmpBlock->getTerminator()->moveBefore(SCEVCheckBlock->getTerminator()); + SCEVCheckBlock->getTerminator()->eraseFromParent(); + SCEVCheckBlock->moveBefore(LoopVectorPreHeader); + + auto *PHTerm = Pred->getTerminator(); + for (unsigned i = 0; i < PHTerm->getNumSuccessors(); i++) + if (PHTerm->getSuccessor(i) == LoopVectorPreHeader) + PHTerm->setSuccessor(i, SCEVCheckBlock); + + DT->addNewBlock(SCEVCheckBlock, Pred); + DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); // Update dominator only if this is first RT check. if (LoopBypassBlocks.empty()) { @@ -3168,27 +3277,28 @@ ReplaceInstWithInst( SCEVCheckBlock->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); + BranchInst::Create(Bypass, LoopVectorPreHeader, Check.SCEVCheck)); LoopBypassBlocks.push_back(SCEVCheckBlock); AddedSafetyChecks = true; + Check.SCEVCheck = nullptr; + return SCEVCheckBlock; } -void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { +BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, + BasicBlock *Bypass) { // VPlan-native path does not do any analysis for runtime checks currently. if (EnableVPlanNativePath) - return; + return nullptr; // Reuse existing vector loop preheader for runtime memory checks. // Note that new preheader block is generated for vector loop. - BasicBlock *const MemCheckBlock = L->getLoopPreheader(); + BasicBlock *const MemCheckBlock = Check.TmpBlock; - // Generate the code that checks in runtime if arrays overlap. We put the - // checks into a separate block to make the more common case of few elements - // faster. - auto *LAI = Legal->getLAI(); - const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); - if (!RtPtrChecking.Need) - return; + // Check if we generated code that checks in runtime if arrays overlap. We put + // the checks into a separate block to make the more common case of few + // elements faster. + if (!Check.MemRuntimeCheck) + return nullptr; if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && @@ -3204,11 +3314,22 @@ }); } + auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); + auto *PHTerm = Pred->getTerminator(); + for (unsigned i = 0; i < PHTerm->getNumSuccessors(); i++) + if (PHTerm->getSuccessor(i) == LoopVectorPreHeader) + PHTerm->setSuccessor(i, Check.TmpBlock); + auto *BI = BranchInst::Create(LoopVectorPreHeader, Check.TmpBlock); + BI->setDebugLoc(PHTerm->getDebugLoc()); + + DT->addNewBlock(Check.TmpBlock, Pred); + DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); + Check.TmpBlock->moveBefore(LoopVectorPreHeader); + + if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) + PL->addBasicBlockToLoop(Check.TmpBlock, *LI); + MemCheckBlock->setName("vector.memcheck"); - // Create new preheader for vector loop. - LoopVectorPreHeader = - SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, - "vector.ph"); auto *CondBranch = cast( Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader)); @@ -3221,16 +3342,12 @@ DT->changeImmediateDominator(Bypass, MemCheckBlock); DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); } + Check.TmpBlock = nullptr; - Instruction *FirstCheckInst; - Instruction *MemRuntimeCheck; - SCEVExpander Exp(*PSE.getSE(), MemCheckBlock->getModule()->getDataLayout(), - "induction"); - std::tie(FirstCheckInst, MemRuntimeCheck) = addRuntimeChecks( - MemCheckBlock->getTerminator(), OrigLoop, RtPtrChecking.getChecks(), Exp); - assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " - "claimed checks are required"); - CondBranch->setCondition(MemRuntimeCheck); + ReplaceInstWithInst( + MemCheckBlock->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, Check.MemRuntimeCheck)); + AddedSafetyChecks = true; // We currently don't use LoopVersioning for the actual loop cloning but we // still use it to add the noalias metadata. @@ -3239,6 +3356,7 @@ Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT, PSE.getSE()); LVer->prepareNoAliasMetadata(); + return MemCheckBlock; } Value *InnerLoopVectorizer::emitTransformedIndex( @@ -7883,22 +8001,12 @@ // Generate the code to check any assumptions that we've made for SCEV // expressions. - BasicBlock *SavedPreHeader = LoopVectorPreHeader; - emitSCEVChecks(Lp, LoopScalarPreHeader); - - // If a safety check was generated save it. - if (SavedPreHeader != LoopVectorPreHeader) - EPI.SCEVSafetyCheck = SavedPreHeader; + EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); // Generate the code that checks at runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. - SavedPreHeader = LoopVectorPreHeader; - emitMemRuntimeChecks(Lp, LoopScalarPreHeader); - - // If a safety check was generated save/overwite it. - if (SavedPreHeader != LoopVectorPreHeader) - EPI.MemSafetyCheck = SavedPreHeader; + EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); // Generate the iteration count check for the main loop, *after* the check // for the epilogue loop, so that the path-length is shorter for the case @@ -9264,15 +9372,17 @@ LVP.setBestPlan(VF.Width, 1); - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, - &CM, BFI, PSI); - LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" - << L->getHeader()->getParent()->getName() << "\"\n"); - LVP.executePlan(LB, DT); + { + GeneratedRTChecks Checks(L->getLoopPreheader(), *PSE.getSE(), DT); + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, + &CM, BFI, PSI, Checks); + LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" + << L->getHeader()->getParent()->getName() << "\"\n"); + LVP.executePlan(LB, DT); + } // Mark the loop as already vectorized to avoid vectorizing again. Hints.setAlreadyVectorized(); - assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); return true; } @@ -9430,16 +9540,9 @@ IC = CM.selectInterleaveCount(VF.Width, VF.Cost); } + bool VectorizeLoop = true, InterleaveLoop = true; // Identify the diagnostic messages that should be produced. std::pair VecDiagMsg, IntDiagMsg; - bool VectorizeLoop = true, InterleaveLoop = true; - if (Requirements.doesNotMeet(F, L, Hints)) { - LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " - "requirements.\n"); - Hints.emitRemarkWithHints(); - return false; - } - if (VF.Width.isScalar()) { LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); VecDiagMsg = std::make_pair( @@ -9519,82 +9622,97 @@ LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); } - LVP.setBestPlan(VF.Width, IC); + if (Requirements.doesNotMeet(F, L, Hints)) { + LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization " + "requirements.\n"); + Hints.emitRemarkWithHints(); + return false; + } - using namespace ore; bool DisableRuntimeUnroll = false; MDNode *OrigLoopID = L->getLoopID(); - - if (!VectorizeLoop) { - assert(IC > 1 && "interleave count should not be 1 or 0"); - // If we decided that it is not legal to vectorize the loop, then - // interleave it. - InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, - BFI, PSI); - LVP.executePlan(Unroller, DT); - - ORE->emit([&]() { - return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), - L->getHeader()) - << "interleaved loop (interleaved count: " - << NV("InterleaveCount", IC) << ")"; - }); - } else { - // If we decided that it is *legal* to vectorize the loop, then do it. - - // Consider vectorizing the epilogue too if it's profitable. - VectorizationFactor EpilogueVF = - CM.selectEpilogueVectorizationFactor(VF.Width, LVP); - if (EpilogueVF.Width.isVector()) { - - // The first pass vectorizes the main loop and creates a scalar epilogue - // to be vectorized by executing the plan (potentially with a different - // factor) again shortly afterwards. - EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, - EpilogueVF.Width.getKnownMinValue(), 1); - EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, - &LVL, &CM, BFI, PSI); - - LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); - LVP.executePlan(MainILV, DT); - ++LoopsVectorized; - - simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); - formLCSSARecursively(*L, *DT, LI, SE); - - // Second pass vectorizes the epilogue and adjusts the control flow - // edges from the first pass. - LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); - EPI.MainLoopVF = EPI.EpilogueVF; - EPI.MainLoopUF = EPI.EpilogueUF; - EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, - ORE, EPI, &LVL, &CM, BFI, PSI); - LVP.executePlan(EpilogILV, DT); - ++LoopsEpilogueVectorized; - - if (!MainILV.areSafetyChecksAdded()) - DisableRuntimeUnroll = true; + { + // Optimistically generate runtime checks. Drop them if they turn out to not + // be profitable. Limit the scope of Checks, so the cleanup happens + // immediately after vector codegeneration is done. + GeneratedRTChecks Checks(L->getLoopPreheader(), *PSE.getSE(), DT); + if (!VF.Width.isScalar() || IC > 1) + Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate(), LI); + LVP.setBestPlan(VF.Width, IC); + + using namespace ore; + if (!VectorizeLoop) { + assert(IC > 1 && "interleave count should not be 1 or 0"); + // If we decided that it is not legal to vectorize the loop, then + // interleave it. + InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, + &CM, BFI, PSI, Checks); + LVP.executePlan(Unroller, DT); + + ORE->emit([&]() { + return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), + L->getHeader()) + << "interleaved loop (interleaved count: " + << NV("InterleaveCount", IC) << ")"; + }); } else { - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, - &LVL, &CM, BFI, PSI); - LVP.executePlan(LB, DT); - ++LoopsVectorized; - - // Add metadata to disable runtime unrolling a scalar loop when there are - // no runtime checks about strides and memory. A scalar loop that is - // rarely used is not worth unrolling. - if (!LB.areSafetyChecksAdded()) - DisableRuntimeUnroll = true; - } + // If we decided that it is *legal* to vectorize the loop, then do it. + + // Consider vectorizing the epilogue too if it's profitable. + VectorizationFactor EpilogueVF = + CM.selectEpilogueVectorizationFactor(VF.Width, LVP); + if (EpilogueVF.Width.isVector()) { + + // The first pass vectorizes the main loop and creates a scalar epilogue + // to be vectorized by executing the plan (potentially with a different + // factor) again shortly afterwards. + EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, + EpilogueVF.Width.getKnownMinValue(), + 1); + EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, + EPI, &LVL, &CM, BFI, PSI, Checks); + + LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); + LVP.executePlan(MainILV, DT); + ++LoopsVectorized; - // Report the vectorization decision. - ORE->emit([&]() { - return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), - L->getHeader()) - << "vectorized loop (vectorization width: " - << NV("VectorizationFactor", VF.Width) - << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; - }); + simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); + formLCSSARecursively(*L, *DT, LI, SE); + + // Second pass vectorizes the epilogue and adjusts the control flow + // edges from the first pass. + LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); + EPI.MainLoopVF = EPI.EpilogueVF; + EPI.MainLoopUF = EPI.EpilogueUF; + EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, + ORE, EPI, &LVL, &CM, BFI, PSI, + Checks); + LVP.executePlan(EpilogILV, DT); + ++LoopsEpilogueVectorized; + + if (!MainILV.areSafetyChecksAdded()) + DisableRuntimeUnroll = true; + } else { + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, + &LVL, &CM, BFI, PSI, Checks); + LVP.executePlan(LB, DT); + ++LoopsVectorized; + + // Add metadata to disable runtime unrolling a scalar loop when there + // are no runtime checks about strides and memory. A scalar loop that is + // rarely used is not worth unrolling. + if (!LB.areSafetyChecksAdded()) + DisableRuntimeUnroll = true; + } + // Report the vectorization decision. + ORE->emit([&]() { + return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), + L->getHeader()) + << "vectorized loop (vectorization width: " + << NV("VectorizationFactor", VF.Width) + << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; + }); + } } Optional RemainderLoopID = @@ -9676,6 +9794,8 @@ Changed |= CFGChanged |= processLoop(L); } + assert(!Changed || !verifyFunction(F, &dbgs())); + // Process each loop nest in the function. return LoopVectorizeResult(Changed, CFGChanged); } diff --git a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll --- a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll @@ -26,9 +26,9 @@ ; CHECK-NEXT: br i1 [[CMP27]], label [[FOR_BODY3_LR_PH_US_PREHEADER:%.*]], label [[FOR_END15:%.*]] ; CHECK: for.body3.lr.ph.us.preheader: ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[M]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[K:%.*]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[K:%.*]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 ; CHECK-NEXT: br label [[FOR_BODY3_LR_PH_US:%.*]] ; CHECK: for.end.us: ; CHECK-NEXT: [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV33:%.*]] @@ -54,12 +54,12 @@ ; CHECK-NEXT: br i1 [[EXITCOND32]], label [[FOR_END_US:%.*]], label [[FOR_BODY3_US]], !llvm.loop !3 ; CHECK: for.body3.lr.ph.us: ; CHECK-NEXT: [[INDVARS_IV33]] = phi i64 [ [[INDVARS_IV_NEXT34]], [[FOR_END_US]] ], [ 0, [[FOR_BODY3_LR_PH_US_PREHEADER]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP3]], [[INDVARS_IV33]] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP1]], [[INDVARS_IV33]] ; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 ; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[INDVARS_IV33]] to i32 ; CHECK-NEXT: [[ADD_US]] = add i32 [[TMP9]], [[K]] ; CHECK-NEXT: [[ARRAYIDX7_US]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV33]] -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP0]]) @@ -74,8 +74,8 @@ ; CHECK-NEXT: [[TMP16:%.*]] = or i1 false, [[TMP15]] ; CHECK-NEXT: br i1 [[TMP16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -100,7 +100,7 @@ ; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5 ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_US]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY3_LR_PH_US]] ], [ 0, [[VECTOR_SCEVCHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll b/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll --- a/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll +++ b/llvm/test/Transforms/LoopVectorize/pr47343-expander-lcssa-after-cfg-update.ll @@ -56,7 +56,6 @@ ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[TMP4:%.*]] = phi i8* [ [[TMP1]], %vector.memcheck ], [ [[TMP1]], %loop.preheader ], [ [[TMP1]], %middle.block ] ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 500, %middle.block ], [ 0, %loop.preheader ], [ 0, %vector.memcheck ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; diff --git a/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll b/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll @@ -0,0 +1,32 @@ +; RUN: opt -loop-vectorize -force-vector-width=4 %s | FileCheck %s + +%struct.foo = type { [400 x double] } + +; Make sure we do not crash when dropping runtime checks. + +; CHECK-NOT: vector.body + +define void @barney(%struct.foo* %ptr) { +entry: + br label %loop + +loop: + %tmp3 = phi i64 [ 0, %entry ], [ %tmp18, %loop ] + %tmp4 = getelementptr inbounds %struct.foo, %struct.foo* %ptr, i64 undef + %tmp5 = bitcast %struct.foo* %tmp4 to i64* + store i64 0, i64* %tmp5, align 8 + %tmp8 = add i64 1, %tmp3 + %tmp10 = getelementptr inbounds %struct.foo, %struct.foo* %ptr, i64 %tmp8 + %tmp11 = bitcast %struct.foo* %tmp10 to i64* + store i64 1, i64* %tmp11, align 8 + %tmp14 = add i64 undef, %tmp3 + %tmp16 = getelementptr inbounds %struct.foo, %struct.foo* %ptr, i64 %tmp14 + %tmp17 = bitcast %struct.foo* %tmp16 to i64* + store i64 2, i64* %tmp17, align 8 + %tmp18 = add nuw nsw i64 %tmp3, 4 + %c = icmp ult i64 %tmp18, 400 + br i1 %c, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll b/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll --- a/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll @@ -23,22 +23,22 @@ ; CHECK-NEXT: [[C_3:%.*]] = call i1 @cond() ; CHECK-NEXT: br i1 [[C_3]], label [[LOOP3_PREHEADER:%.*]], label [[INNER_LATCH:%.*]] ; CHECK: loop3.preheader: -; CHECK-NEXT: [[L_1_LCSSA11:%.*]] = phi i16* [ [[L_1]], [[INNER_BB]] ] +; CHECK-NEXT: [[L_1_LCSSA8:%.*]] = phi i16* [ [[L_1]], [[INNER_BB]] ] ; CHECK-NEXT: [[L_1_LCSSA:%.*]] = phi i16* [ [[L_1]], [[INNER_BB]] ] ; CHECK-NEXT: [[L_2_LCSSA:%.*]] = phi i16* [ [[L_2]], [[INNER_BB]] ] -; CHECK-NEXT: [[L_2_LCSSA4:%.*]] = bitcast i16* [[L_2_LCSSA]] to i8* +; CHECK-NEXT: [[L_2_LCSSA3:%.*]] = bitcast i16* [[L_2_LCSSA]] to i8* ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[L_2_LCSSA4]], i64 1 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[L_2_LCSSA3]], i64 1 ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i16, i16* [[L_1_LCSSA]], i64 1 -; CHECK-NEXT: [[SCEVGEP9:%.*]] = bitcast i16* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[SCEVGEP6:%.*]] = bitcast i16* [[SCEVGEP]] to i8* ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[N]], 2 -; CHECK-NEXT: [[SCEVGEP10:%.*]] = getelementptr i16, i16* [[L_1_LCSSA11]], i64 [[TMP1]] -; CHECK-NEXT: [[SCEVGEP1013:%.*]] = bitcast i16* [[SCEVGEP10]] to i8* -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[L_2_LCSSA4]], [[SCEVGEP1013]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP9]], [[UGLYGEP]] +; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr i16, i16* [[L_1_LCSSA8]], i64 [[TMP1]] +; CHECK-NEXT: [[SCEVGEP710:%.*]] = bitcast i16* [[SCEVGEP7]] to i8* +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[L_2_LCSSA3]], [[SCEVGEP710]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP6]], [[UGLYGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true ; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] @@ -66,8 +66,6 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[L_18:%.*]] = phi i16* [ [[L_1_LCSSA]], [[VECTOR_MEMCHECK]] ], [ [[L_1_LCSSA]], [[LOOP3_PREHEADER]] ], [ [[L_1_LCSSA]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[L_23:%.*]] = phi i16* [ [[L_2_LCSSA]], [[VECTOR_MEMCHECK]] ], [ [[L_2_LCSSA]], [[LOOP3_PREHEADER]] ], [ [[L_2_LCSSA]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP3_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP3:%.*]] ; CHECK: inner.latch: @@ -79,20 +77,19 @@ ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP3]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[C_5:%.*]] = icmp ult i64 [[IV]], [[N]] -; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i16, i16* [[L_18]], i64 [[IV_NEXT]] +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i16, i16* [[L_1_LCSSA]], i64 [[IV_NEXT]] ; CHECK-NEXT: [[LOOP_L_1:%.*]] = load i16, i16* [[GEP_1]], align 2 -; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i16, i16* [[L_23]], i64 0 +; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i16, i16* [[L_2_LCSSA]], i64 0 ; CHECK-NEXT: store i16 [[LOOP_L_1]], i16* [[GEP_2]], align 2 ; CHECK-NEXT: br i1 [[C_5]], label [[LOOP3]], label [[EXIT_LOOPEXIT]], [[LOOP7:!llvm.loop !.*]] ; CHECK: exit.loopexit: -; CHECK-NEXT: [[L_17:%.*]] = phi i16* [ [[L_1_LCSSA]], [[MIDDLE_BLOCK]] ], [ [[L_18]], [[LOOP3]] ] ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: exit.loopexit1: -; CHECK-NEXT: [[L_1_LCSSA5:%.*]] = phi i16* [ [[L_1]], [[INNER_LATCH]] ] +; CHECK-NEXT: [[L_1_LCSSA4:%.*]] = phi i16* [ [[L_1]], [[INNER_LATCH]] ] ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: -; CHECK-NEXT: [[L_16:%.*]] = phi i16* [ [[L_1_LCSSA5]], [[EXIT_LOOPEXIT1]] ], [ [[L_17]], [[EXIT_LOOPEXIT]] ] -; CHECK-NEXT: [[L_3:%.*]] = load i16, i16* [[L_16]], align 2 +; CHECK-NEXT: [[L_15:%.*]] = phi i16* [ [[L_1_LCSSA4]], [[EXIT_LOOPEXIT1]] ], [ [[L_1_LCSSA]], [[EXIT_LOOPEXIT]] ] +; CHECK-NEXT: [[L_3:%.*]] = load i16, i16* [[L_15]], align 2 ; CHECK-NEXT: ret i16 [[L_3]] ; entry: