diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -36,6 +36,7 @@ class OptimizationRemarkEmitter; class PredIteratorCache; class ScalarEvolution; +class ScalarEvolutionExpander; class SCEV; class SCEVExpander; class TargetLibraryInfo; @@ -471,7 +472,7 @@ std::pair addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl &PointerChecks, - ScalarEvolution *SE); + SCEVExpander &Expander); } // end namespace llvm diff --git a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h --- a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h +++ b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h @@ -16,6 +16,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/Optional.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ScalarEvolutionNormalization.h" @@ -199,6 +200,8 @@ ChainedPhis.clear(); } + ScalarEvolution *getSE() { return &SE; } + /// Return a vector containing all instructions inserted during expansion. SmallVector getAllInsertedInstructions() const { SmallVector Result; @@ -511,10 +514,12 @@ SCEVExpanderCleaner(SCEVExpander &Expander, DominatorTree &DT) : Expander(Expander), DT(DT), ResultUsed(false) {} - ~SCEVExpanderCleaner(); + ~SCEVExpanderCleaner() { cleanup(); } /// Indicate that the result of the expansion is used. void markResultUsed() { ResultUsed = true; } + + void cleanup(); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1569,7 +1569,8 @@ /// in \p TheLoop. \return the values for the bounds. static PointerBounds expandBounds(const RuntimeCheckingPtrGroup *CG, Loop *TheLoop, Instruction *Loc, - SCEVExpander &Exp, ScalarEvolution *SE) { + SCEVExpander &Exp) { + ScalarEvolution *SE = Exp.getSE(); // TODO: Add helper to retrieve pointers to CG. Value *Ptr = CG->RtCheck.Pointers[CG->Members[0]].PointerValue; const SCEV *Sc = SE->getSCEV(Ptr); @@ -1608,16 +1609,15 @@ /// lower bounds for both pointers in the check. static SmallVector, 4> expandBounds(const SmallVectorImpl &PointerChecks, Loop *L, - Instruction *Loc, ScalarEvolution *SE, SCEVExpander &Exp) { + Instruction *Loc, SCEVExpander &Exp) { SmallVector, 4> ChecksWithBounds; // Here we're relying on the SCEV Expander's cache to only emit code for the // same bounds once. transform(PointerChecks, std::back_inserter(ChecksWithBounds), [&](const RuntimePointerCheck &Check) { - PointerBounds First = expandBounds(Check.first, L, Loc, Exp, SE), - Second = - expandBounds(Check.second, L, Loc, Exp, SE); + PointerBounds First = expandBounds(Check.first, L, Loc, Exp), + Second = expandBounds(Check.second, L, Loc, Exp); return std::make_pair(First, Second); }); @@ -1627,12 +1627,10 @@ std::pair llvm::addRuntimeChecks( Instruction *Loc, Loop *TheLoop, const SmallVectorImpl &PointerChecks, - ScalarEvolution *SE) { + SCEVExpander &Exp) { // TODO: Move noalias annotation code from LoopVersioning here and share with LV if possible. // TODO: Pass RtPtrChecking instead of PointerChecks and SE separately, if possible - const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout(); - SCEVExpander Exp(*SE, DL, "induction"); - auto ExpandedChecks = expandBounds(PointerChecks, TheLoop, Loc, SE, Exp); + auto ExpandedChecks = expandBounds(PointerChecks, TheLoop, Loc, Exp); LLVMContext &Ctx = Loc->getContext(); Instruction *FirstInst = nullptr; diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp --- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp +++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp @@ -60,9 +60,12 @@ // Add the memcheck in the original preheader (this is empty initially). BasicBlock *RuntimeCheckBB = VersionedLoop->getLoopPreheader(); const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); - std::tie(FirstCheckInst, MemRuntimeCheck) = - addRuntimeChecks(RuntimeCheckBB->getTerminator(), VersionedLoop, - AliasChecks, RtPtrChecking.getSE()); + + SCEVExpander Exp2(*RtPtrChecking.getSE(), + VersionedLoop->getHeader()->getModule()->getDataLayout(), + "induction"); + std::tie(FirstCheckInst, MemRuntimeCheck) = addRuntimeChecks( + RuntimeCheckBB->getTerminator(), VersionedLoop, AliasChecks, Exp2); SCEVExpander Exp(*SE, RuntimeCheckBB->getModule()->getDataLayout(), "scev.check"); diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -2702,7 +2702,7 @@ return false; } -SCEVExpanderCleaner::~SCEVExpanderCleaner() { +void SCEVExpanderCleaner::cleanup() { // Result is used, nothing to remove. if (ResultUsed) return; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -431,9 +431,8 @@ return None; } - +struct GeneratedRTChecks; namespace llvm { - /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -457,12 +456,12 @@ OptimizationRemarkEmitter *ORE, ElementCount VecWidth, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI) + ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), - BFI(BFI), PSI(PSI) { + BFI(BFI), PSI(PSI), Check(Check) { // Query this against the original loop and save it here because the profile // of the original loop header may change as the transformation happens. OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( @@ -719,10 +718,10 @@ /// Emit a bypass check to see if all of the SCEV assumptions we've /// had to make are correct. - void emitSCEVChecks(Loop *L, BasicBlock *Bypass); + BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass); /// Emit bypass checks to check any memory assumptions we may have made. - void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); + BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); /// Compute the transformed value of Index at offset StartValue using step /// StepValue. @@ -895,6 +894,10 @@ // Whether this loop should be optimized for size based on profile guided size // optimizatios. bool OptForSizeBasedOnProfile; + + /// Structure to hold information about generated runtime checks, responsible + /// for cleaning the checks, if they turn out unprofitable. + GeneratedRTChecks &Check; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -906,10 +909,10 @@ OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI) + ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1), UnrollFactor, LVL, CM, - BFI, PSI) {} + BFI, PSI, Check) {} private: Value *getBroadcastInstrs(Value *V) override; @@ -958,9 +961,11 @@ const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, - BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, + GeneratedRTChecks &Checks) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI), + EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI, + Checks), EPI(EPI) {} // Override this function to handle the more complex control flow around the @@ -994,9 +999,10 @@ const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, - BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, + GeneratedRTChecks &Check) : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI, LVL, CM, BFI, PSI) {} + EPI, LVL, CM, BFI, PSI, Check) {} /// Implements the interface for creating a vectorized skeleton using the /// *main loop* strategy (ie the first pass of vplan execution). BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; @@ -1016,17 +1022,16 @@ // their epilogues. class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { public: - EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, - LoopInfo *LI, DominatorTree *DT, - const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, - EpilogueLoopVectorizationInfo &EPI, - LoopVectorizationLegality *LVL, - llvm::LoopVectorizationCostModel *CM, - BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI) + EpilogueVectorizerEpilogueLoop( + Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, + DominatorTree *DT, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, AssumptionCache *AC, + OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, + LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, + BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, + GeneratedRTChecks &Checks) : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI, LVL, CM, BFI, PSI) {} + EPI, LVL, CM, BFI, PSI, Checks) {} /// Implements the interface for creating a vectorized skeleton using the /// *epilogue loop* strategy (ie the second pass of vplan execution). BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; @@ -1839,9 +1844,99 @@ /// Profitable vector factors. SmallVector ProfitableVFs; }; - } // end namespace llvm +/// Helper struct to manage generating runtime checks for vectorization. +/// +/// The runtime checks are created up-front in a temporary block to allow better +/// estimating the cost and un-linked from the existing IR. After deciding to +/// vectorize, the checks are moved backed. If deciding not to vectorize, the +/// temporary block is completely removed. +struct GeneratedRTChecks { + BasicBlock *TmpBlock = nullptr; + BasicBlock *Preheader; + Value *SCEVCheck; + Instruction *FirstCheckInst = nullptr; + Instruction *MemRuntimeCheck = nullptr; + + ScalarEvolution &SE; + DominatorTree *DT; + + SCEVExpander Exp; + SCEVExpanderCleaner Cleaner; + + GeneratedRTChecks(BasicBlock *Preheader, ScalarEvolution &SE, + DominatorTree *DT) + : Preheader(Preheader), SE(SE), DT(DT), + Exp(SE, Preheader->getModule()->getDataLayout(), "scev.check"), + Cleaner(Exp, *DT) {} + + /// Generate runtime checks in temporary block (TmpBlock), so we can + /// accurately estimate the cost of the runtime checks. The block is un-linked + /// from the IR and is added back during vector code generation. If there is + /// no vector code generation, the check blocks is removed completely. + void Create(Loop *L, const LoopAccessInfo &LAI, + const SCEVUnionPredicate &UnionPred, LoopInfo *LI) { + BasicBlock *LoopHeader = Preheader->getSingleSuccessor(); + TmpBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, + nullptr, "tmp.rtchecks"); + + SCEVCheck = + Exp.expandCodeForPredicate(&UnionPred, TmpBlock->getTerminator()); + + const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); + if (RtPtrChecking.Need) { + std::tie(FirstCheckInst, MemRuntimeCheck) = addRuntimeChecks( + TmpBlock->getTerminator(), L, RtPtrChecking.getChecks(), Exp); + assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " + "claimed checks are required"); + } + + // Unhook the temporary block with the checks, update various places + // accordingly. + TmpBlock->replaceAllUsesWith(Preheader); + TmpBlock->getTerminator()->moveBefore(Preheader->getTerminator()); + Preheader->getTerminator()->eraseFromParent(); + DT->changeImmediateDominator(LoopHeader, Preheader); + DT->eraseNode(TmpBlock); + LI->removeBlock(TmpBlock); + } + + ~GeneratedRTChecks() { + if (!TmpBlock) { + Cleaner.markResultUsed(); + return; + } + + if (!SCEVCheck && TmpBlock->empty()) { + Cleaner.markResultUsed(); + TmpBlock->eraseFromParent(); + return; + } + + if (MemRuntimeCheck && !isa(MemRuntimeCheck)) + MemRuntimeCheck->replaceAllUsesWith( + ConstantInt::getFalse(MemRuntimeCheck->getType()->getContext())); + if (SCEVCheck && !isa(SCEVCheck)) + SCEVCheck->replaceAllUsesWith( + ConstantInt::getFalse(SCEVCheck->getType()->getContext())); + + SmallPtrSet Removed; + // Completely remove the block. + for (auto &I : make_early_inc_range(reverse(*TmpBlock))) { + if (Exp.isInsertedInstruction(&I)) + continue; + SE.forgetValue(&I); + SE.eraseValueFromMap(&I); + Removed.insert(&I); + I.eraseFromParent(); + } + + Cleaner.cleanup(); + TmpBlock->eraseFromParent(); + } +}; + // Return true if \p OuterLp is an outer loop annotated with hints for explicit // vectorization. The loop needs to be annotated with #pragma omp simd // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the @@ -3100,22 +3195,16 @@ LoopBypassBlocks.push_back(TCCheckBlock); } -void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { +BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { // Reuse existing vector loop preheader for SCEV checks. // Note that new preheader block is generated for vector loop. - BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; + BasicBlock *const SCEVCheckBlock = Check.TmpBlock; - // Generate the code to check that the SCEV assumptions that we made. - // We want the new basic block to start at the first instruction in a - // sequence of instructions that form a check. - SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), - "scev.check"); - Value *SCEVCheck = Exp.expandCodeForPredicate( - &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); - - if (auto *C = dyn_cast(SCEVCheck)) + if (!Check.TmpBlock) + return nullptr; + if (auto *C = dyn_cast(Check.SCEVCheck)) if (C->isZero()) - return; + return nullptr; assert(!(SCEVCheckBlock->getParent()->hasOptSize() || (OptForSizeBasedOnProfile && @@ -3123,10 +3212,30 @@ "Cannot SCEV check stride or overflow when optimizing for size"); SCEVCheckBlock->setName("vector.scevcheck"); + + auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); + + BranchInst::Create(LoopVectorPreHeader, Check.TmpBlock); // Create new preheader for vector loop. - LoopVectorPreHeader = - SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, - nullptr, "vector.ph"); + Check.TmpBlock = SplitBlock(SCEVCheckBlock, + cast(Check.SCEVCheck)->getNextNode(), + nullptr, nullptr, nullptr, ""); + + if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) + PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); + + Check.TmpBlock->replaceAllUsesWith(SCEVCheckBlock); + Check.TmpBlock->getTerminator()->moveBefore(SCEVCheckBlock->getTerminator()); + SCEVCheckBlock->getTerminator()->eraseFromParent(); + SCEVCheckBlock->moveBefore(LoopVectorPreHeader); + + auto *PHTerm = Pred->getTerminator(); + for (unsigned i = 0; i < PHTerm->getNumSuccessors(); i++) + if (PHTerm->getSuccessor(i) == LoopVectorPreHeader) + PHTerm->setSuccessor(i, SCEVCheckBlock); + + DT->addNewBlock(SCEVCheckBlock, Pred); + DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); // Update dominator only if this is first RT check. if (LoopBypassBlocks.empty()) { @@ -3136,27 +3245,28 @@ ReplaceInstWithInst( SCEVCheckBlock->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); + BranchInst::Create(Bypass, LoopVectorPreHeader, Check.SCEVCheck)); LoopBypassBlocks.push_back(SCEVCheckBlock); AddedSafetyChecks = true; + Check.SCEVCheck = nullptr; + return SCEVCheckBlock; } -void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { +BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, + BasicBlock *Bypass) { // VPlan-native path does not do any analysis for runtime checks currently. if (EnableVPlanNativePath) - return; + return nullptr; // Reuse existing vector loop preheader for runtime memory checks. // Note that new preheader block is generated for vector loop. - BasicBlock *const MemCheckBlock = L->getLoopPreheader(); + BasicBlock *const MemCheckBlock = Check.TmpBlock; - // Generate the code that checks in runtime if arrays overlap. We put the - // checks into a separate block to make the more common case of few elements - // faster. - auto *LAI = Legal->getLAI(); - const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); - if (!RtPtrChecking.Need) - return; + // Check if we generated code that checks in runtime if arrays overlap. We put + // the checks into a separate block to make the more common case of few + // elements faster. + if (!Check.MemRuntimeCheck) + return nullptr; if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && @@ -3172,11 +3282,22 @@ }); } + auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); + auto *PHTerm = Pred->getTerminator(); + for (unsigned i = 0; i < PHTerm->getNumSuccessors(); i++) + if (PHTerm->getSuccessor(i) == LoopVectorPreHeader) + PHTerm->setSuccessor(i, Check.TmpBlock); + auto *BI = BranchInst::Create(LoopVectorPreHeader, Check.TmpBlock); + BI->setDebugLoc(PHTerm->getDebugLoc()); + + DT->addNewBlock(Check.TmpBlock, Pred); + DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); + Check.TmpBlock->moveBefore(LoopVectorPreHeader); + + if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) + PL->addBasicBlockToLoop(Check.TmpBlock, *LI); + MemCheckBlock->setName("vector.memcheck"); - // Create new preheader for vector loop. - LoopVectorPreHeader = - SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, - "vector.ph"); auto *CondBranch = cast( Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader)); @@ -3189,15 +3310,12 @@ DT->changeImmediateDominator(Bypass, MemCheckBlock); DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); } + Check.TmpBlock = nullptr; - Instruction *FirstCheckInst; - Instruction *MemRuntimeCheck; - std::tie(FirstCheckInst, MemRuntimeCheck) = - addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, - RtPtrChecking.getChecks(), RtPtrChecking.getSE()); - assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " - "claimed checks are required"); - CondBranch->setCondition(MemRuntimeCheck); + ReplaceInstWithInst( + MemCheckBlock->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, Check.MemRuntimeCheck)); + AddedSafetyChecks = true; // We currently don't use LoopVersioning for the actual loop cloning but we // still use it to add the noalias metadata. @@ -3206,6 +3324,7 @@ Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT, PSE.getSE()); LVer->prepareNoAliasMetadata(); + return MemCheckBlock; } Value *InnerLoopVectorizer::emitTransformedIndex( @@ -7640,22 +7759,12 @@ // Generate the code to check any assumptions that we've made for SCEV // expressions. - BasicBlock *SavedPreHeader = LoopVectorPreHeader; - emitSCEVChecks(Lp, LoopScalarPreHeader); - - // If a safety check was generated save it. - if (SavedPreHeader != LoopVectorPreHeader) - EPI.SCEVSafetyCheck = SavedPreHeader; + EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader); // Generate the code that checks at runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. - SavedPreHeader = LoopVectorPreHeader; - emitMemRuntimeChecks(Lp, LoopScalarPreHeader); - - // If a safety check was generated save/overwite it. - if (SavedPreHeader != LoopVectorPreHeader) - EPI.MemSafetyCheck = SavedPreHeader; + EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader); // Generate the iteration count check for the main loop, *after* the check // for the epilogue loop, so that the path-length is shorter for the case @@ -8987,8 +9096,9 @@ LVP.setBestPlan(VF.Width, 1); + GeneratedRTChecks Checks(L->getLoopPreheader(), *PSE.getSE(), DT); InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, - &CM, BFI, PSI); + &CM, BFI, PSI, Checks); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); LVP.executePlan(LB, DT); @@ -8996,7 +9106,6 @@ // Mark the loop as already vectorized to avoid vectorizing again. Hints.setAlreadyVectorized(); - assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); return true; } @@ -9153,6 +9262,11 @@ IC = CM.selectInterleaveCount(VF.Width, VF.Cost); } + // Optimistically generate runtime checks. Drop them if they turn out to not + // be profitable. + GeneratedRTChecks Checks(L->getLoopPreheader(), *PSE.getSE(), DT); + Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate(), LI); + // Identify the diagnostic messages that should be produced. std::pair VecDiagMsg, IntDiagMsg; bool VectorizeLoop = true, InterleaveLoop = true; @@ -9253,7 +9367,7 @@ // If we decided that it is not legal to vectorize the loop, then // interleave it. InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, - BFI, PSI); + BFI, PSI, Checks); LVP.executePlan(Unroller, DT); ORE->emit([&]() { @@ -9276,7 +9390,7 @@ EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC, EpilogueVF.Width.getKnownMinValue(), 1); EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, - &LVL, &CM, BFI, PSI); + &LVL, &CM, BFI, PSI, Checks); LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF); LVP.executePlan(MainILV, DT); @@ -9290,8 +9404,8 @@ LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); EPI.MainLoopVF = EPI.EpilogueVF; EPI.MainLoopUF = EPI.EpilogueUF; - EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, - ORE, EPI, &LVL, &CM, BFI, PSI); + EpilogueVectorizerEpilogueLoop EpilogILV( + L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL, &CM, BFI, PSI, Checks); LVP.executePlan(EpilogILV, DT); ++LoopsEpilogueVectorized; @@ -9299,7 +9413,7 @@ DisableRuntimeUnroll = true; } else { InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, - &LVL, &CM, BFI, PSI); + &LVL, &CM, BFI, PSI, Checks); LVP.executePlan(LB, DT); ++LoopsVectorized; @@ -9333,7 +9447,6 @@ Hints.setAlreadyVectorized(); } - assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); return true; } @@ -9399,6 +9512,8 @@ Changed |= CFGChanged |= processLoop(L); } + assert(!Changed || !verifyFunction(F, &dbgs())); + // Process each loop nest in the function. return LoopVectorizeResult(Changed, CFGChanged); } diff --git a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll --- a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll @@ -26,9 +26,9 @@ ; CHECK-NEXT: br i1 [[CMP27]], label [[FOR_BODY3_LR_PH_US_PREHEADER:%.*]], label [[FOR_END15:%.*]] ; CHECK: for.body3.lr.ph.us.preheader: ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[M]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[K:%.*]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[K:%.*]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 ; CHECK-NEXT: br label [[FOR_BODY3_LR_PH_US:%.*]] ; CHECK: for.end.us: ; CHECK-NEXT: [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV33:%.*]] @@ -54,12 +54,12 @@ ; CHECK-NEXT: br i1 [[EXITCOND32]], label [[FOR_END_US:%.*]], label [[FOR_BODY3_US]], !llvm.loop !3 ; CHECK: for.body3.lr.ph.us: ; CHECK-NEXT: [[INDVARS_IV33]] = phi i64 [ [[INDVARS_IV_NEXT34]], [[FOR_END_US]] ], [ 0, [[FOR_BODY3_LR_PH_US_PREHEADER]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP3]], [[INDVARS_IV33]] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP1]], [[INDVARS_IV33]] ; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 ; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[INDVARS_IV33]] to i32 ; CHECK-NEXT: [[ADD_US]] = add i32 [[TMP9]], [[K]] ; CHECK-NEXT: [[ARRAYIDX7_US]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV33]] -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP0]]) @@ -74,8 +74,8 @@ ; CHECK-NEXT: [[TMP16:%.*]] = or i1 false, [[TMP15]] ; CHECK-NEXT: br i1 [[TMP16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -100,7 +100,7 @@ ; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5 ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_US]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY3_LR_PH_US]] ], [ 0, [[VECTOR_SCEVCHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll b/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll @@ -0,0 +1,32 @@ +; RUN: opt -loop-vectorize -force-vector-width=4 %s | FileCheck %s + +%struct.foo = type { [400 x double] } + +; Make sure we do not crash when dropping runtime checks. + +; CHECK-NOT: vector.body + +define void @barney(%struct.foo* %ptr) { +entry: + br label %loop + +loop: + %tmp3 = phi i64 [ 0, %entry ], [ %tmp18, %loop ] + %tmp4 = getelementptr inbounds %struct.foo, %struct.foo* %ptr, i64 undef + %tmp5 = bitcast %struct.foo* %tmp4 to i64* + store i64 0, i64* %tmp5, align 8 + %tmp8 = add i64 1, %tmp3 + %tmp10 = getelementptr inbounds %struct.foo, %struct.foo* %ptr, i64 %tmp8 + %tmp11 = bitcast %struct.foo* %tmp10 to i64* + store i64 1, i64* %tmp11, align 8 + %tmp14 = add i64 undef, %tmp3 + %tmp16 = getelementptr inbounds %struct.foo, %struct.foo* %ptr, i64 %tmp14 + %tmp17 = bitcast %struct.foo* %tmp16 to i64* + store i64 2, i64* %tmp17, align 8 + %tmp18 = add nuw nsw i64 %tmp3, 4 + %c = icmp ult i64 %tmp18, 400 + br i1 %c, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll b/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll --- a/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll @@ -23,22 +23,22 @@ ; CHECK-NEXT: [[C_3:%.*]] = call i1 @cond() ; CHECK-NEXT: br i1 [[C_3]], label [[LOOP3_PREHEADER:%.*]], label [[INNER_LATCH:%.*]] ; CHECK: loop3.preheader: -; CHECK-NEXT: [[L_1_LCSSA11:%.*]] = phi i16* [ [[L_1]], [[INNER_BB]] ] +; CHECK-NEXT: [[L_1_LCSSA8:%.*]] = phi i16* [ [[L_1]], [[INNER_BB]] ] ; CHECK-NEXT: [[L_1_LCSSA:%.*]] = phi i16* [ [[L_1]], [[INNER_BB]] ] ; CHECK-NEXT: [[L_2_LCSSA:%.*]] = phi i16* [ [[L_2]], [[INNER_BB]] ] -; CHECK-NEXT: [[L_2_LCSSA4:%.*]] = bitcast i16* [[L_2_LCSSA]] to i8* +; CHECK-NEXT: [[L_2_LCSSA3:%.*]] = bitcast i16* [[L_2_LCSSA]] to i8* ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[L_2_LCSSA4]], i64 1 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[L_2_LCSSA3]], i64 1 ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i16, i16* [[L_1_LCSSA]], i64 1 -; CHECK-NEXT: [[SCEVGEP9:%.*]] = bitcast i16* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[SCEVGEP6:%.*]] = bitcast i16* [[SCEVGEP]] to i8* ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[N]], 2 -; CHECK-NEXT: [[SCEVGEP10:%.*]] = getelementptr i16, i16* [[L_1_LCSSA11]], i64 [[TMP1]] -; CHECK-NEXT: [[SCEVGEP1013:%.*]] = bitcast i16* [[SCEVGEP10]] to i8* -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[L_2_LCSSA4]], [[SCEVGEP1013]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP9]], [[UGLYGEP]] +; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr i16, i16* [[L_1_LCSSA8]], i64 [[TMP1]] +; CHECK-NEXT: [[SCEVGEP710:%.*]] = bitcast i16* [[SCEVGEP7]] to i8* +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[L_2_LCSSA3]], [[SCEVGEP710]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP6]], [[UGLYGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT]], true ; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] @@ -66,8 +66,6 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[L_18:%.*]] = phi i16* [ [[L_1_LCSSA]], [[VECTOR_MEMCHECK]] ], [ [[L_1_LCSSA]], [[LOOP3_PREHEADER]] ], [ [[L_1_LCSSA]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[L_23:%.*]] = phi i16* [ [[L_2_LCSSA]], [[VECTOR_MEMCHECK]] ], [ [[L_2_LCSSA]], [[LOOP3_PREHEADER]] ], [ [[L_2_LCSSA]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP3_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP3:%.*]] ; CHECK: inner.latch: @@ -79,20 +77,19 @@ ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP3]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[C_5:%.*]] = icmp ult i64 [[IV]], [[N]] -; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i16, i16* [[L_18]], i64 [[IV_NEXT]] +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i16, i16* [[L_1_LCSSA]], i64 [[IV_NEXT]] ; CHECK-NEXT: [[LOOP_L_1:%.*]] = load i16, i16* [[GEP_1]], align 2 -; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i16, i16* [[L_23]], i64 0 +; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i16, i16* [[L_2_LCSSA]], i64 0 ; CHECK-NEXT: store i16 [[LOOP_L_1]], i16* [[GEP_2]], align 2 ; CHECK-NEXT: br i1 [[C_5]], label [[LOOP3]], label [[EXIT_LOOPEXIT]], [[LOOP7:!llvm.loop !.*]] ; CHECK: exit.loopexit: -; CHECK-NEXT: [[L_17:%.*]] = phi i16* [ [[L_1_LCSSA]], [[MIDDLE_BLOCK]] ], [ [[L_18]], [[LOOP3]] ] ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: exit.loopexit1: -; CHECK-NEXT: [[L_1_LCSSA5:%.*]] = phi i16* [ [[L_1]], [[INNER_LATCH]] ] +; CHECK-NEXT: [[L_1_LCSSA4:%.*]] = phi i16* [ [[L_1]], [[INNER_LATCH]] ] ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: -; CHECK-NEXT: [[L_16:%.*]] = phi i16* [ [[L_1_LCSSA5]], [[EXIT_LOOPEXIT1]] ], [ [[L_17]], [[EXIT_LOOPEXIT]] ] -; CHECK-NEXT: [[L_3:%.*]] = load i16, i16* [[L_16]], align 2 +; CHECK-NEXT: [[L_15:%.*]] = phi i16* [ [[L_1_LCSSA4]], [[EXIT_LOOPEXIT1]] ], [ [[L_1_LCSSA]], [[EXIT_LOOPEXIT]] ] +; CHECK-NEXT: [[L_3:%.*]] = load i16, i16* [[L_15]], align 2 ; CHECK-NEXT: ret i16 [[L_3]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/version-mem-access.ll b/llvm/test/Transforms/LoopVectorize/version-mem-access.ll --- a/llvm/test/Transforms/LoopVectorize/version-mem-access.ll +++ b/llvm/test/Transforms/LoopVectorize/version-mem-access.ll @@ -54,41 +54,3 @@ for.end: ret void } - -; We used to crash on this function because we removed the fptosi cast when -; replacing the symbolic stride '%conv'. -; PR18480 - -; CHECK-LABEL: fn1 -; CHECK: load <2 x double> - -define void @fn1(double* noalias %x, double* noalias %c, double %a) { -entry: - %conv = fptosi double %a to i32 - %conv2 = add i32 %conv, 4 - %cmp8 = icmp sgt i32 %conv2, 0 - br i1 %cmp8, label %for.body.preheader, label %for.end - -for.body.preheader: - br label %for.body - -for.body: - %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] - %0 = trunc i64 %indvars.iv to i32 - %mul = mul nsw i32 %0, %conv - %idxprom = sext i32 %mul to i64 - %arrayidx = getelementptr inbounds double, double* %x, i64 %idxprom - %1 = load double, double* %arrayidx, align 8 - %arrayidx3 = getelementptr inbounds double, double* %c, i64 %indvars.iv - store double %1, double* %arrayidx3, align 8 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, %conv2 - br i1 %exitcond, label %for.end.loopexit, label %for.body - -for.end.loopexit: - br label %for.end - -for.end: - ret void -}