diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -35,6 +35,7 @@ class OptimizationRemarkEmitter; class PredIteratorCache; class ScalarEvolution; +class ScalarEvolutionExpander; class SCEV; class SCEVExpander; class TargetLibraryInfo; @@ -446,7 +447,7 @@ std::pair addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl &PointerChecks, - ScalarEvolution *SE); + SCEVExpander &Expander); } // end namespace llvm diff --git a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h --- a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h +++ b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h @@ -16,6 +16,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/Optional.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ScalarEvolutionNormalization.h" @@ -180,6 +181,8 @@ ChainedPhis.clear(); } + ScalarEvolution *getSE() { return &SE; } + /// Return a vector containing all instructions inserted during expansion. SmallVector getAllInsertedInstructions() const { SmallVector Result; @@ -485,10 +488,12 @@ SCEVExpanderCleaner(SCEVExpander &Expander, DominatorTree &DT) : Expander(Expander), DT(DT), ResultUsed(false) {} - ~SCEVExpanderCleaner(); + ~SCEVExpanderCleaner() { cleanup(); } /// Indicate that the result of the expansion is used. void markResultUsed() { ResultUsed = true; } + + void cleanup(); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1574,7 +1574,8 @@ /// in \p TheLoop. \return the values for the bounds. static PointerBounds expandBounds(const RuntimeCheckingPtrGroup *CG, Loop *TheLoop, Instruction *Loc, - SCEVExpander &Exp, ScalarEvolution *SE) { + SCEVExpander &Exp) { + ScalarEvolution *SE = Exp.getSE(); // TODO: Add helper to retrieve pointers to CG. Value *Ptr = CG->RtCheck.Pointers[CG->Members[0]].PointerValue; const SCEV *Sc = SE->getSCEV(Ptr); @@ -1613,16 +1614,15 @@ /// lower bounds for both pointers in the check. static SmallVector, 4> expandBounds(const SmallVectorImpl &PointerChecks, Loop *L, - Instruction *Loc, ScalarEvolution *SE, SCEVExpander &Exp) { + Instruction *Loc, SCEVExpander &Exp) { SmallVector, 4> ChecksWithBounds; // Here we're relying on the SCEV Expander's cache to only emit code for the // same bounds once. transform(PointerChecks, std::back_inserter(ChecksWithBounds), [&](const RuntimePointerCheck &Check) { - PointerBounds First = expandBounds(Check.first, L, Loc, Exp, SE), - Second = - expandBounds(Check.second, L, Loc, Exp, SE); + PointerBounds First = expandBounds(Check.first, L, Loc, Exp), + Second = expandBounds(Check.second, L, Loc, Exp); return std::make_pair(First, Second); }); @@ -1632,12 +1632,10 @@ std::pair llvm::addRuntimeChecks( Instruction *Loc, Loop *TheLoop, const SmallVectorImpl &PointerChecks, - ScalarEvolution *SE) { + SCEVExpander &Exp) { // TODO: Move noalias annotation code from LoopVersioning here and share with LV if possible. // TODO: Pass RtPtrChecking instead of PointerChecks and SE separately, if possible - const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout(); - SCEVExpander Exp(*SE, DL, "induction"); - auto ExpandedChecks = expandBounds(PointerChecks, TheLoop, Loc, SE, Exp); + auto ExpandedChecks = expandBounds(PointerChecks, TheLoop, Loc, Exp); LLVMContext &Ctx = Loc->getContext(); Instruction *FirstInst = nullptr; diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp --- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp +++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp @@ -68,9 +68,12 @@ // Add the memcheck in the original preheader (this is empty initially). BasicBlock *RuntimeCheckBB = VersionedLoop->getLoopPreheader(); const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); - std::tie(FirstCheckInst, MemRuntimeCheck) = - addRuntimeChecks(RuntimeCheckBB->getTerminator(), VersionedLoop, - AliasChecks, RtPtrChecking.getSE()); + + SCEVExpander Exp2(*RtPtrChecking.getSE(), + VersionedLoop->getHeader()->getModule()->getDataLayout(), + "induction"); + std::tie(FirstCheckInst, MemRuntimeCheck) = addRuntimeChecks( + RuntimeCheckBB->getTerminator(), VersionedLoop, AliasChecks, Exp2); const SCEVUnionPredicate &Pred = LAI.getPSE().getUnionPredicate(); SCEVExpander Exp(*SE, RuntimeCheckBB->getModule()->getDataLayout(), diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -2632,11 +2632,12 @@ return false; } -SCEVExpanderCleaner::~SCEVExpanderCleaner() { +void SCEVExpanderCleaner::cleanup() { // Result is used, nothing to remove. if (ResultUsed) return; + ResultUsed = true; auto InsertedInstructions = Expander.getAllInsertedInstructions(); SmallPtrSet InsertedSet(InsertedInstructions.begin(), InsertedInstructions.end()); @@ -2645,10 +2646,14 @@ Expander.clear(); // Sort so that earlier instructions do not dominate later instructions. - sort(InsertedInstructions, - [this](Instruction *A, Instruction *B) { return DT.dominates(B, A); }); + stable_sort(InsertedInstructions, [this](Instruction *A, Instruction *B) { + return DT.dominates(B, A); + }); // Remove all inserted instructions. for (Instruction *I : InsertedInstructions) { + for (Value *U : I->users()) { + assert(InsertedSet.contains(cast(U))); + } assert(all_of(I->users(), [&InsertedSet](Value *U) { return InsertedSet.contains(cast(U)); })); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -370,9 +370,8 @@ return None; } - +struct GeneratedRTChecks; namespace llvm { - /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -396,12 +395,12 @@ OptimizationRemarkEmitter *ORE, unsigned VecWidth, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI) + ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), - BFI(BFI), PSI(PSI) { + BFI(BFI), PSI(PSI), Check(Check) { // Query this against the original loop and save it here because the profile // of the original loop header may change as the transformation happens. OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( @@ -795,6 +794,10 @@ // Whether this loop should be optimized for size based on profile guided size // optimizatios. bool OptForSizeBasedOnProfile; + + /// Structure to hold information about generated runtime checks, responsible + /// for cleaning the checks, if they turn out unprofitable. + GeneratedRTChecks &Check; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -806,9 +809,9 @@ OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI) + ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, - UnrollFactor, LVL, CM, BFI, PSI) {} + UnrollFactor, LVL, CM, BFI, PSI, Check) {} private: Value *getBroadcastInstrs(Value *V) override; @@ -1547,9 +1550,99 @@ /// Values to ignore in the cost model when VF > 1. SmallPtrSet VecValuesToIgnore; }; - } // end namespace llvm +/// Helper struct to manage generating runtime checks for vectorization. +/// +/// The runtime checks are created up-front in a temporary block to allow better +/// estimating the cost and un-linked from the existing IR. After deciding to +/// vectorize, the checks are moved backed. If deciding not to vectorize, the +/// temporary block is completely removed. +struct GeneratedRTChecks { + BasicBlock *TmpBlock = nullptr; + BasicBlock *Preheader; + Value *SCEVCheck; + Instruction *FirstCheckInst = nullptr; + Instruction *MemRuntimeCheck = nullptr; + + ScalarEvolution &SE; + DominatorTree *DT; + + SCEVExpander Exp; + SCEVExpanderCleaner Cleaner; + + GeneratedRTChecks(BasicBlock *Preheader, ScalarEvolution &SE, + DominatorTree *DT) + : Preheader(Preheader), SE(SE), DT(DT), + Exp(SE, Preheader->getModule()->getDataLayout(), "scev.check"), + Cleaner(Exp, *DT) {} + + /// Generate runtime checks in temporary block (TmpBlock), so we can + /// accurately estimate the cost of the runtime checks. The block is un-linked + /// from the IR and is added back during vector code generation. If there is + /// no vector code generation, the check blocks is removed completely. + void Create(Loop *L, const LoopAccessInfo &LAI, + const SCEVUnionPredicate &UnionPred, LoopInfo *LI) { + BasicBlock *LoopHeader = Preheader->getSingleSuccessor(); + TmpBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, + nullptr, "tmp.rtchecks"); + + SCEVCheck = + Exp.expandCodeForPredicate(&UnionPred, TmpBlock->getTerminator()); + + const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); + if (RtPtrChecking.Need) { + std::tie(FirstCheckInst, MemRuntimeCheck) = addRuntimeChecks( + TmpBlock->getTerminator(), L, RtPtrChecking.getChecks(), Exp); + assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " + "claimed checks are required"); + } + + // Unhook the temporary block with the checks, update various places + // accordingly. + TmpBlock->replaceAllUsesWith(Preheader); + TmpBlock->getTerminator()->moveBefore(Preheader->getTerminator()); + Preheader->getTerminator()->eraseFromParent(); + DT->changeImmediateDominator(LoopHeader, Preheader); + DT->eraseNode(TmpBlock); + LI->removeBlock(TmpBlock); + } + + ~GeneratedRTChecks() { + if (!TmpBlock) { + Cleaner.markResultUsed(); + return; + } + + if (!SCEVCheck && TmpBlock->empty()) { + Cleaner.markResultUsed(); + TmpBlock->eraseFromParent(); + return; + } + + if (MemRuntimeCheck && !isa(MemRuntimeCheck)) + MemRuntimeCheck->replaceAllUsesWith( + ConstantInt::getFalse(MemRuntimeCheck->getType()->getContext())); + if (SCEVCheck && !isa(SCEVCheck)) + SCEVCheck->replaceAllUsesWith( + ConstantInt::getFalse(SCEVCheck->getType()->getContext())); + + SmallPtrSet Removed; + // Completely remove the block. + for (auto &I : make_early_inc_range(reverse(*TmpBlock))) { + if (Exp.isInsertedInstruction(&I)) + continue; + SE.forgetValue(&I); + SE.eraseValueFromMap(&I); + Removed.insert(&I); + I.eraseFromParent(); + } + + Cleaner.cleanup(); + TmpBlock->eraseFromParent(); + } +}; + // Return true if \p OuterLp is an outer loop annotated with hints for explicit // vectorization. The loop needs to be annotated with #pragma omp simd // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the @@ -2757,17 +2850,11 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { // Reuse existing vector loop preheader for SCEV checks. // Note that new preheader block is generated for vector loop. - BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; - - // Generate the code to check that the SCEV assumptions that we made. - // We want the new basic block to start at the first instruction in a - // sequence of instructions that form a check. - SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), - "scev.check"); - Value *SCEVCheck = Exp.expandCodeForPredicate( - &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); + BasicBlock *const SCEVCheckBlock = Check.TmpBlock; - if (auto *C = dyn_cast(SCEVCheck)) + if (!Check.TmpBlock) + return; + if (auto *C = dyn_cast(Check.SCEVCheck)) if (C->isZero()) return; @@ -2776,10 +2863,30 @@ "Cannot SCEV check stride or overflow when optimizing for size"); SCEVCheckBlock->setName("vector.scevcheck"); + + auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); + + BranchInst::Create(LoopVectorPreHeader, Check.TmpBlock); // Create new preheader for vector loop. - LoopVectorPreHeader = - SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, - nullptr, "vector.ph"); + Check.TmpBlock = SplitBlock(SCEVCheckBlock, + cast(Check.SCEVCheck)->getNextNode(), + nullptr, nullptr, nullptr, ""); + + if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) + PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); + + Check.TmpBlock->replaceAllUsesWith(SCEVCheckBlock); + Check.TmpBlock->getTerminator()->moveBefore(SCEVCheckBlock->getTerminator()); + SCEVCheckBlock->getTerminator()->eraseFromParent(); + SCEVCheckBlock->moveBefore(LoopVectorPreHeader); + + auto *PHTerm = Pred->getTerminator(); + for (unsigned i = 0; i < PHTerm->getNumSuccessors(); i++) + if (PHTerm->getSuccessor(i) == LoopVectorPreHeader) + PHTerm->setSuccessor(i, SCEVCheckBlock); + + DT->addNewBlock(SCEVCheckBlock, Pred); + DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); // Update dominator only if this is first RT check. if (LoopBypassBlocks.empty()) { @@ -2789,9 +2896,10 @@ ReplaceInstWithInst( SCEVCheckBlock->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); + BranchInst::Create(Bypass, LoopVectorPreHeader, Check.SCEVCheck)); LoopBypassBlocks.push_back(SCEVCheckBlock); AddedSafetyChecks = true; + Check.SCEVCheck = nullptr; } void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { @@ -2801,22 +2909,13 @@ // Reuse existing vector loop preheader for runtime memory checks. // Note that new preheader block is generated for vector loop. - BasicBlock *const MemCheckBlock = L->getLoopPreheader(); + BasicBlock *const MemCheckBlock = Check.TmpBlock; - // Generate the code that checks in runtime if arrays overlap. We put the - // checks into a separate block to make the more common case of few elements - // faster. - auto *LAI = Legal->getLAI(); - const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); - if (!RtPtrChecking.Need) + // Check if we generated code that checks in runtime if arrays overlap. We put + // the checks into a separate block to make the more common case of few + // elements faster. + if (!Check.MemRuntimeCheck) return; - Instruction *FirstCheckInst; - Instruction *MemRuntimeCheck; - std::tie(FirstCheckInst, MemRuntimeCheck) = - addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, - RtPtrChecking.getChecks(), RtPtrChecking.getSE()); - assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " - "claimed checks are required"); if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && @@ -2832,21 +2931,33 @@ }); } + auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); + auto *PHTerm = Pred->getTerminator(); + for (unsigned i = 0; i < PHTerm->getNumSuccessors(); i++) + if (PHTerm->getSuccessor(i) == LoopVectorPreHeader) + PHTerm->setSuccessor(i, Check.TmpBlock); + auto *BI = BranchInst::Create(LoopVectorPreHeader, Check.TmpBlock); + BI->setDebugLoc(PHTerm->getDebugLoc()); + + DT->addNewBlock(Check.TmpBlock, Pred); + DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); + Check.TmpBlock->moveBefore(LoopVectorPreHeader); + + if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) + PL->addBasicBlockToLoop(Check.TmpBlock, *LI); + MemCheckBlock->setName("vector.memcheck"); - // Create new preheader for vector loop. - LoopVectorPreHeader = - SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, - "vector.ph"); // Update dominator only if this is first RT check. if (LoopBypassBlocks.empty()) { DT->changeImmediateDominator(Bypass, MemCheckBlock); DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); } + Check.TmpBlock = nullptr; ReplaceInstWithInst( MemCheckBlock->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); + BranchInst::Create(Bypass, LoopVectorPreHeader, Check.MemRuntimeCheck)); LoopBypassBlocks.push_back(MemCheckBlock); AddedSafetyChecks = true; @@ -7786,8 +7897,9 @@ LVP.setBestPlan(VF.Width, 1); + GeneratedRTChecks Checks(L->getLoopPreheader(), *PSE.getSE(), DT); InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, - &CM, BFI, PSI); + &CM, BFI, PSI, Checks); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); LVP.executePlan(LB, DT); @@ -7795,7 +7907,6 @@ // Mark the loop as already vectorized to avoid vectorizing again. Hints.setAlreadyVectorized(); - assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); return true; } @@ -7952,6 +8063,11 @@ IC = CM.selectInterleaveCount(VF.Width, VF.Cost); } + // Optimistically generate runtime checks. Drop them if they turn out to not + // be profitable. + GeneratedRTChecks Checks(L->getLoopPreheader(), *PSE.getSE(), DT); + Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate(), LI); + // Identify the diagnostic messages that should be produced. std::pair VecDiagMsg, IntDiagMsg; bool VectorizeLoop = true, InterleaveLoop = true; @@ -8052,7 +8168,7 @@ // If we decided that it is not legal to vectorize the loop, then // interleave it. InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, - BFI, PSI); + BFI, PSI, Checks); LVP.executePlan(Unroller, DT); ORE->emit([&]() { @@ -8064,7 +8180,7 @@ } else { // If we decided that it is *legal* to vectorize the loop, then do it. InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, - &LVL, &CM, BFI, PSI); + &LVL, &CM, BFI, PSI, Checks); LVP.executePlan(LB, DT); ++LoopsVectorized; @@ -8097,7 +8213,6 @@ Hints.setAlreadyVectorized(); } - assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); return true; } @@ -8163,6 +8278,8 @@ Changed |= CFGChanged |= processLoop(L); } + assert(!Changed || !verifyFunction(F, &dbgs())); + // Process each loop nest in the function. return LoopVectorizeResult(Changed, CFGChanged); } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll @@ -47,11 +47,10 @@ ; CHECK-NEXT: br i1 [[TMP13]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, i8* [[CALL]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[G_0]], [[CONV]] -; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 -; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr [6 x i8], [6 x i8]* @c, i64 0, i64 [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = sub i64 [[TMP15]], [[TMP0]] -; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, i8* getelementptr inbounds ([6 x i8], [6 x i8]* @c, i64 0, i64 4), i64 [[TMP16]] +; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP3]] to i64 +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr [6 x i8], [6 x i8]* @c, i64 0, i64 [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = sub i64 [[TMP14]], [[TMP0]] +; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, i8* getelementptr inbounds ([6 x i8], [6 x i8]* @c, i64 0, i64 4), i64 [[TMP15]] ; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP]], [[SCEVGEP3]] ; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP2]], [[SCEVGEP1]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] @@ -65,23 +64,23 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]] -; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = add i64 [[TMP0]], [[INDEX]] -; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[OFFSET_IDX4]] to i32 -; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 0 -; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[CONV]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP20]] to i64 -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [6 x i8], [6 x i8]* @c, i64 0, i64 [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[TMP22]], i32 0 -; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP23]] to <4 x i8>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP24]], align 1, !alias.scope !0 -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, i8* [[CALL]], i64 [[TMP17]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, i8* [[TMP25]], i32 0 -; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to <4 x i8>* -; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP27]], align 1, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[OFFSET_IDX4]] to i32 +; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP17]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[CONV]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [6 x i8], [6 x i8]* @c, i64 0, i64 [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, i8* [[TMP21]], i32 0 +; CHECK-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP22]] to <4 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP23]], align 1, !alias.scope !0 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, i8* [[CALL]], i64 [[TMP16]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, i8* [[TMP24]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = bitcast i8* [[TMP25]] to <4 x i8>* +; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP26]], align 1, !alias.scope !3, !noalias !0 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5 +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5 ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -96,13 +95,13 @@ ; CHECK-NEXT: br label [[FOR_COND]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP29:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; CHECK-NEXT: [[ADD:%.*]] = add i32 [[CONV]], [[TMP29]] +; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[CONV]], [[TMP28]] ; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[ADD]] to i64 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [6 x i8], [6 x i8]* @c, i64 0, i64 [[IDXPROM]] -; CHECK-NEXT: [[TMP30:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP29:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, i8* [[CALL]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store i8 [[TMP30]], i8* [[ARRAYIDX3]], align 1 +; CHECK-NEXT: store i8 [[TMP29]], i8* [[ARRAYIDX3]], align 1 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !7 diff --git a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll --- a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll @@ -26,9 +26,9 @@ ; CHECK-NEXT: br i1 [[CMP27]], label [[FOR_BODY3_LR_PH_US_PREHEADER:%.*]], label [[FOR_END15:%.*]] ; CHECK: for.body3.lr.ph.us.preheader: ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[M]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[K:%.*]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[K:%.*]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 ; CHECK-NEXT: br label [[FOR_BODY3_LR_PH_US:%.*]] ; CHECK: for.end.us: ; CHECK-NEXT: [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV33:%.*]] @@ -54,12 +54,12 @@ ; CHECK-NEXT: br i1 [[EXITCOND32]], label [[FOR_END_US:%.*]], label [[FOR_BODY3_US]], !llvm.loop !3 ; CHECK: for.body3.lr.ph.us: ; CHECK-NEXT: [[INDVARS_IV33]] = phi i64 [ [[INDVARS_IV_NEXT34]], [[FOR_END_US]] ], [ 0, [[FOR_BODY3_LR_PH_US_PREHEADER]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP3]], [[INDVARS_IV33]] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP1]], [[INDVARS_IV33]] ; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 ; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[INDVARS_IV33]] to i32 ; CHECK-NEXT: [[ADD_US]] = add i32 [[TMP9]], [[K]] ; CHECK-NEXT: [[ARRAYIDX7_US]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV33]] -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP0]]) @@ -74,8 +74,8 @@ ; CHECK-NEXT: [[TMP16:%.*]] = or i1 false, [[TMP15]] ; CHECK-NEXT: br i1 [[TMP16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -100,7 +100,7 @@ ; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5 ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_US]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY3_LR_PH_US]] ], [ 0, [[VECTOR_SCEVCHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll b/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll @@ -0,0 +1,32 @@ +; RUN: opt -loop-vectorize -force-vector-width=4 %s | FileCheck %s + +%struct.foo = type { [400 x double] } + +; Make sure we do not crash when dropping runtime checks. + +; CHECK-NOT: vector.body + +define void @barney(%struct.foo* %ptr) { +entry: + br label %loop + +loop: + %tmp3 = phi i64 [ 0, %entry ], [ %tmp18, %loop ] + %tmp4 = getelementptr inbounds %struct.foo, %struct.foo* %ptr, i64 undef + %tmp5 = bitcast %struct.foo* %tmp4 to i64* + store i64 0, i64* %tmp5, align 8 + %tmp8 = add i64 1, %tmp3 + %tmp10 = getelementptr inbounds %struct.foo, %struct.foo* %ptr, i64 %tmp8 + %tmp11 = bitcast %struct.foo* %tmp10 to i64* + store i64 1, i64* %tmp11, align 8 + %tmp14 = add i64 undef, %tmp3 + %tmp16 = getelementptr inbounds %struct.foo, %struct.foo* %ptr, i64 %tmp14 + %tmp17 = bitcast %struct.foo* %tmp16 to i64* + store i64 2, i64* %tmp17, align 8 + %tmp18 = add nuw nsw i64 %tmp3, 4 + %c = icmp ult i64 %tmp18, 400 + br i1 %c, label %exit, label %loop + +exit: + ret void +}