diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -90,7 +90,7 @@ OptimizationRemarkEmitter &ORE); /// Mark the loop L as already vectorized by setting the width to 1. - void setAlreadyVectorized(); + void setAlreadyVectorized(const Loop *TheLoop = nullptr); bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -96,7 +96,10 @@ << "LV: Interleaving disabled by the pass manager\n"); } -void LoopVectorizeHints::setAlreadyVectorized() { +void LoopVectorizeHints::setAlreadyVectorized(const Loop *TheLoop) { + + if (!TheLoop) + TheLoop = this->TheLoop; LLVMContext &Context = TheLoop->getHeader()->getContext(); MDNode *IsVectorizedMD = MDNode::get( diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -184,6 +184,9 @@ bool operator==(const VectorizationFactor &rhs) const { return Width == rhs.Width && Cost == rhs.Cost; } + + VectorizationFactor(ElementCount Width, unsigned Cost) + : Width(Width), Cost(Cost) {} }; /// Planner drives the vectorization process after having passed diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -335,6 +335,10 @@ "vectorize-loops", cl::init(true), cl::Hidden, cl::desc("Run the Loop vectorization passes")); +static cl::opt EnableEpilogLoopVectorization( + "vectorize-remainder-loops", cl::init(true), cl::Hidden, + cl::desc("Do Epilog / Remainder Loop vectorization")); + /// A helper function that returns the type of loaded or stored value. static Type *getMemInstValueType(Value *I) { assert((isa(I) || isa(I)) && @@ -414,6 +418,32 @@ namespace llvm { +/// EpilogVectorLoopHelper hold information about original vector +// loop. +struct EpilogVectorLoopHelper { + + // Vector loop runtime check blocks. + BasicBlock *MemRuntimeCheckBlock; + BasicBlock *SCEVCheckBlock; + + // Original scalar block preheader. + BasicBlock *OriginalScalarBlock; + + // Vector loop middle block. + BasicBlock *LoopMiddleBlock; + + // Vector loop iteration check block. + BasicBlock *OrigTCCheckBlock; + + // Alias check block for epilog loop. + BasicBlock *EpilogAliasCheckBlock; + + // Resume Value from the original vectorized loop. + Value *ResumeValue; + + bool hasRuntimeChecks() { return MemRuntimeCheckBlock || SCEVCheckBlock; } +}; + /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -437,12 +467,13 @@ OptimizationRemarkEmitter *ORE, ElementCount VecWidth, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI) + ProfileSummaryInfo *PSI, EpilogVectorLoopHelper EVLH, + bool IsEpilogLoop = false) : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), - BFI(BFI), PSI(PSI) { + BFI(BFI), PSI(PSI), EVLH(EVLH), IsEpilogLoop(IsEpilogLoop) { // Query this against the original loop and save it here because the profile // of the original loop header may change as the transformation happens. OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( @@ -473,6 +504,8 @@ /// Fix the vectorized code, taking care of header phi's, live-outs, and more. void fixVectorizedLoop(); + Loop *getVectorizedLoop(void); + // Return true if any runtime check is added. bool areSafetyChecksAdded() { return AddedSafetyChecks; } @@ -561,6 +594,10 @@ /// Fix the non-induction PHIs in the OrigPHIsToFix vector. void fixNonInductionPHIs(void); + EpilogVectorLoopHelper getEpilogVectorLoopHelper() { return EVLH; } + + bool canCreateVectorEpilog(); + protected: friend class LoopVectorizationPlanner; @@ -687,7 +724,7 @@ /// Emit a bypass check to see if the vector trip count is zero, including if /// it overflows. - void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); + BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); /// Emit a bypass check to see if all of the SCEV assumptions we've /// had to make are correct. @@ -855,6 +892,10 @@ // Whether this loop should be optimized for size based on profile guided size // optimizatios. bool OptForSizeBasedOnProfile; + + // Epilog Loop Helpers + EpilogVectorLoopHelper EVLH; + bool IsEpilogLoop; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -869,7 +910,7 @@ ProfileSummaryInfo *PSI) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1), UnrollFactor, LVL, CM, - BFI, PSI) {} + BFI, PSI, EpilogVectorLoopHelper()) {} private: Value *getBroadcastInstrs(Value *V) override; @@ -1070,6 +1111,7 @@ /// then this vectorization factor will be selected if vectorization is /// possible. VectorizationFactor selectVectorizationFactor(unsigned MaxVF); + VectorizationFactor selectEpilogVectorizationFactor(unsigned MaxVF); /// Setup cost-based decisions for user vectorization factor. void selectUserVectorizationFactor(ElementCount UserVF) { @@ -1644,6 +1686,9 @@ /// Values to ignore in the cost model when VF > 1. SmallPtrSet VecValuesToIgnore; + + /// Profitable vector factors. + SmallVector ProfitableVFs; }; } // end namespace llvm @@ -2836,14 +2881,24 @@ return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); } -void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, - BasicBlock *Bypass) { +BasicBlock * +InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, + BasicBlock *Bypass) { Value *Count = getOrCreateTripCount(L); // Reuse existing vector loop preheader for TC checks. // Note that new preheader block is generated for vector loop. BasicBlock *const TCCheckBlock = LoopVectorPreHeader; IRBuilder<> Builder(TCCheckBlock->getTerminator()); + // Epilog loop: Subtract the number of iterations from the original vector + // loop. + if (IsEpilogLoop) { + Value *Temp = Builder.CreateSub( + EVLH.ResumeValue, + Legal->getInductionVars().front().second.getStartValue()); + Count = Builder.CreateSub(Count, Temp, "remainder.iter"); + } + // Generate code to check if the loop's trip count is less than VF * UF, or // equal to it in case a scalar epilogue is required; this implies that the // vector trip count is zero. This check also covers the case where adding one @@ -2864,7 +2919,7 @@ // Create new preheader for vector loop. LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, - "vector.ph"); + IsEpilogLoop ? "epilog.vector.ph" : "vector.ph"); assert(DT->properlyDominates(DT->getNode(TCCheckBlock), DT->getNode(Bypass)->getIDom()) && @@ -2878,6 +2933,7 @@ TCCheckBlock->getTerminator(), BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); LoopBypassBlocks.push_back(TCCheckBlock); + return TCCheckBlock; } void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { @@ -2902,11 +2958,12 @@ Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) && "Cannot SCEV check stride or overflow when optimizing for size"); - SCEVCheckBlock->setName("vector.scevcheck"); + SCEVCheckBlock->setName(IsEpilogLoop ? "epilog.vector.scevcheck" + : "vector.scevcheck"); // Create new preheader for vector loop. LoopVectorPreHeader = SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, - nullptr, "vector.ph"); + nullptr, IsEpilogLoop ? "epilog.vector.ph" : "vector.ph"); // Update dominator only if this is first RT check. if (LoopBypassBlocks.empty()) { @@ -2917,6 +2974,8 @@ ReplaceInstWithInst( SCEVCheckBlock->getTerminator(), BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); + if (!IsEpilogLoop) + EVLH.SCEVCheckBlock = SCEVCheckBlock; LoopBypassBlocks.push_back(SCEVCheckBlock); AddedSafetyChecks = true; } @@ -2952,16 +3011,19 @@ }); } - MemCheckBlock->setName("vector.memcheck"); + MemCheckBlock->setName(IsEpilogLoop ? "epilog.vector.memcheck" + : "vector.memcheck"); // Create new preheader for vector loop. LoopVectorPreHeader = SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, - "vector.ph"); + IsEpilogLoop ? "epilog.vector.ph" : "vector.ph"); auto *CondBranch = cast( Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader)); ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch); LoopBypassBlocks.push_back(MemCheckBlock); + if (!IsEpilogLoop) + EVLH.MemRuntimeCheckBlock = MemCheckBlock; AddedSafetyChecks = true; // Update dominator only if this is first RT check. @@ -3144,6 +3206,15 @@ LoopScalarPreHeader->getTerminator()); // Copy original phi DL over to the new one. BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); + + if (!IsEpilogLoop) { + EVLH.ResumeValue = BCResumeVal; + EVLH.OriginalScalarBlock = LoopScalarPreHeader; + } else { + EVLH.OriginalScalarBlock->setName("epilog.ph"); + LoopScalarPreHeader->setName("scalar.ph"); + } + Value *&EndValue = IVEndValues[OrigPhi]; if (OrigPhi == OldInduction) { // We know what the end value is. @@ -3166,9 +3237,40 @@ // Fix the scalar body counter (PHI node). // The old induction's phi node in the scalar body needs the truncated // value. - for (BasicBlock *BB : LoopBypassBlocks) - BCResumeVal->addIncoming(II.getStartValue(), BB); + for (BasicBlock *BB : LoopBypassBlocks) { + Value *StartValue = II.getStartValue(); + if (IsEpilogLoop) { + StartValue = EVLH.ResumeValue; + } + BCResumeVal->addIncoming(StartValue, BB); + } + OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); + + // Fix the CFG when epilog vector loop is generated when there are runtime + // check blocks + if (IsEpilogLoop && (EVLH.hasRuntimeChecks())) { + for (BasicBlock *BB : {EVLH.MemRuntimeCheckBlock, EVLH.SCEVCheckBlock}) { + if (!BB) + continue; + BB->getTerminator()->replaceSuccessorWith(EVLH.OriginalScalarBlock, + LoopScalarPreHeader); + BCResumeVal->addIncoming(II.getStartValue(), BB); + cast(EVLH.ResumeValue)->removeIncomingValue(BB); + } + BasicBlock *Entry = + DT->getNode(EVLH.SCEVCheckBlock ? EVLH.SCEVCheckBlock + : EVLH.MemRuntimeCheckBlock) + ->getIDom() + ->getBlock(); + DT->changeImmediateDominator(LoopScalarPreHeader, Entry); + DT->changeImmediateDominator(LoopExitBlock, Entry); + BranchInst *BI = + cast(EVLH.EpilogAliasCheckBlock->getTerminator()); + BI->setSuccessor(1, LoopVectorPreHeader); + DT->changeImmediateDominator(LoopVectorPreHeader, + EVLH.EpilogAliasCheckBlock); + } } } @@ -3276,20 +3378,92 @@ ... */ + /* + Simplifed version of Epilog Vectorized loop CFG: + + 1. The alias and scev are done once. Either during original vector loop or at + epilog vector loop. But it is cloned in both the places. + + + [ ] <-- original loop iteratiib check. + / | + / v + | [ ] <-- original vector loop bypass (may consist of multiple blocks). + | / | + | / v + || [ ] <-- original vector pre-header. + |/ | + | v + | [ ] \ + | [ ]_| <-- original vector loop. + | | + | v + | -[ ] <--- original middle-block. + | / | + | / v + |----[ ] <-- epilog vector loop iteration number check. + | / | + |/ v + / [ ] <-- Previously executed alias result check or new checks. + || / | + ||/ v + |/ [ ] <-- new vector version pre-header. + || | + || v + || [ ] \ + || [ ]_| <-- new vector loop. + || | + || v + || -[ ] <--- new middle-block. + || / | + | / v + -|- >[ ] <--- new scalar pre-header. + | | + | v + | [ ] \ + | [ ]_| <--old scalar loop to handle remainder. + \ | + \ v + >[ ] <-- exit block. + */ + // Get the metadata of the original loop before it gets modified. MDNode *OrigLoopID = OrigLoop->getLoopID(); // Create an empty vector loop, and prepare basic blocks for the runtime // checks. - Loop *Lp = createVectorLoopSkeleton(""); + Loop *Lp = createVectorLoopSkeleton(IsEpilogLoop ? "epilog." : ""); // Now, compare the new count to zero. If it is zero skip the vector loop and // jump to the scalar loop. This check also covers the case where the // backedge-taken count is uint##_max: adding one to it will overflow leading // to an incorrect trip count of zero. In this (rare) case we will also jump // to the scalar loop. - emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); - + BasicBlock *TCCheckBlock = + emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); + + // Create bypass for epilog vector loop runtime checks if already done in + // vector loop. + if (IsEpilogLoop) { + IRBuilder<> Builder(&TCCheckBlock->front()); + PHINode *RuntimeChecksPHI = Builder.CreatePHI(Builder.getInt1Ty(), 2); + RuntimeChecksPHI->addIncoming(Builder.getInt1(false), EVLH.LoopMiddleBlock); + RuntimeChecksPHI->addIncoming(Builder.getInt1(true), EVLH.OrigTCCheckBlock); + + BasicBlock *TCBlock = LoopVectorPreHeader; + // Create new preheader for vector loop. + TCBlock->setName("runtime.check.ph"); + LoopVectorPreHeader = SplitBlock(TCBlock, TCBlock->getTerminator(), DT, LI, + nullptr, "runtime.check.ph"); + + ReplaceInstWithInst(TCBlock->getTerminator(), + BranchInst::Create(LoopVectorPreHeader, + LoopVectorPreHeader, + RuntimeChecksPHI)); + EVLH.EpilogAliasCheckBlock = TCBlock; + } else { + EVLH.OrigTCCheckBlock = TCCheckBlock; + } // Generate the code to check any assumptions that we've made for SCEV // expressions. emitSCEVChecks(Lp, LoopScalarPreHeader); @@ -3313,6 +3487,15 @@ OldInduction = Legal->getPrimaryInduction(); Type *IdxTy = Legal->getWidestInductionType(); Value *StartIdx = ConstantInt::get(IdxTy, 0); + if (IsEpilogLoop) { + IRBuilder<> Builder(cast(EVLH.ResumeValue)->getNextNode()); + StartIdx = Builder.CreateSub( + EVLH.ResumeValue, + Legal->getInductionVars().front().second.getStartValue()); + } else { + EVLH.LoopMiddleBlock = LoopMiddleBlock; + } + // The loop step is equal to the vectorization factor (num of SIMD elements) // times the unroll factor (num of SIMD instructions). assert(!VF.isScalable() && "scalable vectors not yet supported."); @@ -3652,6 +3835,10 @@ } } +Loop *InnerLoopVectorizer::getVectorizedLoop() { + return LI->getLoopFor(LoopVectorBody); +} + void InnerLoopVectorizer::fixVectorizedLoop() { // Insert truncates and extends for any truncated instructions as hints to // InstCombine. @@ -5419,6 +5606,13 @@ << " because it will not generate any vector instructions.\n"); continue; } + + // If profitable add it to ProfitableVF list. + if (VectorCost < ScalarCost) { + ProfitableVFs.push_back( + VectorizationFactor(ElementCount::getFixed(i), VectorCost)); + } + if (VectorCost < Cost) { Cost = VectorCost; Width = i; @@ -5442,6 +5636,25 @@ return Factor; } +VectorizationFactor +LoopVectorizationCostModel::selectEpilogVectorizationFactor(unsigned MaxVF) { + // Find next VF. + unsigned VF = 1; + unsigned int Cost = 0; + // Original loop vector factor should be atleast 16. + if (MaxVF < 16) { + return VectorizationFactor(ElementCount::getFixed(VF), Cost); + } + for (auto &NextVF : ProfitableVFs) { + if ((NextVF.Width.getFixedValue() < MaxVF) && + (VF == 1 || NextVF.Cost < Cost)) { + VF = NextVF.Width.getFixedValue(); + Cost = NextVF.Cost; + } + } + return VectorizationFactor(ElementCount::getFixed(VF), Cost); +} + std::pair LoopVectorizationCostModel::getSmallestAndWidestTypes() { unsigned MinWidth = -1U; @@ -8217,7 +8430,7 @@ LVP.setBestPlan(VF.Width, 1); InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, - &CM, BFI, PSI); + &CM, BFI, PSI, EpilogVectorLoopHelper()); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); LVP.executePlan(LB, DT); @@ -8235,6 +8448,43 @@ VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced || !EnableLoopVectorization) {} +bool InnerLoopVectorizer::canCreateVectorEpilog() { + + // If optimized for size, do not do it. + if (OrigLoop->getHeader()->getParent()->hasOptSize() || + OptForSizeBasedOnProfile) { + return false; + } + // Epilog vectorization option should be enabled. + if (!EnableEpilogLoopVectorization) + return false; + // Loop should have single exit block. + if (!OrigLoop->getExitBlock()) + return false; + // Loop should have pre header. + if (!OrigLoop->getLoopPreheader()) + return false; + // Loop bypass blocks should not be empty. + if (!LoopBypassBlocks.size()) + return false; + // FIXME: Yet to handle loops with multiple induction variables. + if (Legal->getInductionVars().size() != 1) { + return false; + } + // FIXME: Yet to handle reductions / first order recurrences. + if (!Legal->getReductionVars().empty() || + !Legal->getFirstOrderRecurrences().empty()) { + return false; + } + // FIMXE: Yet to handle loops with outgoing values. + if (!LoopExitBlock->phis().empty()) + return false; + // illegal for now if types are extended / truncated. + if (getOrCreateTripCount(OrigLoop)->getType() != EVLH.ResumeValue->getType()) + return false; + return true; +} + bool LoopVectorizePass::processLoop(Loop *L) { assert((EnableVPlanNativePath || L->isInnermost()) && "VPlan-native path is not enabled. Only process inner loops."); @@ -8495,7 +8745,7 @@ } else { // If we decided that it is *legal* to vectorize the loop, then do it. InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, - &LVL, &CM, BFI, PSI); + &LVL, &CM, BFI, PSI, EpilogVectorLoopHelper()); LVP.executePlan(LB, DT); ++LoopsVectorized; @@ -8513,6 +8763,32 @@ << NV("VectorizationFactor", VF.Width) << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; }); + + if (LB.canCreateVectorEpilog()) { + LLVM_DEBUG(dbgs() << "LV: Trying for Epilog Vectorization\n"); + VectorizationFactor EpiVF = + CM.selectEpilogVectorizationFactor(VF.Width.getFixedValue()); + // Enter only if VF > 1 + if (EpiVF.Width.isVector()) { + LLVM_DEBUG(dbgs() << "LV: Epilog Vectorization with width : " + << EpiVF.Width << " in " << DebugLocStr << '\n'); + // Get the simplified form again. + simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */); + InnerLoopVectorizer EpilogVectorizer( + L, PSE, LI, DT, TLI, TTI, AC, ORE, EpiVF.Width, 1, &LVL, &CM, BFI, + PSI, LB.getEpilogVectorLoopHelper(), true); + // There is no use if we Unroll epilog loop. + LVP.setBestPlan(EpiVF.Width, /*UF=*/1); + LVP.executePlan(EpilogVectorizer, DT); + // Also disable the unrolling of the epilog vector loop. + AddRuntimeUnrollDisableMetaData(EpilogVectorizer.getVectorizedLoop()); + // Mark it as vectorized. + Hints.setAlreadyVectorized(EpilogVectorizer.getVectorizedLoop()); + } else { + LLVM_DEBUG(dbgs() << "LV: Epilog Vectorization is not profitable " + << " in " << DebugLocStr << '\n'); + } + } } Optional RemainderLoopID = diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll --- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll +++ b/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll @@ -54,7 +54,7 @@ ; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4, !alias.scope !3, !noalias !0 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 64 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5 +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP11]], [[TMP10]] ; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <16 x i32> [[TMP12]], [[BIN_RDX]] @@ -75,7 +75,7 @@ ; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4 ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !7 +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP7:!llvm.loop !.*]] ; CHECK: for.end: ; CHECK-NEXT: [[T4:%.*]] = phi i32 [ [[T3]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[T4]] @@ -105,14 +105,14 @@ define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32 %k) { ; CHECK-LABEL: @inv_val_store_to_inv_address_conditional( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* +; CHECK-NEXT: [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8* ; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 ; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[N]], 1 ; CHECK-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[EPILOG_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8* -; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i64 [[N]], 1 ; CHECK-NEXT: [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1 ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]] @@ -120,7 +120,7 @@ ; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]] ; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> undef, i32 [[K:%.*]], i32 0 @@ -141,15 +141,55 @@ ; CHECK-NEXT: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[BROADCAST_SPLAT6]], <16 x i32*> [[BROADCAST_SPLAT8]], i32 4, <16 x i1> [[TMP4]]), !alias.scope !11 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !13 +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP13:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[EPILOG_PH]] +; CHECK: epilog.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt i64 [[N]], 1 +; CHECK-NEXT: [[SMAX9:%.*]] = select i1 [[TMP7]], i64 [[N]], i64 1 +; CHECK-NEXT: [[REMAINDER_ITER:%.*]] = sub nsw i64 [[SMAX9]], [[BC_RESUME_VAL]] +; CHECK-NEXT: [[MIN_ITERS_CHECK10:%.*]] = icmp ult i64 [[REMAINDER_ITER]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK10]], label [[SCALAR_PH]], label [[RUNTIME_CHECK_PH:%.*]] +; CHECK: runtime.check.ph: +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[EPILOG_VECTOR_MEMCHECK:%.*]], label [[EPILOG_VECTOR_PH:%.*]] +; CHECK: epilog.vector.memcheck: +; CHECK-NEXT: [[SCEVGEP12:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX]] +; CHECK-NEXT: [[UGLYGEP14:%.*]] = getelementptr i8, i8* [[A4]], i64 1 +; CHECK-NEXT: [[BOUND016:%.*]] = icmp ugt i8* [[UGLYGEP14]], [[B1]] +; CHECK-NEXT: [[BOUND117:%.*]] = icmp ugt i32* [[SCEVGEP12]], [[A]] +; CHECK-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT18]], label [[SCALAR_PH]], label [[EPILOG_VECTOR_PH]] +; CHECK: epilog.vector.ph: +; CHECK-NEXT: [[N_VEC21:%.*]] = and i64 [[SMAX9]], 9223372036854775800 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <8 x i32> undef, i32 [[K]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT28:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT27]], <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT29:%.*]] = insertelement <8 x i32> undef, i32 [[NTRUNC]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT30:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT29]], <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT31:%.*]] = insertelement <8 x i32*> undef, i32* [[A]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT32:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT31]], <8 x i32*> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: br label [[EPILOG_VECTOR_BODY:%.*]] +; CHECK: epilog.vector.body: +; CHECK-NEXT: [[INDEX22:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[EPILOG_VECTOR_PH]] ], [ [[INDEX_NEXT23:%.*]], [[EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX22]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* +; CHECK-NEXT: [[WIDE_LOAD26:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 8, !alias.scope !14, !noalias !17 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD26]], [[BROADCAST_SPLAT28]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* +; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT30]], <8 x i32>* [[TMP11]], align 4, !alias.scope !14, !noalias !17 +; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> [[BROADCAST_SPLAT30]], <8 x i32*> [[BROADCAST_SPLAT32]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !17 +; CHECK-NEXT: [[INDEX_NEXT23]] = add i64 [[INDEX22]], 8 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT23]], [[N_VEC21]] +; CHECK-NEXT: br i1 [[TMP12]], label [[EPILOG_MIDDLE_BLOCK:%.*]], label [[EPILOG_VECTOR_BODY]], [[LOOP19:!llvm.loop !.*]] +; CHECK: epilog.middle.block: +; CHECK-NEXT: [[CMP_N25:%.*]] = icmp eq i64 [[SMAX9]], [[N_VEC21]] +; CHECK-NEXT: br i1 [[CMP_N25]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL24:%.*]] = phi i64 [ [[N_VEC21]], [[EPILOG_MIDDLE_BLOCK]] ], [ [[BC_RESUME_VAL]], [[EPILOG_PH]] ], [ [[BC_RESUME_VAL]], [[EPILOG_VECTOR_MEMCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL24]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] ; CHECK-NEXT: [[T2:%.*]] = load i32, i32* [[T1]], align 8 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[T2]], [[K]] @@ -161,7 +201,9 @@ ; CHECK: latch: ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !14 +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], [[LOOP21:!llvm.loop !.*]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -193,15 +235,15 @@ define void @variant_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32* %c, i32 %k) { ; CHECK-LABEL: @variant_val_store_to_inv_address_conditional( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* +; CHECK-NEXT: [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8* +; CHECK-NEXT: [[C5:%.*]] = bitcast i32* [[C:%.*]] to i8* ; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 ; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[N]], 1 ; CHECK-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[EPILOG_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[C5:%.*]] = bitcast i32* [[C:%.*]] to i8* -; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* -; CHECK-NEXT: [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8* ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i64 [[N]], 1 ; CHECK-NEXT: [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1 ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]] @@ -218,7 +260,7 @@ ; CHECK-NEXT: [[BOUND113:%.*]] = icmp ugt i8* [[UGLYGEP]], [[C5]] ; CHECK-NEXT: [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]] ; CHECK-NEXT: [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT14]] -; CHECK-NEXT: br i1 [[CONFLICT_RDX15]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX15]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> undef, i32 [[K:%.*]], i32 0 @@ -232,25 +274,77 @@ ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 8, !alias.scope !15, !noalias !18 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 8, !alias.scope !22, !noalias !25 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <16 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>* -; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT17]], <16 x i32>* [[TMP5]], align 4, !alias.scope !15, !noalias !18 +; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT17]], <16 x i32>* [[TMP5]], align 4, !alias.scope !22, !noalias !25 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP7]], i32 8, <16 x i1> [[TMP4]], <16 x i32> undef), !alias.scope !21 -; CHECK-NEXT: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[WIDE_MASKED_LOAD]], <16 x i32*> [[BROADCAST_SPLAT19]], i32 4, <16 x i1> [[TMP4]]), !alias.scope !22, !noalias !21 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP7]], i32 8, <16 x i1> [[TMP4]], <16 x i32> undef), !alias.scope !28 +; CHECK-NEXT: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> [[WIDE_MASKED_LOAD]], <16 x i32*> [[BROADCAST_SPLAT19]], i32 4, <16 x i1> [[TMP4]]), !alias.scope !29, !noalias !28 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !23 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP30:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[EPILOG_PH]] +; CHECK: epilog.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt i64 [[N]], 1 +; CHECK-NEXT: [[SMAX20:%.*]] = select i1 [[TMP9]], i64 [[N]], i64 1 +; CHECK-NEXT: [[REMAINDER_ITER:%.*]] = sub nsw i64 [[SMAX20]], [[BC_RESUME_VAL]] +; CHECK-NEXT: [[MIN_ITERS_CHECK21:%.*]] = icmp ult i64 [[REMAINDER_ITER]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK21]], label [[SCALAR_PH]], label [[RUNTIME_CHECK_PH:%.*]] +; CHECK: runtime.check.ph: +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[EPILOG_VECTOR_MEMCHECK:%.*]], label [[EPILOG_VECTOR_PH:%.*]] +; CHECK: epilog.vector.memcheck: +; CHECK-NEXT: [[SCEVGEP23:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX]] +; CHECK-NEXT: [[UGLYGEP25:%.*]] = getelementptr i8, i8* [[A4]], i64 1 +; CHECK-NEXT: [[SCEVGEP26:%.*]] = getelementptr i32, i32* [[C]], i64 [[SMAX]] +; CHECK-NEXT: [[BOUND029:%.*]] = icmp ugt i8* [[UGLYGEP25]], [[B1]] +; CHECK-NEXT: [[BOUND130:%.*]] = icmp ugt i32* [[SCEVGEP23]], [[A]] +; CHECK-NEXT: [[FOUND_CONFLICT31:%.*]] = and i1 [[BOUND029]], [[BOUND130]] +; CHECK-NEXT: [[BOUND032:%.*]] = icmp ugt i32* [[SCEVGEP26]], [[B]] +; CHECK-NEXT: [[BOUND133:%.*]] = icmp ugt i32* [[SCEVGEP23]], [[C]] +; CHECK-NEXT: [[FOUND_CONFLICT34:%.*]] = and i1 [[BOUND032]], [[BOUND133]] +; CHECK-NEXT: [[CONFLICT_RDX35:%.*]] = or i1 [[FOUND_CONFLICT31]], [[FOUND_CONFLICT34]] +; CHECK-NEXT: [[BOUND037:%.*]] = icmp ugt i32* [[SCEVGEP26]], [[A]] +; CHECK-NEXT: [[BOUND138:%.*]] = icmp ugt i8* [[UGLYGEP25]], [[C5]] +; CHECK-NEXT: [[FOUND_CONFLICT39:%.*]] = and i1 [[BOUND037]], [[BOUND138]] +; CHECK-NEXT: [[CONFLICT_RDX40:%.*]] = or i1 [[CONFLICT_RDX35]], [[FOUND_CONFLICT39]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX40]], label [[SCALAR_PH]], label [[EPILOG_VECTOR_PH]] +; CHECK: epilog.vector.ph: +; CHECK-NEXT: [[N_VEC43:%.*]] = and i64 [[SMAX20]], 9223372036854775800 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT49:%.*]] = insertelement <8 x i32> undef, i32 [[K]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT50:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT49]], <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT51:%.*]] = insertelement <8 x i32> undef, i32 [[NTRUNC]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT52:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT51]], <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT54:%.*]] = insertelement <8 x i32*> undef, i32* [[A]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT55:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT54]], <8 x i32*> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: br label [[EPILOG_VECTOR_BODY:%.*]] +; CHECK: epilog.vector.body: +; CHECK-NEXT: [[INDEX44:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[EPILOG_VECTOR_PH]] ], [ [[INDEX_NEXT45:%.*]], [[EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX44]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* +; CHECK-NEXT: [[WIDE_LOAD48:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 8, !alias.scope !31, !noalias !34 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD48]], [[BROADCAST_SPLAT50]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* +; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT52]], <8 x i32>* [[TMP13]], align 4, !alias.scope !31, !noalias !34 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDEX44]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD53:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP15]], i32 8, <8 x i1> [[TMP12]], <8 x i32> undef), !alias.scope !37 +; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> [[WIDE_MASKED_LOAD53]], <8 x i32*> [[BROADCAST_SPLAT55]], i32 4, <8 x i1> [[TMP12]]), !alias.scope !38, !noalias !37 +; CHECK-NEXT: [[INDEX_NEXT45]] = add i64 [[INDEX44]], 8 +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT45]], [[N_VEC43]] +; CHECK-NEXT: br i1 [[TMP16]], label [[EPILOG_MIDDLE_BLOCK:%.*]], label [[EPILOG_VECTOR_BODY]], [[LOOP39:!llvm.loop !.*]] +; CHECK: epilog.middle.block: +; CHECK-NEXT: [[CMP_N47:%.*]] = icmp eq i64 [[SMAX20]], [[N_VEC43]] +; CHECK-NEXT: br i1 [[CMP_N47]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL46:%.*]] = phi i64 [ [[N_VEC43]], [[EPILOG_MIDDLE_BLOCK]] ], [ [[BC_RESUME_VAL]], [[EPILOG_PH]] ], [ [[BC_RESUME_VAL]], [[EPILOG_VECTOR_MEMCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL46]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] ; CHECK-NEXT: [[T2:%.*]] = load i32, i32* [[T1]], align 8 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[T2]], [[K]] @@ -264,7 +358,9 @@ ; CHECK: latch: ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !24 +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], [[LOOP40:!llvm.loop !.*]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/test/Transforms/LoopVectorize/X86/masked_load_store.ll --- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ b/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -61,7 +61,7 @@ ; AVX1-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !5, !noalias !7 ; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; AVX1-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; AVX1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8 +; AVX1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -84,7 +84,7 @@ ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !10 +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP10:!llvm.loop !.*]] ; AVX1: for.end: ; AVX1-NEXT: ret void ; @@ -176,7 +176,7 @@ ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP35]], <8 x i32>* [[TMP47]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !5, !noalias !7 ; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 ; AVX2-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX2-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8 +; AVX2-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -199,7 +199,7 @@ ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !10 +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP10:!llvm.loop !.*]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; @@ -208,7 +208,7 @@ ; AVX512-NEXT: [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8* ; AVX512-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* ; AVX512-NEXT: [[B6:%.*]] = bitcast i32* [[B:%.*]] to i8* -; AVX512-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; AVX512-NEXT: br i1 false, label [[EPILOG_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX512: vector.memcheck: ; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A]], i64 10000 ; AVX512-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8* @@ -224,7 +224,7 @@ ; AVX512-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] ; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] ; AVX512-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true -; AVX512-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; AVX512-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX512: vector.ph: ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: @@ -291,30 +291,83 @@ ; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP35]], <16 x i32>* [[TMP47]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !5, !noalias !7 ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 64 ; AVX512-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX512-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8 +; AVX512-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 -; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[EPILOG_PH]] +; AVX512: epilog.ph: +; AVX512-NEXT: [[TMP49:%.*]] = phi i1 [ false, [[MIDDLE_BLOCK]] ], [ true, [[ENTRY:%.*]] ] +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; AVX512-NEXT: [[TMP50:%.*]] = sub i64 [[BC_RESUME_VAL]], 0 +; AVX512-NEXT: [[TMP51:%.*]] = sub i64 [[BC_RESUME_VAL]], 0 +; AVX512-NEXT: [[REMAINDER_ITER:%.*]] = sub i64 10000, [[TMP51]] +; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[REMAINDER_ITER]], 8 +; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH]], label [[RUNTIME_CHECK_PH:%.*]] +; AVX512: runtime.check.ph: +; AVX512-NEXT: br i1 [[TMP49]], label [[EPILOG_VECTOR_MEMCHECK:%.*]], label [[EPILOG_VECTOR_PH:%.*]] +; AVX512: epilog.vector.memcheck: +; AVX512-NEXT: [[SCEVGEP19:%.*]] = getelementptr i32, i32* [[A]], i64 10000 +; AVX512-NEXT: [[SCEVGEP1920:%.*]] = bitcast i32* [[SCEVGEP19]] to i8* +; AVX512-NEXT: [[SCEVGEP21:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 10000 +; AVX512-NEXT: [[SCEVGEP2122:%.*]] = bitcast i32* [[SCEVGEP21]] to i8* +; AVX512-NEXT: [[SCEVGEP23:%.*]] = getelementptr i32, i32* [[B]], i64 10000 +; AVX512-NEXT: [[SCEVGEP2324:%.*]] = bitcast i32* [[SCEVGEP23]] to i8* +; AVX512-NEXT: [[BOUND025:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP2122]] +; AVX512-NEXT: [[BOUND126:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP1920]] +; AVX512-NEXT: [[FOUND_CONFLICT27:%.*]] = and i1 [[BOUND025]], [[BOUND126]] +; AVX512-NEXT: [[BOUND028:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP2324]] +; AVX512-NEXT: [[BOUND129:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP1920]] +; AVX512-NEXT: [[FOUND_CONFLICT30:%.*]] = and i1 [[BOUND028]], [[BOUND129]] +; AVX512-NEXT: [[CONFLICT_RDX31:%.*]] = or i1 [[FOUND_CONFLICT27]], [[FOUND_CONFLICT30]] +; AVX512-NEXT: [[MEMCHECK_CONFLICT32:%.*]] = and i1 [[CONFLICT_RDX31]], true +; AVX512-NEXT: br i1 [[MEMCHECK_CONFLICT32]], label [[SCALAR_PH]], label [[EPILOG_VECTOR_PH]] +; AVX512: epilog.vector.ph: +; AVX512-NEXT: br label [[EPILOG_VECTOR_BODY:%.*]] +; AVX512: epilog.vector.body: +; AVX512-NEXT: [[INDEX33:%.*]] = phi i64 [ [[TMP50]], [[EPILOG_VECTOR_PH]] ], [ [[INDEX_NEXT34:%.*]], [[EPILOG_VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP52:%.*]] = add i64 [[INDEX33]], 0 +; AVX512-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP52]] +; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds i32, i32* [[TMP53]], i32 0 +; AVX512-NEXT: [[TMP55:%.*]] = bitcast i32* [[TMP54]] to <8 x i32>* +; AVX512-NEXT: [[WIDE_LOAD37:%.*]] = load <8 x i32>, <8 x i32>* [[TMP55]], align 4, !alias.scope !10 +; AVX512-NEXT: [[TMP56:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD37]], +; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP52]] +; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[TMP57]], i32 0 +; AVX512-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <8 x i32>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD38:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP59]], i32 4, <8 x i1> [[TMP56]], <8 x i32> undef), !alias.scope !13 +; AVX512-NEXT: [[TMP60:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD38]], [[WIDE_LOAD37]] +; AVX512-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP52]] +; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds i32, i32* [[TMP61]], i32 0 +; AVX512-NEXT: [[TMP63:%.*]] = bitcast i32* [[TMP62]] to <8 x i32>* +; AVX512-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP60]], <8 x i32>* [[TMP63]], i32 4, <8 x i1> [[TMP56]]), !alias.scope !15, !noalias !17 +; AVX512-NEXT: [[INDEX_NEXT34]] = add i64 [[INDEX33]], 8 +; AVX512-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT34]], 10000 +; AVX512-NEXT: br i1 [[TMP64]], label [[EPILOG_MIDDLE_BLOCK:%.*]], label [[EPILOG_VECTOR_BODY]], [[LOOP18:!llvm.loop !.*]] +; AVX512: epilog.middle.block: +; AVX512-NEXT: [[CMP_N36:%.*]] = icmp eq i64 10000, 10000 +; AVX512-NEXT: br i1 [[CMP_N36]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; AVX512: scalar.ph: -; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; AVX512-NEXT: [[BC_RESUME_VAL35:%.*]] = phi i64 [ 10000, [[EPILOG_MIDDLE_BLOCK]] ], [ [[BC_RESUME_VAL]], [[EPILOG_PH]] ], [ [[BC_RESUME_VAL]], [[EPILOG_VECTOR_MEMCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: -; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL35]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP49:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP49]], 100 +; AVX512-NEXT: [[TMP65:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP65]], 100 ; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX512: if.then: ; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP50:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 -; AVX512-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP50]], [[TMP49]] +; AVX512-NEXT: [[TMP66:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 +; AVX512-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP66]], [[TMP65]] ; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] ; AVX512-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4 ; AVX512-NEXT: br label [[FOR_INC]] ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !10 +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP20:!llvm.loop !.*]] +; AVX512: for.end.loopexit: +; AVX512-NEXT: br label [[FOR_END]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -391,7 +444,7 @@ ; AVX1-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP8]], <8 x i32> addrspace(1)* [[TMP11]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !16, !noalias !18 ; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; AVX1-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; AVX1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !19 +; AVX1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP19:!llvm.loop !.*]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -414,7 +467,7 @@ ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !20 +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP20:!llvm.loop !.*]] ; AVX1: for.end: ; AVX1-NEXT: ret void ; @@ -506,7 +559,7 @@ ; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP35]], <8 x i32> addrspace(1)* [[TMP47]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !16, !noalias !18 ; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 ; AVX2-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX2-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !19 +; AVX2-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP19:!llvm.loop !.*]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -529,7 +582,7 @@ ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !20 +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP20:!llvm.loop !.*]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; @@ -538,7 +591,7 @@ ; AVX512-NEXT: [[A1:%.*]] = bitcast i32 addrspace(1)* [[A:%.*]] to i8 addrspace(1)* ; AVX512-NEXT: [[TRIGGER3:%.*]] = bitcast i32 addrspace(1)* [[TRIGGER:%.*]] to i8 addrspace(1)* ; AVX512-NEXT: [[B6:%.*]] = bitcast i32 addrspace(1)* [[B:%.*]] to i8 addrspace(1)* -; AVX512-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; AVX512-NEXT: br i1 false, label [[EPILOG_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX512: vector.memcheck: ; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 10000 ; AVX512-NEXT: [[SCEVGEP2:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP]] to i8 addrspace(1)* @@ -554,7 +607,7 @@ ; AVX512-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] ; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] ; AVX512-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true -; AVX512-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; AVX512-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX512: vector.ph: ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: @@ -569,16 +622,16 @@ ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 0 ; AVX512-NEXT: [[TMP9:%.*]] = bitcast i32 addrspace(1)* [[TMP8]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP9]], align 4, !alias.scope !11 +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP9]], align 4, !alias.scope !21 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 16 ; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32 addrspace(1)* [[TMP10]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP11]], align 4, !alias.scope !11 +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP11]], align 4, !alias.scope !21 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 32 ; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP13]], align 4, !alias.scope !11 +; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP13]], align 4, !alias.scope !21 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 48 ; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP15]], align 4, !alias.scope !11 +; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP15]], align 4, !alias.scope !21 ; AVX512-NEXT: [[TMP16:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], ; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], ; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD13]], @@ -589,16 +642,16 @@ ; AVX512-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP20]], i32 0 ; AVX512-NEXT: [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x i32> undef), !alias.scope !14 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x i32> undef), !alias.scope !24 ; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP20]], i32 16 ; AVX512-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x i32> undef), !alias.scope !14 +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x i32> undef), !alias.scope !24 ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP20]], i32 32 ; AVX512-NEXT: [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x i32> undef), !alias.scope !14 +; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x i32> undef), !alias.scope !24 ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP20]], i32 48 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x i32> undef), !alias.scope !14 +; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x i32> undef), !alias.scope !24 ; AVX512-NEXT: [[TMP32:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] ; AVX512-NEXT: [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] ; AVX512-NEXT: [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] @@ -609,42 +662,95 @@ ; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP36]], i32 0 ; AVX512-NEXT: [[TMP41:%.*]] = bitcast i32 addrspace(1)* [[TMP40]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP32]], <16 x i32> addrspace(1)* [[TMP41]], i32 4, <16 x i1> [[TMP16]]), !alias.scope !16, !noalias !18 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP32]], <16 x i32> addrspace(1)* [[TMP41]], i32 4, <16 x i1> [[TMP16]]), !alias.scope !26, !noalias !28 ; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP36]], i32 16 ; AVX512-NEXT: [[TMP43:%.*]] = bitcast i32 addrspace(1)* [[TMP42]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP33]], <16 x i32> addrspace(1)* [[TMP43]], i32 4, <16 x i1> [[TMP17]]), !alias.scope !16, !noalias !18 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP33]], <16 x i32> addrspace(1)* [[TMP43]], i32 4, <16 x i1> [[TMP17]]), !alias.scope !26, !noalias !28 ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP36]], i32 32 ; AVX512-NEXT: [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP34]], <16 x i32> addrspace(1)* [[TMP45]], i32 4, <16 x i1> [[TMP18]]), !alias.scope !16, !noalias !18 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP34]], <16 x i32> addrspace(1)* [[TMP45]], i32 4, <16 x i1> [[TMP18]]), !alias.scope !26, !noalias !28 ; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP36]], i32 48 ; AVX512-NEXT: [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP35]], <16 x i32> addrspace(1)* [[TMP47]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !16, !noalias !18 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP35]], <16 x i32> addrspace(1)* [[TMP47]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !26, !noalias !28 ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 64 ; AVX512-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX512-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !19 +; AVX512-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP29:!llvm.loop !.*]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 -; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[EPILOG_PH]] +; AVX512: epilog.ph: +; AVX512-NEXT: [[TMP49:%.*]] = phi i1 [ false, [[MIDDLE_BLOCK]] ], [ true, [[ENTRY:%.*]] ] +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; AVX512-NEXT: [[TMP50:%.*]] = sub i64 [[BC_RESUME_VAL]], 0 +; AVX512-NEXT: [[TMP51:%.*]] = sub i64 [[BC_RESUME_VAL]], 0 +; AVX512-NEXT: [[REMAINDER_ITER:%.*]] = sub i64 10000, [[TMP51]] +; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[REMAINDER_ITER]], 8 +; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH]], label [[RUNTIME_CHECK_PH:%.*]] +; AVX512: runtime.check.ph: +; AVX512-NEXT: br i1 [[TMP49]], label [[EPILOG_VECTOR_MEMCHECK:%.*]], label [[EPILOG_VECTOR_PH:%.*]] +; AVX512: epilog.vector.memcheck: +; AVX512-NEXT: [[SCEVGEP19:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 10000 +; AVX512-NEXT: [[SCEVGEP1920:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP19]] to i8 addrspace(1)* +; AVX512-NEXT: [[SCEVGEP21:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER]], i64 10000 +; AVX512-NEXT: [[SCEVGEP2122:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP21]] to i8 addrspace(1)* +; AVX512-NEXT: [[SCEVGEP23:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 10000 +; AVX512-NEXT: [[SCEVGEP2324:%.*]] = bitcast i32 addrspace(1)* [[SCEVGEP23]] to i8 addrspace(1)* +; AVX512-NEXT: [[BOUND025:%.*]] = icmp ult i8 addrspace(1)* [[A1]], [[SCEVGEP2122]] +; AVX512-NEXT: [[BOUND126:%.*]] = icmp ult i8 addrspace(1)* [[TRIGGER3]], [[SCEVGEP1920]] +; AVX512-NEXT: [[FOUND_CONFLICT27:%.*]] = and i1 [[BOUND025]], [[BOUND126]] +; AVX512-NEXT: [[BOUND028:%.*]] = icmp ult i8 addrspace(1)* [[A1]], [[SCEVGEP2324]] +; AVX512-NEXT: [[BOUND129:%.*]] = icmp ult i8 addrspace(1)* [[B6]], [[SCEVGEP1920]] +; AVX512-NEXT: [[FOUND_CONFLICT30:%.*]] = and i1 [[BOUND028]], [[BOUND129]] +; AVX512-NEXT: [[CONFLICT_RDX31:%.*]] = or i1 [[FOUND_CONFLICT27]], [[FOUND_CONFLICT30]] +; AVX512-NEXT: [[MEMCHECK_CONFLICT32:%.*]] = and i1 [[CONFLICT_RDX31]], true +; AVX512-NEXT: br i1 [[MEMCHECK_CONFLICT32]], label [[SCALAR_PH]], label [[EPILOG_VECTOR_PH]] +; AVX512: epilog.vector.ph: +; AVX512-NEXT: br label [[EPILOG_VECTOR_BODY:%.*]] +; AVX512: epilog.vector.body: +; AVX512-NEXT: [[INDEX33:%.*]] = phi i64 [ [[TMP50]], [[EPILOG_VECTOR_PH]] ], [ [[INDEX_NEXT34:%.*]], [[EPILOG_VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP52:%.*]] = add i64 [[INDEX33]], 0 +; AVX512-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP52]] +; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP53]], i32 0 +; AVX512-NEXT: [[TMP55:%.*]] = bitcast i32 addrspace(1)* [[TMP54]] to <8 x i32> addrspace(1)* +; AVX512-NEXT: [[WIDE_LOAD37:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP55]], align 4, !alias.scope !30 +; AVX512-NEXT: [[TMP56:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD37]], +; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[TMP52]] +; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP57]], i32 0 +; AVX512-NEXT: [[TMP59:%.*]] = bitcast i32 addrspace(1)* [[TMP58]] to <8 x i32> addrspace(1)* +; AVX512-NEXT: [[WIDE_MASKED_LOAD38:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP59]], i32 4, <8 x i1> [[TMP56]], <8 x i32> undef), !alias.scope !33 +; AVX512-NEXT: [[TMP60:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD38]], [[WIDE_LOAD37]] +; AVX512-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[TMP52]] +; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP61]], i32 0 +; AVX512-NEXT: [[TMP63:%.*]] = bitcast i32 addrspace(1)* [[TMP62]] to <8 x i32> addrspace(1)* +; AVX512-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP60]], <8 x i32> addrspace(1)* [[TMP63]], i32 4, <8 x i1> [[TMP56]]), !alias.scope !35, !noalias !37 +; AVX512-NEXT: [[INDEX_NEXT34]] = add i64 [[INDEX33]], 8 +; AVX512-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT34]], 10000 +; AVX512-NEXT: br i1 [[TMP64]], label [[EPILOG_MIDDLE_BLOCK:%.*]], label [[EPILOG_VECTOR_BODY]], [[LOOP38:!llvm.loop !.*]] +; AVX512: epilog.middle.block: +; AVX512-NEXT: [[CMP_N36:%.*]] = icmp eq i64 10000, 10000 +; AVX512-NEXT: br i1 [[CMP_N36]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; AVX512: scalar.ph: -; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; AVX512-NEXT: [[BC_RESUME_VAL35:%.*]] = phi i64 [ 10000, [[EPILOG_MIDDLE_BLOCK]] ], [ [[BC_RESUME_VAL]], [[EPILOG_PH]] ], [ [[BC_RESUME_VAL]], [[EPILOG_VECTOR_MEMCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: -; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL35]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP49:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4 -; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP49]], 100 +; AVX512-NEXT: [[TMP65:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4 +; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP65]], 100 ; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX512: if.then: ; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP50:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4 -; AVX512-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP50]], [[TMP49]] +; AVX512-NEXT: [[TMP66:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4 +; AVX512-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP66]], [[TMP65]] ; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]] ; AVX512-NEXT: store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4 ; AVX512-NEXT: br label [[FOR_INC]] ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !20 +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP39:!llvm.loop !.*]] +; AVX512: for.end.loopexit: +; AVX512-NEXT: br label [[FOR_END]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -731,7 +837,7 @@ ; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP9]], <8 x float>* [[TMP12]], i32 4, <8 x i1> [[TMP4]]), !alias.scope !26, !noalias !28 ; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; AVX1-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; AVX1-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !29 +; AVX1-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP29:!llvm.loop !.*]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -755,7 +861,7 @@ ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !30 +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP30:!llvm.loop !.*]] ; AVX1: for.end: ; AVX1-NEXT: ret void ; @@ -851,7 +957,7 @@ ; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP39]], <8 x float>* [[TMP51]], i32 4, <8 x i1> [[TMP19]]), !alias.scope !26, !noalias !28 ; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 ; AVX2-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX2-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !29 +; AVX2-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP29:!llvm.loop !.*]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -875,7 +981,7 @@ ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !30 +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP30:!llvm.loop !.*]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; @@ -884,7 +990,7 @@ ; AVX512-NEXT: [[A1:%.*]] = bitcast float* [[A:%.*]] to i8* ; AVX512-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* ; AVX512-NEXT: [[B6:%.*]] = bitcast float* [[B:%.*]] to i8* -; AVX512-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; AVX512-NEXT: br i1 false, label [[EPILOG_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; AVX512: vector.memcheck: ; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A]], i64 10000 ; AVX512-NEXT: [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8* @@ -900,7 +1006,7 @@ ; AVX512-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]] ; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]] ; AVX512-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true -; AVX512-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; AVX512-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX512: vector.ph: ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: @@ -915,16 +1021,16 @@ ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 ; AVX512-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP9]], align 4, !alias.scope !21 +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP9]], align 4, !alias.scope !40 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16 ; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4, !alias.scope !21 +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4, !alias.scope !40 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 32 ; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4, !alias.scope !21 +; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4, !alias.scope !40 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 48 ; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4, !alias.scope !21 +; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4, !alias.scope !40 ; AVX512-NEXT: [[TMP16:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], ; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], ; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD13]], @@ -935,16 +1041,16 @@ ; AVX512-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0 ; AVX512-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef), !alias.scope !24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef), !alias.scope !43 ; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 16 ; AVX512-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x float> undef), !alias.scope !24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x float> undef), !alias.scope !43 ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 32 ; AVX512-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x float> undef), !alias.scope !24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x float> undef), !alias.scope !43 ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 48 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x float> undef), !alias.scope !24 +; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x float> undef), !alias.scope !43 ; AVX512-NEXT: [[TMP32:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float> ; AVX512-NEXT: [[TMP33:%.*]] = sitofp <16 x i32> [[WIDE_LOAD12]] to <16 x float> ; AVX512-NEXT: [[TMP34:%.*]] = sitofp <16 x i32> [[WIDE_LOAD13]] to <16 x float> @@ -959,43 +1065,97 @@ ; AVX512-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[TMP40]], i32 0 ; AVX512-NEXT: [[TMP45:%.*]] = bitcast float* [[TMP44]] to <16 x float>* -; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP36]], <16 x float>* [[TMP45]], i32 4, <16 x i1> [[TMP16]]), !alias.scope !26, !noalias !28 +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP36]], <16 x float>* [[TMP45]], i32 4, <16 x i1> [[TMP16]]), !alias.scope !45, !noalias !47 ; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, float* [[TMP40]], i32 16 ; AVX512-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to <16 x float>* -; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP37]], <16 x float>* [[TMP47]], i32 4, <16 x i1> [[TMP17]]), !alias.scope !26, !noalias !28 +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP37]], <16 x float>* [[TMP47]], i32 4, <16 x i1> [[TMP17]]), !alias.scope !45, !noalias !47 ; AVX512-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, float* [[TMP40]], i32 32 ; AVX512-NEXT: [[TMP49:%.*]] = bitcast float* [[TMP48]] to <16 x float>* -; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP38]], <16 x float>* [[TMP49]], i32 4, <16 x i1> [[TMP18]]), !alias.scope !26, !noalias !28 +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP38]], <16 x float>* [[TMP49]], i32 4, <16 x i1> [[TMP18]]), !alias.scope !45, !noalias !47 ; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, float* [[TMP40]], i32 48 ; AVX512-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP50]] to <16 x float>* -; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP39]], <16 x float>* [[TMP51]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !26, !noalias !28 +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP39]], <16 x float>* [[TMP51]], i32 4, <16 x i1> [[TMP19]]), !alias.scope !45, !noalias !47 ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 64 ; AVX512-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !29 +; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP48:!llvm.loop !.*]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 -; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[EPILOG_PH]] +; AVX512: epilog.ph: +; AVX512-NEXT: [[TMP53:%.*]] = phi i1 [ false, [[MIDDLE_BLOCK]] ], [ true, [[ENTRY:%.*]] ] +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; AVX512-NEXT: [[TMP54:%.*]] = sub i64 [[BC_RESUME_VAL]], 0 +; AVX512-NEXT: [[TMP55:%.*]] = sub i64 [[BC_RESUME_VAL]], 0 +; AVX512-NEXT: [[REMAINDER_ITER:%.*]] = sub i64 10000, [[TMP55]] +; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[REMAINDER_ITER]], 8 +; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH]], label [[RUNTIME_CHECK_PH:%.*]] +; AVX512: runtime.check.ph: +; AVX512-NEXT: br i1 [[TMP53]], label [[EPILOG_VECTOR_MEMCHECK:%.*]], label [[EPILOG_VECTOR_PH:%.*]] +; AVX512: epilog.vector.memcheck: +; AVX512-NEXT: [[SCEVGEP19:%.*]] = getelementptr float, float* [[A]], i64 10000 +; AVX512-NEXT: [[SCEVGEP1920:%.*]] = bitcast float* [[SCEVGEP19]] to i8* +; AVX512-NEXT: [[SCEVGEP21:%.*]] = getelementptr i32, i32* [[TRIGGER]], i64 10000 +; AVX512-NEXT: [[SCEVGEP2122:%.*]] = bitcast i32* [[SCEVGEP21]] to i8* +; AVX512-NEXT: [[SCEVGEP23:%.*]] = getelementptr float, float* [[B]], i64 10000 +; AVX512-NEXT: [[SCEVGEP2324:%.*]] = bitcast float* [[SCEVGEP23]] to i8* +; AVX512-NEXT: [[BOUND025:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP2122]] +; AVX512-NEXT: [[BOUND126:%.*]] = icmp ult i8* [[TRIGGER3]], [[SCEVGEP1920]] +; AVX512-NEXT: [[FOUND_CONFLICT27:%.*]] = and i1 [[BOUND025]], [[BOUND126]] +; AVX512-NEXT: [[BOUND028:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP2324]] +; AVX512-NEXT: [[BOUND129:%.*]] = icmp ult i8* [[B6]], [[SCEVGEP1920]] +; AVX512-NEXT: [[FOUND_CONFLICT30:%.*]] = and i1 [[BOUND028]], [[BOUND129]] +; AVX512-NEXT: [[CONFLICT_RDX31:%.*]] = or i1 [[FOUND_CONFLICT27]], [[FOUND_CONFLICT30]] +; AVX512-NEXT: [[MEMCHECK_CONFLICT32:%.*]] = and i1 [[CONFLICT_RDX31]], true +; AVX512-NEXT: br i1 [[MEMCHECK_CONFLICT32]], label [[SCALAR_PH]], label [[EPILOG_VECTOR_PH]] +; AVX512: epilog.vector.ph: +; AVX512-NEXT: br label [[EPILOG_VECTOR_BODY:%.*]] +; AVX512: epilog.vector.body: +; AVX512-NEXT: [[INDEX33:%.*]] = phi i64 [ [[TMP54]], [[EPILOG_VECTOR_PH]] ], [ [[INDEX_NEXT34:%.*]], [[EPILOG_VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP56:%.*]] = add i64 [[INDEX33]], 0 +; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP56]] +; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, i32* [[TMP57]], i32 0 +; AVX512-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <8 x i32>* +; AVX512-NEXT: [[WIDE_LOAD37:%.*]] = load <8 x i32>, <8 x i32>* [[TMP59]], align 4, !alias.scope !49 +; AVX512-NEXT: [[TMP60:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD37]], +; AVX512-NEXT: [[TMP61:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP56]] +; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds float, float* [[TMP61]], i32 0 +; AVX512-NEXT: [[TMP63:%.*]] = bitcast float* [[TMP62]] to <8 x float>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD38:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP63]], i32 4, <8 x i1> [[TMP60]], <8 x float> undef), !alias.scope !52 +; AVX512-NEXT: [[TMP64:%.*]] = sitofp <8 x i32> [[WIDE_LOAD37]] to <8 x float> +; AVX512-NEXT: [[TMP65:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD38]], [[TMP64]] +; AVX512-NEXT: [[TMP66:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP56]] +; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds float, float* [[TMP66]], i32 0 +; AVX512-NEXT: [[TMP68:%.*]] = bitcast float* [[TMP67]] to <8 x float>* +; AVX512-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP65]], <8 x float>* [[TMP68]], i32 4, <8 x i1> [[TMP60]]), !alias.scope !54, !noalias !56 +; AVX512-NEXT: [[INDEX_NEXT34]] = add i64 [[INDEX33]], 8 +; AVX512-NEXT: [[TMP69:%.*]] = icmp eq i64 [[INDEX_NEXT34]], 10000 +; AVX512-NEXT: br i1 [[TMP69]], label [[EPILOG_MIDDLE_BLOCK:%.*]], label [[EPILOG_VECTOR_BODY]], [[LOOP57:!llvm.loop !.*]] +; AVX512: epilog.middle.block: +; AVX512-NEXT: [[CMP_N36:%.*]] = icmp eq i64 10000, 10000 +; AVX512-NEXT: br i1 [[CMP_N36]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; AVX512: scalar.ph: -; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; AVX512-NEXT: [[BC_RESUME_VAL35:%.*]] = phi i64 [ 10000, [[EPILOG_MIDDLE_BLOCK]] ], [ [[BC_RESUME_VAL]], [[EPILOG_PH]] ], [ [[BC_RESUME_VAL]], [[EPILOG_VECTOR_MEMCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: -; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL35]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP53:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP53]], 100 +; AVX512-NEXT: [[TMP70:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP70]], 100 ; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX512: if.then: ; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP54:%.*]] = load float, float* [[ARRAYIDX3]], align 4 -; AVX512-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP53]] to float -; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP54]], [[CONV]] +; AVX512-NEXT: [[TMP71:%.*]] = load float, float* [[ARRAYIDX3]], align 4 +; AVX512-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP70]] to float +; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP71]], [[CONV]] ; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] ; AVX512-NEXT: store float [[ADD]], float* [[ARRAYIDX7]], align 4 ; AVX512-NEXT: br label [[FOR_INC]] ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !30 +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP58:!llvm.loop !.*]] +; AVX512: for.end.loopexit: +; AVX512-NEXT: br label [[FOR_END]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -1131,7 +1291,7 @@ ; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP39]], <4 x double>* [[TMP51]], i32 8, <4 x i1> [[TMP19]]), !alias.scope !36, !noalias !38 ; AVX-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; AVX-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; AVX-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !39 +; AVX-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP39:!llvm.loop !.*]] ; AVX: middle.block: ; AVX-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; AVX-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1155,7 +1315,7 @@ ; AVX: for.inc: ; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !40 +; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP40:!llvm.loop !.*]] ; AVX: for.end: ; AVX-NEXT: ret void ; @@ -1195,16 +1355,16 @@ ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 ; AVX512-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !31 +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !59 ; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8 ; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4, !alias.scope !31 +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4, !alias.scope !59 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16 ; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !31 +; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !59 ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 24 ; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4, !alias.scope !31 +; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4, !alias.scope !59 ; AVX512-NEXT: [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], ; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], ; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], @@ -1215,16 +1375,16 @@ ; AVX512-NEXT: [[TMP23:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 0 ; AVX512-NEXT: [[TMP25:%.*]] = bitcast double* [[TMP24]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP25]], i32 8, <8 x i1> [[TMP16]], <8 x double> undef), !alias.scope !34 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP25]], i32 8, <8 x i1> [[TMP16]], <8 x double> undef), !alias.scope !62 ; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 8 ; AVX512-NEXT: [[TMP27:%.*]] = bitcast double* [[TMP26]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP27]], i32 8, <8 x i1> [[TMP17]], <8 x double> undef), !alias.scope !34 +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP27]], i32 8, <8 x i1> [[TMP17]], <8 x double> undef), !alias.scope !62 ; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 16 ; AVX512-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP18]], <8 x double> undef), !alias.scope !34 +; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP29]], i32 8, <8 x i1> [[TMP18]], <8 x double> undef), !alias.scope !62 ; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 24 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP19]], <8 x double> undef), !alias.scope !34 +; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP19]], <8 x double> undef), !alias.scope !62 ; AVX512-NEXT: [[TMP32:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x double> ; AVX512-NEXT: [[TMP33:%.*]] = sitofp <8 x i32> [[WIDE_LOAD12]] to <8 x double> ; AVX512-NEXT: [[TMP34:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x double> @@ -1239,19 +1399,19 @@ ; AVX512-NEXT: [[TMP43:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP3]] ; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds double, double* [[TMP40]], i32 0 ; AVX512-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP36]], <8 x double>* [[TMP45]], i32 8, <8 x i1> [[TMP16]]), !alias.scope !36, !noalias !38 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP36]], <8 x double>* [[TMP45]], i32 8, <8 x i1> [[TMP16]]), !alias.scope !64, !noalias !66 ; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[TMP40]], i32 8 ; AVX512-NEXT: [[TMP47:%.*]] = bitcast double* [[TMP46]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP37]], <8 x double>* [[TMP47]], i32 8, <8 x i1> [[TMP17]]), !alias.scope !36, !noalias !38 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP37]], <8 x double>* [[TMP47]], i32 8, <8 x i1> [[TMP17]]), !alias.scope !64, !noalias !66 ; AVX512-NEXT: [[TMP48:%.*]] = getelementptr inbounds double, double* [[TMP40]], i32 16 ; AVX512-NEXT: [[TMP49:%.*]] = bitcast double* [[TMP48]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP38]], <8 x double>* [[TMP49]], i32 8, <8 x i1> [[TMP18]]), !alias.scope !36, !noalias !38 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP38]], <8 x double>* [[TMP49]], i32 8, <8 x i1> [[TMP18]]), !alias.scope !64, !noalias !66 ; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds double, double* [[TMP40]], i32 24 ; AVX512-NEXT: [[TMP51:%.*]] = bitcast double* [[TMP50]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP39]], <8 x double>* [[TMP51]], i32 8, <8 x i1> [[TMP19]]), !alias.scope !36, !noalias !38 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP39]], <8 x double>* [[TMP51]], i32 8, <8 x i1> [[TMP19]]), !alias.scope !64, !noalias !66 ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !39 +; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP67:!llvm.loop !.*]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1275,7 +1435,7 @@ ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !40 +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP68:!llvm.loop !.*]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -1372,19 +1532,19 @@ ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <8 x i64> [[VEC_IND]] -; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP0]], i32 4, <8 x i1> , <8 x i32> undef), !alias.scope !41 +; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP0]], i32 4, <8 x i1> , <8 x i32> undef), !alias.scope !69 ; AVX512-NEXT: [[TMP1:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER]], ; AVX512-NEXT: [[TMP2:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND]], ; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[B]], <8 x i64> [[TMP2]] -; AVX512-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> [[TMP3]], i32 8, <8 x i1> [[TMP1]], <8 x double> undef), !alias.scope !44 +; AVX512-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> [[TMP3]], i32 8, <8 x i1> [[TMP1]], <8 x double> undef), !alias.scope !72 ; AVX512-NEXT: [[TMP4:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER]] to <8 x double> ; AVX512-NEXT: [[TMP5:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER12]], [[TMP4]] ; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[A]], <8 x i64> [[VEC_IND]] -; AVX512-NEXT: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> [[TMP5]], <8 x double*> [[TMP6]], i32 8, <8 x i1> [[TMP1]]), !alias.scope !46, !noalias !48 +; AVX512-NEXT: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> [[TMP5]], <8 x double*> [[TMP6]], i32 8, <8 x i1> [[TMP1]]), !alias.scope !74, !noalias !76 ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; AVX512-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], ; AVX512-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 624 -; AVX512-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !49 +; AVX512-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP77:!llvm.loop !.*]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 625, 624 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1409,7 +1569,7 @@ ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 ; AVX512-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000 -; AVX512-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !50 +; AVX512-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], [[LOOP78:!llvm.loop !.*]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -1664,7 +1824,7 @@ ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE35]], <4 x double>* [[TMP59]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48 ; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; AVX2-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !49 +; AVX2-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP49:!llvm.loop !.*]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1687,7 +1847,7 @@ ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 ; AVX2-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0 -; AVX2-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !50 +; AVX2-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP50:!llvm.loop !.*]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; @@ -1729,22 +1889,22 @@ ; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 ; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 -7 ; AVX512-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4, !alias.scope !51 +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4, !alias.scope !79 ; AVX512-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD]], <8 x i32> undef, <8 x i32> ; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -8 ; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 -7 ; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !51 +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !79 ; AVX512-NEXT: [[REVERSE13:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD12]], <8 x i32> undef, <8 x i32> ; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -16 ; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 -7 ; AVX512-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP16]], align 4, !alias.scope !51 +; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP16]], align 4, !alias.scope !79 ; AVX512-NEXT: [[REVERSE15:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD14]], <8 x i32> undef, <8 x i32> ; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 -24 ; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 -7 ; AVX512-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD16:%.*]] = load <8 x i32>, <8 x i32>* [[TMP19]], align 4, !alias.scope !51 +; AVX512-NEXT: [[WIDE_LOAD16:%.*]] = load <8 x i32>, <8 x i32>* [[TMP19]], align 4, !alias.scope !79 ; AVX512-NEXT: [[REVERSE17:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD16]], <8 x i32> undef, <8 x i32> ; AVX512-NEXT: [[TMP20:%.*]] = icmp sgt <8 x i32> [[REVERSE]], zeroinitializer ; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <8 x i32> [[REVERSE13]], zeroinitializer @@ -1758,25 +1918,25 @@ ; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds double, double* [[TMP28]], i32 -7 ; AVX512-NEXT: [[REVERSE18:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> undef, <8 x i32> ; AVX512-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP30]], i32 8, <8 x i1> [[REVERSE18]], <8 x double> undef), !alias.scope !54 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP30]], i32 8, <8 x i1> [[REVERSE18]], <8 x double> undef), !alias.scope !82 ; AVX512-NEXT: [[REVERSE19:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD]], <8 x double> undef, <8 x i32> ; AVX512-NEXT: [[TMP31:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 -8 ; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP31]], i32 -7 ; AVX512-NEXT: [[REVERSE20:%.*]] = shufflevector <8 x i1> [[TMP21]], <8 x i1> undef, <8 x i32> ; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP33]], i32 8, <8 x i1> [[REVERSE20]], <8 x double> undef), !alias.scope !54 +; AVX512-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP33]], i32 8, <8 x i1> [[REVERSE20]], <8 x double> undef), !alias.scope !82 ; AVX512-NEXT: [[REVERSE22:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD21]], <8 x double> undef, <8 x i32> ; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 -16 ; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds double, double* [[TMP34]], i32 -7 ; AVX512-NEXT: [[REVERSE23:%.*]] = shufflevector <8 x i1> [[TMP22]], <8 x i1> undef, <8 x i32> ; AVX512-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP36]], i32 8, <8 x i1> [[REVERSE23]], <8 x double> undef), !alias.scope !54 +; AVX512-NEXT: [[WIDE_MASKED_LOAD24:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP36]], i32 8, <8 x i1> [[REVERSE23]], <8 x double> undef), !alias.scope !82 ; AVX512-NEXT: [[REVERSE25:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD24]], <8 x double> undef, <8 x i32> ; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds double, double* [[TMP24]], i32 -24 ; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP37]], i32 -7 ; AVX512-NEXT: [[REVERSE26:%.*]] = shufflevector <8 x i1> [[TMP23]], <8 x i1> undef, <8 x i32> ; AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP39]], i32 8, <8 x i1> [[REVERSE26]], <8 x double> undef), !alias.scope !54 +; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP39]], i32 8, <8 x i1> [[REVERSE26]], <8 x double> undef), !alias.scope !82 ; AVX512-NEXT: [[REVERSE28:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD27]], <8 x double> undef, <8 x i32> ; AVX512-NEXT: [[TMP40:%.*]] = fadd <8 x double> [[REVERSE19]], ; AVX512-NEXT: [[TMP41:%.*]] = fadd <8 x double> [[REVERSE22]], @@ -1790,25 +1950,25 @@ ; AVX512-NEXT: [[TMP48:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 0 ; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds double, double* [[TMP48]], i32 -7 ; AVX512-NEXT: [[TMP50:%.*]] = bitcast double* [[TMP49]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE29]], <8 x double>* [[TMP50]], i32 8, <8 x i1> [[REVERSE18]]), !alias.scope !56, !noalias !58 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE29]], <8 x double>* [[TMP50]], i32 8, <8 x i1> [[REVERSE18]]), !alias.scope !84, !noalias !86 ; AVX512-NEXT: [[REVERSE31:%.*]] = shufflevector <8 x double> [[TMP41]], <8 x double> undef, <8 x i32> ; AVX512-NEXT: [[TMP51:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 -8 ; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds double, double* [[TMP51]], i32 -7 ; AVX512-NEXT: [[TMP53:%.*]] = bitcast double* [[TMP52]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE31]], <8 x double>* [[TMP53]], i32 8, <8 x i1> [[REVERSE20]]), !alias.scope !56, !noalias !58 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE31]], <8 x double>* [[TMP53]], i32 8, <8 x i1> [[REVERSE20]]), !alias.scope !84, !noalias !86 ; AVX512-NEXT: [[REVERSE33:%.*]] = shufflevector <8 x double> [[TMP42]], <8 x double> undef, <8 x i32> ; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 -16 ; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds double, double* [[TMP54]], i32 -7 ; AVX512-NEXT: [[TMP56:%.*]] = bitcast double* [[TMP55]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE33]], <8 x double>* [[TMP56]], i32 8, <8 x i1> [[REVERSE23]]), !alias.scope !56, !noalias !58 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE33]], <8 x double>* [[TMP56]], i32 8, <8 x i1> [[REVERSE23]]), !alias.scope !84, !noalias !86 ; AVX512-NEXT: [[REVERSE35:%.*]] = shufflevector <8 x double> [[TMP43]], <8 x double> undef, <8 x i32> ; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds double, double* [[TMP44]], i32 -24 ; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP57]], i32 -7 ; AVX512-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <8 x double>* -; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE35]], <8 x double>* [[TMP59]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !56, !noalias !58 +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE35]], <8 x double>* [[TMP59]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !84, !noalias !86 ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; AVX512-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !59 +; AVX512-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP87:!llvm.loop !.*]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1831,7 +1991,7 @@ ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 ; AVX512-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0 -; AVX512-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !60 +; AVX512-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP88:!llvm.loop !.*]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -1963,7 +2123,7 @@ ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; AVX1-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !41 +; AVX1-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP41:!llvm.loop !.*]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -1989,7 +2149,7 @@ ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !42 +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP42:!llvm.loop !.*]] ; AVX1: for.end.loopexit: ; AVX1-NEXT: br label [[FOR_END]] ; AVX1: for.end: @@ -2087,7 +2247,7 @@ ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX2-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !51 +; AVX2-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP51:!llvm.loop !.*]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -2113,7 +2273,7 @@ ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !52 +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP52:!llvm.loop !.*]] ; AVX2: for.end.loopexit: ; AVX2-NEXT: br label [[FOR_END]] ; AVX2: for.end: @@ -2211,7 +2371,7 @@ ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP63]], i32 8, <8 x i1> [[TMP55]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX512-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !61 +; AVX512-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP89:!llvm.loop !.*]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -2237,7 +2397,7 @@ ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !62 +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP90:!llvm.loop !.*]] ; AVX512: for.end.loopexit: ; AVX512-NEXT: br label [[FOR_END]] ; AVX512: for.end: @@ -2380,7 +2540,7 @@ ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; AVX1-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !44 +; AVX1-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP44:!llvm.loop !.*]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -2406,7 +2566,7 @@ ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !45 +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP45:!llvm.loop !.*]] ; AVX1: for.end.loopexit: ; AVX1-NEXT: br label [[FOR_END]] ; AVX1: for.end: @@ -2504,7 +2664,7 @@ ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX2-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !54 +; AVX2-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP54:!llvm.loop !.*]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -2530,7 +2690,7 @@ ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !55 +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP55:!llvm.loop !.*]] ; AVX2: for.end.loopexit: ; AVX2-NEXT: br label [[FOR_END]] ; AVX2: for.end: @@ -2628,7 +2788,7 @@ ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP63]], i32 8, <8 x i1> [[TMP55]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX512-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !64 +; AVX512-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP91:!llvm.loop !.*]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -2654,7 +2814,7 @@ ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !65 +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP92:!llvm.loop !.*]] ; AVX512: for.end.loopexit: ; AVX512-NEXT: br label [[FOR_END]] ; AVX512: for.end: diff --git a/test/Transforms/LoopVectorize/epilog-loop-vectorize.ll b/test/Transforms/LoopVectorize/epilog-loop-vectorize.ll new file mode 100644 --- /dev/null +++ b/test/Transforms/LoopVectorize/epilog-loop-vectorize.ll @@ -0,0 +1,185 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -loop-vectorize -S %s | FileCheck %s +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64" + +define void @epilog_loop_test(i8* nocapture %c, i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) { +; CHECK-LABEL: @epilog_loop_test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP12:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP12]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[EPILOG_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, i8* [[C:%.*]], i64 [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, i8* [[A:%.*]], i64 [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, i8* [[B:%.*]], i64 [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[C]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[A]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult i8* [[C]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult i8* [[B]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true +; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 32 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP5]], align 1, !alias.scope !0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 16 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, <16 x i8>* [[TMP7]], align 1, !alias.scope !0 +; CHECK-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i16> +; CHECK-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i16> +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[B]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP13]], align 1, !alias.scope !3 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP10]], i32 16 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, <16 x i8>* [[TMP15]], align 1, !alias.scope !3 +; CHECK-NEXT: [[TMP16:%.*]] = zext <16 x i8> [[WIDE_LOAD7]] to <16 x i16> +; CHECK-NEXT: [[TMP17:%.*]] = zext <16 x i8> [[WIDE_LOAD8]] to <16 x i16> +; CHECK-NEXT: [[TMP18:%.*]] = add nuw nsw <16 x i16> [[TMP16]], [[TMP8]] +; CHECK-NEXT: [[TMP19:%.*]] = add nuw nsw <16 x i16> [[TMP17]], [[TMP9]] +; CHECK-NEXT: [[TMP20:%.*]] = lshr <16 x i16> [[TMP18]], +; CHECK-NEXT: [[TMP21:%.*]] = lshr <16 x i16> [[TMP19]], +; CHECK-NEXT: [[TMP22:%.*]] = trunc <16 x i16> [[TMP20]] to <16 x i8> +; CHECK-NEXT: [[TMP23:%.*]] = trunc <16 x i16> [[TMP21]] to <16 x i8> +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, i8* [[C]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, i8* [[C]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, i8* [[TMP24]], i32 0 +; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP22]], <16 x i8>* [[TMP27]], align 1, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, i8* [[TMP24]], i32 16 +; CHECK-NEXT: [[TMP29:%.*]] = bitcast i8* [[TMP28]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP23]], <16 x i8>* [[TMP29]], align 1, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 +; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[EPILOG_PH]] +; CHECK: epilog.ph: +; CHECK-NEXT: [[TMP31:%.*]] = phi i1 [ false, [[MIDDLE_BLOCK]] ], [ true, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[TMP32:%.*]] = sub i64 [[BC_RESUME_VAL]], 0 +; CHECK-NEXT: [[TMP33:%.*]] = sub i64 [[BC_RESUME_VAL]], 0 +; CHECK-NEXT: [[REMAINDER_ITER:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP33]] +; CHECK-NEXT: [[MIN_ITERS_CHECK9:%.*]] = icmp ult i64 [[REMAINDER_ITER]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK9]], label [[SCALAR_PH]], label [[RUNTIME_CHECK_PH:%.*]] +; CHECK: runtime.check.ph: +; CHECK-NEXT: br i1 [[TMP31]], label [[EPILOG_VECTOR_MEMCHECK:%.*]], label [[EPILOG_VECTOR_PH:%.*]] +; CHECK: epilog.vector.memcheck: +; CHECK-NEXT: [[SCEVGEP11:%.*]] = getelementptr i8, i8* [[C]], i64 [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: [[SCEVGEP12:%.*]] = getelementptr i8, i8* [[A]], i64 [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: [[SCEVGEP13:%.*]] = getelementptr i8, i8* [[B]], i64 [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: [[BOUND014:%.*]] = icmp ult i8* [[C]], [[SCEVGEP12]] +; CHECK-NEXT: [[BOUND115:%.*]] = icmp ult i8* [[A]], [[SCEVGEP11]] +; CHECK-NEXT: [[FOUND_CONFLICT16:%.*]] = and i1 [[BOUND014]], [[BOUND115]] +; CHECK-NEXT: [[BOUND017:%.*]] = icmp ult i8* [[C]], [[SCEVGEP13]] +; CHECK-NEXT: [[BOUND118:%.*]] = icmp ult i8* [[B]], [[SCEVGEP11]] +; CHECK-NEXT: [[FOUND_CONFLICT19:%.*]] = and i1 [[BOUND017]], [[BOUND118]] +; CHECK-NEXT: [[CONFLICT_RDX20:%.*]] = or i1 [[FOUND_CONFLICT16]], [[FOUND_CONFLICT19]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT21:%.*]] = and i1 [[CONFLICT_RDX20]], true +; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT21]], label [[SCALAR_PH]], label [[EPILOG_VECTOR_PH]] +; CHECK: epilog.vector.ph: +; CHECK-NEXT: [[N_MOD_VF22:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8 +; CHECK-NEXT: [[N_VEC23:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF22]] +; CHECK-NEXT: br label [[EPILOG_VECTOR_BODY:%.*]] +; CHECK: epilog.vector.body: +; CHECK-NEXT: [[INDEX24:%.*]] = phi i64 [ [[TMP32]], [[EPILOG_VECTOR_PH]] ], [ [[INDEX_NEXT25:%.*]], [[EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP34:%.*]] = add i64 [[INDEX24]], 0 +; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP34]] +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, i8* [[TMP35]], i32 0 +; CHECK-NEXT: [[TMP37:%.*]] = bitcast i8* [[TMP36]] to <8 x i8>* +; CHECK-NEXT: [[WIDE_LOAD28:%.*]] = load <8 x i8>, <8 x i8>* [[TMP37]], align 1, !alias.scope !10 +; CHECK-NEXT: [[TMP38:%.*]] = zext <8 x i8> [[WIDE_LOAD28]] to <8 x i16> +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds i8, i8* [[B]], i64 [[TMP34]] +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, i8* [[TMP39]], i32 0 +; CHECK-NEXT: [[TMP41:%.*]] = bitcast i8* [[TMP40]] to <8 x i8>* +; CHECK-NEXT: [[WIDE_LOAD29:%.*]] = load <8 x i8>, <8 x i8>* [[TMP41]], align 1, !alias.scope !13 +; CHECK-NEXT: [[TMP42:%.*]] = zext <8 x i8> [[WIDE_LOAD29]] to <8 x i16> +; CHECK-NEXT: [[TMP43:%.*]] = add nuw nsw <8 x i16> [[TMP42]], [[TMP38]] +; CHECK-NEXT: [[TMP44:%.*]] = lshr <8 x i16> [[TMP43]], +; CHECK-NEXT: [[TMP45:%.*]] = trunc <8 x i16> [[TMP44]] to <8 x i8> +; CHECK-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, i8* [[C]], i64 [[TMP34]] +; CHECK-NEXT: [[TMP47:%.*]] = getelementptr inbounds i8, i8* [[TMP46]], i32 0 +; CHECK-NEXT: [[TMP48:%.*]] = bitcast i8* [[TMP47]] to <8 x i8>* +; CHECK-NEXT: store <8 x i8> [[TMP45]], <8 x i8>* [[TMP48]], align 1, !alias.scope !15, !noalias !17 +; CHECK-NEXT: [[INDEX_NEXT25]] = add i64 [[INDEX24]], 8 +; CHECK-NEXT: [[TMP49:%.*]] = icmp eq i64 [[INDEX_NEXT25]], [[N_VEC23]] +; CHECK-NEXT: br i1 [[TMP49]], label [[EPILOG_MIDDLE_BLOCK:%.*]], label [[EPILOG_VECTOR_BODY]], [[LOOP18:!llvm.loop !.*]] +; CHECK: epilog.middle.block: +; CHECK-NEXT: [[CMP_N27:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC23]] +; CHECK-NEXT: br i1 [[CMP_N27]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL26:%.*]] = phi i64 [ [[N_VEC23]], [[EPILOG_MIDDLE_BLOCK]] ], [ [[BC_RESUME_VAL]], [[EPILOG_PH]] ], [ [[BC_RESUME_VAL]], [[EPILOG_VECTOR_MEMCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL26]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP50:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP50]] to i16 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP51:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[CONV3:%.*]] = zext i8 [[TMP51]] to i16 +; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i16 [[CONV3]], [[CONV]] +; CHECK-NEXT: [[TMP52:%.*]] = lshr i16 [[ADD]], 1 +; CHECK-NEXT: [[CONV4:%.*]] = trunc i16 [[TMP52]] to i8 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, i8* [[C]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i8 [[CONV4]], i8* [[ARRAYIDX6]], align 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT]], label [[FOR_BODY]], [[LOOP20:!llvm.loop !.*]] +; +entry: + %cmp12 = icmp sgt i32 %n, 0 + br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i8, i8* %a, i64 %indvars.iv + %0 = load i8, i8* %arrayidx + %conv = zext i8 %0 to i16 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %indvars.iv + %1 = load i8, i8* %arrayidx2 + %conv3 = zext i8 %1 to i16 + %add = add nuw nsw i16 %conv3, %conv + %2 = lshr i16 %add, 1 + %conv4 = trunc i16 %2 to i8 + %arrayidx6 = getelementptr inbounds i8, i8* %c, i64 %indvars.iv + store i8 %conv4, i8* %arrayidx6 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +}