diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -540,10 +540,6 @@ /// represented as. void truncateToMinimalBitwidths(); - /// Insert the new loop to the loop hierarchy and pass manager - /// and update the analysis passes. - void updateAnalysis(); - /// Create a broadcast instruction. This method generates a broadcast /// instruction (shuffle) for loop invariant values and for the induction /// value. If this is the induction variable then we extend it to N, N+1, ... @@ -2677,66 +2673,89 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass) { Value *Count = getOrCreateTripCount(L); - BasicBlock *BB = L->getLoopPreheader(); - IRBuilder<> Builder(BB->getTerminator()); + BasicBlock *const OrigPreHeader = LoopVectorPreHeader; + IRBuilder<> Builder(OrigPreHeader->getTerminator()); + + // No trip count check needed for predicated vectorization. + if (Cost->foldTailByMasking()) + return; // Generate code to check if the loop's trip count is less than VF * UF, or // equal to it in case a scalar epilogue is required; this implies that the // vector trip count is zero. This check also covers the case where adding one // to the backedge-taken count overflowed leading to an incorrect trip count // of zero. In this case we will also jump to the scalar loop. - auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE - : ICmpInst::ICMP_ULT; + auto P = + Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; // If tail is to be folded, vector loop takes care of all iterations. - Value *CheckMinIters = Builder.getFalse(); - if (!Cost->foldTailByMasking()) - CheckMinIters = Builder.CreateICmp( - P, Count, ConstantInt::get(Count->getType(), VF * UF), - "min.iters.check"); + Value *CheckMinIters = Builder.CreateICmp( + P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check"); - BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); + auto OrigName = OrigPreHeader->getName().str(); + OrigPreHeader->setName("tc.check"); + LoopVectorPreHeader = + OrigPreHeader->splitBasicBlock(OrigPreHeader->getTerminator(), OrigName); // Update dominator tree immediately if the generated block is a // LoopBypassBlock because SCEV expansions to generate loop bypass // checks may query it before the current function is finished. - DT->addNewBlock(NewBB, BB); + DT->addNewBlock(LoopVectorPreHeader, OrigPreHeader); + DT->changeImmediateDominator(LoopVectorBody, LoopVectorPreHeader); + assert(DT->properlyDominates(DT->getNode(OrigPreHeader), + DT->getNode(Bypass)->getIDom()) && + "TC check is expected to dominate Bypass"); + // Update dominator for Bypass. + DT->changeImmediateDominator(Bypass, OrigPreHeader); + DT->changeImmediateDominator(LoopExitBlock, OrigPreHeader); + if (L->getParentLoop()) - L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); - ReplaceInstWithInst(BB->getTerminator(), - BranchInst::Create(Bypass, NewBB, CheckMinIters)); - LoopBypassBlocks.push_back(BB); + L->getParentLoop()->addBasicBlockToLoop(LoopVectorPreHeader, *LI); + ReplaceInstWithInst( + OrigPreHeader->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); + LoopBypassBlocks.push_back(OrigPreHeader); } void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { - BasicBlock *BB = L->getLoopPreheader(); + BasicBlock *const OrigPreHeader = LoopVectorPreHeader; // Generate the code to check that the SCEV assumptions that we made. // We want the new basic block to start at the first instruction in a // sequence of instructions that form a check. SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), "scev.check"); - Value *SCEVCheck = - Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator()); + Value *SCEVCheck = Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), + OrigPreHeader->getTerminator()); if (auto *C = dyn_cast(SCEVCheck)) if (C->isZero()) return; - assert(!BB->getParent()->hasOptSize() && + assert(!OrigPreHeader->getParent()->hasOptSize() && "Cannot SCEV check stride or overflow when optimizing for size"); // Create a new block containing the stride check. - BB->setName("vector.scevcheck"); - auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); + auto OrigName = OrigPreHeader->getName().str(); + OrigPreHeader->setName("vector.scevcheck"); + LoopVectorPreHeader = + OrigPreHeader->splitBasicBlock(OrigPreHeader->getTerminator(), OrigName); // Update dominator tree immediately if the generated block is a // LoopBypassBlock because SCEV expansions to generate loop bypass // checks may query it before the current function is finished. - DT->addNewBlock(NewBB, BB); + DT->addNewBlock(LoopVectorPreHeader, OrigPreHeader); + DT->changeImmediateDominator(LoopVectorBody, LoopVectorPreHeader); + // Update dominator for Bypass only if this is first RT check. + if (LoopBypassBlocks.empty()) { + DT->changeImmediateDominator(Bypass, OrigPreHeader); + DT->changeImmediateDominator(LoopExitBlock, OrigPreHeader); + } + if (L->getParentLoop()) - L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); - ReplaceInstWithInst(BB->getTerminator(), - BranchInst::Create(Bypass, NewBB, SCEVCheck)); - LoopBypassBlocks.push_back(BB); + L->getParentLoop()->addBasicBlockToLoop(LoopVectorPreHeader, *LI); + ReplaceInstWithInst( + OrigPreHeader->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); + LoopBypassBlocks.push_back(OrigPreHeader); AddedSafetyChecks = true; } @@ -2745,7 +2764,7 @@ if (EnableVPlanNativePath) return; - BasicBlock *BB = L->getLoopPreheader(); + BasicBlock *const OrigPreHeader = L->getLoopPreheader(); // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements @@ -2753,11 +2772,11 @@ Instruction *FirstCheckInst; Instruction *MemRuntimeCheck; std::tie(FirstCheckInst, MemRuntimeCheck) = - Legal->getLAI()->addRuntimeChecks(BB->getTerminator()); + Legal->getLAI()->addRuntimeChecks(OrigPreHeader->getTerminator()); if (!MemRuntimeCheck) return; - if (BB->getParent()->hasOptSize()) { + if (OrigPreHeader->getParent()->hasOptSize()) { assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced " "to vectorize."); @@ -2772,23 +2791,33 @@ } // Create a new block containing the memory check. - BB->setName("vector.memcheck"); - auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); + auto OrigName = OrigPreHeader->getName().str(); + OrigPreHeader->setName("vector.memcheck"); + LoopVectorPreHeader = + OrigPreHeader->splitBasicBlock(OrigPreHeader->getTerminator(), OrigName); // Update dominator tree immediately if the generated block is a // LoopBypassBlock because SCEV expansions to generate loop bypass // checks may query it before the current function is finished. - DT->addNewBlock(NewBB, BB); + DT->addNewBlock(LoopVectorPreHeader, OrigPreHeader); + DT->changeImmediateDominator(LoopVectorBody, LoopVectorPreHeader); + // Update dominator for Bypass only if this is first RT check. + if (LoopBypassBlocks.empty()) { + DT->changeImmediateDominator(Bypass, OrigPreHeader); + DT->changeImmediateDominator(LoopExitBlock, OrigPreHeader); + } + if (L->getParentLoop()) - L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); - ReplaceInstWithInst(BB->getTerminator(), - BranchInst::Create(Bypass, NewBB, MemRuntimeCheck)); - LoopBypassBlocks.push_back(BB); + L->getParentLoop()->addBasicBlockToLoop(LoopVectorPreHeader, *LI); + ReplaceInstWithInst( + OrigPreHeader->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); + LoopBypassBlocks.push_back(OrigPreHeader); AddedSafetyChecks = true; // We currently don't use LoopVersioning for the actual loop cloning but we // still use it to add the noalias metadata. LVer = std::make_unique(*Legal->getLAI(), OrigLoop, LI, DT, - PSE.getSE()); + PSE.getSE()); LVer->prepareNoAliasMetadata(); } @@ -2913,12 +2942,7 @@ ... */ - BasicBlock *OldBasicBlock = OrigLoop->getHeader(); - BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); - BasicBlock *ExitBlock = OrigLoop->getExitBlock(); MDNode *OrigLoopID = OrigLoop->getLoopID(); - assert(VectorPH && "Invalid loop structure"); - assert(ExitBlock && "Must have an exit block"); // Some loops have a single integer induction variable, while other loops // don't. One example is c++ iterators that often have multiple pointer @@ -2935,12 +2959,37 @@ Type *IdxTy = Legal->getWidestInductionType(); // Split the single block loop into the two loop structure described above. - BasicBlock *VecBody = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); - BasicBlock *MiddleBlock = - VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); - BasicBlock *ScalarPH = - MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); + LoopScalarBody = OrigLoop->getHeader(); + LoopExitBlock = OrigLoop->getExitBlock(); + assert(LoopExitBlock && "Must have an exit block"); + + BasicBlock *OrigLoopPH = OrigLoop->getLoopPreheader(); + if (OrigLoopPH->size() > 1) { + LoopVectorPreHeader = + OrigLoopPH->splitBasicBlock(OrigLoopPH->getTerminator(), "vector.ph"); + DT->addNewBlock(LoopVectorPreHeader, OrigLoopPH); + } else { + LoopVectorPreHeader = OrigLoopPH; + LoopVectorPreHeader->setName("vector.ph"); + } + + + LoopVectorBody = LoopVectorPreHeader->splitBasicBlock( + LoopVectorPreHeader->getTerminator(), "vector.body"); + DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); + + LoopMiddleBlock = LoopVectorBody->splitBasicBlock( + LoopVectorBody->getTerminator(), "middle.block"); + DT->addNewBlock(LoopMiddleBlock, LoopVectorBody); + + LoopScalarPreHeader = LoopMiddleBlock->splitBasicBlock( + LoopMiddleBlock->getTerminator(), "scalar.ph"); + DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock); + + // Update dominator for original loop. + DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); + // Update dominator for loop exit. + DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); // Create and register the new vector loop. Loop *Lp = LI->AllocateLoop(); @@ -2950,12 +2999,14 @@ // before calling any utilities such as SCEV that require valid LoopInfo. if (ParentLoop) { ParentLoop->addChildLoop(Lp); - ParentLoop->addBasicBlockToLoop(ScalarPH, *LI); - ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI); + if (LoopVectorPreHeader != OrigLoopPH) + ParentLoop->addBasicBlockToLoop(LoopVectorPreHeader, *LI); + ParentLoop->addBasicBlockToLoop(LoopScalarPreHeader, *LI); + ParentLoop->addBasicBlockToLoop(LoopMiddleBlock, *LI); } else { LI->addTopLevelLoop(Lp); } - Lp->addBasicBlockToLoop(VecBody, *LI); + Lp->addBasicBlockToLoop(LoopVectorBody, *LI); // Find the loop boundaries. Value *Count = getOrCreateTripCount(Lp); @@ -2967,16 +3018,16 @@ // backedge-taken count is uint##_max: adding one to it will overflow leading // to an incorrect trip count of zero. In this (rare) case we will also jump // to the scalar loop. - emitMinimumIterationCountCheck(Lp, ScalarPH); + emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); // Generate the code to check any assumptions that we've made for SCEV // expressions. - emitSCEVChecks(Lp, ScalarPH); + emitSCEVChecks(Lp, LoopScalarPreHeader); // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. - emitMemRuntimeChecks(Lp, ScalarPH); + emitMemRuntimeChecks(Lp, LoopScalarPreHeader); // Generate the induction variable. // The loop step is equal to the vectorization factor (num of SIMD elements) @@ -3004,8 +3055,9 @@ InductionDescriptor II = InductionEntry.second; // Create phi nodes to merge from the backedge-taken check block. - PHINode *BCResumeVal = PHINode::Create( - OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator()); + PHINode *BCResumeVal = + PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", + LoopScalarPreHeader->getTerminator()); // Copy original phi DL over to the new one. BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); Value *&EndValue = IVEndValues[OrigPhi]; @@ -3016,23 +3068,23 @@ IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); Type *StepType = II.getStep()->getType(); Instruction::CastOps CastOp = - CastInst::getCastOpcode(CountRoundDown, true, StepType, true); + CastInst::getCastOpcode(CountRoundDown, true, StepType, true); Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); - const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); + const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); EndValue->setName("ind.end"); } // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. - BCResumeVal->addIncoming(EndValue, MiddleBlock); + BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); // Fix the scalar body counter (PHI node). // The old induction's phi node in the scalar body needs the truncated // value. for (BasicBlock *BB : LoopBypassBlocks) BCResumeVal->addIncoming(II.getStartValue(), BB); - OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal); + OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); } // We need the OrigLoop (scalar loop part) latch terminator to help @@ -3050,9 +3102,9 @@ // If tail is to be folded, we know we don't need to run the remainder. Value *CmpN = Builder.getTrue(); if (!Cost->foldTailByMasking()) { - CmpN = - CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, - CountRoundDown, "cmp.n", MiddleBlock->getTerminator()); + CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, + CountRoundDown, "cmp.n", + LoopMiddleBlock->getTerminator()); // Here we use the same DebugLoc as the scalar loop latch branch instead // of the corresponding compare because they may have ended up with @@ -3061,24 +3113,17 @@ cast(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); } - BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN); + BranchInst *BrInst = + BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); - ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst); + ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); // Get ready to start creating new instructions into the vectorized body. - Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt()); - - // Save the state. - LoopVectorPreHeader = Lp->getLoopPreheader(); - LoopScalarPreHeader = ScalarPH; - LoopMiddleBlock = MiddleBlock; - LoopExitBlock = ExitBlock; - LoopVectorBody = VecBody; - LoopScalarBody = OldBasicBlock; + Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); Optional VectorizedLoopID = - makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, - LLVMLoopVectorizeFollowupVectorized}); + makeFollowupLoopID(OrigLoopID, { LLVMLoopVectorizeFollowupAll, + LLVMLoopVectorizeFollowupVectorized }); if (VectorizedLoopID.hasValue()) { Lp->setLoopID(VectorizedLoopID.getValue()); @@ -3095,6 +3140,8 @@ LoopVectorizeHints Hints(Lp, true, *ORE); Hints.setAlreadyVectorized(); + assert(DT->verify(DominatorTree::VerificationLevel::Fast)); + return LoopVectorPreHeader; } @@ -3430,15 +3477,8 @@ // This is the second stage of vectorizing recurrences. fixCrossIterationPHIs(); - // Update the dominator tree. - // - // FIXME: After creating the structure of the new loop, the dominator tree is - // no longer up-to-date, and it remains that way until we update it - // here. An out-of-date dominator tree is problematic for SCEV, - // because SCEVExpander uses it to guide code generation. The - // vectorizer use SCEVExpanders in several places. Instead, we should - // keep the dominator tree up-to-date as we go. - updateAnalysis(); + // Forget the original basic block. + PSE.getSE()->forgetLoop(OrigLoop); // Fix-up external users of the induction variables. for (auto &Entry : *Legal->getInductionVars()) @@ -4336,26 +4376,6 @@ } // end of switch. } -void InnerLoopVectorizer::updateAnalysis() { - // Forget the original basic block. - PSE.getSE()->forgetLoop(OrigLoop); - - // DT is not kept up-to-date for outer loop vectorization - if (EnableVPlanNativePath) - return; - - // Update the dominator tree information. - assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && - "Entry does not dominate exit."); - - DT->addNewBlock(LoopMiddleBlock, - LI->getLoopFor(LoopVectorBody)->getLoopLatch()); - DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); - DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); - DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); - assert(DT->verify(DominatorTree::VerificationLevel::Fast)); -} - void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { // We should not collect Scalars more than once per VF. Right now, this // function is called from collectUniformsAndScalars(), which already does diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1393,9 +1393,9 @@ private: /// Add to the given dominator tree the header block and every new basic block /// that was created between it and the latch block, inclusive. - static void updateDominatorTree(DominatorTree *DT, - BasicBlock *LoopPreHeaderBB, - BasicBlock *LoopLatchBB); + static void updateDominatorTree(DominatorTree *DT, BasicBlock *LoopMiddleBB, + BasicBlock *LoopLatchBB, + BasicBlock *LoopPreHeaderBB); }; /// VPlanPrinter prints a given VPlan to a given output stream. The printing is diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -465,7 +465,8 @@ // We do not attempt to preserve DT for outer loop vectorization currently. if (!EnableVPlanNativePath) - updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB); + updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB, + L->getExitBlock()); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -474,10 +475,10 @@ #endif void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB, - BasicBlock *LoopLatchBB) { + BasicBlock *LoopLatchBB, + BasicBlock *LoopExitBB) { BasicBlock *LoopHeaderBB = LoopPreHeaderBB->getSingleSuccessor(); assert(LoopHeaderBB && "Loop preheader does not have a single successor."); - DT->addNewBlock(LoopHeaderBB, LoopPreHeaderBB); // The vector body may be more than a single basic-block by this point. // Update the dominator tree information inside the vector body by propagating // it from header to latch, expecting only triangular control-flow, if any. @@ -508,6 +509,8 @@ DT->addNewBlock(InterimSucc, BB); DT->addNewBlock(PostDomSucc, BB); } + DT->changeImmediateDominator(LoopExitBB, LoopLatchBB); + assert(DT->verify(DominatorTree::VerificationLevel::Fast)); } const Twine VPlanPrinter::getUID(const VPBlockBase *Block) { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll @@ -16,8 +16,8 @@ ; ; CHECK-LABEL: @predicated_udiv_scalarized_operand( ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %entry ], [ [[INDEX_NEXT:%.*]], %[[PRED_UDIV_CONTINUE2:.*]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, %entry ], [ [[TMP17:%.*]], %[[PRED_UDIV_CONTINUE2]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %tc.check ], [ [[INDEX_NEXT:%.*]], %[[PRED_UDIV_CONTINUE2:.*]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, %tc.check ], [ [[TMP17:%.*]], %[[PRED_UDIV_CONTINUE2]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, i64* %a, i64 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[TMP0]] to <2 x i64>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll @@ -25,6 +25,8 @@ ; CHECK-NEXT: br i1 [[CMP12]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP]] ; CHECK: for.body.lr.ph: ; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[G_0]] to i64 +; CHECK-NEXT: br label [[TC_CHECK:%.*]] +; CHECK: tc.check: ; CHECK-NEXT: [[TMP1:%.*]] = sub i64 4, [[TMP0]] ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] @@ -92,7 +94,7 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[TMP0]], [[FOR_BODY_LR_PH]] ], [ [[TMP0]], [[VECTOR_SCEVCHECK]] ], [ [[TMP0]], [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[TMP0]], [[TC_CHECK]] ], [ [[TMP0]], [[VECTOR_SCEVCHECK]] ], [ [[TMP0]], [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup.loopexit: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] diff --git a/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll b/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll @@ -34,6 +34,8 @@ ; CHECK-NEXT: [[ARRAYIDX113:%.*]] = getelementptr inbounds float, float* [[T7]], i32 [[T2]] ; CHECK-NEXT: [[T8:%.*]] = load float, float* [[ARRAYIDX113]], align 4 ; CHECK-NEXT: [[CONV114:%.*]] = fpext float [[T8]] to double +; CHECK-NEXT: br label [[TC_CHECK:%.*]] +; CHECK: tc.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[T]], 2 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -76,8 +78,8 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[T]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[OUTEREND]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[OUTERLOOP]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[CONV114]], [[OUTERLOOP]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[CONV114]], [[TC_CHECK]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[INNERLOOP:%.*]] ; CHECK: innerloop: ; CHECK-NEXT: [[I_2132:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC129:%.*]], [[INNERLOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll --- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll @@ -81,12 +81,10 @@ ; CHECK: LV: Found scalar instruction: {{%.*}} = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 {{%.*}} ; ; FORCE-LABEL: @PR40816( -; FORCE-NEXT: entry: -; FORCE-NEXT: br i1 false, label {{%.*}}, label [[VECTOR_PH:%.*]] ; FORCE: vector.ph: ; FORCE-NEXT: br label [[VECTOR_BODY:%.*]] ; FORCE: vector.body: -; FORCE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE4:%.*]] ] +; FORCE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE4:%.*]] ] ; FORCE-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE4]] ] ; FORCE-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 ; FORCE-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll --- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll @@ -12,7 +12,7 @@ define void @f1() { ; CHECK-LABEL: @f1( -; CHECK-NEXT: bb1: +; CHECK-NEXT: tc.check: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -32,6 +32,25 @@ ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2 ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 ; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 2, 2 +; CHECK-NEXT: br i1 [[CMP_N]], label [[BB3:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 2, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK:%.*]] ] +; CHECK-NEXT: br label [[BB2:%.*]] +; CHECK: bb2: +; CHECK-NEXT: [[C_1_0:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[_TMP9:%.*]], [[BB2]] ] +; CHECK-NEXT: [[_TMP1:%.*]] = zext i16 0 to i64 +; CHECK-NEXT: [[_TMP2:%.*]] = getelementptr [1 x %rec8], [1 x %rec8]* @a, i16 0, i64 [[_TMP1]] +; CHECK-NEXT: [[_TMP4:%.*]] = bitcast %rec8* [[_TMP2]] to i16* +; CHECK-NEXT: [[_TMP6:%.*]] = sext i16 [[C_1_0]] to i64 +; CHECK-NEXT: [[_TMP7:%.*]] = getelementptr [2 x i16*], [2 x i16*]* @b, i16 0, i64 [[_TMP6]] +; CHECK-NEXT: store i16* [[_TMP4]], i16** [[_TMP7]] +; CHECK-NEXT: [[_TMP9]] = add nsw i16 [[C_1_0]], 1 +; CHECK-NEXT: [[_TMP11:%.*]] = icmp slt i16 [[_TMP9]], 2 +; CHECK-NEXT: br i1 [[_TMP11]], label [[BB2]], label [[BB3]], !llvm.loop !2 +; CHECK: bb3: +; CHECK-NEXT: ret void +; bb1: br label %bb2 diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll @@ -14,8 +14,8 @@ define void @cff_index_load_offsets(i1 %cond, i8 %x, i8* %p) #0 { ; CHECK-LABEL: @cff_index_load_offsets( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[EXIT:%.*]] -; CHECK: if.then: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[TC_CHECK:%.*]], label [[EXIT:%.*]] +; CHECK: tc.check: ; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> undef, i8 [[X:%.*]], i32 0 @@ -60,7 +60,7 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1, 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[SW_EPILOG:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8* [ null, [[MIDDLE_BLOCK]] ], [ null, [[IF_THEN]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8* [ null, [[MIDDLE_BLOCK]] ], [ null, [[TC_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY68:%.*]] ; CHECK: for.body68: ; CHECK-NEXT: [[P_359:%.*]] = phi i8* [ [[ADD_PTR86:%.*]], [[FOR_BODY68]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll --- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll @@ -83,7 +83,7 @@ } ; AUTO_VEC-LABEL: @external_use_with_fast_math( -; AUTO_VEC-NEXT: entry: +; AUTO_VEC-NEXT: tc.check: ; AUTO_VEC-NEXT: [[TMP0:%.*]] = icmp sgt i64 %n, 1 ; AUTO_VEC-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i64 %n, i64 1 ; AUTO_VEC: br i1 {{.*}}, label %for.body, label %vector.ph diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll --- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll @@ -22,10 +22,10 @@ ; Function Attrs: nounwind uwtable define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) { ; AVX512-LABEL: @foo1( -; AVX512-NEXT: entry: +; AVX512-NEXT: tc.check: ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: -; AVX512-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[TC_CHECK:%.*]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX6]] ; AVX512-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>* ; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4 @@ -92,10 +92,10 @@ ; AVX512-NEXT: ret void ; ; FVW2-LABEL: @foo1( -; FVW2-NEXT: entry: +; FVW2-NEXT: tc.check: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ] +; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[TC_CHECK:%.*]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ] ; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX6]] ; FVW2-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>* ; FVW2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4 @@ -233,7 +233,7 @@ define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 { ; AVX512-LABEL: @foo2( -; AVX512-NEXT: entry: +; AVX512-NEXT: tc.check: ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer @@ -365,11 +365,11 @@ ; AVX512-NEXT: ret void ; ; FVW2-LABEL: @foo2( -; FVW2-NEXT: entry: +; FVW2-NEXT: tc.check: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ] -; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ] +; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[TC_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ] +; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[TC_CHECK]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ] ; FVW2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], @@ -549,7 +549,7 @@ define void @foo3(%struct.In* noalias %in, %struct.Out* noalias %out, i32* noalias %trigger) { ; AVX512-LABEL: @foo3( -; AVX512-NEXT: entry: +; AVX512-NEXT: tc.check: ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer @@ -681,11 +681,11 @@ ; AVX512-NEXT: ret void ; ; FVW2-LABEL: @foo3( -; FVW2-NEXT: entry: +; FVW2-NEXT: tc.check: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE29:%.*]] ] -; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE29]] ] +; FVW2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[TC_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE29:%.*]] ] +; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[TC_CHECK]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE29]] ] ; FVW2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[STEP_ADD6:%.*]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], @@ -851,7 +851,7 @@ define void @foo2_addrspace(%struct.In addrspace(1)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 { ; AVX512-LABEL: @foo2_addrspace( -; AVX512-NEXT: entry: +; AVX512-NEXT: tc.check: ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer @@ -983,11 +983,11 @@ ; AVX512-NEXT: ret void ; ; FVW2-LABEL: @foo2_addrspace( -; FVW2-NEXT: entry: +; FVW2-NEXT: tc.check: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ] -; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ] +; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[TC_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ] +; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[TC_CHECK]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ] ; FVW2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], @@ -1153,7 +1153,7 @@ define void @foo2_addrspace2(%struct.In addrspace(1)* noalias %in, float addrspace(0)* noalias %out, i32* noalias %trigger, i32* noalias %index) { ; AVX512-LABEL: @foo2_addrspace2( -; AVX512-NEXT: entry: +; AVX512-NEXT: tc.check: ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer @@ -1285,11 +1285,11 @@ ; AVX512-NEXT: ret void ; ; FVW2-LABEL: @foo2_addrspace2( -; FVW2-NEXT: entry: +; FVW2-NEXT: tc.check: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ] -; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ] +; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[TC_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ] +; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[TC_CHECK]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ] ; FVW2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], @@ -1455,7 +1455,7 @@ define void @foo2_addrspace3(%struct.In addrspace(0)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) { ; AVX512-LABEL: @foo2_addrspace3( -; AVX512-NEXT: entry: +; AVX512-NEXT: tc.check: ; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer @@ -1587,11 +1587,11 @@ ; AVX512-NEXT: ret void ; ; FVW2-LABEL: @foo2_addrspace3( -; FVW2-NEXT: entry: +; FVW2-NEXT: tc.check: ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ] -; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ] +; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[TC_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ] +; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[TC_CHECK]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ] ; FVW2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], ; FVW2-NEXT: [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], diff --git a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll --- a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll @@ -59,6 +59,8 @@ ; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[INDVARS_IV33]] to i32 ; CHECK-NEXT: [[ADD_US]] = add i32 [[TMP9]], [[K]] ; CHECK-NEXT: [[ARRAYIDX7_US]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV33]] +; CHECK-NEXT: br label [[TC_CHECK:%.*]] +; CHECK: tc.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: @@ -103,7 +105,7 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_US]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY3_LR_PH_US]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY3_US]] ; CHECK: for.end15.loopexit: ; CHECK-NEXT: br label [[FOR_END15]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll --- a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll @@ -90,7 +90,7 @@ ; SSE-NEXT: ret double [[TOT_NEXT_LCSSA]] ; ; AVX-LABEL: @sumIfVector( -; AVX-NEXT: entry: +; AVX-NEXT: tc.check: ; AVX-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX: vector.ph: ; AVX-NEXT: br label [[VECTOR_BODY:%.*]] @@ -121,8 +121,8 @@ ; AVX-NEXT: [[CMP_N:%.*]] = icmp eq i32 32, 32 ; AVX-NEXT: br i1 [[CMP_N]], label [[DONE:%.*]], label [[SCALAR_PH]] ; AVX: scalar.ph: -; AVX-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; AVX-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; AVX-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK:%.*]] ] +; AVX-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[TC_CHECK]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] ; AVX-NEXT: br label [[LOOP:%.*]] ; AVX: loop: ; AVX-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll --- a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll @@ -7,14 +7,16 @@ define i32 @inv_load_conditional(i32* %a, i64 %n, i32* %b, i32 %k) { ; CHECK-LABEL: @inv_load_conditional( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* +; CHECK-NEXT: [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8* ; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 +; CHECK-NEXT: br label [[TC_CHECK:%.*]] +; CHECK: tc.check: ; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[N]], 1 ; CHECK-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 16 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8* -; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i64 [[N]], 1 ; CHECK-NEXT: [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1 ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]] @@ -46,7 +48,7 @@ ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <16 x i32> [[PREDPHI]], i32 15 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll --- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll @@ -46,14 +46,16 @@ define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32 %k) { ; CHECK-LABEL: @inv_val_store_to_inv_address_conditional( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* +; CHECK-NEXT: [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8* ; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 +; CHECK-NEXT: br label [[TC_CHECK:%.*]] +; CHECK: tc.check: ; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[N]], 1 ; CHECK-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 16 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8* -; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i64 [[N]], 1 ; CHECK-NEXT: [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1 ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]] @@ -87,7 +89,7 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -134,15 +136,17 @@ define void @variant_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32* %c, i32 %k) { ; CHECK-LABEL: @variant_val_store_to_inv_address_conditional( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* +; CHECK-NEXT: [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8* +; CHECK-NEXT: [[C5:%.*]] = bitcast i32* [[C:%.*]] to i8* ; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 +; CHECK-NEXT: br label [[TC_CHECK:%.*]] +; CHECK: tc.check: ; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[N]], 1 ; CHECK-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 16 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[C5:%.*]] = bitcast i32* [[C:%.*]] to i8* -; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* -; CHECK-NEXT: [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8* ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i64 [[N]], 1 ; CHECK-NEXT: [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1 ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]] @@ -188,7 +192,7 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll --- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll @@ -20,6 +20,8 @@ ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [4096 x i32] ; CHECK-NEXT: [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32* ; CHECK-NEXT: call void @init(i32* [[BASE]]) +; CHECK-NEXT: br label [[TC_CHECK:%.*]] +; CHECK: tc.check: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[LEN:%.*]], i32 0 @@ -105,8 +107,8 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[TC_CHECK]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -159,6 +161,8 @@ ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [4096 x i32] ; CHECK-NEXT: [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32* ; CHECK-NEXT: call void @init(i32* [[BASE]]) +; CHECK-NEXT: br label [[TC_CHECK:%.*]] +; CHECK: tc.check: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -281,8 +285,8 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[TC_CHECK]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -340,6 +344,8 @@ ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [4096 x i32] ; CHECK-NEXT: [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32* ; CHECK-NEXT: call void @init(i32* [[BASE]]) +; CHECK-NEXT: br label [[TC_CHECK:%.*]] +; CHECK: tc.check: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -478,8 +484,8 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP105]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[TC_CHECK]] ], [ [[TMP105]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -532,6 +538,8 @@ ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [4096 x i32] ; CHECK-NEXT: [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32* ; CHECK-NEXT: call void @init(i32* [[BASE]]) +; CHECK-NEXT: br label [[TC_CHECK:%.*]] +; CHECK: tc.check: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -814,8 +822,8 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP185]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[TC_CHECK]] ], [ [[TMP185]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -875,6 +883,8 @@ ; CHECK-NEXT: call void @init(i32* [[BASE]]) ; CHECK-NEXT: [[MIN_CMP:%.*]] = icmp ult i64 4096, [[N:%.*]] ; CHECK-NEXT: [[MIN_N:%.*]] = select i1 [[MIN_CMP]], i64 4096, i64 [[N]] +; CHECK-NEXT: br label [[TC_CHECK:%.*]] +; CHECK: tc.check: ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[MIN_N]], 2 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -1001,8 +1011,8 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[TC_CHECK]] ], [ [[TMP86]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -1061,6 +1071,8 @@ ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [4096 x i32] ; CHECK-NEXT: [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32* ; CHECK-NEXT: call void @init(i32* [[BASE]]) +; CHECK-NEXT: br label [[TC_CHECK:%.*]] +; CHECK: tc.check: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -1184,8 +1196,8 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 3072, 3072 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 1024, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 1024, [[TC_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[TC_CHECK]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -1294,6 +1306,8 @@ ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [4096 x i32] ; CHECK-NEXT: [[BASE:%.*]] = bitcast [4096 x i32]* [[ALLOCA]] to i32* ; CHECK-NEXT: call void @init(i32* [[BASE]]) +; CHECK-NEXT: br label [[TC_CHECK:%.*]] +; CHECK: tc.check: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -1545,8 +1559,8 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 2048, 2048 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP153]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[TC_CHECK]] ], [ [[TMP153]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -1600,6 +1614,8 @@ ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [1024 x i32] ; CHECK-NEXT: [[BASE:%.*]] = bitcast [1024 x i32]* [[ALLOCA]] to i32* ; CHECK-NEXT: call void @init(i32* [[BASE]]) +; CHECK-NEXT: br label [[TC_CHECK:%.*]] +; CHECK: tc.check: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -1722,8 +1738,8 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[TC_CHECK]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -1777,6 +1793,8 @@ ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [4095 x i32] ; CHECK-NEXT: [[BASE:%.*]] = bitcast [4095 x i32]* [[ALLOCA]] to i32* ; CHECK-NEXT: call void @init(i32* [[BASE]]) +; CHECK-NEXT: br label [[TC_CHECK:%.*]] +; CHECK: tc.check: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -1899,8 +1917,8 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[TC_CHECK]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] @@ -1954,6 +1972,8 @@ ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [16383 x i8] ; CHECK-NEXT: [[BASE:%.*]] = bitcast [16383 x i8]* [[ALLOCA]] to i32* ; CHECK-NEXT: call void @init(i32* [[BASE]]) +; CHECK-NEXT: br label [[TC_CHECK:%.*]] +; CHECK: tc.check: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -2076,8 +2096,8 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[TC_CHECK]] ], [ [[TMP85]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll --- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -19,7 +19,7 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture readonly %trigger) local_unnamed_addr #0 { ; AVX1-LABEL: @foo1( -; AVX1-NEXT: entry: +; AVX1-NEXT: tc.check: ; AVX1-NEXT: [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8* ; AVX1-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* ; AVX1-NEXT: [[B6:%.*]] = bitcast i32* [[B:%.*]] to i8* @@ -69,7 +69,7 @@ ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AVX1: scalar.ph: -; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; AVX1-NEXT: br label [[FOR_BODY:%.*]] ; AVX1: for.body: ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -92,7 +92,7 @@ ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @foo1( -; AVX2-NEXT: entry: +; AVX2-NEXT: tc.check: ; AVX2-NEXT: [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8* ; AVX2-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* ; AVX2-NEXT: [[B6:%.*]] = bitcast i32* [[B:%.*]] to i8* @@ -190,7 +190,7 @@ ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AVX2: scalar.ph: -; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; AVX2-NEXT: br label [[FOR_BODY:%.*]] ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -213,7 +213,7 @@ ; AVX2-NEXT: ret void ; ; AVX512-LABEL: @foo1( -; AVX512-NEXT: entry: +; AVX512-NEXT: tc.check: ; AVX512-NEXT: [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8* ; AVX512-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* ; AVX512-NEXT: [[B6:%.*]] = bitcast i32* [[B:%.*]] to i8* @@ -311,7 +311,7 @@ ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AVX512: scalar.ph: -; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -364,7 +364,7 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* nocapture readonly %B, i32 addrspace(1)* nocapture readonly %trigger) local_unnamed_addr #0 { ; AVX1-LABEL: @foo1_addrspace1( -; AVX1-NEXT: entry: +; AVX1-NEXT: tc.check: ; AVX1-NEXT: [[A1:%.*]] = bitcast i32 addrspace(1)* [[A:%.*]] to i8 addrspace(1)* ; AVX1-NEXT: [[TRIGGER3:%.*]] = bitcast i32 addrspace(1)* [[TRIGGER:%.*]] to i8 addrspace(1)* ; AVX1-NEXT: [[B6:%.*]] = bitcast i32 addrspace(1)* [[B:%.*]] to i8 addrspace(1)* @@ -414,7 +414,7 @@ ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AVX1: scalar.ph: -; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; AVX1-NEXT: br label [[FOR_BODY:%.*]] ; AVX1: for.body: ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -437,7 +437,7 @@ ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @foo1_addrspace1( -; AVX2-NEXT: entry: +; AVX2-NEXT: tc.check: ; AVX2-NEXT: [[A1:%.*]] = bitcast i32 addrspace(1)* [[A:%.*]] to i8 addrspace(1)* ; AVX2-NEXT: [[TRIGGER3:%.*]] = bitcast i32 addrspace(1)* [[TRIGGER:%.*]] to i8 addrspace(1)* ; AVX2-NEXT: [[B6:%.*]] = bitcast i32 addrspace(1)* [[B:%.*]] to i8 addrspace(1)* @@ -535,7 +535,7 @@ ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AVX2: scalar.ph: -; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; AVX2-NEXT: br label [[FOR_BODY:%.*]] ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -558,7 +558,7 @@ ; AVX2-NEXT: ret void ; ; AVX512-LABEL: @foo1_addrspace1( -; AVX512-NEXT: entry: +; AVX512-NEXT: tc.check: ; AVX512-NEXT: [[A1:%.*]] = bitcast i32 addrspace(1)* [[A:%.*]] to i8 addrspace(1)* ; AVX512-NEXT: [[TRIGGER3:%.*]] = bitcast i32 addrspace(1)* [[TRIGGER:%.*]] to i8 addrspace(1)* ; AVX512-NEXT: [[B6:%.*]] = bitcast i32 addrspace(1)* [[B:%.*]] to i8 addrspace(1)* @@ -656,7 +656,7 @@ ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AVX512: scalar.ph: -; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -718,7 +718,7 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapture readonly %trigger) local_unnamed_addr #0 { ; AVX1-LABEL: @foo2( -; AVX1-NEXT: entry: +; AVX1-NEXT: tc.check: ; AVX1-NEXT: [[A1:%.*]] = bitcast float* [[A:%.*]] to i8* ; AVX1-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* ; AVX1-NEXT: [[B6:%.*]] = bitcast float* [[B:%.*]] to i8* @@ -769,7 +769,7 @@ ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AVX1: scalar.ph: -; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; AVX1-NEXT: br label [[FOR_BODY:%.*]] ; AVX1: for.body: ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -793,7 +793,7 @@ ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @foo2( -; AVX2-NEXT: entry: +; AVX2-NEXT: tc.check: ; AVX2-NEXT: [[A1:%.*]] = bitcast float* [[A:%.*]] to i8* ; AVX2-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* ; AVX2-NEXT: [[B6:%.*]] = bitcast float* [[B:%.*]] to i8* @@ -895,7 +895,7 @@ ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AVX2: scalar.ph: -; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; AVX2-NEXT: br label [[FOR_BODY:%.*]] ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -919,7 +919,7 @@ ; AVX2-NEXT: ret void ; ; AVX512-LABEL: @foo2( -; AVX512-NEXT: entry: +; AVX512-NEXT: tc.check: ; AVX512-NEXT: [[A1:%.*]] = bitcast float* [[A:%.*]] to i8* ; AVX512-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* ; AVX512-NEXT: [[B6:%.*]] = bitcast float* [[B:%.*]] to i8* @@ -1021,7 +1021,7 @@ ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AVX512: scalar.ph: -; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -1085,7 +1085,7 @@ define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* nocapture readonly %trigger) local_unnamed_addr #0 { ; AVX-LABEL: @foo3( -; AVX-NEXT: entry: +; AVX-NEXT: tc.check: ; AVX-NEXT: [[A1:%.*]] = bitcast double* [[A:%.*]] to i8* ; AVX-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* ; AVX-NEXT: [[B6:%.*]] = bitcast double* [[B:%.*]] to i8* @@ -1187,7 +1187,7 @@ ; AVX-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; AVX-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AVX: scalar.ph: -; AVX-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; AVX-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; AVX-NEXT: br label [[FOR_BODY:%.*]] ; AVX: for.body: ; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -1211,7 +1211,7 @@ ; AVX-NEXT: ret void ; ; AVX512-LABEL: @foo3( -; AVX512-NEXT: entry: +; AVX512-NEXT: tc.check: ; AVX512-NEXT: [[A1:%.*]] = bitcast double* [[A:%.*]] to i8* ; AVX512-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* ; AVX512-NEXT: [[B6:%.*]] = bitcast double* [[B:%.*]] to i8* @@ -1313,7 +1313,7 @@ ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AVX512: scalar.ph: -; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -1402,7 +1402,7 @@ ; AVX-NEXT: ret void ; ; AVX512-LABEL: @foo4( -; AVX512-NEXT: entry: +; AVX512-NEXT: tc.check: ; AVX512-NEXT: [[A1:%.*]] = bitcast double* [[A:%.*]] to i8* ; AVX512-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* ; AVX512-NEXT: [[B6:%.*]] = bitcast double* [[B:%.*]] to i8* @@ -1446,7 +1446,7 @@ ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 625, 624 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AVX512: scalar.ph: -; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9984, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -1605,7 +1605,7 @@ ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @foo6( -; AVX2-NEXT: entry: +; AVX2-NEXT: tc.check: ; AVX2-NEXT: [[OUT1:%.*]] = bitcast double* [[OUT:%.*]] to i8* ; AVX2-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* ; AVX2-NEXT: [[IN6:%.*]] = bitcast double* [[IN:%.*]] to i8* @@ -1732,7 +1732,7 @@ ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AVX2: scalar.ph: -; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 4095, [[ENTRY:%.*]] ], [ 4095, [[VECTOR_MEMCHECK]] ] +; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 4095, [[TC_CHECK:%.*]] ], [ 4095, [[VECTOR_MEMCHECK]] ] ; AVX2-NEXT: br label [[FOR_BODY:%.*]] ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -1755,7 +1755,7 @@ ; AVX2-NEXT: ret void ; ; AVX512-LABEL: @foo6( -; AVX512-NEXT: entry: +; AVX512-NEXT: tc.check: ; AVX512-NEXT: [[OUT1:%.*]] = bitcast double* [[OUT:%.*]] to i8* ; AVX512-NEXT: [[TRIGGER3:%.*]] = bitcast i32* [[TRIGGER:%.*]] to i8* ; AVX512-NEXT: [[IN6:%.*]] = bitcast double* [[IN:%.*]] to i8* @@ -1882,7 +1882,7 @@ ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; AVX512: scalar.ph: -; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 4095, [[ENTRY:%.*]] ], [ 4095, [[VECTOR_MEMCHECK]] ] +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 4095, [[TC_CHECK:%.*]] ], [ 4095, [[VECTOR_MEMCHECK]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -1946,6 +1946,8 @@ ; AVX1-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] ; AVX1: for.body.preheader: ; AVX1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 +; AVX1-NEXT: br label [[TC_CHECK:%.*]] +; AVX1: tc.check: ; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX1: vector.ph: @@ -2043,7 +2045,7 @@ ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; AVX1: scalar.ph: -; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ] ; AVX1-NEXT: br label [[FOR_BODY:%.*]] ; AVX1: for.body: ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -2076,6 +2078,8 @@ ; AVX2-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] ; AVX2: for.body.preheader: ; AVX2-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 +; AVX2-NEXT: br label [[TC_CHECK:%.*]] +; AVX2: tc.check: ; AVX2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; AVX2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX2: vector.ph: @@ -2173,7 +2177,7 @@ ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; AVX2: scalar.ph: -; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ] ; AVX2-NEXT: br label [[FOR_BODY:%.*]] ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -2206,6 +2210,8 @@ ; AVX512-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] ; AVX512: for.body.preheader: ; AVX512-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 +; AVX512-NEXT: br label [[TC_CHECK:%.*]] +; AVX512: tc.check: ; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32 ; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX512: vector.ph: @@ -2303,7 +2309,7 @@ ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; AVX512: scalar.ph: -; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -2381,6 +2387,8 @@ ; AVX1-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] ; AVX1: for.body.preheader: ; AVX1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 +; AVX1-NEXT: br label [[TC_CHECK:%.*]] +; AVX1: tc.check: ; AVX1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; AVX1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX1: vector.ph: @@ -2478,7 +2486,7 @@ ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; AVX1: scalar.ph: -; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ] ; AVX1-NEXT: br label [[FOR_BODY:%.*]] ; AVX1: for.body: ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -2511,6 +2519,8 @@ ; AVX2-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] ; AVX2: for.body.preheader: ; AVX2-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 +; AVX2-NEXT: br label [[TC_CHECK:%.*]] +; AVX2: tc.check: ; AVX2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 ; AVX2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX2: vector.ph: @@ -2608,7 +2618,7 @@ ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; AVX2: scalar.ph: -; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ] ; AVX2-NEXT: br label [[FOR_BODY:%.*]] ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -2641,6 +2651,8 @@ ; AVX512-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] ; AVX512: for.body.preheader: ; AVX512-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 +; AVX512-NEXT: br label [[TC_CHECK:%.*]] +; AVX512: tc.check: ; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32 ; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; AVX512: vector.ph: @@ -2738,7 +2750,7 @@ ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; AVX512: scalar.ph: -; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll --- a/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll @@ -18,7 +18,7 @@ define i32 @enabled(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %N) { ; O1-LABEL: @enabled( -; O1-NEXT: entry: +; O1-NEXT: tc.check: ; O1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 ; O1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer ; O1-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* @@ -135,7 +135,7 @@ ; O1-NEXT: ret i32 [[TMP78]] ; ; O2-LABEL: @enabled( -; O2-NEXT: entry: +; O2-NEXT: tc.check: ; O2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 ; O2-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer ; O2-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* @@ -252,7 +252,7 @@ ; O2-NEXT: ret i32 [[TMP78]] ; ; O3-LABEL: @enabled( -; O3-NEXT: entry: +; O3-NEXT: tc.check: ; O3-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 ; O3-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer ; O3-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* @@ -369,7 +369,7 @@ ; O3-NEXT: ret i32 [[TMP78]] ; ; O3DEFAULT-LABEL: @enabled( -; O3DEFAULT-NEXT: entry: +; O3DEFAULT-NEXT: tc.check: ; O3DEFAULT-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 ; O3DEFAULT-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer ; O3DEFAULT-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* @@ -486,7 +486,7 @@ ; O3DEFAULT-NEXT: ret i32 [[TMP78]] ; ; Os-LABEL: @enabled( -; Os-NEXT: entry: +; Os-NEXT: tc.check: ; Os-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 ; Os-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer ; Os-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* @@ -603,7 +603,7 @@ ; Os-NEXT: ret i32 [[TMP78]] ; ; Oz-LABEL: @enabled( -; Oz-NEXT: entry: +; Oz-NEXT: tc.check: ; Oz-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 ; Oz-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer ; Oz-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* @@ -720,7 +720,7 @@ ; Oz-NEXT: ret i32 [[TMP78]] ; ; O1VEC2-LABEL: @enabled( -; O1VEC2-NEXT: entry: +; O1VEC2-NEXT: tc.check: ; O1VEC2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 ; O1VEC2-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer ; O1VEC2-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* @@ -837,7 +837,7 @@ ; O1VEC2-NEXT: ret i32 [[TMP78]] ; ; OzVEC2-LABEL: @enabled( -; OzVEC2-NEXT: entry: +; OzVEC2-NEXT: tc.check: ; OzVEC2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 ; OzVEC2-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer ; OzVEC2-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* @@ -954,7 +954,7 @@ ; OzVEC2-NEXT: ret i32 [[TMP78]] ; ; O3DIS-LABEL: @enabled( -; O3DIS-NEXT: entry: +; O3DIS-NEXT: tc.check: ; O3DIS-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 ; O3DIS-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer ; O3DIS-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* @@ -1108,7 +1108,7 @@ ; O1-NEXT: ret i32 [[TMP1]] ; ; O2-LABEL: @nopragma( -; O2-NEXT: entry: +; O2-NEXT: tc.check: ; O2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 ; O2-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer ; O2-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* @@ -1225,7 +1225,7 @@ ; O2-NEXT: ret i32 [[TMP78]] ; ; O3-LABEL: @nopragma( -; O3-NEXT: entry: +; O3-NEXT: tc.check: ; O3-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 ; O3-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer ; O3-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* @@ -1342,7 +1342,7 @@ ; O3-NEXT: ret i32 [[TMP78]] ; ; O3DEFAULT-LABEL: @nopragma( -; O3DEFAULT-NEXT: entry: +; O3DEFAULT-NEXT: tc.check: ; O3DEFAULT-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 ; O3DEFAULT-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer ; O3DEFAULT-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* @@ -1459,7 +1459,7 @@ ; O3DEFAULT-NEXT: ret i32 [[TMP78]] ; ; Os-LABEL: @nopragma( -; Os-NEXT: entry: +; Os-NEXT: tc.check: ; Os-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 ; Os-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer ; Os-NEXT: [[TMP0:%.*]] = bitcast i32* [[B:%.*]] to <4 x i32>* @@ -1593,7 +1593,7 @@ ; Oz-NEXT: ret i32 [[TMP1]] ; ; O1VEC2-LABEL: @nopragma( -; O1VEC2-NEXT: entry: +; O1VEC2-NEXT: tc.check: ; O1VEC2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; O1VEC2: vector.ph: ; O1VEC2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 @@ -1621,7 +1621,7 @@ ; O1VEC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 64, 64 ; O1VEC2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; O1VEC2: scalar.ph: -; O1VEC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; O1VEC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK:%.*]] ] ; O1VEC2-NEXT: br label [[FOR_BODY:%.*]] ; O1VEC2: for.body: ; O1VEC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -1638,7 +1638,7 @@ ; O1VEC2-NEXT: ret i32 [[TMP10]] ; ; OzVEC2-LABEL: @nopragma( -; OzVEC2-NEXT: entry: +; OzVEC2-NEXT: tc.check: ; OzVEC2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; OzVEC2: vector.ph: ; OzVEC2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 @@ -1666,7 +1666,7 @@ ; OzVEC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 64, 64 ; OzVEC2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; OzVEC2: scalar.ph: -; OzVEC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; OzVEC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK:%.*]] ] ; OzVEC2-NEXT: br label [[FOR_BODY:%.*]] ; OzVEC2: for.body: ; OzVEC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll --- a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll @@ -11,12 +11,10 @@ define i32 @foo_optsize() #0 { ; CHECK-LABEL: @foo_optsize( -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: +; CHECK-NEXT: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> undef, i32 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> undef, <64 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], @@ -35,9 +33,9 @@ ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] @@ -52,7 +50,6 @@ ; CHECK: for.end: ; CHECK-NEXT: ret i32 0 ; - entry: br label %for.body @@ -75,12 +72,10 @@ define i32 @foo_minsize() #1 { ; CHECK-LABEL: @foo_minsize( -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: +; CHECK-NEXT: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> undef, i32 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> undef, <64 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], @@ -99,9 +94,9 @@ ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] @@ -116,7 +111,6 @@ ; CHECK: for.end: ; CHECK-NEXT: ret i32 0 ; - entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll --- a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll @@ -14,6 +14,8 @@ ; CHECK-NEXT: [[DOT11:%.*]] = bitcast i8 addrspace(1)* [[DOT10]] to i8 addrspace(1)* addrspace(1)* ; CHECK-NEXT: [[DOT12:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP1:%.*]], i64 16 ; CHECK-NEXT: [[DOT13:%.*]] = bitcast i8 addrspace(1)* [[DOT12]] to i8 addrspace(1)* addrspace(1)* +; CHECK-NEXT: br label [[TC_CHECK:%.*]] +; CHECK: tc.check: ; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2:%.*]], 1 ; CHECK-NEXT: [[UMAX:%.*]] = select i1 [[TMP3]], i64 [[TMP2]], i64 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 16 @@ -27,8 +29,8 @@ ; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP1]], i64 [[TMP6]] ; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8 addrspace(1)* [[DOT10]], [[SCEVGEP2]] ; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8 addrspace(1)* [[DOT12]], [[SCEVGEP]] -; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[UMAX]], -16 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -65,7 +67,7 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[INDVARS_IV3:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT4:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll --- a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll @@ -36,20 +36,22 @@ ; CHECK: for.body8.lr.ph: ; CHECK-NEXT: [[CONV3:%.*]] = trunc i32 [[STOREMERGE_IN9]] to i8 ; CHECK-NEXT: [[DOTPROMOTED:%.*]] = load i32, i32* getelementptr inbounds ([192 x [192 x i32]], [192 x [192 x i32]]* @a, i64 0, i64 0, i64 0), align 16 +; CHECK-NEXT: br label [[TC_CHECK:%.*]] +; CHECK: tc.check: ; CHECK-NEXT: [[TMP3:%.*]] = add i8 [[CONV3]], -1 ; CHECK-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i32 ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP4]], 1 ; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[UMAX:%.*]] = select i1 [[TMP6]], i32 [[TMP2]], i32 [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = sub i32 [[TMP5]], [[UMAX]] +; CHECK-NEXT: [[UMIN:%.*]] = select i1 [[TMP6]], i32 [[TMP2]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = sub i32 [[TMP5]], [[UMIN]] ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP7]], 8 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[TMP8:%.*]] = add i8 [[CONV3]], -1 ; CHECK-NEXT: [[TMP9:%.*]] = zext i8 [[TMP8]] to i32 ; CHECK-NEXT: [[TMP10:%.*]] = icmp ult i32 [[TMP2]], [[TMP9]] -; CHECK-NEXT: [[UMAX1:%.*]] = select i1 [[TMP10]], i32 [[TMP2]], i32 [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[UMAX1]] +; CHECK-NEXT: [[UMIN1:%.*]] = select i1 [[TMP10]], i32 [[TMP2]], i32 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = sub i32 [[TMP9]], [[UMIN1]] ; CHECK-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i8 ; CHECK-NEXT: [[MUL:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 1, i8 [[TMP12]]) ; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i8, i1 } [[MUL]], 0 @@ -102,8 +104,8 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP7]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND4_FOR_INC9_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[CONV3]], [[FOR_BODY8_LR_PH]] ], [ [[CONV3]], [[VECTOR_SCEVCHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[DOTPROMOTED]], [[FOR_BODY8_LR_PH]] ], [ [[DOTPROMOTED]], [[VECTOR_SCEVCHECK]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[CONV3]], [[TC_CHECK]] ], [ [[CONV3]], [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[DOTPROMOTED]], [[TC_CHECK]] ], [ [[DOTPROMOTED]], [[VECTOR_SCEVCHECK]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY8:%.*]] ; CHECK: for.body8: ; CHECK-NEXT: [[INC5:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY8]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll b/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll --- a/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr42674.ll @@ -8,12 +8,12 @@ ; the vector loop was dead code leaving only a scalar remainder. define zeroext i8 @sum() { ; CHECK-LABEL: @sum( -; CHECK-NEXT: entry: +; CHECK-NEXT: tc.check: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <64 x i8> [ zeroinitializer, [[ENTRY]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <64 x i8> [ zeroinitializer, [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[TC_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <64 x i8> [ zeroinitializer, [[TC_CHECK]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <64 x i8> [ zeroinitializer, [[TC_CHECK]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [128 x i8], [128 x i8]* @bytes, i64 0, i64 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <64 x i8>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <64 x i8>, <64 x i8>* [[TMP1]], align 16 diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll --- a/llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll @@ -6,7 +6,7 @@ ; PR15344 define void @test1(float* nocapture %arg, i32 %arg1) nounwind { ; CHECK-LABEL: @test1( -; CHECK: preheader +; CHECK: vector.ph ; CHECK: insertelement <2 x double> zeroinitializer, double %tmp, i32 0 ; CHECK: vector.memcheck diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll --- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll @@ -23,30 +23,33 @@ ; We can optimize this test without a tail. define void @example1() optsize { ; CHECK-LABEL: @example1( +; CHECK-NEXT: tc.check: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[INDEX]] -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 16 -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[INDEX]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 16 -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[INDEX]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP7]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 16 +; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[INDEX]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP6]], align 16 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[TMP10:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br i1 true, label [[TMP9:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: br label [[TMP9:%.*]] -; CHECK: br i1 undef, label [[TMP10]], label [[TMP9]], !llvm.loop !2 -; CHECK: ret void +; CHECK-NEXT: br label [[TMP8:%.*]] +; CHECK: 8: +; CHECK-NEXT: br i1 undef, label [[TMP9]], label [[TMP8]], !llvm.loop !2 +; CHECK: 9: +; CHECK-NEXT: ret void ; br label %1 @@ -72,12 +75,10 @@ define void @example2(i32 %n, i32 %x) optsize { ; CHECK-LABEL: @example2( ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[TMP1]], label [[DOTLR_PH5_PREHEADER:%.*]], label [[DOTPREHEADER:%.*]] -; CHECK: .lr.ph5.preheader: +; CHECK-NEXT: br i1 [[TMP1]], label [[VECTOR_PH:%.*]], label [[DOTPREHEADER:%.*]] +; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[N]], -1 ; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64 -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: ; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw nsw i64 [[TMP3]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N_RND_UP]], 8589934588 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> undef, i64 [[TMP3]], i32 0 @@ -88,45 +89,43 @@ ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP8:%.*]] = icmp ule <4 x i64> [[INDUCTION]], [[BROADCAST_SPLAT2]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP8]], i32 0 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ule <4 x i64> [[INDUCTION]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[INDEX]] -; CHECK-NEXT: store i32 [[X:%.*]], i32* [[TMP10]], align 16 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[INDEX]] +; CHECK-NEXT: store i32 [[X:%.*]], i32* [[TMP9]], align 16 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1 -; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] ; CHECK: pred.store.if3: -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP5]] -; CHECK-NEXT: store i32 [[X]], i32* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP4]] +; CHECK-NEXT: store i32 [[X]], i32* [[TMP11]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] ; CHECK: pred.store.continue4: -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP8]], i32 2 -; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP7]], i32 2 +; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] ; CHECK: pred.store.if5: -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP6]] -; CHECK-NEXT: store i32 [[X]], i32* [[TMP14]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP5]] +; CHECK-NEXT: store i32 [[X]], i32* [[TMP13]], align 8 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] ; CHECK: pred.store.continue6: -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP8]], i32 3 -; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP7]], i32 3 +; CHECK-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]] ; CHECK: pred.store.if7: -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP7]] -; CHECK-NEXT: store i32 [[X]], i32* [[TMP16]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP6]] +; CHECK-NEXT: store i32 [[X]], i32* [[TMP15]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]] ; CHECK: pred.store.continue8: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[DOT_PREHEADER_CRIT_EDGE:%.*]], label [[SCALAR_PH]] -; CHECK: ._crit_edge: -; CHECK-NEXT: ret void +; CHECK-NEXT: br i1 true, label [[DOT_PREHEADER_CRIT_EDGE:%.*]], label [[SCALAR_PH:%.*]] ; %1 = icmp sgt i32 %n, 0 br i1 %1, label %.lr.ph5, label %.preheader @@ -222,6 +221,7 @@ ; We CAN vectorize this example because the pointers are marked as noalias. define void @example23b(i16* noalias nocapture %src, i32* noalias nocapture %dst) optsize { ; CHECK-LABEL: @example23b( +; CHECK-NEXT: tc.check: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -229,21 +229,23 @@ ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[SRC:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[NEXT_GEP]] to <4 x i16>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[WIDE_LOAD]] to <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw <4 x i32> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[NEXT_GEP4]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[NEXT_GEP]] to <4 x i16>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i16> [[WIDE_LOAD]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[NEXT_GEP4]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6 ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[TMP7:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br i1 true, label [[TMP6:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: br label [[TMP6:%.*]] -; CHECK: br i1 undef, label [[TMP7]], label [[TMP6]], !llvm.loop !7 -; CHECK: ret void +; CHECK-NEXT: br label [[TMP5:%.*]] +; CHECK: 5: +; CHECK-NEXT: br i1 undef, label [[TMP6]], label [[TMP5]], !llvm.loop !7 +; CHECK: 6: +; CHECK-NEXT: ret void ; br label %1 @@ -268,98 +270,99 @@ ; We CAN vectorize this example by folding the tail it entails. define void @example23c(i16* noalias nocapture %src, i32* noalias nocapture %dst) optsize { ; CHECK-LABEL: @example23c( -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: +; CHECK-NEXT: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE22:%.*]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE22:%.*]] ] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i64> [[INDUCTION]], -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 -; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[INDUCTION]], +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 +; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[SRC:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP3:%.*]] = load i16, i16* [[NEXT_GEP]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, i16* [[NEXT_GEP]], align 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] ; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP4:%.*]] = phi i16 [ undef, [[VECTOR_BODY]] ], [ [[TMP3]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 -; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = phi i16 [ undef, [[VECTOR_BODY]] ], [ [[TMP2]], [[PRED_LOAD_IF]] ] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1 +; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]] ; CHECK: pred.load.if11: -; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = load i16, i16* [[NEXT_GEP4]], align 2 +; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = load i16, i16* [[NEXT_GEP4]], align 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE12]] ; CHECK: pred.load.continue12: -; CHECK-NEXT: [[TMP8:%.*]] = phi i16 [ undef, [[PRED_LOAD_CONTINUE]] ], [ [[TMP7]], [[PRED_LOAD_IF11]] ] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 -; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] +; CHECK-NEXT: [[TMP7:%.*]] = phi i16 [ undef, [[PRED_LOAD_CONTINUE]] ], [ [[TMP6]], [[PRED_LOAD_IF11]] ] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2 +; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]] ; CHECK: pred.load.if13: -; CHECK-NEXT: [[TMP10:%.*]] = or i64 [[INDEX]], 2 -; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = load i16, i16* [[NEXT_GEP5]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = load i16, i16* [[NEXT_GEP5]], align 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE14]] ; CHECK: pred.load.continue14: -; CHECK-NEXT: [[TMP12:%.*]] = phi i16 [ undef, [[PRED_LOAD_CONTINUE12]] ], [ [[TMP11]], [[PRED_LOAD_IF13]] ] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3 -; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] +; CHECK-NEXT: [[TMP11:%.*]] = phi i16 [ undef, [[PRED_LOAD_CONTINUE12]] ], [ [[TMP10]], [[PRED_LOAD_IF13]] ] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3 +; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16:%.*]] ; CHECK: pred.load.if15: -; CHECK-NEXT: [[TMP14:%.*]] = or i64 [[INDEX]], 3 -; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP15:%.*]] = load i16, i16* [[NEXT_GEP6]], align 2 +; CHECK-NEXT: [[TMP13:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i16, i16* [[SRC]], i64 [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = load i16, i16* [[NEXT_GEP6]], align 2 ; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE16]] ; CHECK: pred.load.continue16: -; CHECK-NEXT: [[TMP16:%.*]] = phi i16 [ undef, [[PRED_LOAD_CONTINUE14]] ], [ [[TMP15]], [[PRED_LOAD_IF15]] ] -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 -; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP15:%.*]] = phi i16 [ undef, [[PRED_LOAD_CONTINUE14]] ], [ [[TMP14]], [[PRED_LOAD_IF15]] ] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 +; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP18:%.*]] = zext i16 [[TMP4]] to i32 -; CHECK-NEXT: [[TMP19:%.*]] = shl nuw nsw i32 [[TMP18]], 7 +; CHECK-NEXT: [[TMP17:%.*]] = zext i16 [[TMP3]] to i32 +; CHECK-NEXT: [[TMP18:%.*]] = shl nuw nsw i32 [[TMP17]], 7 ; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[INDEX]] -; CHECK-NEXT: store i32 [[TMP19]], i32* [[NEXT_GEP7]], align 4 +; CHECK-NEXT: store i32 [[TMP18]], i32* [[NEXT_GEP7]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 -; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1 +; CHECK-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] ; CHECK: pred.store.if17: -; CHECK-NEXT: [[TMP21:%.*]] = zext i16 [[TMP8]] to i32 -; CHECK-NEXT: [[TMP22:%.*]] = shl nuw nsw i32 [[TMP21]], 7 -; CHECK-NEXT: [[TMP23:%.*]] = or i64 [[INDEX]], 1 -; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP23]] -; CHECK-NEXT: store i32 [[TMP22]], i32* [[NEXT_GEP8]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = zext i16 [[TMP7]] to i32 +; CHECK-NEXT: [[TMP21:%.*]] = shl nuw nsw i32 [[TMP20]], 7 +; CHECK-NEXT: [[TMP22:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP22]] +; CHECK-NEXT: store i32 [[TMP21]], i32* [[NEXT_GEP8]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE18]] ; CHECK: pred.store.continue18: -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 -; CHECK-NEXT: br i1 [[TMP24]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2 +; CHECK-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] ; CHECK: pred.store.if19: -; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP12]] to i32 -; CHECK-NEXT: [[TMP26:%.*]] = shl nuw nsw i32 [[TMP25]], 7 -; CHECK-NEXT: [[TMP27:%.*]] = or i64 [[INDEX]], 2 -; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP27]] -; CHECK-NEXT: store i32 [[TMP26]], i32* [[NEXT_GEP9]], align 4 +; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP11]] to i32 +; CHECK-NEXT: [[TMP25:%.*]] = shl nuw nsw i32 [[TMP24]], 7 +; CHECK-NEXT: [[TMP26:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP26]] +; CHECK-NEXT: store i32 [[TMP25]], i32* [[NEXT_GEP9]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE20]] ; CHECK: pred.store.continue20: -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3 -; CHECK-NEXT: br i1 [[TMP28]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22]] +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3 +; CHECK-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22]] ; CHECK: pred.store.if21: -; CHECK-NEXT: [[TMP29:%.*]] = zext i16 [[TMP16]] to i32 -; CHECK-NEXT: [[TMP30:%.*]] = shl nuw nsw i32 [[TMP29]], 7 -; CHECK-NEXT: [[TMP31:%.*]] = or i64 [[INDEX]], 3 -; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP31]] -; CHECK-NEXT: store i32 [[TMP30]], i32* [[NEXT_GEP10]], align 4 +; CHECK-NEXT: [[TMP28:%.*]] = zext i16 [[TMP15]] to i32 +; CHECK-NEXT: [[TMP29:%.*]] = shl nuw nsw i32 [[TMP28]], 7 +; CHECK-NEXT: [[TMP30:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP30]] +; CHECK-NEXT: store i32 [[TMP29]], i32* [[NEXT_GEP10]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE22]] ; CHECK: pred.store.continue22: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 -; CHECK-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8 +; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 +; CHECK-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8 ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[TMP34:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br i1 true, label [[TMP33:%.*]], label [[SCALAR_PH:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: br label [[TMP33:%.*]] -; CHECK: br i1 undef, label [[TMP34]], label [[TMP33]], !llvm.loop !9 -; CHECK: ret void +; CHECK-NEXT: br label [[TMP32:%.*]] +; CHECK: 32: +; CHECK-NEXT: br i1 undef, label [[TMP33]], label [[TMP32]], !llvm.loop !9 +; CHECK: 33: +; CHECK-NEXT: ret void ; br label %1 diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll --- a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll @@ -18,7 +18,7 @@ ; define void @vectorized(float* noalias nocapture %A, float* noalias nocapture readonly %B) { ; CHECK-LABEL: @vectorized( -; CHECK-NEXT: entry: +; CHECK-NEXT: tc.check: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -31,14 +31,14 @@ ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !0 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !0 ; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] ; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4 +; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !0 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !1 @@ -46,7 +46,7 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 20, 16 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -91,12 +91,10 @@ ; define void @vectorized1(float* noalias nocapture %A, float* noalias nocapture readonly %B) { ; CHECK-LABEL: @vectorized1( -; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: +; CHECK-NEXT: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], @@ -112,13 +110,26 @@ ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP7]], i32 4, <8 x i1> [[TMP2]], <8 x float> undef), !llvm.access.group !6 ; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]] ; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP6]] to <8 x float>* -; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP8]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP2]]) +; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP8]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP2]]), !llvm.access.group !6 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !7 ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH:%.*]] ; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 24, [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !6 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !6 +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP11]], [[TMP12]] +; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !6 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !9 ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -150,7 +161,7 @@ ; define void @vectorized2(float* noalias nocapture %A, float* noalias nocapture readonly %B) { ; CHECK-LABEL: @vectorized2( -; CHECK-NEXT: entry: +; CHECK-NEXT: tc.check: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -163,14 +174,14 @@ ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !6 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !6 ; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] ; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4 +; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !6 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10 @@ -178,7 +189,7 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, 16 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll --- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll @@ -391,7 +391,7 @@ ;DISABLED_MASKED_STRIDED: for.end: ;ENABLED_MASKED_STRIDED-LABEL: @unconditional_strided1_optsize( -;ENABLED_MASKED_STRIDED-NEXT: entry: +;ENABLED_MASKED_STRIDED-NEXT: tc.check: ;ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]] ;ENABLED_MASKED_STRIDED: vector.body: ;ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/debugloc.ll b/llvm/test/Transforms/LoopVectorize/debugloc.ll --- a/llvm/test/Transforms/LoopVectorize/debugloc.ll +++ b/llvm/test/Transforms/LoopVectorize/debugloc.ll @@ -4,7 +4,7 @@ ; Make sure we are preserving debug info in the vectorized code. -; CHECK: for.body.lr.ph +; CHECK: tc.check ; CHECK: min.iters.check = icmp ult i64 {{.*}}, 2, !dbg !{{[0-9]+}} ; CHECK: vector.body ; CHECK: index {{.*}}, !dbg ![[LOC:[0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -22,7 +22,7 @@ ; CHECK: middle.block: ; CHECK: %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3 ; CHECK: scalar.ph: -; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %pre_load, %vector.memcheck ], [ %pre_load, %for.preheader ] +; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %pre_load, %vector.memcheck ], [ %pre_load, %tc.check ] ; CHECK: scalar.body: ; CHECK: %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ] ; @@ -79,7 +79,7 @@ ; CHECK: middle.block: ; CHECK: %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3 ; CHECK: scalar.ph: -; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %.pre, %for.preheader ] +; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %.pre, %tc.check ] ; CHECK: scalar.body: ; CHECK: %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ] ; @@ -147,7 +147,7 @@ ; CHECK: middle.block: ; CHECK: %vector.recur.extract = extractelement <4 x i16> [[L1]], i32 3 ; CHECK: scalar.ph: -; CHECK: %scalar.recur.init = phi i16 [ %vector.recur.extract, %middle.block ], [ %0, %vector.memcheck ], [ %0, %for.preheader ] +; CHECK: %scalar.recur.init = phi i16 [ %vector.recur.extract, %middle.block ], [ %0, %vector.memcheck ], [ %0, %tc.check ] ; CHECK: scalar.body: ; CHECK: %scalar.recur = phi i16 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ] ; diff --git a/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll b/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll --- a/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll +++ b/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll @@ -48,6 +48,8 @@ ; CHECK-LABEL: @minloopattr( ; CHECK-NEXT: top: ; CHECK-NEXT: [[T:%.*]] = load float, float* [[ARG:%.*]] +; CHECK-NEXT: br label [[TC_CHECK:%.*]] +; CHECK: tc.check: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x float> undef, float [[T]], i32 0 @@ -82,8 +84,8 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 65536, 65536 ; CHECK-NEXT: br i1 [[CMP_N]], label [[OUT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 65537, [[MIDDLE_BLOCK]] ], [ 1, [[TOP:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[T]], [[TOP]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 65537, [[MIDDLE_BLOCK]] ], [ 1, [[TC_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[T]], [[TC_CHECK]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[T1:%.*]] = phi i64 [ [[T7:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll --- a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll +++ b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll @@ -7,8 +7,8 @@ ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP26:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP26]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] -; CHECK: for.body.preheader: +; CHECK-NEXT: br i1 [[CMP26]], label [[TC_CHECK:%.*]], label [[FOR_END:%.*]] +; CHECK: tc.check: ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 @@ -53,7 +53,7 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[IF_END14:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll --- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll +++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll @@ -8,10 +8,10 @@ ; Test predication of stores. define i32 @test(i32* nocapture %f) #0 { ; UNROLL-LABEL: @test( -; UNROLL-NEXT: entry: +; UNROLL-NEXT: tc.check: ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: -; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] +; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[TC_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ] ; UNROLL-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 ; UNROLL-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1 ; UNROLL-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[F:%.*]], i64 [[INDUCTION]] @@ -56,7 +56,7 @@ ; UNROLL-NEXT: ret i32 0 ; ; UNROLL-NOSIMPLIFY-LABEL: @test( -; UNROLL-NOSIMPLIFY-NEXT: entry: +; UNROLL-NOSIMPLIFY-NEXT: tc.check: ; UNROLL-NOSIMPLIFY-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; UNROLL-NOSIMPLIFY: vector.ph: ; UNROLL-NOSIMPLIFY-NEXT: br label [[VECTOR_BODY:%.*]] @@ -89,7 +89,7 @@ ; UNROLL-NOSIMPLIFY-NEXT: [[CMP_N:%.*]] = icmp eq i64 128, 128 ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NOSIMPLIFY: scalar.ph: -; UNROLL-NOSIMPLIFY-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; UNROLL-NOSIMPLIFY-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK:%.*]] ] ; UNROLL-NOSIMPLIFY-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NOSIMPLIFY: for.body: ; UNROLL-NOSIMPLIFY-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -109,10 +109,10 @@ ; UNROLL-NOSIMPLIFY-NEXT: ret i32 0 ; ; VEC-LABEL: @test( -; VEC-NEXT: entry: +; VEC-NEXT: tc.check: ; VEC-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC: vector.body: -; VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] +; VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[TC_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ] ; VEC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> undef, i64 [[INDEX]], i32 0 ; VEC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> undef, <2 x i32> zeroinitializer ; VEC-NEXT: [[INDUCTION:%.*]] = add <2 x i64> [[BROADCAST_SPLAT]], @@ -222,8 +222,8 @@ ; UNROLL-NOSIMPLIFY-NEXT: entry: ; UNROLL-NOSIMPLIFY-NEXT: br label [[FOR_BODY9:%.*]] ; UNROLL-NOSIMPLIFY: for.body9: -; UNROLL-NOSIMPLIFY-NEXT: br i1 [[COND:%.*]], label [[FOR_INC26:%.*]], label [[FOR_BODY14_PREHEADER:%.*]] -; UNROLL-NOSIMPLIFY: for.body14.preheader: +; UNROLL-NOSIMPLIFY-NEXT: br i1 [[COND:%.*]], label [[FOR_INC26:%.*]], label [[TC_CHECK:%.*]] +; UNROLL-NOSIMPLIFY: tc.check: ; UNROLL-NOSIMPLIFY-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; UNROLL-NOSIMPLIFY: vector.ph: ; UNROLL-NOSIMPLIFY-NEXT: br label [[VECTOR_BODY:%.*]] @@ -264,8 +264,8 @@ ; UNROLL-NOSIMPLIFY-NEXT: [[CMP_N:%.*]] = icmp eq i64 1, 0 ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[CMP_N]], label [[FOR_INC26_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; UNROLL-NOSIMPLIFY: scalar.ph: -; UNROLL-NOSIMPLIFY-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ undef, [[MIDDLE_BLOCK]] ], [ undef, [[FOR_BODY14_PREHEADER]] ] -; UNROLL-NOSIMPLIFY-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ undef, [[FOR_BODY14_PREHEADER]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ] +; UNROLL-NOSIMPLIFY-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ undef, [[MIDDLE_BLOCK]] ], [ undef, [[TC_CHECK]] ] +; UNROLL-NOSIMPLIFY-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ undef, [[TC_CHECK]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ] ; UNROLL-NOSIMPLIFY-NEXT: br label [[FOR_BODY14:%.*]] ; UNROLL-NOSIMPLIFY: for.body14: ; UNROLL-NOSIMPLIFY-NEXT: [[INDVARS_IV3:%.*]] = phi i64 [ [[INDVARS_IV_NEXT4:%.*]], [[FOR_INC23:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -349,10 +349,10 @@ ; define void @minimal_bit_widths(i1 %c) { ; UNROLL-LABEL: @minimal_bit_widths( -; UNROLL-NEXT: entry: +; UNROLL-NEXT: tc.check: ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: -; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] +; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[TC_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] ; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 undef, [[INDEX]] ; UNROLL-NEXT: [[INDUCTION3:%.*]] = add i64 [[OFFSET_IDX]], 0 ; UNROLL-NEXT: [[INDUCTION4:%.*]] = add i64 [[OFFSET_IDX]], -1 @@ -398,7 +398,7 @@ ; UNROLL-NEXT: ret void ; ; UNROLL-NOSIMPLIFY-LABEL: @minimal_bit_widths( -; UNROLL-NOSIMPLIFY-NEXT: entry: +; UNROLL-NOSIMPLIFY-NEXT: tc.check: ; UNROLL-NOSIMPLIFY-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; UNROLL-NOSIMPLIFY: vector.ph: ; UNROLL-NOSIMPLIFY-NEXT: br label [[VECTOR_BODY:%.*]] @@ -434,8 +434,8 @@ ; UNROLL-NOSIMPLIFY-NEXT: [[CMP_N:%.*]] = icmp eq i64 undef, undef ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NOSIMPLIFY: scalar.ph: -; UNROLL-NOSIMPLIFY-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ undef, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; UNROLL-NOSIMPLIFY-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ undef, [[MIDDLE_BLOCK]] ], [ undef, [[ENTRY]] ] +; UNROLL-NOSIMPLIFY-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ undef, [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK:%.*]] ] +; UNROLL-NOSIMPLIFY-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ undef, [[MIDDLE_BLOCK]] ], [ undef, [[TC_CHECK]] ] ; UNROLL-NOSIMPLIFY-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NOSIMPLIFY: for.body: ; UNROLL-NOSIMPLIFY-NEXT: [[TMP0:%.*]] = phi i64 [ [[TMP6:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -457,12 +457,12 @@ ; UNROLL-NOSIMPLIFY-NEXT: ret void ; ; VEC-LABEL: @minimal_bit_widths( -; VEC-NEXT: entry: +; VEC-NEXT: tc.check: ; VEC-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <2 x i1> undef, i1 [[C:%.*]], i32 0 ; VEC-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT5]], <2 x i1> undef, <2 x i32> zeroinitializer ; VEC-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC: vector.body: -; VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ] +; VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[TC_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ] ; VEC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> undef, i64 [[INDEX]], i32 0 ; VEC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> undef, <2 x i32> zeroinitializer ; VEC-NEXT: [[INDUCTION:%.*]] = add <2 x i64> [[BROADCAST_SPLAT]], diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll --- a/llvm/test/Transforms/LoopVectorize/induction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction.ll @@ -534,7 +534,7 @@ ; CHECK: br ; CHECK: scalar.ph -; CHECK: phi i8 [ %{{.*}}, %middle.block ], [ %[[LOAD]], %entry ] +; CHECK: phi i8 [ %{{.*}}, %middle.block ], [ %[[LOAD]], %tc.check ] @e = global i8 1, align 1 @d = common global i32 0, align 4 diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll --- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll @@ -165,14 +165,16 @@ ; scalar store the value extracted from the last element of the vector value. ; CHECK-LABEL: inv_val_store_to_inv_address_conditional_diff_values_ic ; CHECK-NEXT: entry: +; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* +; CHECK-NEXT: [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8* ; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 +; CHECK-NEXT: br label [[TC_CHECK:%.*]] +; CHECK: tc.check: ; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[N]], 1 ; CHECK-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8* -; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i64 [[N]], 1 ; CHECK-NEXT: [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1 ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]] @@ -270,15 +272,17 @@ ; vectorization. ; CHECK-LABEL: inv_val_store_to_inv_address_conditional_inv ; CHECK-NEXT: entry: +; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* +; CHECK-NEXT: [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8* ; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[NTRUNC]], [[K:%.*]] +; CHECK-NEXT: br label [[TC_CHECK:%.*]] +; CHECK: tc.check: ; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[N]], 1 ; CHECK-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8* -; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i64 [[N]], 1 ; CHECK-NEXT: [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1 ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]] @@ -366,13 +370,15 @@ ; address. ; CHECK-LABEL: variant_val_store_to_inv_address ; CHECK-NEXT: entry: +; CHECK-NEXT: [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8* +; CHECK-NEXT: [[B2:%.*]] = bitcast i32* [[B:%.*]] to i8* +; CHECK-NEXT: br label [[TC_CHECK:%.*]] +; CHECK: tc.check: ; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[N:%.*]], 1 ; CHECK-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[B2:%.*]] = bitcast i32* [[B:%.*]] to i8* -; CHECK-NEXT: [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8* ; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A1]], i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i64 [[N]], 1 ; CHECK-NEXT: [[SMAX3:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1 diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll --- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll +++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll @@ -2,7 +2,7 @@ ; CHECK-LABEL: @postinc ; CHECK-LABEL: scalar.ph: -; CHECK: %bc.resume.val = phi i32 [ %n.vec, %middle.block ], [ 0, %entry ] +; CHECK: %bc.resume.val = phi i32 [ %n.vec, %middle.block ], [ 0, %tc.check ] ; CHECK-LABEL: for.end: ; CHECK: %[[RET:.*]] = phi i32 [ {{.*}}, %for.body ], [ %n.vec, %middle.block ] ; CHECK: ret i32 %[[RET]] @@ -24,7 +24,7 @@ ; CHECK-LABEL: middle.block: ; CHECK: %[[v3:.+]] = sub i32 %n.vec, 1 ; CHECK-LABEL: scalar.ph: -; CHECK: %bc.resume.val = phi i32 [ %n.vec, %middle.block ], [ 0, %entry ] +; CHECK: %bc.resume.val = phi i32 [ %n.vec, %middle.block ], [ 0, %tc.check ] ; CHECK-LABEL: for.end: ; CHECK: %[[RET:.*]] = phi i32 [ {{.*}}, %for.body ], [ %[[v3]], %middle.block ] ; CHECK: ret i32 %[[RET]] @@ -110,7 +110,7 @@ ; CHECK-LABEL: @multiphi ; CHECK-LABEL: scalar.ph: -; CHECK: %bc.resume.val = phi i32 [ %n.vec, %middle.block ], [ 0, %entry ] +; CHECK: %bc.resume.val = phi i32 [ %n.vec, %middle.block ], [ 0, %tc.check ] ; CHECK-LABEL: for.end: ; CHECK: %phi = phi i32 [ {{.*}}, %for.body ], [ %n.vec, %middle.block ] ; CHECK: %phi2 = phi i32 [ {{.*}}, %for.body ], [ %n.vec, %middle.block ] diff --git a/llvm/test/Transforms/LoopVectorize/pr34681.ll b/llvm/test/Transforms/LoopVectorize/pr34681.ll --- a/llvm/test/Transforms/LoopVectorize/pr34681.ll +++ b/llvm/test/Transforms/LoopVectorize/pr34681.ll @@ -21,7 +21,7 @@ ; Instead the loop is vectorized with an unknown stride. ; CHECK-LABEL: @foo1 -; CHECK: for.body.lr.ph +; CHECK: tc.check ; CHECK-NOT: %ident.check = icmp ne i32 %N, 1 ; CHECK-NOT: %[[TEST:[0-9]+]] = or i1 false, %ident.check ; CHECK-NOT: br i1 %[[TEST]], label %scalar.ph, label %vector.ph @@ -80,7 +80,7 @@ ; CHECK-LABEL: @foo2 -; CHECK: for.body.lr.ph +; CHECK: tc.check ; CHECK-NOT: %ident.check = icmp ne i16 %N, 1 ; CHECK-NOT: %[[TEST:[0-9]+]] = or i1 false, %ident.check ; CHECK-NOT: br i1 %[[TEST]], label %scalar.ph, label %vector.ph diff --git a/llvm/test/Transforms/LoopVectorize/pr38697.ll b/llvm/test/Transforms/LoopVectorize/pr38697.ll --- a/llvm/test/Transforms/LoopVectorize/pr38697.ll +++ b/llvm/test/Transforms/LoopVectorize/pr38697.ll @@ -37,7 +37,7 @@ ; CHECK-NOT: udiv ; CHECK: loop1.body: ; CHECK: while.cond.preheader: -; CHECK: while.body.preheader: +; CHECK: tc.check: ; CHECK-NEXT: [[TMP1:%.*]] = udiv i32 [[TMP0:%.*]], [[COUNT:%.*]] ; CHECK: vector.ph: ; CHECK: exit: diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check.ll b/llvm/test/Transforms/LoopVectorize/runtime-check.ll --- a/llvm/test/Transforms/LoopVectorize/runtime-check.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-check.ll @@ -14,8 +14,8 @@ ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg !4 -; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]], !dbg !4 -; CHECK: for.body.preheader: +; CHECK-NEXT: br i1 [[CMP6]], label [[TC_CHECK:%.*]], label [[FOR_END:%.*]], !dbg !4 +; CHECK: tc.check: ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1, !dbg !9 ; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64, !dbg !9 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1, !dbg !9 @@ -50,7 +50,7 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]], !dbg !9 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]], !dbg !9 ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ], !dbg !9 +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[TC_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], !dbg !9 ; CHECK-NEXT: br label [[FOR_BODY:%.*]], !dbg !9 ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], !dbg !9 @@ -161,11 +161,8 @@ ; CHECK: !9 = !DILocation(line: 101, column: 1, scope: !{{.*}}) define dso_local void @forced_optsize(i64* noalias nocapture readonly %x_p, i64* noalias nocapture readonly %y_p, i64* noalias nocapture %z_p) minsize optsize { -; ; FORCED_OPTSIZE: remark: :0:0: Code-size may be reduced by not forcing vectorization, or by source-code modifications eliminating the need for runtime checks (e.g., adding 'restrict'). -; FORCED_OPTSIZE-LABEL: @forced_optsize( ; FORCED_OPTSIZE: vector.body: -; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll b/llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll --- a/llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll +++ b/llvm/test/Transforms/LoopVectorize/tbaa-nodep.ll @@ -25,9 +25,9 @@ ; runtime checks. ; CHECK-LABEL: @test1 -; CHECK: entry: -; CHECK-NEXT: br label %vector.body -; CHECK: vector.body: +; CHECK: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: ; CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 4, !tbaa ; CHECK: store <4 x i32> %{{.*}}, <4 x i32>* %{{.*}}, align 4, !tbaa