diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2677,16 +2677,16 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass) { Value *Count = getOrCreateTripCount(L); - BasicBlock *BB = L->getLoopPreheader(); - IRBuilder<> Builder(BB->getTerminator()); + BasicBlock *const OrigPreHeader = LoopVectorPreHeader; + IRBuilder<> Builder(OrigPreHeader->getTerminator()); // Generate code to check if the loop's trip count is less than VF * UF, or // equal to it in case a scalar epilogue is required; this implies that the // vector trip count is zero. This check also covers the case where adding one // to the backedge-taken count overflowed leading to an incorrect trip count // of zero. In this case we will also jump to the scalar loop. - auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE - : ICmpInst::ICMP_ULT; + auto P = + Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; // If tail is to be folded, vector loop takes care of all iterations. Value *CheckMinIters = Builder.getFalse(); @@ -2695,48 +2695,52 @@ P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check"); - BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); + LoopVectorPreHeader = OrigPreHeader->splitBasicBlock( + OrigPreHeader->getTerminator(), "vector.ph"); // Update dominator tree immediately if the generated block is a // LoopBypassBlock because SCEV expansions to generate loop bypass // checks may query it before the current function is finished. - DT->addNewBlock(NewBB, BB); + DT->addNewBlock(LoopVectorPreHeader, OrigPreHeader); if (L->getParentLoop()) - L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); - ReplaceInstWithInst(BB->getTerminator(), - BranchInst::Create(Bypass, NewBB, CheckMinIters)); - LoopBypassBlocks.push_back(BB); + L->getParentLoop()->addBasicBlockToLoop(LoopVectorPreHeader, *LI); + ReplaceInstWithInst( + OrigPreHeader->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); + LoopBypassBlocks.push_back(OrigPreHeader); } void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { - BasicBlock *BB = L->getLoopPreheader(); + BasicBlock *const OrigPreHeader = LoopVectorPreHeader; // Generate the code to check that the SCEV assumptions that we made. // We want the new basic block to start at the first instruction in a // sequence of instructions that form a check. SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), "scev.check"); - Value *SCEVCheck = - Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator()); + Value *SCEVCheck = Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), + OrigPreHeader->getTerminator()); if (auto *C = dyn_cast(SCEVCheck)) if (C->isZero()) return; - assert(!BB->getParent()->hasOptSize() && + assert(!OrigPreHeader->getParent()->hasOptSize() && "Cannot SCEV check stride or overflow when optimizing for size"); // Create a new block containing the stride check. - BB->setName("vector.scevcheck"); - auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); + OrigPreHeader->setName("vector.scevcheck"); + LoopVectorPreHeader = OrigPreHeader->splitBasicBlock( + OrigPreHeader->getTerminator(), "vector.ph"); // Update dominator tree immediately if the generated block is a // LoopBypassBlock because SCEV expansions to generate loop bypass // checks may query it before the current function is finished. - DT->addNewBlock(NewBB, BB); + DT->addNewBlock(LoopVectorPreHeader, OrigPreHeader); if (L->getParentLoop()) - L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); - ReplaceInstWithInst(BB->getTerminator(), - BranchInst::Create(Bypass, NewBB, SCEVCheck)); - LoopBypassBlocks.push_back(BB); + L->getParentLoop()->addBasicBlockToLoop(LoopVectorPreHeader, *LI); + ReplaceInstWithInst( + OrigPreHeader->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); + LoopBypassBlocks.push_back(OrigPreHeader); AddedSafetyChecks = true; } @@ -2745,7 +2749,7 @@ if (EnableVPlanNativePath) return; - BasicBlock *BB = L->getLoopPreheader(); + BasicBlock *const OrigPreHeader = L->getLoopPreheader(); // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements @@ -2753,11 +2757,11 @@ Instruction *FirstCheckInst; Instruction *MemRuntimeCheck; std::tie(FirstCheckInst, MemRuntimeCheck) = - Legal->getLAI()->addRuntimeChecks(BB->getTerminator()); + Legal->getLAI()->addRuntimeChecks(OrigPreHeader->getTerminator()); if (!MemRuntimeCheck) return; - if (BB->getParent()->hasOptSize()) { + if (OrigPreHeader->getParent()->hasOptSize()) { assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced " "to vectorize."); @@ -2772,23 +2776,26 @@ } // Create a new block containing the memory check. - BB->setName("vector.memcheck"); - auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); + OrigPreHeader->setName("vector.memcheck"); + LoopVectorPreHeader = OrigPreHeader->splitBasicBlock( + OrigPreHeader->getTerminator(), "vector.ph"); // Update dominator tree immediately if the generated block is a // LoopBypassBlock because SCEV expansions to generate loop bypass // checks may query it before the current function is finished. - DT->addNewBlock(NewBB, BB); + DT->addNewBlock(LoopVectorPreHeader, OrigPreHeader); + if (L->getParentLoop()) - L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); - ReplaceInstWithInst(BB->getTerminator(), - BranchInst::Create(Bypass, NewBB, MemRuntimeCheck)); - LoopBypassBlocks.push_back(BB); + L->getParentLoop()->addBasicBlockToLoop(LoopVectorPreHeader, *LI); + ReplaceInstWithInst( + OrigPreHeader->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); + LoopBypassBlocks.push_back(OrigPreHeader); AddedSafetyChecks = true; // We currently don't use LoopVersioning for the actual loop cloning but we // still use it to add the noalias metadata. LVer = std::make_unique(*Legal->getLAI(), OrigLoop, LI, DT, - PSE.getSE()); + PSE.getSE()); LVer->prepareNoAliasMetadata(); } @@ -2913,12 +2920,7 @@ ... */ - BasicBlock *OldBasicBlock = OrigLoop->getHeader(); - BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); - BasicBlock *ExitBlock = OrigLoop->getExitBlock(); MDNode *OrigLoopID = OrigLoop->getLoopID(); - assert(VectorPH && "Invalid loop structure"); - assert(ExitBlock && "Must have an exit block"); // Some loops have a single integer induction variable, while other loops // don't. One example is c++ iterators that often have multiple pointer @@ -2935,12 +2937,18 @@ Type *IdxTy = Legal->getWidestInductionType(); // Split the single block loop into the two loop structure described above. - BasicBlock *VecBody = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); - BasicBlock *MiddleBlock = - VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); - BasicBlock *ScalarPH = - MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); + LoopScalarBody = OrigLoop->getHeader(); + LoopVectorPreHeader = OrigLoop->getLoopPreheader(); + LoopExitBlock = OrigLoop->getExitBlock(); + assert(LoopVectorPreHeader && "Invalid loop structure"); + assert(LoopExitBlock && "Must have an exit block"); + + LoopVectorBody = LoopVectorPreHeader->splitBasicBlock( + LoopVectorPreHeader->getTerminator(), "vector.body"); + LoopMiddleBlock = LoopVectorBody->splitBasicBlock( + LoopVectorBody->getTerminator(), "middle.block"); + LoopScalarPreHeader = LoopMiddleBlock->splitBasicBlock( + LoopMiddleBlock->getTerminator(), "scalar.ph"); // Create and register the new vector loop. Loop *Lp = LI->AllocateLoop(); @@ -2950,12 +2958,12 @@ // before calling any utilities such as SCEV that require valid LoopInfo. if (ParentLoop) { ParentLoop->addChildLoop(Lp); - ParentLoop->addBasicBlockToLoop(ScalarPH, *LI); - ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI); + ParentLoop->addBasicBlockToLoop(LoopScalarPreHeader, *LI); + ParentLoop->addBasicBlockToLoop(LoopMiddleBlock, *LI); } else { LI->addTopLevelLoop(Lp); } - Lp->addBasicBlockToLoop(VecBody, *LI); + Lp->addBasicBlockToLoop(LoopVectorBody, *LI); // Find the loop boundaries. Value *Count = getOrCreateTripCount(Lp); @@ -2967,16 +2975,16 @@ // backedge-taken count is uint##_max: adding one to it will overflow leading // to an incorrect trip count of zero. In this (rare) case we will also jump // to the scalar loop. - emitMinimumIterationCountCheck(Lp, ScalarPH); + emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); // Generate the code to check any assumptions that we've made for SCEV // expressions. - emitSCEVChecks(Lp, ScalarPH); + emitSCEVChecks(Lp, LoopScalarPreHeader); // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. - emitMemRuntimeChecks(Lp, ScalarPH); + emitMemRuntimeChecks(Lp, LoopScalarPreHeader); // Generate the induction variable. // The loop step is equal to the vectorization factor (num of SIMD elements) @@ -3004,8 +3012,9 @@ InductionDescriptor II = InductionEntry.second; // Create phi nodes to merge from the backedge-taken check block. - PHINode *BCResumeVal = PHINode::Create( - OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator()); + PHINode *BCResumeVal = + PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", + LoopScalarPreHeader->getTerminator()); // Copy original phi DL over to the new one. BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); Value *&EndValue = IVEndValues[OrigPhi]; @@ -3016,23 +3025,23 @@ IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); Type *StepType = II.getStep()->getType(); Instruction::CastOps CastOp = - CastInst::getCastOpcode(CountRoundDown, true, StepType, true); + CastInst::getCastOpcode(CountRoundDown, true, StepType, true); Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); - const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); + const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); EndValue->setName("ind.end"); } // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. - BCResumeVal->addIncoming(EndValue, MiddleBlock); + BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); // Fix the scalar body counter (PHI node). // The old induction's phi node in the scalar body needs the truncated // value. for (BasicBlock *BB : LoopBypassBlocks) BCResumeVal->addIncoming(II.getStartValue(), BB); - OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal); + OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); } // We need the OrigLoop (scalar loop part) latch terminator to help @@ -3050,9 +3059,9 @@ // If tail is to be folded, we know we don't need to run the remainder. Value *CmpN = Builder.getTrue(); if (!Cost->foldTailByMasking()) { - CmpN = - CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, - CountRoundDown, "cmp.n", MiddleBlock->getTerminator()); + CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, + CountRoundDown, "cmp.n", + LoopMiddleBlock->getTerminator()); // Here we use the same DebugLoc as the scalar loop latch branch instead // of the corresponding compare because they may have ended up with @@ -3061,24 +3070,17 @@ cast(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); } - BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN); + BranchInst *BrInst = + BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); - ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst); + ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); // Get ready to start creating new instructions into the vectorized body. - Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt()); - - // Save the state. - LoopVectorPreHeader = Lp->getLoopPreheader(); - LoopScalarPreHeader = ScalarPH; - LoopMiddleBlock = MiddleBlock; - LoopExitBlock = ExitBlock; - LoopVectorBody = VecBody; - LoopScalarBody = OldBasicBlock; + Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); Optional VectorizedLoopID = - makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, - LLVMLoopVectorizeFollowupVectorized}); + makeFollowupLoopID(OrigLoopID, { LLVMLoopVectorizeFollowupAll, + LLVMLoopVectorizeFollowupVectorized }); if (VectorizedLoopID.hasValue()) { Lp->setLoopID(VectorizedLoopID.getValue());