diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2684,8 +2684,10 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass) { Value *Count = getOrCreateTripCount(L); - BasicBlock *BB = L->getLoopPreheader(); - IRBuilder<> Builder(BB->getTerminator()); + // Reuse existing vector loop preheader for TC checks. + // Note that new preheader block is generated for vector loop. + BasicBlock *const TCCheckBlock = LoopVectorPreHeader; + IRBuilder<> Builder(TCCheckBlock->getTerminator()); // Generate code to check if the loop's trip count is less than VF * UF, or // equal to it in case a scalar epilogue is required; this implies that the @@ -2702,48 +2704,55 @@ P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check"); - BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); + // Create new preheader for vector loop. + LoopVectorPreHeader = TCCheckBlock->splitBasicBlock( + TCCheckBlock->getTerminator(), "vector.ph"); // Update dominator tree immediately if the generated block is a // LoopBypassBlock because SCEV expansions to generate loop bypass // checks may query it before the current function is finished. - DT->addNewBlock(NewBB, BB); + DT->addNewBlock(LoopVectorPreHeader, TCCheckBlock); if (L->getParentLoop()) - L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); - ReplaceInstWithInst(BB->getTerminator(), - BranchInst::Create(Bypass, NewBB, CheckMinIters)); - LoopBypassBlocks.push_back(BB); + L->getParentLoop()->addBasicBlockToLoop(LoopVectorPreHeader, *LI); + ReplaceInstWithInst( + TCCheckBlock->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); + LoopBypassBlocks.push_back(TCCheckBlock); } void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { - BasicBlock *BB = L->getLoopPreheader(); + // Reuse existing vector loop preheader for SCEV checks. + // Note that new preheader block is generated for vector loop. + BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; // Generate the code to check that the SCEV assumptions that we made. // We want the new basic block to start at the first instruction in a // sequence of instructions that form a check. SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), "scev.check"); - Value *SCEVCheck = - Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator()); + Value *SCEVCheck = Exp.expandCodeForPredicate( + &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); if (auto *C = dyn_cast(SCEVCheck)) if (C->isZero()) return; - assert(!BB->getParent()->hasOptSize() && + assert(!SCEVCheckBlock->getParent()->hasOptSize() && "Cannot SCEV check stride or overflow when optimizing for size"); - // Create a new block containing the stride check. - BB->setName("vector.scevcheck"); - auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); + SCEVCheckBlock->setName("vector.scevcheck"); + // Create new preheader for vector loop. + LoopVectorPreHeader = SCEVCheckBlock->splitBasicBlock( + SCEVCheckBlock->getTerminator(), "vector.ph"); // Update dominator tree immediately if the generated block is a // LoopBypassBlock because SCEV expansions to generate loop bypass // checks may query it before the current function is finished. - DT->addNewBlock(NewBB, BB); + DT->addNewBlock(LoopVectorPreHeader, SCEVCheckBlock); if (L->getParentLoop()) - L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); - ReplaceInstWithInst(BB->getTerminator(), - BranchInst::Create(Bypass, NewBB, SCEVCheck)); - LoopBypassBlocks.push_back(BB); + L->getParentLoop()->addBasicBlockToLoop(LoopVectorPreHeader, *LI); + ReplaceInstWithInst( + SCEVCheckBlock->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); + LoopBypassBlocks.push_back(SCEVCheckBlock); AddedSafetyChecks = true; } @@ -2752,7 +2761,9 @@ if (EnableVPlanNativePath) return; - BasicBlock *BB = L->getLoopPreheader(); + // Reuse existing vector loop preheader for runtime memory checks. + // Note that new preheader block is generated for vector loop. + BasicBlock *const MemCheckBlock = L->getLoopPreheader(); // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements @@ -2760,11 +2771,11 @@ Instruction *FirstCheckInst; Instruction *MemRuntimeCheck; std::tie(FirstCheckInst, MemRuntimeCheck) = - Legal->getLAI()->addRuntimeChecks(BB->getTerminator()); + Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator()); if (!MemRuntimeCheck) return; - if (BB->getParent()->hasOptSize()) { + if (MemCheckBlock->getParent()->hasOptSize()) { assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && "Cannot emit memory checks when optimizing for size, unless forced " "to vectorize."); @@ -2778,24 +2789,27 @@ }); } - // Create a new block containing the memory check. - BB->setName("vector.memcheck"); - auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); + MemCheckBlock->setName("vector.memcheck"); + // Create new preheader for vector loop. + LoopVectorPreHeader = MemCheckBlock->splitBasicBlock( + MemCheckBlock->getTerminator(), "vector.ph"); // Update dominator tree immediately if the generated block is a // LoopBypassBlock because SCEV expansions to generate loop bypass // checks may query it before the current function is finished. - DT->addNewBlock(NewBB, BB); + DT->addNewBlock(LoopVectorPreHeader, MemCheckBlock); + if (L->getParentLoop()) - L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); - ReplaceInstWithInst(BB->getTerminator(), - BranchInst::Create(Bypass, NewBB, MemRuntimeCheck)); - LoopBypassBlocks.push_back(BB); + L->getParentLoop()->addBasicBlockToLoop(LoopVectorPreHeader, *LI); + ReplaceInstWithInst( + MemCheckBlock->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); + LoopBypassBlocks.push_back(MemCheckBlock); AddedSafetyChecks = true; // We currently don't use LoopVersioning for the actual loop cloning but we // still use it to add the noalias metadata. LVer = std::make_unique(*Legal->getLAI(), OrigLoop, LI, DT, - PSE.getSE()); + PSE.getSE()); LVer->prepareNoAliasMetadata(); } @@ -2920,12 +2934,7 @@ ... */ - BasicBlock *OldBasicBlock = OrigLoop->getHeader(); - BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); - BasicBlock *ExitBlock = OrigLoop->getExitBlock(); MDNode *OrigLoopID = OrigLoop->getLoopID(); - assert(VectorPH && "Invalid loop structure"); - assert(ExitBlock && "Must have an exit block"); // Some loops have a single integer induction variable, while other loops // don't. One example is c++ iterators that often have multiple pointer @@ -2942,12 +2951,18 @@ Type *IdxTy = Legal->getWidestInductionType(); // Split the single block loop into the two loop structure described above. - BasicBlock *VecBody = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); - BasicBlock *MiddleBlock = - VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); - BasicBlock *ScalarPH = - MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); + LoopScalarBody = OrigLoop->getHeader(); + LoopVectorPreHeader = OrigLoop->getLoopPreheader(); + LoopExitBlock = OrigLoop->getExitBlock(); + assert(LoopVectorPreHeader && "Invalid loop structure"); + assert(LoopExitBlock && "Must have an exit block"); + + LoopVectorBody = LoopVectorPreHeader->splitBasicBlock( + LoopVectorPreHeader->getTerminator(), "vector.body"); + LoopMiddleBlock = LoopVectorBody->splitBasicBlock( + LoopVectorBody->getTerminator(), "middle.block"); + LoopScalarPreHeader = LoopMiddleBlock->splitBasicBlock( + LoopMiddleBlock->getTerminator(), "scalar.ph"); // Create and register the new vector loop. Loop *Lp = LI->AllocateLoop(); @@ -2957,12 +2972,12 @@ // before calling any utilities such as SCEV that require valid LoopInfo. if (ParentLoop) { ParentLoop->addChildLoop(Lp); - ParentLoop->addBasicBlockToLoop(ScalarPH, *LI); - ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI); + ParentLoop->addBasicBlockToLoop(LoopScalarPreHeader, *LI); + ParentLoop->addBasicBlockToLoop(LoopMiddleBlock, *LI); } else { LI->addTopLevelLoop(Lp); } - Lp->addBasicBlockToLoop(VecBody, *LI); + Lp->addBasicBlockToLoop(LoopVectorBody, *LI); // Find the loop boundaries. Value *Count = getOrCreateTripCount(Lp); @@ -2974,16 +2989,16 @@ // backedge-taken count is uint##_max: adding one to it will overflow leading // to an incorrect trip count of zero. In this (rare) case we will also jump // to the scalar loop. - emitMinimumIterationCountCheck(Lp, ScalarPH); + emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); // Generate the code to check any assumptions that we've made for SCEV // expressions. - emitSCEVChecks(Lp, ScalarPH); + emitSCEVChecks(Lp, LoopScalarPreHeader); // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. - emitMemRuntimeChecks(Lp, ScalarPH); + emitMemRuntimeChecks(Lp, LoopScalarPreHeader); // Generate the induction variable. // The loop step is equal to the vectorization factor (num of SIMD elements) @@ -3011,8 +3026,9 @@ InductionDescriptor II = InductionEntry.second; // Create phi nodes to merge from the backedge-taken check block. - PHINode *BCResumeVal = PHINode::Create( - OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator()); + PHINode *BCResumeVal = + PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", + LoopScalarPreHeader->getTerminator()); // Copy original phi DL over to the new one. BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); Value *&EndValue = IVEndValues[OrigPhi]; @@ -3023,23 +3039,23 @@ IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); Type *StepType = II.getStep()->getType(); Instruction::CastOps CastOp = - CastInst::getCastOpcode(CountRoundDown, true, StepType, true); + CastInst::getCastOpcode(CountRoundDown, true, StepType, true); Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); - const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); + const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); EndValue->setName("ind.end"); } // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. - BCResumeVal->addIncoming(EndValue, MiddleBlock); + BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); // Fix the scalar body counter (PHI node). // The old induction's phi node in the scalar body needs the truncated // value. for (BasicBlock *BB : LoopBypassBlocks) BCResumeVal->addIncoming(II.getStartValue(), BB); - OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal); + OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); } // We need the OrigLoop (scalar loop part) latch terminator to help @@ -3057,9 +3073,9 @@ // If tail is to be folded, we know we don't need to run the remainder. Value *CmpN = Builder.getTrue(); if (!Cost->foldTailByMasking()) { - CmpN = - CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, - CountRoundDown, "cmp.n", MiddleBlock->getTerminator()); + CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, + CountRoundDown, "cmp.n", + LoopMiddleBlock->getTerminator()); // Here we use the same DebugLoc as the scalar loop latch branch instead // of the corresponding compare because they may have ended up with @@ -3068,20 +3084,15 @@ cast(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); } - BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN); + BranchInst *BrInst = + BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); - ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst); + ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); // Get ready to start creating new instructions into the vectorized body. - Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt()); - - // Save the state. - LoopVectorPreHeader = Lp->getLoopPreheader(); - LoopScalarPreHeader = ScalarPH; - LoopMiddleBlock = MiddleBlock; - LoopExitBlock = ExitBlock; - LoopVectorBody = VecBody; - LoopScalarBody = OldBasicBlock; + assert(LoopVectorPreHeader == Lp->getLoopPreheader() && + "Inconsistent vector loop preheader"); + Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); Optional VectorizedLoopID = makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,