diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -370,9 +370,8 @@ return None; } - +struct GeneratedRTChecks; namespace llvm { - /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -395,11 +394,12 @@ const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, unsigned VecWidth, unsigned UnrollFactor, LoopVectorizationLegality *LVL, - LoopVectorizationCostModel *CM) + LoopVectorizationCostModel *CM, GeneratedRTChecks &Check) : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), - VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {} + VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), + Check(Check) {} virtual ~InnerLoopVectorizer() = default; /// Create a new empty loop. Unlink the old loop and connect the new one. @@ -756,6 +756,7 @@ /// The profitablity analysis. LoopVectorizationCostModel *Cost; + GeneratedRTChecks &Check; // Record whether runtime checks are added. bool AddedSafetyChecks = false; @@ -776,9 +777,9 @@ const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, LoopVectorizationLegality *LVL, - LoopVectorizationCostModel *CM) + LoopVectorizationCostModel *CM, GeneratedRTChecks &Check) : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1, - UnrollFactor, LVL, CM) {} + UnrollFactor, LVL, CM, Check) {} private: Value *getBroadcastInstrs(Value *V) override; @@ -1509,9 +1510,81 @@ /// Values to ignore in the cost model when VF > 1. SmallPtrSet VecValuesToIgnore; }; - } // end namespace llvm +/// Helper struct to manage generating runtime checks for vectorization. +/// +/// The runtime checks are created up-front in a temporary block to allow better +/// estimating the cost and un-linked from the existing IR. After deciding to +/// vectorize, the checks are moved backed. If deciding not to vectorize, the +/// temporary block is completely removed. +struct GeneratedRTChecks { + BasicBlock *TmpBlock = nullptr; + BasicBlock *Preheader; + ScalarEvolution *SE; + Value *SCEVCheck; + Instruction *FirstCheckInst; + Instruction *MemRuntimeCheck; + + /// Generate runtime checks in temporary block (Checks.TmpBlock), so we can + /// accurately estimate the cost of the runtime checks. The block is un-linked + /// from the IR and is added back during vector code generation. If there is + /// no vector code generation, the check blocks is removed completely. + + static GeneratedRTChecks Create(BasicBlock *Preheader, + const LoopAccessInfo &LAI, + PredicatedScalarEvolution &PSE, + DominatorTree *DT, LoopInfo *LI) { + GeneratedRTChecks Checks; + Checks.SE = PSE.getSE(); + Checks.Preheader = Preheader; + BasicBlock *LoopHeader = Preheader->getSingleSuccessor(); + Checks.TmpBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, + nullptr, "tmp.rtchecks"); + + SCEVExpander Exp(*Checks.SE, Preheader->getModule()->getDataLayout(), + "scev.check"); + Checks.SCEVCheck = Exp.expandCodeForPredicate( + &PSE.getUnionPredicate(), Checks.TmpBlock->getTerminator()); + + std::tie(Checks.FirstCheckInst, Checks.MemRuntimeCheck) = + LAI.addRuntimeChecks(Checks.TmpBlock->getTerminator()); + + // Unhook the temporary block with the checks, update various places + // accordingly. + Checks.TmpBlock->replaceAllUsesWith(Checks.Preheader); + Checks.TmpBlock->getTerminator()->moveBefore( + Checks.Preheader->getTerminator()); + Checks.Preheader->getTerminator()->eraseFromParent(); + DT->changeImmediateDominator(LoopHeader, Checks.Preheader); + DT->eraseNode(Checks.TmpBlock); + LI->removeBlock(Checks.TmpBlock); + return Checks; + } + + ~GeneratedRTChecks() { + if (!TmpBlock) + return; + + // Completely remove the block. + for (auto &I : make_early_inc_range(reverse(*TmpBlock))) { + SE->forgetValue(&I); + SE->eraseValueFromMap(&I); + I.eraseFromParent(); + } + TmpBlock->eraseFromParent(); + + // ScalarEvolutionExpander may insert some instructions here. We clean them up here, to avoid unexpected instructions appearing. + for (auto &I : make_early_inc_range(reverse(*Preheader))) { + if (I.getType()->isVoidTy() || I.mayReadOrWriteMemory() || !I.use_empty()) + continue; + SE->forgetValue(&I); + SE->eraseValueFromMap(&I); + I.eraseFromParent(); + } + } +}; + // Return true if \p OuterLp is an outer loop annotated with hints for explicit // vectorization. The loop needs to be annotated with #pragma omp simd // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the @@ -2718,17 +2791,11 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { // Reuse existing vector loop preheader for SCEV checks. // Note that new preheader block is generated for vector loop. - BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; - - // Generate the code to check that the SCEV assumptions that we made. - // We want the new basic block to start at the first instruction in a - // sequence of instructions that form a check. - SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), - "scev.check"); - Value *SCEVCheck = Exp.expandCodeForPredicate( - &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); + BasicBlock *const SCEVCheckBlock = Check.TmpBlock; - if (auto *C = dyn_cast(SCEVCheck)) + if (!Check.TmpBlock) + return; + if (auto *C = dyn_cast(Check.SCEVCheck)) if (C->isZero()) return; @@ -2736,10 +2803,30 @@ "Cannot SCEV check stride or overflow when optimizing for size"); SCEVCheckBlock->setName("vector.scevcheck"); + + auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); + + BranchInst::Create(LoopVectorPreHeader, Check.TmpBlock); // Create new preheader for vector loop. - LoopVectorPreHeader = - SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, - nullptr, "vector.ph"); + Check.TmpBlock = SplitBlock(SCEVCheckBlock, + cast(Check.SCEVCheck)->getNextNode(), + nullptr, nullptr, nullptr, ""); + + if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) + PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); + + Check.TmpBlock->replaceAllUsesWith(SCEVCheckBlock); + Check.TmpBlock->getTerminator()->moveBefore(SCEVCheckBlock->getTerminator()); + SCEVCheckBlock->getTerminator()->eraseFromParent(); + SCEVCheckBlock->moveBefore(LoopVectorPreHeader); + + auto *PHTerm = Pred->getTerminator(); + for (unsigned i = 0; i < PHTerm->getNumSuccessors(); i++) + if (PHTerm->getSuccessor(i) == LoopVectorPreHeader) + PHTerm->setSuccessor(i, SCEVCheckBlock); + + DT->addNewBlock(SCEVCheckBlock, Pred); + DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); // Update dominator only if this is first RT check. if (LoopBypassBlocks.empty()) { @@ -2749,7 +2836,7 @@ ReplaceInstWithInst( SCEVCheckBlock->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); + BranchInst::Create(Bypass, LoopVectorPreHeader, Check.SCEVCheck)); LoopBypassBlocks.push_back(SCEVCheckBlock); AddedSafetyChecks = true; } @@ -2761,16 +2848,10 @@ // Reuse existing vector loop preheader for runtime memory checks. // Note that new preheader block is generated for vector loop. - BasicBlock *const MemCheckBlock = L->getLoopPreheader(); + BasicBlock *const MemCheckBlock = Check.TmpBlock; - // Generate the code that checks in runtime if arrays overlap. We put the - // checks into a separate block to make the more common case of few elements - // faster. - Instruction *FirstCheckInst; - Instruction *MemRuntimeCheck; - std::tie(FirstCheckInst, MemRuntimeCheck) = - Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator()); - if (!MemRuntimeCheck) + // Check if we generated code that checks in runtime if arrays overlap. We put the checks into a separate block to make the more common case of few elements faster. + if (!Check.MemRuntimeCheck) return; if (MemCheckBlock->getParent()->hasOptSize()) { @@ -2787,21 +2868,33 @@ }); } + auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); + auto *PHTerm = Pred->getTerminator(); + for (unsigned i = 0; i < PHTerm->getNumSuccessors(); i++) + if (PHTerm->getSuccessor(i) == LoopVectorPreHeader) + PHTerm->setSuccessor(i, Check.TmpBlock); + auto *BI = BranchInst::Create(LoopVectorPreHeader, Check.TmpBlock); + BI->setDebugLoc(PHTerm->getDebugLoc()); + + DT->addNewBlock(Check.TmpBlock, Pred); + DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); + Check.TmpBlock->moveBefore(LoopVectorPreHeader); + + if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) + PL->addBasicBlockToLoop(Check.TmpBlock, *LI); + MemCheckBlock->setName("vector.memcheck"); - // Create new preheader for vector loop. - LoopVectorPreHeader = - SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, - "vector.ph"); // Update dominator only if this is first RT check. if (LoopBypassBlocks.empty()) { DT->changeImmediateDominator(Bypass, MemCheckBlock); DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); } + Check.TmpBlock = nullptr; ReplaceInstWithInst( MemCheckBlock->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); + BranchInst::Create(Bypass, LoopVectorPreHeader, Check.MemRuntimeCheck)); LoopBypassBlocks.push_back(MemCheckBlock); AddedSafetyChecks = true; @@ -7596,8 +7689,9 @@ LVP.setBestPlan(VF.Width, 1); + GeneratedRTChecks Check; InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, - &CM); + &CM, Check); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); LVP.executePlan(LB, DT); @@ -7756,6 +7850,9 @@ IC = CM.selectInterleaveCount(VF.Width, VF.Cost); } + GeneratedRTChecks Checks = GeneratedRTChecks::Create( + L->getLoopPreheader(), *LVL.getLAI(), PSE, DT, LI); + // Identify the diagnostic messages that should be produced. std::pair VecDiagMsg, IntDiagMsg; bool VectorizeLoop = true, InterleaveLoop = true; @@ -7855,8 +7952,8 @@ assert(IC > 1 && "interleave count should not be 1 or 0"); // If we decided that it is not legal to vectorize the loop, then // interleave it. - InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, - &CM); + InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, + Checks); LVP.executePlan(Unroller, DT); ORE->emit([&]() { @@ -7868,7 +7965,7 @@ } else { // If we decided that it is *legal* to vectorize the loop, then do it. InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, - &LVL, &CM); + &LVL, &CM, Checks); LVP.executePlan(LB, DT); ++LoopsVectorized;