Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -274,7 +274,8 @@ : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(nullptr), OldInduction(nullptr), WidenMap(UnrollFactor), - Legal(nullptr), AddedSafetyChecks(false) {} + TripCount(nullptr), VectorTripCount(nullptr), Legal(nullptr), + AddedSafetyChecks(false) {} // Perform the actual loop widening (vectorization). void vectorize(LoopVectorizationLegality *L) { @@ -383,6 +384,22 @@ /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); + /// Returns (and creates if needed) the original loop trip count. + Value *getOrCreateTripCount(Loop *NewLoop); + + /// Returns (and creates if needed) the trip count of the widened loop. + Value *getOrCreateVectorTripCount(Loop *NewLoop); + + /// Emit a bypass check to see if the trip count would overflow, or we + /// wouldn't have enough iterations to execute one vector loop. + void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass); + /// Emit a bypass check to see if the vector trip count is nonzero. + void emitVectorLoopEnteredCheck(Loop *L, BasicBlock *Bypass); + /// Emit bypass checks to check if strides we've assumed to be one really are. + void emitStrideChecks(Loop *L, BasicBlock *Bypass); + /// Emit bypass checks to check any memory assumptions we may have made. + void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass); + /// This is a helper class that holds the vectorizer state. It maps scalar /// instructions to vector instructions. When the code is 'unrolled' then /// then a single scalar value is mapped to multiple vector parts. The parts @@ -476,6 +493,10 @@ /// Maps scalars to widened vectors. ValueMap WidenMap; EdgeMaskCache MaskCache; + /// Trip count of the original loop. + Value *TripCount; + /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) + Value *VectorTripCount; LoopVectorizationLegality *Legal; @@ -2597,6 +2618,156 @@ return Induction; } +Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) { + if (TripCount) + return TripCount; + + IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); + // Find the loop boundaries. + const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop); + assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); + + Type *IdxTy = Legal->getWidestInductionType(); + + // The exit count might have the type of i64 while the phi is i32. This can + // happen if we have an induction variable that is sign extended before the + // compare. The only way that we get a backedge taken count is that the + // induction variable was signed and as such will not overflow. In such a case + // truncation is legal. + if (ExitCount->getType()->getPrimitiveSizeInBits() > + IdxTy->getPrimitiveSizeInBits()) + ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy); + + const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy); + // Get the total trip count from the count by adding 1. + ExitCount = SE->getAddExpr(BackedgeTakeCount, + SE->getConstant(BackedgeTakeCount->getType(), 1)); + + const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); + + // Expand the trip count and place the new instructions in the preheader. + // Notice that the pre-header does not change, only the loop body. + SCEVExpander Exp(*SE, DL, "induction"); + + // Count holds the overall loop count (N). + TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(), + L->getLoopPreheader()->getTerminator()); + + return TripCount; +} + +Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { + if (VectorTripCount) + return VectorTripCount; + + Value *TC = getOrCreateTripCount(L); + IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); + + // Now we need to generate the expression for N - (N % VF), which is + // the part that the vectorized body will execute. + // The loop step is equal to the vectorization factor (num of SIMD elements) + // times the unroll factor (num of SIMD instructions). + Constant *Step = ConstantInt::get(TC->getType(), VF * UF); + Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); + VectorTripCount = Builder.CreateSub(TC, R, "n.vec"); + + return VectorTripCount; +} + +void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, + BasicBlock *Bypass) { + Value *Count = getOrCreateTripCount(L); + BasicBlock *BB = L->getLoopPreheader(); + IRBuilder<> Builder(BB->getTerminator()); + + // Generate code to check that the loop's trip count that we computed by + // adding one to the backedge-taken count will not overflow. + Value *CheckMinIters = + Builder.CreateICmpULT(Count, + ConstantInt::get(Count->getType(), VF * UF), + "min.iters.check"); + + BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), + "min.iters.checked"); + if (L->getParentLoop()) + L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); + ReplaceInstWithInst(BB->getTerminator(), + BranchInst::Create(Bypass, NewBB, CheckMinIters)); + LoopBypassBlocks.push_back(BB); +} + +void InnerLoopVectorizer::emitVectorLoopEnteredCheck(Loop *L, + BasicBlock *Bypass) { + Value *TC = getOrCreateVectorTripCount(L); + BasicBlock *BB = L->getLoopPreheader(); + IRBuilder<> Builder(BB->getTerminator()); + + // Now, compare the new count to zero. If it is zero skip the vector loop and + // jump to the scalar loop. + Value *Cmp = Builder.CreateICmpEQ(TC, Constant::getNullValue(TC->getType()), + "cmp.zero"); + + // Generate code to check that the loop's trip count that we computed by + // adding one to the backedge-taken count will not overflow. + BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), + "vector.ph"); + if (L->getParentLoop()) + L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); + ReplaceInstWithInst(BB->getTerminator(), + BranchInst::Create(Bypass, NewBB, Cmp)); + LoopBypassBlocks.push_back(BB); +} + +void InnerLoopVectorizer::emitStrideChecks(Loop *L, + BasicBlock *Bypass) { + BasicBlock *BB = L->getLoopPreheader(); + + // Generate the code to check that the strides we assumed to be one are really + // one. We want the new basic block to start at the first instruction in a + // sequence of instructions that form a check. + Instruction *StrideCheck; + Instruction *FirstCheckInst; + std::tie(FirstCheckInst, StrideCheck) = addStrideCheck(BB->getTerminator()); + if (!StrideCheck) + return; + + // Create a new block containing the stride check. + BB->setName("vector.stridecheck"); + auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); + if (L->getParentLoop()) + L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); + ReplaceInstWithInst(BB->getTerminator(), + BranchInst::Create(Bypass, NewBB, StrideCheck)); + LoopBypassBlocks.push_back(BB); + AddedSafetyChecks = true; +} + +void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, + BasicBlock *Bypass) { + BasicBlock *BB = L->getLoopPreheader(); + + // Generate the code that checks in runtime if arrays overlap. We put the + // checks into a separate block to make the more common case of few elements + // faster. + Instruction *FirstCheckInst; + Instruction *MemRuntimeCheck; + std::tie(FirstCheckInst, MemRuntimeCheck) = + Legal->getLAI()->addRuntimeChecks(BB->getTerminator()); + if (!MemRuntimeCheck) + return; + + // Create a new block containing the memory check. + BB->setName("vector.memcheck"); + auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); + if (L->getParentLoop()) + L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); + ReplaceInstWithInst(BB->getTerminator(), + BranchInst::Create(Bypass, NewBB, MemRuntimeCheck)); + LoopBypassBlocks.push_back(BB); + AddedSafetyChecks = true; +} + + void InnerLoopVectorizer::createEmptyLoop() { /* In this function we generate a new loop. The new loop will contain @@ -2654,59 +2825,6 @@ if (OldInduction && IdxTy != OldInduction->getType()) OldInduction = nullptr; - // Find the loop boundaries. - const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop); - assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); - - // The exit count might have the type of i64 while the phi is i32. This can - // happen if we have an induction variable that is sign extended before the - // compare. The only way that we get a backedge taken count is that the - // induction variable was signed and as such will not overflow. In such a case - // truncation is legal. - if (ExitCount->getType()->getPrimitiveSizeInBits() > - IdxTy->getPrimitiveSizeInBits()) - ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy); - - const SCEV *BackedgeTakeCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy); - // Get the total trip count from the count by adding 1. - ExitCount = SE->getAddExpr(BackedgeTakeCount, - SE->getConstant(BackedgeTakeCount->getType(), 1)); - - const DataLayout &DL = OldBasicBlock->getModule()->getDataLayout(); - - // Expand the trip count and place the new instructions in the preheader. - // Notice that the pre-header does not change, only the loop body. - SCEVExpander Exp(*SE, DL, "induction"); - - // The loop minimum iterations check below is to ensure the loop has enough - // trip count so the generated vector loop will likely be executed and the - // preparation and rounding-off costs will likely be worthy. - // - // The minimum iteration check also covers case where the backedge-taken - // count is uint##_max. Adding one to it will cause overflow and an - // incorrect loop trip count being generated in the vector body. In this - // case we also want to directly jump to the scalar remainder loop. - Value *ExitCountValue = Exp.expandCodeFor(ExitCount, ExitCount->getType(), - VectorPH->getTerminator()); - if (ExitCountValue->getType()->isPointerTy()) - ExitCountValue = CastInst::CreatePointerCast(ExitCountValue, IdxTy, - "exitcount.ptrcnt.to.int", - VectorPH->getTerminator()); - - Instruction *CheckMinIters = - CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT, ExitCountValue, - ConstantInt::get(ExitCountValue->getType(), VF * UF), - "min.iters.check", VectorPH->getTerminator()); - - Builder.SetInsertPoint(VectorPH->getTerminator()); - StartIdx = ConstantInt::get(IdxTy, 0); - - // Count holds the overall loop count (N). - Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), - VectorPH->getTerminator()); - - LoopBypassBlocks.push_back(VectorPH); - // Split the single block loop into the two loop structure described above. BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); @@ -2730,107 +2848,43 @@ } Lp->addBasicBlockToLoop(VecBody, *LI); - // Use this IR builder to create the loop instructions (Phi, Br, Cmp) - // inside the loop. - Builder.SetInsertPoint(VecBody->getFirstNonPHI()); - - // Generate code to check that the loop's trip count is not less than the - // minimum loop iteration number threshold. - BasicBlock *NewVectorPH = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "min.iters.checked"); - if (ParentLoop) - ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); - ReplaceInstWithInst(VectorPH->getTerminator(), - BranchInst::Create(ScalarPH, NewVectorPH, CheckMinIters)); - VectorPH = NewVectorPH; + // Find the loop boundaries. + Value *Count = getOrCreateTripCount(Lp); + StartIdx = ConstantInt::get(IdxTy, 0); - // This is the IR builder that we use to add all of the logic for bypassing - // the new vector loop. - IRBuilder<> BypassBuilder(VectorPH->getTerminator()); - setDebugLocFromInst(BypassBuilder, - getDebugLocFromInstOrOperands(OldInduction)); + // We need to test whether the backedge-taken count is uint##_max. Adding one + // to it will cause overflow and an incorrect loop trip count in the vector + // body. In case of overflow we want to directly jump to the scalar remainder + // loop. + emitMinimumIterationCountCheck(Lp, ScalarPH); + // Now, compare the new count to zero. If it is zero skip the vector loop and + // jump to the scalar loop. + emitVectorLoopEnteredCheck(Lp, MiddleBlock); + // Generate the code to check that the strides we assumed to be one are really + // one. We want the new basic block to start at the first instruction in a + // sequence of instructions that form a check. + emitStrideChecks(Lp, MiddleBlock); + // Generate the code that checks in runtime if arrays overlap. We put the + // checks into a separate block to make the more common case of few elements + // faster. + emitMemRuntimeChecks(Lp, MiddleBlock); // Add the start index to the loop count to get the new end index. - Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx"); + Value *CountRoundDown = getOrCreateVectorTripCount(Lp); - // Now we need to generate the expression for N - (N % VF), which is - // the part that the vectorized body will execute. + // Generate the induction variable. // The loop step is equal to the vectorization factor (num of SIMD elements) // times the unroll factor (num of SIMD instructions). Constant *Step = ConstantInt::get(IdxTy, VF * UF); - Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf"); - Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec"); - Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx, - "end.idx.rnd.down"); - - // Generate the induction variable. Induction = - createInductionVariable(Lp, StartIdx, IdxEndRoundDown, Step, + createInductionVariable(Lp, StartIdx, CountRoundDown, Step, getDebugLocFromInstOrOperands(OldInduction)); - - // Now, compare the new count to zero. If it is zero skip the vector loop and - // jump to the scalar loop. - Value *Cmp = - BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, "cmp.zero"); - NewVectorPH = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph"); - if (ParentLoop) - ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); - LoopBypassBlocks.push_back(VectorPH); - ReplaceInstWithInst(VectorPH->getTerminator(), - BranchInst::Create(MiddleBlock, NewVectorPH, Cmp)); - VectorPH = NewVectorPH; - // Generate the code to check that the strides we assumed to be one are really - // one. We want the new basic block to start at the first instruction in a - // sequence of instructions that form a check. - Instruction *StrideCheck; - Instruction *FirstCheckInst; - std::tie(FirstCheckInst, StrideCheck) = - addStrideCheck(VectorPH->getTerminator()); - if (StrideCheck) { - AddedSafetyChecks = true; - // Create a new block containing the stride check. - VectorPH->setName("vector.stridecheck"); - NewVectorPH = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph"); - if (ParentLoop) - ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); - LoopBypassBlocks.push_back(VectorPH); - - // Replace the branch into the memory check block with a conditional branch - // for the "few elements case". - ReplaceInstWithInst( - VectorPH->getTerminator(), - BranchInst::Create(MiddleBlock, NewVectorPH, StrideCheck)); - - VectorPH = NewVectorPH; - } - - // Generate the code that checks in runtime if arrays overlap. We put the - // checks into a separate block to make the more common case of few elements - // faster. - Instruction *MemRuntimeCheck; - std::tie(FirstCheckInst, MemRuntimeCheck) = - Legal->getLAI()->addRuntimeChecks(VectorPH->getTerminator()); - if (MemRuntimeCheck) { - AddedSafetyChecks = true; - // Create a new block containing the memory check. - VectorPH->setName("vector.memcheck"); - NewVectorPH = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.ph"); - if (ParentLoop) - ParentLoop->addBasicBlockToLoop(NewVectorPH, *LI); - LoopBypassBlocks.push_back(VectorPH); - - // Replace the branch into the memory check block with a conditional branch - // for the "few elements case". - ReplaceInstWithInst( - VectorPH->getTerminator(), - BranchInst::Create(MiddleBlock, NewVectorPH, MemRuntimeCheck)); - - VectorPH = NewVectorPH; - } + // This is the IR builder that we use to add all of the logic for bypassing + // the new vector loop. + IRBuilder<> BypassBuilder(LoopBypassBlocks.back()->getTerminator()); + setDebugLocFromInst(BypassBuilder, + getDebugLocFromInstOrOperands(OldInduction)); // We are going to resume the execution of the scalar loop. // Go over all of the induction variables that we found and fix the @@ -2846,8 +2900,6 @@ PHINode *ResumeIndex = nullptr; LoopVectorizationLegality::InductionList::iterator I, E; LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); - // Set builder to point to last bypass block. - BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator()); for (I = List->begin(), E = List->end(); I != E; ++I) { PHINode *OrigPhi = I->first; InductionDescriptor II = I->second; @@ -2863,7 +2915,7 @@ Value *EndValue; if (OrigPhi == OldInduction) { // We know what the end value is. - EndValue = IdxEndRoundDown; + EndValue = CountRoundDown; // We also know which PHI node holds it. ResumeIndex = ResumeVal; } else { @@ -2899,7 +2951,7 @@ MiddleBlock->getTerminator()); for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]); - ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); + ResumeIndex->addIncoming(CountRoundDown, VecBody); } // Make sure that we found the index where scalar loop needs to continue. @@ -2909,7 +2961,7 @@ // Add a check in the middle block to see if we have completed // all of the iterations in the first vector loop. // If (N - N%VF) == N, then we *don't* need to run the remainder. - Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, + Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, ResumeIndex, "cmp.n", MiddleBlock->getTerminator()); ReplaceInstWithInst(MiddleBlock->getTerminator(), @@ -2919,7 +2971,7 @@ Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); // Save the state. - LoopVectorPreHeader = VectorPH; + LoopVectorPreHeader = Lp->getLoopPreheader(); LoopScalarPreHeader = ScalarPH; LoopMiddleBlock = MiddleBlock; LoopExitBlock = ExitBlock; Index: test/Transforms/LoopVectorize/debugloc.ll =================================================================== --- test/Transforms/LoopVectorize/debugloc.ll +++ test/Transforms/LoopVectorize/debugloc.ll @@ -12,7 +12,7 @@ ; CHECK: load <2 x i32>, <2 x i32>* {{.*}}, !dbg ![[LOC2]] ; CHECK: add <2 x i32> {{.*}}, !dbg ![[LOC2]] ; CHECK: add i64 %index, 2, !dbg ![[LOC]] -; CHECK: icmp eq i64 %index.next, %end.idx.rnd.down, !dbg ![[LOC]] +; CHECK: icmp eq i64 %index.next, %n.vec, !dbg ![[LOC]] ; CHECK: middle.block ; CHECK: add <2 x i32> %rdx.vec.exit.phi, %rdx.shuf, !dbg ![[LOC2]] ; CHECK: extractelement <2 x i32> %bin.rdx, i32 0, !dbg ![[LOC2]] Index: test/Transforms/LoopVectorize/induction.ll =================================================================== --- test/Transforms/LoopVectorize/induction.ll +++ test/Transforms/LoopVectorize/induction.ll @@ -112,8 +112,7 @@ ; condition and branch directly to the scalar loop. ; CHECK-LABEL: max_i32_backedgetaken -; CHECK: %min.iters.check = icmp ult i32 0, 2 -; CHECK: br i1 %min.iters.check, label %scalar.ph, label %min.iters.checked +; CHECK: br i1 true, label %scalar.ph, label %min.iters.checked ; CHECK: scalar.ph: ; CHECK: %bc.resume.val = phi i32 [ %resume.val, %middle.block ], [ 0, %0 ]