diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -410,8 +410,11 @@ virtual ~InnerLoopVectorizer() = default; - /// Create a new empty loop. Unlink the old loop and connect the new one. - /// Return the pre-header block of the new loop. + /// Create a new empty loop that will contain vectorized instructions later + /// on, while the old loop will be used as the scalar remainder. Control flow + /// is generated around the vectorized (and scalar epilogue) loops consisting + /// of various checks and bypasses. Return the pre-header block of the new + /// loop. BasicBlock *createVectorizedLoopSkeleton(); /// Widen a single instruction within the innermost loop. @@ -662,6 +665,22 @@ const DataLayout &DL, const InductionDescriptor &ID) const; + /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, + /// vector loop preheader, middle block and scalar preheader. Also + /// allocate a loop object for the new vector loop and return it. + Loop *createVectorLoopSkeleton(StringRef Prefix); + + /// Create new phi nodes for the induction variables to resume iteration count + /// in the scalar epilogue, from where the vectorized loop left off (given by + /// \p VectorTripCount). + void createInductionResumeValues(Loop *L, Value *VectorTripCount); + + /// Complete the loop skeleton by adding debug MDs, creating appropriate + /// conditional branches in the middle block, preparing the builder and + /// running the verifier. Take in the vector loop \p L as argument, and return + /// the preheader of the completed vector loop. + BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID); + /// Add additional metadata to \p To that was not present on \p Orig. /// /// Currently this is used to add the noalias annotations based on the @@ -2957,56 +2976,7 @@ llvm_unreachable("invalid enum"); } -BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { - /* - In this function we generate a new loop. The new loop will contain - the vectorized instructions while the old loop will continue to run the - scalar remainder. - - [ ] <-- loop iteration number check. - / | - / v - | [ ] <-- vector loop bypass (may consist of multiple blocks). - | / | - | / v - || [ ] <-- vector pre header. - |/ | - | v - | [ ] \ - | [ ]_| <-- vector loop. - | | - | v - | -[ ] <--- middle-block. - | / | - | / v - -|- >[ ] <--- new preheader. - | | - | v - | [ ] \ - | [ ]_| <-- old scalar loop to handle remainder. - \ | - \ v - >[ ] <-- exit block. - ... - */ - - MDNode *OrigLoopID = OrigLoop->getLoopID(); - - // Some loops have a single integer induction variable, while other loops - // don't. One example is c++ iterators that often have multiple pointer - // induction variables. In the code below we also support a case where we - // don't have a single induction variable. - // - // We try to obtain an induction variable from the original loop as hard - // as possible. However if we don't find one that: - // - is an integer - // - counts from zero, stepping by one - // - is the size of the widest induction variable type - // then we create a new one. - OldInduction = Legal->getPrimaryInduction(); - Type *IdxTy = Legal->getWidestInductionType(); - - // Split the single block loop into the two loop structure described above. +Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { LoopScalarBody = OrigLoop->getHeader(); LoopVectorPreHeader = OrigLoop->getLoopPreheader(); LoopExitBlock = OrigLoop->getExitBlock(); @@ -3015,16 +2985,16 @@ LoopMiddleBlock = SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, - LI, nullptr, "middle.block"); + LI, nullptr, Twine(Prefix) + "middle.block"); LoopScalarPreHeader = SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, - nullptr, "scalar.ph"); + nullptr, Twine(Prefix) + "scalar.ph"); // We intentionally don't let SplitBlock to update LoopInfo since // LoopVectorBody should belong to another loop than LoopVectorPreHeader. // LoopVectorBody is explicitly added to the correct place few lines later. LoopVectorBody = SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, - nullptr, nullptr, "vector.body"); + nullptr, nullptr, Twine(Prefix) + "vector.body"); // Update dominator for loop exit. DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); @@ -3041,37 +3011,12 @@ LI->addTopLevelLoop(Lp); } Lp->addBasicBlockToLoop(LoopVectorBody, *LI); + return Lp; +} - // Find the loop boundaries. - Value *Count = getOrCreateTripCount(Lp); - - Value *StartIdx = ConstantInt::get(IdxTy, 0); - - // Now, compare the new count to zero. If it is zero skip the vector loop and - // jump to the scalar loop. This check also covers the case where the - // backedge-taken count is uint##_max: adding one to it will overflow leading - // to an incorrect trip count of zero. In this (rare) case we will also jump - // to the scalar loop. - emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); - - // Generate the code to check any assumptions that we've made for SCEV - // expressions. - emitSCEVChecks(Lp, LoopScalarPreHeader); - - // Generate the code that checks in runtime if arrays overlap. We put the - // checks into a separate block to make the more common case of few elements - // faster. - emitMemRuntimeChecks(Lp, LoopScalarPreHeader); - - // Generate the induction variable. - // The loop step is equal to the vectorization factor (num of SIMD elements) - // times the unroll factor (num of SIMD instructions). - Value *CountRoundDown = getOrCreateVectorTripCount(Lp); - Constant *Step = ConstantInt::get(IdxTy, VF * UF); - Induction = - createInductionVariable(Lp, StartIdx, CountRoundDown, Step, - getDebugLocFromInstOrOperands(OldInduction)); - +void InnerLoopVectorizer::createInductionResumeValues(Loop *L, + Value *VectorTripCount) { + assert(VectorTripCount && L && "Expected valid arguments"); // We are going to resume the execution of the scalar loop. // Go over all of the induction variables that we found and fix the // PHIs that are left in the scalar version of the loop. @@ -3079,10 +3024,6 @@ // iteration in the vectorized loop. // If we come from a bypass edge then we need to start from the original // start value. - - // This variable saves the new starting index for the scalar loop. It is used - // to test if there are any tail iterations left once the vector loop has - // completed. for (auto &InductionEntry : Legal->getInductionVars()) { PHINode *OrigPhi = InductionEntry.first; InductionDescriptor II = InductionEntry.second; @@ -3096,13 +3037,13 @@ Value *&EndValue = IVEndValues[OrigPhi]; if (OrigPhi == OldInduction) { // We know what the end value is. - EndValue = CountRoundDown; + EndValue = VectorTripCount; } else { - IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); + IRBuilder<> B(L->getLoopPreheader()->getTerminator()); Type *StepType = II.getStep()->getType(); Instruction::CastOps CastOp = - CastInst::getCastOpcode(CountRoundDown, true, StepType, true); - Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); + CastInst::getCastOpcode(VectorTripCount, true, StepType, true); + Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); EndValue->setName("ind.end"); @@ -3119,6 +3060,15 @@ BCResumeVal->addIncoming(II.getStartValue(), BB); OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); } +} + +BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, + MDNode *OrigLoopID) { + assert(L && "Expected valid loop."); + + // The trip counts should be cached by now. + Value *Count = getOrCreateTripCount(L); + Value *VectorTripCount = getOrCreateVectorTripCount(L); // We need the OrigLoop (scalar loop part) latch terminator to help // produce correct debug info for the middle block BB instructions. @@ -3136,7 +3086,7 @@ Value *CmpN = Builder.getTrue(); if (!Cost->foldTailByMasking()) { CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, - CountRoundDown, "cmp.n", + VectorTripCount, "cmp.n", LoopMiddleBlock->getTerminator()); // Here we use the same DebugLoc as the scalar loop latch branch instead @@ -3152,7 +3102,7 @@ ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); // Get ready to start creating new instructions into the vectorized body. - assert(LoopVectorPreHeader == Lp->getLoopPreheader() && + assert(LoopVectorPreHeader == L->getLoopPreheader() && "Inconsistent vector loop preheader"); Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); @@ -3160,7 +3110,7 @@ makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupVectorized}); if (VectorizedLoopID.hasValue()) { - Lp->setLoopID(VectorizedLoopID.getValue()); + L->setLoopID(VectorizedLoopID.getValue()); // Do not setAlreadyVectorized if loop attributes have been defined // explicitly. @@ -3170,9 +3120,9 @@ // Keep all loop hints from the original loop on the vector loop (we'll // replace the vectorizer-specific hints below). if (MDNode *LID = OrigLoop->getLoopID()) - Lp->setLoopID(LID); + L->setLoopID(LID); - LoopVectorizeHints Hints(Lp, true, *ORE); + LoopVectorizeHints Hints(L, true, *ORE); Hints.setAlreadyVectorized(); #ifdef EXPENSIVE_CHECKS @@ -3183,6 +3133,89 @@ return LoopVectorPreHeader; } +BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { + /* + In this function we generate a new loop. The new loop will contain + the vectorized instructions while the old loop will continue to run the + scalar remainder. + + [ ] <-- loop iteration number check. + / | + / v + | [ ] <-- vector loop bypass (may consist of multiple blocks). + | / | + | / v + || [ ] <-- vector pre header. + |/ | + | v + | [ ] \ + | [ ]_| <-- vector loop. + | | + | v + | -[ ] <--- middle-block. + | / | + | / v + -|- >[ ] <--- new preheader. + | | + | v + | [ ] \ + | [ ]_| <-- old scalar loop to handle remainder. + \ | + \ v + >[ ] <-- exit block. + ... + */ + + // Get the metadata of the original loop before it gets modified. + MDNode *OrigLoopID = OrigLoop->getLoopID(); + + // Create an empty vector loop, and prepare basic blocks for the runtime checks. + Loop *Lp = createVectorLoopSkeleton(""); + + // Now, compare the new count to zero. If it is zero skip the vector loop and + // jump to the scalar loop. This check also covers the case where the + // backedge-taken count is uint##_max: adding one to it will overflow leading + // to an incorrect trip count of zero. In this (rare) case we will also jump + // to the scalar loop. + emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); + + // Generate the code to check any assumptions that we've made for SCEV + // expressions. + emitSCEVChecks(Lp, LoopScalarPreHeader); + + // Generate the code that checks in runtime if arrays overlap. We put the + // checks into a separate block to make the more common case of few elements + // faster. + emitMemRuntimeChecks(Lp, LoopScalarPreHeader); + + // Some loops have a single integer induction variable, while other loops + // don't. One example is c++ iterators that often have multiple pointer + // induction variables. In the code below we also support a case where we + // don't have a single induction variable. + // + // We try to obtain an induction variable from the original loop as hard + // as possible. However if we don't find one that: + // - is an integer + // - counts from zero, stepping by one + // - is the size of the widest induction variable type + // then we create a new one. + OldInduction = Legal->getPrimaryInduction(); + Type *IdxTy = Legal->getWidestInductionType(); + Value *StartIdx = ConstantInt::get(IdxTy, 0); + // The loop step is equal to the vectorization factor (num of SIMD elements) + // times the unroll factor (num of SIMD instructions). + Constant *Step = ConstantInt::get(IdxTy, VF * UF); + Value *CountRoundDown = getOrCreateVectorTripCount(Lp); + Induction = + createInductionVariable(Lp, StartIdx, CountRoundDown, Step, + getDebugLocFromInstOrOperands(OldInduction)); + + // Emit phis for the new starting index of the scalar loop. + createInductionResumeValues(Lp, CountRoundDown); + + return completeLoopSkeleton(Lp, OrigLoopID); +} + // Fix up external users of the induction variable. At this point, we are // in LCSSA form, with all external PHIs that use the IV having one input value, // coming from the remainder loop. We need those PHIs to also have a correct