diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -520,8 +520,8 @@ /// Widen an integer or floating-point induction variable \p IV. If \p Trunc /// is provided, the integer induction variable will first be truncated to /// the corresponding type. - void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, - VPValue *Def, VPValue *CastDef, + void widenIntOrFpInduction(PHINode *IV, Value *Start, Value *Step, + TruncInst *Trunc, VPValue *Def, VPValue *CastDef, VPTransformState &State); /// Construct the vector value of a scalarized value \p V one lane at a time. @@ -707,9 +707,10 @@ /// For pointer induction, returns StartValue[Index * StepValue]. /// FIXME: The newly created binary instructions should contain nsw/nuw /// flags, which can be found from the original scalar operations. - Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, - const DataLayout &DL, - const InductionDescriptor &ID) const; + Value *emitTransformedIndex( + IRBuilder<> &B, Value *Index, + function_ref GetStep, + const DataLayout &DL, const InductionDescriptor &ID) const; /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, /// vector loop preheader, middle block and scalar preheader. Also @@ -2397,8 +2398,8 @@ } void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, - TruncInst *Trunc, VPValue *Def, - VPValue *CastDef, + Value *Step, TruncInst *Trunc, + VPValue *Def, VPValue *CastDef, VPTransformState &State) { assert((IV->getType()->isIntegerTy() || IV != OldInduction) && "Primary induction variable must have an integer type"); @@ -2415,19 +2416,7 @@ auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); - // Generate code for the induction step. Note that induction steps are - // required to be loop-invariant - auto CreateStepValue = [&](const SCEV *Step) -> Value * { - assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) && - "Induction step should be loop invariant"); - if (PSE.getSE()->isSCEVable(IV->getType())) { - SCEVExpander Exp(*PSE.getSE(), DL, "induction"); - return Exp.expandCodeFor(Step, Step->getType(), - LoopVectorPreHeader->getTerminator()); - } - return cast(Step)->getValue(); - }; - + auto GetStep = [Step](const InductionDescriptor &) { return Step; }; // The scalar value to broadcast. This is derived from the canonical // induction variable. If a truncation type is given, truncate the canonical // induction variable and step. Otherwise, derive these values from the @@ -2439,7 +2428,7 @@ ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) : Builder.CreateCast(Instruction::SIToFP, Induction, IV->getType()); - ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); + ScalarIV = emitTransformedIndex(Builder, ScalarIV, GetStep, DL, ID); ScalarIV->setName("offset.idx"); } if (Trunc) { @@ -2475,7 +2464,6 @@ Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); // Now do the actual transformations, and start with creating the step value. - Value *Step = CreateStepValue(ID.getStep()); if (VF.isZero() || VF.isScalar()) { Value *ScalarIV = CreateScalarIV(Step); CreateSplatIV(ScalarIV, Step); @@ -3374,22 +3362,35 @@ return MemCheckBlock; } +static Value *expandStepExpr(const InductionDescriptor &ID, IRBuilder<> &B, + BasicBlock *LoopVectorBody, LoopInfo &LI, + ScalarEvolution &SE, const DataLayout &DL) { + auto GetInsertPoint = [&LI, &B, LoopVectorBody]() { + BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); + if (InsertBB != LoopVectorBody && + LI.getLoopFor(LoopVectorBody) == LI.getLoopFor(InsertBB)) + return LoopVectorBody->getTerminator(); + return &*B.GetInsertPoint(); + }; + + const SCEV *Step = ID.getStep(); + Type *StepType = Step->getType(); + SCEVExpander Exp(SE, DL, "induction"); + if (SE.isSCEVable(StepType)) { + return Exp.expandCodeFor(Step, StepType, GetInsertPoint()); + } + return cast(Step)->getValue(); +} + Value *InnerLoopVectorizer::emitTransformedIndex( - IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, - const InductionDescriptor &ID) const { + IRBuilder<> &B, Value *Index, + function_ref GetStep, + const DataLayout &DL, const InductionDescriptor &ID) const { - SCEVExpander Exp(*SE, DL, "induction"); - auto Step = ID.getStep(); auto StartValue = ID.getStartValue(); - assert(Index->getType()->getScalarType() == Step->getType() && + assert(Index->getType()->getScalarType() == ID.getStep()->getType() && "Index scalar type does not match StepValue type"); - // Note: the IR at this point is broken. We cannot use SE to create any new - // SCEV and then expand it, hoping that SCEV's simplification will give us - // a more optimal code. Unfortunately, attempt of doing so on invalid IR may - // lead to various SCEV crashes. So all we can do is to use builder and rely - // on InstCombine for future simplifications. Here we handle some trivial - // cases only. auto CreateAdd = [&B](Value *X, Value *Y) { assert(X->getType() == Y->getType() && "Types don't match!"); if (auto *CX = dyn_cast(X)) @@ -3418,19 +3419,6 @@ return B.CreateMul(X, Y); }; - // Get a suitable insert point for SCEV expansion. For blocks in the vector - // loop, choose the end of the vector loop header (=LoopVectorBody), because - // the DomTree is not kept up-to-date for additional blocks generated in the - // vector loop. By using the header as insertion point, we guarantee that the - // expanded instructions dominate all their uses. - auto GetInsertPoint = [this, &B]() { - BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); - if (InsertBB != LoopVectorBody && - LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) - return LoopVectorBody->getTerminator(); - return &*B.GetInsertPoint(); - }; - switch (ID.getKind()) { case InductionDescriptor::IK_IntInduction: { assert(!isa(Index->getType()) && @@ -3439,31 +3427,25 @@ "Index type does not match StartValue type"); if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne()) return B.CreateSub(StartValue, Index); - auto *Offset = CreateMul( - Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())); + auto *Offset = CreateMul(Index, GetStep(ID)); return CreateAdd(StartValue, Offset); } case InductionDescriptor::IK_PtrInduction: { - assert(isa(Step) && - "Expected constant step for pointer induction"); - return B.CreateGEP( - ID.getElementType(), StartValue, - CreateMul(Index, - Exp.expandCodeFor(Step, Index->getType()->getScalarType(), - GetInsertPoint()))); + return B.CreateGEP(ID.getElementType(), StartValue, + CreateMul(Index, GetStep(ID))); } case InductionDescriptor::IK_FpInduction: { assert(!isa(Index->getType()) && "Vector indices not supported for FP inductions yet"); - assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value"); + assert(ID.getStep()->getType()->isFloatingPointTy() && + "Expected FP Step value"); auto InductionBinOp = ID.getInductionBinOp(); assert(InductionBinOp && (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub) && "Original bin op should be defined for FP induction"); - Value *StepValue = cast(Step)->getValue(); - Value *MulExp = B.CreateFMul(StepValue, Index); + Value *MulExp = B.CreateFMul(GetStep(ID), Index); return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp, "induction"); } @@ -3541,6 +3523,7 @@ assert(((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && "Inconsistent information about additional bypass."); + // We are going to resume the execution of the scalar loop. // Go over all of the induction variables that we found and fix the // PHIs that are left in the scalar version of the loop. @@ -3574,8 +3557,12 @@ Instruction::CastOps CastOp = CastInst::getCastOpcode(VectorTripCount, true, StepType, true); Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); + const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); - EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); + auto GetStep = [&](const InductionDescriptor &ID) { + return expandStepExpr(ID, B, LoopVectorBody, *LI, *PSE.getSE(), DL); + }; + EndValue = emitTransformedIndex(B, CRD, GetStep, DL, II); EndValue->setName("ind.end"); // Compute the end value for the additional bypass (if applicable). @@ -3586,7 +3573,7 @@ CRD = B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); EndValueFromAdditionalBypass = - emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); + emitTransformedIndex(B, CRD, GetStep, DL, II); EndValueFromAdditionalBypass->setName("ind.end"); } } @@ -3817,7 +3804,10 @@ II.getStep()->getType()) : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); CMO->setName("cast.cmo"); - Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); + auto GetStep = [&](const InductionDescriptor &ID) { + return expandStepExpr(ID, B, LoopVectorBody, *LI, *PSE.getSE(), DL); + }; + Value *Escape = emitTransformedIndex(B, CMO, GetStep, DL, II); Escape->setName("ind.escape"); MissingVals[UI] = Escape; } @@ -4761,6 +4751,11 @@ Value *PartStart = createStepForVF( Builder, ConstantInt::get(PtrInd->getType(), Part), VF); + auto GetStep = [&](const InductionDescriptor &ID) { + return expandStepExpr(ID, Builder, LoopVectorBody, *LI, *PSE.getSE(), + DL); + }; + if (NeedsVectorIndex) { // Here we cache the whole vector, which means we can support the // extraction of any lane. However, in some cases the extractelement @@ -4774,7 +4769,7 @@ Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec); Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices); Value *SclrGep = - emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II); + emitTransformedIndex(Builder, GlobalIndices, GetStep, DL, II); SclrGep->setName("next.gep"); State.set(PhiR, SclrGep, Part); } @@ -4784,7 +4779,7 @@ PartStart, ConstantInt::get(PtrInd->getType(), Lane)); Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); Value *SclrGep = - emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); + emitTransformedIndex(Builder, GlobalIdx, GetStep, DL, II); SclrGep->setName("next.gep"); State.set(PhiR, SclrGep, VPIteration(Part, Lane)); } @@ -8202,8 +8197,8 @@ assert(BestVF.hasValue() && "Vectorization Factor is missing"); assert(VPlans.size() == 1 && "Not a single VPlan to execute."); - VPTransformState State{ - *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()}; + VPTransformState State{*BestVF, BestUF, LI, DT, + *PSE.getSE(), ILV.Builder, &ILV, VPlans.front().get()}; State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); State.TripCount = ILV.getOrCreateTripCount(nullptr); State.CanonicalIV = ILV.Induction; @@ -8719,7 +8714,8 @@ IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); else { auto IVRecipe = new VPWidenCanonicalIVRecipe(); - Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); + auto *Header = cast(Plan->getEntry()->getEntryBasicBlock()); + Builder.getInsertBlock()->insert(IVRecipe, Header->getFirstNonPhi()); IV = IVRecipe->getVPSingleValue(); } VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); @@ -8791,9 +8787,8 @@ Mask); } -VPWidenIntOrFpInductionRecipe * -VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, - ArrayRef Operands) const { +VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI( + PHINode *Phi, ArrayRef Operands, VPlan &Plan) const { // Check if this is an integer or fp induction. If so, build the recipe that // produces its scalar and vector values. InductionDescriptor II = Legal->getInductionVars().lookup(Phi); @@ -8801,9 +8796,12 @@ II.getKind() == InductionDescriptor::IK_FpInduction) { assert(II.getStartValue() == Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); + + VPValue *Step = new VPValue(); + Plan.addExternalDef(Step); const SmallVectorImpl &Casts = II.getCastInsts(); return new VPWidenIntOrFpInductionRecipe( - Phi, Operands[0], Casts.empty() ? nullptr : Casts.front()); + II, Phi, Operands[0], Step, Casts.empty() ? nullptr : Casts.front()); } return nullptr; @@ -8832,8 +8830,10 @@ InductionDescriptor II = Legal->getInductionVars().lookup(cast(I->getOperand(0))); VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); - return new VPWidenIntOrFpInductionRecipe(cast(I->getOperand(0)), - Start, nullptr, I); + VPValue *Step = new VPValue(); + Plan.addExternalDef(Step); + return new VPWidenIntOrFpInductionRecipe( + II, cast(I->getOperand(0)), Start, Step, nullptr, I); } return nullptr; } @@ -9107,7 +9107,7 @@ if (auto Phi = dyn_cast(Instr)) { if (Phi->getParent() != OrigLoop->getHeader()) return tryToBlend(Phi, Operands, Plan); - if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands))) + if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan))) return toVPRecipeResult(Recipe); VPWidenPHIRecipe *PhiRecipe = nullptr; @@ -9680,6 +9680,7 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), + getOperand(1)->getLiveInIRValue(), getTruncInst(), getVPValue(0), getCastValue(), State); } diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -75,7 +75,8 @@ /// Check if an induction recipe should be constructed for \I. If so build and /// return it. If not, return null. VPWidenIntOrFpInductionRecipe * - tryToOptimizeInductionPHI(PHINode *Phi, ArrayRef Operands) const; + tryToOptimizeInductionPHI(PHINode *Phi, ArrayRef Operands, + VPlan &Plan) const; /// Optimize the special case where the operand of \p I is a constant integer /// induction variable. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -38,6 +38,7 @@ #include "llvm/ADT/Twine.h" #include "llvm/ADT/ilist.h" #include "llvm/ADT/ilist_node.h" +#include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/IRBuilder.h" #include "llvm/Support/InstructionCost.h" @@ -193,10 +194,10 @@ /// needed for generating the output IR. struct VPTransformState { VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI, - DominatorTree *DT, IRBuilder<> &Builder, + DominatorTree *DT, ScalarEvolution &SE, IRBuilder<> &Builder, InnerLoopVectorizer *ILV, VPlan *Plan) - : VF(VF), UF(UF), Instance(), LI(LI), DT(DT), Builder(Builder), ILV(ILV), - Plan(Plan) {} + : VF(VF), UF(UF), Instance(), LI(LI), DT(DT), SE(SE), Builder(Builder), + ILV(ILV), Plan(Plan) {} /// The chosen Vectorization and Unroll Factors of the loop being vectorized. ElementCount VF; @@ -330,6 +331,8 @@ /// Hold a pointer to Dominator Tree to register new basic blocks in the loop. DominatorTree *DT; + ScalarEvolution &SE; + /// Hold a reference to the IRBuilder used to generate output IR code. IRBuilder<> &Builder; @@ -1002,12 +1005,14 @@ /// A recipe for handling phi nodes of integer and floating-point inductions, /// producing their vector and scalar values. class VPWidenIntOrFpInductionRecipe : public VPRecipeBase { + InductionDescriptor ID; PHINode *IV; public: - VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, Instruction *Cast, - TruncInst *Trunc = nullptr) - : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), IV(IV) { + VPWidenIntOrFpInductionRecipe(InductionDescriptor ID, PHINode *IV, + VPValue *Start, VPValue *Step, + Instruction *Cast, TruncInst *Trunc = nullptr) + : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start, Step}), ID(ID), IV(IV) { if (Trunc) new VPValue(Trunc, this); else @@ -1051,6 +1056,8 @@ const TruncInst *getTruncInst() const { return dyn_cast_or_null(getVPValue(0)->getUnderlyingValue()); } + + const InductionDescriptor &getDescriptor() const { return ID; } }; /// A recipe for handling first order recurrences and pointer inductions. For diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -40,6 +40,7 @@ #include "llvm/Support/GraphWriter.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include #include #include @@ -767,6 +768,23 @@ } #endif +static void expandAndReplaceStep(VPWidenIntOrFpInductionRecipe &Ind, + BasicBlock *TargetBB, ScalarEvolution &SE) { + Value *Step = nullptr; + const SCEV *StepS = Ind.getDescriptor().getStep(); + if (SE.isSCEVable(Ind.getStartValue()->getUnderlyingValue()->getType())) { + auto &DL = TargetBB->getModule()->getDataLayout(); + SCEVExpander Exp(SE, DL, "induction"); + Step = + Exp.expandCodeFor(StepS, StepS->getType(), TargetBB->getTerminator()); + } else + Step = cast(StepS)->getValue(); + auto *StepOp = Ind.getOperand(1); + VPValue *NewStep = new VPValue(Step); + Ind.getParent()->getPlan()->addExternalDef(NewStep); + StepOp->replaceAllUsesWith(NewStep); +} + /// Generate the code inside the body of the vectorized loop. Assumes a single /// LoopVectorBody basic-block was created for this. Introduce additional /// basic-blocks as needed, and fill them all. @@ -793,6 +811,27 @@ BasicBlock *VectorHeaderBB = VectorPreHeaderBB->getSingleSuccessor(); assert(VectorHeaderBB && "Loop preheader does not have a single successor."); + VPBasicBlock *Header = getEntry()->getEntryBasicBlock(); + if (EnableVPlanNativePath) { + auto Iter = VPBlockUtils::blocksOnly(depth_first( + VPBlockRecursiveTraversalWrapper(getEntry()))); + for (VPBasicBlock *VPBB : Iter) { + for (VPRecipeBase &R : VPBB->phis()) { + auto *Ind = dyn_cast(&R); + if (!Ind) + continue; + expandAndReplaceStep(*Ind, VectorPreHeaderBB, State->SE); + } + } + } else { + for (VPRecipeBase &R : Header->phis()) { + auto *Ind = dyn_cast(&R); + if (!Ind) + continue; + expandAndReplaceStep(*Ind, VectorPreHeaderBB, State->SE); + } + } + // 1. Make room to generate basic-blocks inside loop body if needed. BasicBlock *VectorLatchBB = VectorHeaderBB->splitBasicBlock( VectorHeaderBB->getFirstInsertionPt(), "vector.body.latch"); @@ -817,7 +856,6 @@ // Fix the latch value of reduction and first-order recurrences phis in the // vector loop. - VPBasicBlock *Header = Entry->getEntryBasicBlock(); for (VPRecipeBase &R : Header->phis()) { auto *PhiR = dyn_cast(&R); if (!PhiR || !(isa(&R) || diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -48,7 +48,10 @@ if (II.getKind() == InductionDescriptor::IK_IntInduction || II.getKind() == InductionDescriptor::IK_FpInduction) { VPValue *Start = Plan->getOrAddVPValue(II.getStartValue()); - NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, nullptr); + VPValue *Step = new VPValue(); + Plan->addExternalDef(Step); + NewRecipe = + new VPWidenIntOrFpInductionRecipe(II, Phi, Start, Step, nullptr); } else { Plan->addVPValue(Phi, VPPhi); continue;