diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1233,15 +1233,6 @@ LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n"); - if (!PrimaryInduction) { - reportVectorizationFailure( - "No primary induction, cannot fold tail by masking", - "Missing a primary induction variable in the loop, which is " - "needed in order to fold tail by masking as required.", - "NoPrimaryInduction", ORE, TheLoop); - return false; - } - SmallPtrSet ReductionLiveOuts; for (auto &Reduction : getReductionVars()) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6585,6 +6585,7 @@ &ILV, CallbackILV}; State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); State.TripCount = ILV.getOrCreateTripCount(nullptr); + State.CanonicalIV = ILV.Induction; //===------------------------------------------------===// // @@ -6771,7 +6772,15 @@ // Introduce the early-exit compare IV <= BTC to form header block mask. // This is used instead of IV < TC because TC may wrap, unlike BTC. - VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction()); + // Start by constructing the desired canonical IV. + VPValue *IV = nullptr; + if (Legal->getPrimaryInduction()) + IV = Plan->getVPValue(Legal->getPrimaryInduction()); + else { + auto IVRecipe = new VPWidenCanonicalIVRecipe(); + Builder.getInsertBlock()->appendRecipe(IVRecipe); + IV = IVRecipe->getVPValue(); + } VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); return BlockMaskCache[BB] = BlockMask; @@ -7130,12 +7139,13 @@ NeedDef.insert(Branch->getCondition()); } - // If the tail is to be folded by masking, the primary induction variable - // needs to be represented in VPlan for it to model early-exit masking. + // If the tail is to be folded by masking, the primary induction variable, if + // exists needs to be represented in VPlan for it to model early-exit masking. // Also, both the Phi and the live-out instruction of each reduction are // required in order to introduce a select between them in VPlan. if (CM.foldTailByMasking()) { - NeedDef.insert(Legal->getPrimaryInduction()); + if (Legal->getPrimaryInduction()) + NeedDef.insert(Legal->getPrimaryInduction()); for (auto &Reduction : Legal->getReductionVars()) { NeedDef.insert(Reduction.first); NeedDef.insert(Reduction.second.getLoopExitInstr()); @@ -7572,9 +7582,8 @@ !PreferPredicateOverEpilog; // 2) Next, if disabling predication is requested on the command line, honour - // this and request a scalar epilogue. Also do this if we don't have a - // primary induction variable, which is required for predication. - if (PredicateOptDisabled || !LVL.getPrimaryInduction()) + // this and request a scalar epilogue. + if (PredicateOptDisabled) return CM_ScalarEpilogueAllowed; // 3) and 4) look if enabling predication is requested on the command line, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -329,6 +329,9 @@ /// Values they correspond to. VPValue2ValueTy VPValue2Value; + /// Hold the canonical scalar IV of the vector loop (start=0, step=VF*UF). + Value *CanonicalIV = nullptr; + /// Hold the trip count of the scalar loop. Value *TripCount = nullptr; @@ -610,6 +613,7 @@ VPPredInstPHISC, VPReplicateSC, VPWidenCallSC, + VPWidenCanonicalIVSC, VPWidenGEPSC, VPWidenIntOrFpInductionSC, VPWidenMemoryInstructionSC, @@ -1144,6 +1148,36 @@ VPSlotTracker &SlotTracker) const override; }; +/// A Recipe for widening the canonical induction variable of the vector loop. +class VPWidenCanonicalIVRecipe : public VPRecipeBase { +private: + /// A VPValue representing the canonical vector IV. + VPValue Val; + +public: + VPWidenCanonicalIVRecipe() : VPRecipeBase(VPWidenCanonicalIVSC) {} + ~VPWidenCanonicalIVRecipe() override = default; + + /// Return the VPValue representing the canonical vector induction variable of + /// the vector loop. + const VPValue *getVPValue() const { return &Val; } + VPValue *getVPValue() { return &Val; } + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPWidenCanonicalIVSC; + } + + /// Generate a canonical vector induction variable of the vector loop, with + /// start = { for 0 <= Part < UF}, and + /// step = . + void execute(VPTransformState &State) override; + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +}; + /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It /// holds a sequence of zero or more VPRecipe's each representing a sequence of /// output IR instructions. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -802,6 +802,29 @@ O << "\\l\""; } +void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { + Value *CanonicalIV = State.CanonicalIV; + Type *STy = CanonicalIV->getType(); + IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); + Value *VStart = Builder.CreateVectorSplat(State.VF, CanonicalIV, "broadcast"); + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { + SmallVector Indices; + for (unsigned Lane = 0, VF = State.VF; Lane < VF; ++Lane) + Indices.push_back(ConstantInt::get(STy, Part * VF + Lane)); + Constant *VStep = ConstantVector::get(Indices); + // Add the consecutive indices to the vector value. + Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv"); + State.set(getVPValue(), CanonicalVectorIV, Part); + } +} + +void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << " +\n" << Indent << "\"EMIT "; + getVPValue()->printAsOperand(O, SlotTracker); + O << " = WIDEN-CANONICAL-INDUCTION \\l\""; +} + template void DomTreeBuilder::Calculate(VPDominatorTree &DT); void VPValue::replaceAllUsesWith(VPValue *New) { @@ -901,6 +924,8 @@ for (const VPRecipeBase &Recipe : *VPBB) { if (const auto *VPI = dyn_cast(&Recipe)) assignSlot(VPI); + else if (const auto *VPIV = dyn_cast(&Recipe)) + assignSlot(VPIV->getVPValue()); } } diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll --- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll @@ -168,9 +168,17 @@ ret void } -; N is unknown, we need a tail. Can't vectorize because loop has no primary -; induction. +; Loop has no primary induction as its integer IV has step -1 starting at +; unknown N, but can still be vectorized. ;CHECK-LABEL: @example3( +; CHECK: vector.ph: +; CHECK: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> {{.*}}, <4 x i64> undef, <4 x i32> zeroinitializer +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VPIV:%.*]] = or <4 x i64> [[BROADCAST_SPLAT]], +; CHECK: {{.*}} = icmp ule <4 x i64> [[VPIV]], [[BROADCAST_SPLAT2]] ;CHECK-NOT: <4 x i32> ;CHECK: ret void define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) optsize { @@ -237,12 +245,12 @@ ; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6 +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10 ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[TMP7:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[TMP6:%.*]] -; CHECK: br i1 undef, label [[TMP7]], label [[TMP6]], !llvm.loop !7 +; CHECK: br i1 undef, label [[TMP7]], label [[TMP6]], !llvm.loop !11 ; CHECK: ret void ; br label %1 @@ -353,12 +361,12 @@ ; CHECK: pred.store.continue22: ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 -; CHECK-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8 +; CHECK-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !12 ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[TMP34:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[TMP33:%.*]] -; CHECK: br i1 undef, label [[TMP34]], label [[TMP33]], !llvm.loop !9 +; CHECK: br i1 undef, label [[TMP34]], label [[TMP33]], !llvm.loop !13 ; CHECK: ret void ; br label %1 diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll --- a/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll +++ b/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll @@ -1,12 +1,11 @@ -; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -force-vector-width=4 -S | FileCheck %s -; Check that when we can't predicate this loop that it is still vectorised (with -; an epilogue). -; TODO: the reason this can't be predicated is because a primary induction -; variable can't be found (not yet) for this counting down loop. But with that -; fixed, this should be able to be predicated. +; Check that a counting-down loop which has no primary induction variable +; is vectorized with preferred predication. ; CHECK-LABEL: vector.body: +; CHECK-LABEL: middle.block: +; CHECK-NEXT: br i1 true, target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"