diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -508,7 +508,7 @@ /// the corresponding type. void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID, Value *Start, TruncInst *Trunc, VPValue *Def, - VPValue *CastDef, VPTransformState &State); + VPTransformState &State); /// Construct the vector value of a scalarized value \p V one lane at a time. void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance, @@ -619,7 +619,7 @@ /// can also be a truncate instruction. void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, const InductionDescriptor &ID, VPValue *Def, - VPValue *CastDef, VPTransformState &State); + VPTransformState &State); /// Create a vector induction phi node based on an existing scalar one. \p /// EntryVal is the value from the original loop that maps to the vector phi @@ -629,7 +629,6 @@ void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, Value *Step, Value *Start, Instruction *EntryVal, VPValue *Def, - VPValue *CastDef, VPTransformState &State); /// Returns true if an instruction \p I should be scalarized instead of @@ -639,29 +638,6 @@ /// Returns true if we should generate a scalar version of \p IV. bool needsScalarInduction(Instruction *IV) const; - /// If there is a cast involved in the induction variable \p ID, which should - /// be ignored in the vectorized loop body, this function records the - /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the - /// cast. We had already proved that the casted Phi is equal to the uncasted - /// Phi in the vectorized loop (under a runtime guard), and therefore - /// there is no need to vectorize the cast - the same value can be used in the - /// vector loop for both the Phi and the cast. - /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, - /// Otherwise, \p VectorLoopValue is a widened/vectorized value. - /// - /// \p EntryVal is the value from the original loop that maps to the vector - /// phi node and is used to distinguish what is the IV currently being - /// processed - original one (if \p EntryVal is a phi corresponding to the - /// original IV) or the "newly-created" one based on the proof mentioned above - /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the - /// latter case \p EntryVal is a TruncInst and we must not record anything for - /// that IV, but it's error-prone to expect callers of this routine to care - /// about that, hence this explicit parameter. - void recordVectorLoopValueForInductionCast( - const InductionDescriptor &ID, const Instruction *EntryVal, - Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, - unsigned Part, unsigned Lane = UINT_MAX); - /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); @@ -2356,8 +2332,7 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( const InductionDescriptor &II, Value *Step, Value *Start, - Instruction *EntryVal, VPValue *Def, VPValue *CastDef, - VPTransformState &State) { + Instruction *EntryVal, VPValue *Def, VPTransformState &State) { assert((isa(EntryVal) || isa(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"); @@ -2420,8 +2395,6 @@ if (isa(EntryVal)) addMetadata(LastInduction, EntryVal); - recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, - State, Part); LastInduction = cast( Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); @@ -2455,42 +2428,10 @@ return llvm::any_of(IV->users(), isScalarInst); } -void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( - const InductionDescriptor &ID, const Instruction *EntryVal, - Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, - unsigned Part, unsigned Lane) { - assert((isa(EntryVal) || isa(EntryVal)) && - "Expected either an induction phi-node or a truncate of it!"); - - // This induction variable is not the phi from the original loop but the - // newly-created IV based on the proof that casted Phi is equal to the - // uncasted Phi in the vectorized loop (under a runtime guard possibly). It - // re-uses the same InductionDescriptor that original IV uses but we don't - // have to do any recording in this case - that is done when original IV is - // processed. - if (isa(EntryVal)) - return; - - if (!CastDef) { - assert(ID.getCastInsts().empty() && - "there are casts for ID, but no CastDef"); - return; - } - assert(!ID.getCastInsts().empty() && - "there is a CastDef, but no casts for ID"); - // Only the first Cast instruction in the Casts vector is of interest. - // The rest of the Casts (if exist) have no uses outside the - // induction update chain itself. - if (Lane < UINT_MAX) - State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); - else - State.set(CastDef, VectorLoopVal, Part); -} - void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID, Value *Start, TruncInst *Trunc, - VPValue *Def, VPValue *CastDef, + VPValue *Def, VPTransformState &State) { assert((IV->getType()->isIntegerTy() || IV != OldInduction) && "Primary induction variable must have an integer type"); @@ -2556,8 +2497,6 @@ State.set(Def, EntryPart, Part); if (Trunc) addMetadata(EntryPart, Trunc); - recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, - State, Part); } }; @@ -2579,8 +2518,7 @@ // least one user in the loop that is not widened. auto NeedsScalarIV = needsScalarInduction(EntryVal); if (!NeedsScalarIV) { - createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, - State); + createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); return; } @@ -2588,14 +2526,13 @@ // create the phi node, we will splat the scalar induction variable in each // loop iteration. if (!shouldScalarizeInstruction(EntryVal)) { - createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, - State); + createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); Value *ScalarIV = CreateScalarIV(Step); // Create scalar steps that can be used by instructions we will later // scalarize. Note that the addition of the scalar steps will not increase // the number of instructions in the loop in the common case prior to // InstCombine. We will be trading one vector extract for each scalar step. - buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); + buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); return; } @@ -2605,7 +2542,7 @@ Value *ScalarIV = CreateScalarIV(Step); if (!Cost->isScalarEpilogueAllowed()) CreateSplatIV(ScalarIV, Step); - buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); + buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); } Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx, @@ -2659,7 +2596,7 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, const InductionDescriptor &ID, - VPValue *Def, VPValue *CastDef, + VPValue *Def, VPTransformState &State) { // We shouldn't have to build scalar steps if we aren't vectorizing. assert(VF.isVector() && "VF should be greater than one"); @@ -2709,8 +2646,6 @@ auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); State.set(Def, Add, Part); - recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, - Part); // It's useful to record the lane values too for the known minimum number // of elements so we do those below. This improves the code quality when // trying to extract the first element, for example. @@ -2730,8 +2665,6 @@ auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); State.set(Def, Add, VPIteration(Part, Lane)); - recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, - Part, Lane); } } } @@ -8061,18 +7994,6 @@ return U == Ind || DeadInstructions.count(cast(U)); })) DeadInstructions.insert(IndUpdate); - - // We record as "Dead" also the type-casting instructions we had identified - // during induction analysis. We don't need any handling for them in the - // vectorized loop because we have proven that, under a proper runtime - // test guarding the vectorized loop, the value of the phi, and the casted - // value of the phi, are the same. The last instruction in this casting chain - // will get its scalar/vector/widened def from the scalar/vector/widened def - // of the respective phi node. Any other casts in the induction def-use chain - // have no other uses outside the phi update chain, and will be ignored. - const InductionDescriptor &IndDes = Induction.second; - const SmallVectorImpl &Casts = IndDes.getCastInsts(); - DeadInstructions.insert(Casts.begin(), Casts.end()); } } @@ -8591,9 +8512,7 @@ if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) { assert(II->getStartValue() == Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); - const SmallVectorImpl &Casts = II->getCastInsts(); - return new VPWidenIntOrFpInductionRecipe( - Phi, Operands[0], *II, Casts.empty() ? nullptr : Casts.front()); + return new VPWidenIntOrFpInductionRecipe(Phi, Operands[0], *II); } return nullptr; @@ -9233,6 +9152,8 @@ cast(Plan->getEntry())->setExit(VPBB); + VPlanTransforms::removeRedundantInductionCasts(*Plan); + // Now that sink-after is done, move induction recipes for optimized truncates // to the phi section of the header block. for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) @@ -9713,9 +9634,9 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); - State.ILV->widenIntOrFpInduction( - IV, getInductionDescriptor(), getStartValue()->getLiveInIRValue(), - getTruncInst(), getVPValue(0), getCastValue(), State); + State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(), + getStartValue()->getLiveInIRValue(), + getTruncInst(), getVPValue(0), State); } void VPWidenPHIRecipe::execute(VPTransformState &State) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1004,29 +1004,21 @@ /// A recipe for handling phi nodes of integer and floating-point inductions, /// producing their vector and scalar values. -class VPWidenIntOrFpInductionRecipe : public VPRecipeBase { +class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPValue { PHINode *IV; const InductionDescriptor &IndDesc; public: VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, - const InductionDescriptor &IndDesc, - Instruction *Cast = nullptr) - : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), IV(IV), - IndDesc(IndDesc) { - new VPValue(IV, this); - - if (Cast) - new VPValue(Cast, this); - } + const InductionDescriptor &IndDesc) + : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(IV, this), + IV(IV), IndDesc(IndDesc) {} VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, const InductionDescriptor &IndDesc, TruncInst *Trunc) - : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), IV(IV), - IndDesc(IndDesc) { - new VPValue(Trunc, this); - } + : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(Trunc, this), + IV(IV), IndDesc(IndDesc) {} ~VPWidenIntOrFpInductionRecipe() override = default; @@ -1048,13 +1040,6 @@ /// Returns the start value of the induction. VPValue *getStartValue() { return getOperand(0); } - /// Returns the cast VPValue, if one is attached, or nullptr otherwise. - VPValue *getCastValue() { - if (getNumDefinedValues() != 2) - return nullptr; - return getVPValue(1); - } - /// Returns the first defined value as TruncInst, if it is one or nullptr /// otherwise. TruncInst *getTruncInst() { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -37,6 +37,14 @@ static bool sinkScalarOperands(VPlan &Plan); static bool mergeReplicateRegions(VPlan &Plan); + + /// Remove redundant casts of inductions. + /// + /// Such redundant casts are casts of induction variables that can be ignored, + /// because we already proved that the casted phi is equal to the uncasted phi + /// in the vectorized loop. There is no need to vectorize the cast - the same + /// value can be used for both the phi and casts in the vector loop. + static void removeRedundantInductionCasts(VPlan &Plan); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -294,3 +294,39 @@ delete ToDelete; return Changed; } + +void VPlanTransforms::removeRedundantInductionCasts(VPlan &Plan) { + SmallVector> CastsToRemove; + for (auto &Phi : Plan.getEntry()->getEntryBasicBlock()->phis()) { + auto *IV = dyn_cast(&Phi); + if (!IV) + continue; + auto &Casts = IV->getInductionDescriptor().getCastInsts(); + if (Casts.empty() || IV->getTruncInst()) + continue; + + // Visit all casts connected to IV and in Casts. Collect them. + // remember them for removal. + VPValue *FindMyCast = IV; + for (unsigned NumCastsToRemove = Casts.size(); NumCastsToRemove > 0; + NumCastsToRemove--) { + VPRecipeBase *FoundUserCast = nullptr; + for (auto *U : FindMyCast->users()) { + auto *UserCast = cast(U); + if (UserCast->getNumDefinedValues() == 1 && + UserCast->getVPSingleValue()->getUnderlyingValue() == + Casts[NumCastsToRemove - 1]) { + FoundUserCast = UserCast; + break; + } + } + assert(FoundUserCast && "Missing a cast to remove"); + CastsToRemove.emplace_back(FoundUserCast, IV); + FindMyCast = FoundUserCast->getVPSingleValue(); + } + } + for (auto &E : CastsToRemove) { + E.first->getVPSingleValue()->replaceAllUsesWith(E.second); + E.first->eraseFromParent(); + } +} diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll --- a/llvm/test/Transforms/LoopVectorize/induction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction.ll @@ -933,3 +933,41 @@ exit: ret void } + +; Test case where %iv.2.ext and %iv.2.conv become redundant due to the SCEV +; predicates generated for the vector loop. They should be removed in the +; vector loop. +define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n, i32 %step, i32* %ptr) { +; CHECK-LABEL: @test_optimized_cast_induction_feeding_first_order_recurrence( +; CHECK-LABEL: vector.body: +; CHECK-NEXT: [[MAIN_IV:%.+]] = phi i64 [ 0, %vector.ph ], [ [[MAIN_IV_NEXT:%.+]], %vector.body ] +; CHECK-NEXT: [[VEC_RECUR:%.+]] = phi <2 x i32> [ , %vector.ph ], [ [[VEC_IV:%.+]], %vector.body ] +; CHECK-NEXT: [[VEC_IV]] = phi <2 x i32> [ %induction, %vector.ph ], [ [[VEC_IV_NEXT:%.+]], %vector.body ] +; CHECK-NEXT: [[MAIN_IV_0:%.+]] = add i64 [[MAIN_IV]], 0 +; CHECK-NEXT: [[RECUR_SHUFFLE:%.+]] = shufflevector <2 x i32> [[VEC_RECUR]], <2 x i32> [[VEC_IV]], <2 x i32> +; CHECK-NEXT: [[GEP0:%.+]] = getelementptr inbounds i32, i32* %ptr, i64 [[MAIN_IV_0]] +; CHECK-NEXT: [[GEP1:%.+]] = getelementptr inbounds i32, i32* [[GEP0]], i32 0 +; CHECK-NEXT: [[GEP_CAST:%.+]] = bitcast i32* [[GEP1]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[RECUR_SHUFFLE]], <2 x i32>* [[GEP_CAST]], align 4 +; CHECK-NEXT: [[MAIN_IV_NEXT]] = add nuw i64 [[MAIN_IV]], 2 +; CHECK-NEXT: [[VEC_IV_NEXT]] = add <2 x i32> [[VEC_IV]], +; +entry: + br label %loop + +loop: + %for = phi i32 [ 0, %entry ], [ %iv.2.conv, %loop ] + %iv.1 = phi i64 [ 0, %entry ], [ %iv.1.next, %loop ] + %iv.2 = phi i32 [ 0, %entry ], [ %iv.2.next, %loop ] + %iv.2.ext = shl i32 %iv.2, 24 + %iv.2.conv = ashr exact i32 %iv.2.ext, 24 + %gep = getelementptr inbounds i32, i32* %ptr, i64 %iv.1 + store i32 %for, i32* %gep, align 4 + %iv.2.next = add nsw i32 %iv.2.conv, %step + %iv.1.next = add nuw nsw i64 %iv.1, 1 + %exitcond = icmp eq i64 %iv.1.next, %n + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +}