Index: include/llvm/Analysis/ScalarEvolution.h =================================================================== --- include/llvm/Analysis/ScalarEvolution.h +++ include/llvm/Analysis/ScalarEvolution.h @@ -1012,6 +1012,11 @@ const SCEV *S, const Loop *L, SmallPtrSetImpl &Preds); + /// Check if the Phi Node associated with \p PhiScev has an entry in + /// PredicatedSCEVRewrites which maps it to an \p AR under some predicates. + bool havePredicatedSCEVRewriteForPHI(const SCEVUnknown *PhiScev, + const SCEVAddRecExpr *AR); + private: /// A CallbackVH to arrange for ScalarEvolution to be notified whenever a /// Value is deleted. @@ -1891,6 +1896,11 @@ /// The printed text is indented by \p Depth. void print(raw_ostream &OS, unsigned Depth) const; + /// Check if \p AR1 and \p AR2 are equal, while taking into account + /// Equal predicates in Preds. + bool areAddRecsEqualWithPreds(const SCEVAddRecExpr *AR1, + const SCEVAddRecExpr *AR2) const; + private: /// Increments the version number of the predicate. This needs to be called /// every time the SCEV predicate changes. Index: include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- include/llvm/Transforms/Utils/LoopUtils.h +++ include/llvm/Transforms/Utils/LoopUtils.h @@ -306,10 +306,13 @@ /// induction, the induction descriptor \p D will contain the data describing /// this induction. If by some other means the caller has a better SCEV /// expression for \p Phi than the one returned by the ScalarEvolution - /// analysis, it can be passed through \p Expr. - static bool isInductionPHI(PHINode *Phi, const Loop* L, ScalarEvolution *SE, - InductionDescriptor &D, - const SCEV *Expr = nullptr); + /// analysis, it can be passed through \p Expr. If the def-use chain + /// associated with the phi includes casts (that we know we can ignore + /// under proper runtime checks), they are passed through \p CastsToIgnore. + static bool + isInductionPHI(PHINode *Phi, const Loop* L, ScalarEvolution *SE, + InductionDescriptor &D, const SCEV *Expr = nullptr, + SmallVectorImpl *CastsToIgnore = nullptr); /// Returns true if \p Phi is a floating point induction in the loop \p L. /// If \p Phi is an induction, the induction descriptor \p D will contain @@ -348,10 +351,18 @@ Instruction::BinaryOpsEnd; } + /// Returns a reference to the type cast instructions in the induction + /// update chain, that are redundant when guarded with a runtime + /// SCEV overflow check. + const SmallVectorImpl &getCastInsts() const { + return RedundantCasts; + } + private: /// Private constructor - used by \c isInductionPHI. InductionDescriptor(Value *Start, InductionKind K, const SCEV *Step, - BinaryOperator *InductionBinOp = nullptr); + BinaryOperator *InductionBinOp = nullptr, + SmallVectorImpl *Casts = nullptr); /// Start value. TrackingVH StartValue; @@ -361,6 +372,9 @@ const SCEV *Step = nullptr; // Instruction that advances induction variable. BinaryOperator *InductionBinOp = nullptr; + // Instructions used for type-casts of the induction variable, + // that are redundant when guarded with a runtime SCEV overflow check. + SmallVector RedundantCasts; }; BasicBlock *InsertPreheaderForLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, Index: lib/Analysis/ScalarEvolution.cpp =================================================================== --- lib/Analysis/ScalarEvolution.cpp +++ lib/Analysis/ScalarEvolution.cpp @@ -4421,6 +4421,25 @@ return L; } +bool ScalarEvolution::havePredicatedSCEVRewriteForPHI( + const SCEVUnknown *PhiScev, const SCEVAddRecExpr *AR) { + + auto *PN = cast(PhiScev->getValue()); + const Loop *L = isIntegerLoopHeaderPHI(PN, LI); + if (!L) + return false; + + auto I = PredicatedSCEVRewrites.find({PhiScev, L}); + if (I == PredicatedSCEVRewrites.end()) + return false; + std::pair> Rewrite = + I->second; + assert(isa(Rewrite.first) && "Expected an AddRec"); + if (Rewrite.first != AR) + return false; + return true; +} + // Analyze \p SymbolicPHI, a SCEV expression of a phi node, and check if the // computation that updates the phi follows the following pattern: // (SExt/ZExt ix (Trunc iy (%SymbolicPHI) to ix) to iy) + InvariantAccum @@ -4624,11 +4643,11 @@ // Construct the extended SCEV: (Ext ix (Trunc iy (Expr) to ix) to iy) // for each of StartVal and Accum - auto GetExtendedExpr = [&](const SCEV *Expr) -> const SCEV * { + auto GetExtendedExpr = [&](const SCEV *Expr, bool SignedEx) -> const SCEV * { assert(isLoopInvariant(Expr, L) && "Expr is expected to be invariant"); const SCEV *TruncatedExpr = getTruncateExpr(Expr, TruncTy); const SCEV *ExtendedExpr = - Signed ? getSignExtendExpr(TruncatedExpr, Expr->getType()) + SignedEx ? getSignExtendExpr(TruncatedExpr, Expr->getType()) : getZeroExtendExpr(TruncatedExpr, Expr->getType()); return ExtendedExpr; }; @@ -4644,13 +4663,15 @@ isKnownPredicate(ICmpInst::ICMP_NE, Expr, ExtendedExpr); }; - const SCEV *StartExtended = GetExtendedExpr(StartVal); + const SCEV *StartExtended = GetExtendedExpr(StartVal, Signed); if (PredIsKnownFalse(StartVal, StartExtended)) { DEBUG(dbgs() << "P2 is compile-time false\n";); return None; } - const SCEV *AccumExtended = GetExtendedExpr(Accum); + // The Step us always Signed (because the overflow checks are either + // NSSW or NUSW) + const SCEV *AccumExtended = GetExtendedExpr(Accum, /*Signed=*/true); if (PredIsKnownFalse(Accum, AccumExtended)) { DEBUG(dbgs() << "P3 is compile-time false\n";); return None; @@ -4717,6 +4738,26 @@ return Rewrite; } +bool PredicatedScalarEvolution::areAddRecsEqualWithPreds ( + const SCEVAddRecExpr *AR1, const SCEVAddRecExpr *AR2) const { + if (AR1 == AR2) + return true; + + auto areExprsEqual = [&](const SCEV *Expr1, const SCEV *Expr2) -> bool { + if (Expr1 != Expr2 && + !SE.isKnownPredicate(ICmpInst::ICMP_EQ, Expr1, Expr2) && + !Preds.implies(SE.getEqualPredicate(Expr1, Expr2)) && + !Preds.implies(SE.getEqualPredicate(Expr2, Expr1))) + return false; + return true; + }; + + if (!areExprsEqual(AR1->getStart(), AR2->getStart()) || + !areExprsEqual(AR1->getStepRecurrence(SE), AR2->getStepRecurrence(SE))) + return false; + return true; +} + /// A helper function for createAddRecFromPHI to handle simple cases. /// /// This function tries to find an AddRec expression for the simplest (yet most @@ -4859,33 +4900,33 @@ // indices form a positive value. if (GEP->isInBounds() && GEP->getOperand(0) == PN) { Flags = setFlags(Flags, SCEV::FlagNW); - + const SCEV *Ptr = getSCEV(GEP->getPointerOperand()); if (isKnownPositive(getMinusSCEV(getSCEV(GEP), Ptr))) Flags = setFlags(Flags, SCEV::FlagNUW); } - + // We cannot transfer nuw and nsw flags from subtraction // operations -- sub nuw X, Y is not the same as add nuw X, -Y // for instance. } - + const SCEV *StartVal = getSCEV(StartValueV); const SCEV *PHISCEV = getAddRecExpr(StartVal, Accum, L, Flags); - + // Okay, for the entire analysis of this edge we assumed the PHI // to be symbolic. We now need to go back and purge all of the // entries for the scalars that use the symbolic expression. forgetSymbolicName(PN, SymbolicName); ValueExprMap[SCEVCallbackVH(PN, this)] = PHISCEV; - + // We can add Flags to the post-inc expression only if we // know that it is *undefined behavior* for BEValueV to // overflow. if (auto *BEInst = dyn_cast(BEValueV)) if (isLoopInvariant(Accum, L) && isAddRecNeverPoison(BEInst, L)) (void)getAddRecExpr(getAddExpr(StartVal, Accum), Accum, L, Flags); - + return PHISCEV; } } Index: lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- lib/Transforms/Utils/LoopUtils.cpp +++ lib/Transforms/Utils/LoopUtils.cpp @@ -678,7 +678,8 @@ } InductionDescriptor::InductionDescriptor(Value *Start, InductionKind K, - const SCEV *Step, BinaryOperator *BOp) + const SCEV *Step, BinaryOperator *BOp, + SmallVectorImpl *Casts) : StartValue(Start), IK(K), Step(Step), InductionBinOp(BOp) { assert(IK != IK_NoInduction && "Not an induction"); @@ -705,6 +706,12 @@ (InductionBinOp->getOpcode() == Instruction::FAdd || InductionBinOp->getOpcode() == Instruction::FSub))) && "Binary opcode should be specified for FP induction"); + + if (Casts) { + for (auto &Inst : *Casts) { + RedundantCasts.push_back(Inst); + } + } } int InductionDescriptor::getConsecutiveDirection() const { @@ -808,7 +815,7 @@ StartValue = Phi->getIncomingValue(1); } else { assert(TheLoop->contains(Phi->getIncomingBlock(1)) && - "Unexpected Phi node in the loop"); + "Unexpected Phi node in the loop"); BEValue = Phi->getIncomingValue(1); StartValue = Phi->getIncomingValue(0); } @@ -841,6 +848,117 @@ return true; } +/// If PredicatedSCEVRewrites contains an entry that maps \p PhiScev to \p AR +/// under a runtime predicate P, then we know the following: +/// (1) The value of the phi at iteration i is some scev expression with casts. +/// (2) Under the runtime predicate P, the above expression is equal to a simple +/// AddRec expression without casts, namely: Start + i*Step, where Start +/// and Step are loop invariant. +/// +/// We want to find the cast instructions that are involved in the +/// update-chain of this induction. A caller that adds the required runtime +/// predicate can be free to drop these cast instructions, and compute +/// the phi using (2) above instead of (1). +/// +/// For example, (1) can take the following form: +/// (Ext ix (Trunc iy ( Start + i*Step ) to ix) to iy) +/// (this is the scev pattern that createAddRecFromPHIWithCasts() currently +/// supports). It corresponds to the following IR sequence: +/// %for.body: +/// %x = phi i64 [ 0, %ph ], [ %add, %for.body ] +/// %casted_phi = "ExtTrunc i64 %x" +/// %add = add i64 %casted_phi, %step +/// +/// where %x is given in \p PN, +/// PSE.getSCEV(%x) is equal to PSE.getSCEV(%casted_phi) under a predicate, +/// and the IR sequence that "ExtTrunc i64 %x" represents can take one of +/// several forms, for example, such as: +/// ExtTrunc1: %casted_phi = and %x, 2^n-1 +/// or: +/// ExtTrunc2: %t = shl %x, m +/// %casted_phi = ashr %t, m +/// +/// If we are able to find such sequence, we return the instructions +/// we found, namely %casted_phi and the instructions on its use-def chain up +/// to the phi (not including the phi). +bool getCastsForInductionPHI( + PredicatedScalarEvolution &PSE, const SCEVUnknown *PhiScev, + const SCEVAddRecExpr *AR, SmallVectorImpl &CastInsts) { + + ScalarEvolution *SE = PSE.getSE(); + if (!SE->havePredicatedSCEVRewriteForPHI(PhiScev, AR)) + return false; + + // PhiScev was found in PredicatedSCEVRewrites, and it is mapped to \p AR + // under some runtime tests. Find the cast instructions that participate in + // the def-use chain of PhiScev in the loop. + + assert(CastInsts.empty() && "CastInsts is expected to be empty."); + auto *PN = cast(PhiScev->getValue()); + assert(PSE.getSCEV(PN) == AR && "Unexpected phi node SCEV expression"); + const Loop *L = AR->getLoop(); + + // FORNOW/TODO: We currently expect the def-use chain to include only + // two-operand instructions, where one of the operands is an invariant. + // createAddRecFromPHIWithCasts() currently does not support anything more + // involved than that, so we keep the search simple. This can be + // extended/generalized as needed. + + auto getDef = [&](const Value *Val) -> Value * { + const BinaryOperator *BinOp = dyn_cast(Val); + if (!BinOp) + return nullptr; + Value *Op0 = BinOp->getOperand(0); + Value *Op1 = BinOp->getOperand(1); + Value *Def = nullptr; + if (L->isLoopInvariant(Op0)) + Def = Op1; + else if (L->isLoopInvariant(Op1)) + Def = Op0; + return Def; + }; + + // Look for the instruction that defines the induction via the + // loop backedge. + BasicBlock *Latch = L->getLoopLatch(); + if (!Latch) + return false; + Value *Val = PN->getIncomingValueForBlock(Latch); + if (!Val) + return false; + + // Follow the def-use chain until the induction phi is reached. + // If a on the way we encounter a Value that has the same SCEV Expr as the + // phi node, we can consider the instructions we visit from that point + // as part of the cast-sequence that can be ignored. + bool InCastSequence = false; + auto *Inst = dyn_cast(Val); + while (Val != PN) { + // If we encountered a phi node other than PN, or if we left the loop, + // we bail out. + if (!Inst || !L->contains(Inst)) { + return false; + } + auto *AddRec = dyn_cast(PSE.getSCEV(Val)); + if (AddRec && PSE.areAddRecsEqualWithPreds(AddRec,AR)) { + InCastSequence = true; + } + if (InCastSequence) { + // Only the last instruction in the cast sequence is expected to have + // uses outside the induction def-use chain. + if (!CastInsts.empty()) + assert(Inst->hasOneUse() && "Cast used outside the induction"); + CastInsts.push_back(Inst); + } + Val = getDef(Val); + if (!Val) + return false; + Inst = dyn_cast(Val); + } + + return InCastSequence; +} + bool InductionDescriptor::isInductionPHI(PHINode *Phi, const Loop *TheLoop, PredicatedScalarEvolution &PSE, InductionDescriptor &D, @@ -870,13 +988,26 @@ return false; } + // Record any Cast instructions that participate in the induction update + const auto *SymbolicPhi = dyn_cast(PhiScev); + // If we started from an UnknownSCEV, and managed to build an addRecurrence + // only after enabling Assume with PSCEV, this means we may have encountered + // cast instructions that required adding a runtime check in order to + // guarantee the correctness of the AddRecurence respresentation of the + // induction. + if (PhiScev != AR && SymbolicPhi) { + SmallVector Casts; + if (getCastsForInductionPHI(PSE, SymbolicPhi, AR, Casts)) + return isInductionPHI(Phi, TheLoop, PSE.getSE(), D, AR, &Casts); + } + return isInductionPHI(Phi, TheLoop, PSE.getSE(), D, AR); } -bool InductionDescriptor::isInductionPHI(PHINode *Phi, const Loop *TheLoop, - ScalarEvolution *SE, - InductionDescriptor &D, - const SCEV *Expr) { +bool InductionDescriptor::isInductionPHI( + PHINode *Phi, const Loop *TheLoop, ScalarEvolution *SE, + InductionDescriptor &D, const SCEV *Expr, + SmallVectorImpl *CastsToIgnore) { Type *PhiTy = Phi->getType(); // We only handle integer and pointer inductions variables. if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) @@ -895,7 +1026,7 @@ // FIXME: We should treat this as a uniform. Unfortunately, we // don't currently know how to handled uniform PHIs. DEBUG(dbgs() << "LV: PHI is a recurrence with respect to an outer loop.\n"); - return false; + return false; } Value *StartValue = @@ -908,7 +1039,8 @@ return false; if (PhiTy->isIntegerTy()) { - D = InductionDescriptor(StartValue, IK_IntInduction, Step); + D = InductionDescriptor(StartValue, IK_IntInduction, Step, /*BOp=*/ nullptr, + CastsToIgnore); return true; } Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -611,6 +611,20 @@ /// Returns true if we should generate a scalar version of \p IV. bool needsScalarInduction(Instruction *IV) const; + /// If there is a cast involved in the induction variable \p ID, which should + /// be ignored in the vectorized loop body, this function records the + /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the + /// cast. We had already proved that the casted Phi is equal to the uncasted + /// Phi in the vectorized loop (under a runtime guard), and therefore + /// there is no need to vectorize the cast - the same value can be used in the + /// vector loop for both the Phi and the cast. + /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, + /// Otherwise, \p VectorLoopValue is a widened/vectorized value. + void recordVectorLoopValueForInductionCast (const InductionDescriptor &ID, + Value *VectorLoopValue, + unsigned Part, + unsigned Lane = UINT_MAX); + /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); @@ -1572,7 +1586,17 @@ /// Returns the widest induction type. Type *getWidestInductionType() { return WidestIndTy; } - /// Returns True if V is an induction variable in this loop. + /// Returns True if V is a Phi node of an induction variable in this loop. + bool isInductionPhi(const Value *V); + + /// Returns True if V is a cast that is part of an induction def-use chain, + /// and had been proven to be redundant under a runtime guard (in other + /// words, the cast has the same SCEV expression as the induction phi). + bool isCastedInductionVariable(const Value *V); + + /// Returns True if V can be considered as an induction variable in this + /// loop. V can be the induction phi, or some redundant cast in the def-use + /// chain of the inducion phi. bool isInductionVariable(const Value *V); /// Returns True if PN is a reduction variable in this loop. @@ -1783,6 +1807,12 @@ /// variables can be pointers. InductionList Inductions; + /// Holds all the casts that participate in the update chain of the induction + /// variables, and that have been proven to be redundant (possibly under a + /// runtime guard). These casts can be ignored when creating the vectorized + /// loop body. + SmallPtrSet InductionCastsToIgnore; + /// Holds the phi nodes that are first-order recurrences. RecurrenceSet FirstOrderRecurrences; @@ -2016,7 +2046,7 @@ return false; // If the truncated value is not an induction variable, return false. - return Legal->isInductionVariable(Op); + return Legal->isInductionPhi(Op); } /// Collects the instructions to scalarize for each predicated instruction in @@ -2562,6 +2592,7 @@ Instruction *LastInduction = VecInd; for (unsigned Part = 0; Part < UF; ++Part) { VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction); + recordVectorLoopValueForInductionCast(II, LastInduction, Part); if (isa(EntryVal)) addMetadata(LastInduction, EntryVal); LastInduction = cast(addFastMathFlag( @@ -2595,6 +2626,22 @@ return llvm::any_of(IV->users(), isScalarInst); } +void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( + const InductionDescriptor &ID, Value *VectorLoopVal, unsigned Part, + unsigned Lane) { + const SmallVectorImpl &Casts = ID.getCastInsts(); + if (!Casts.empty()) { + // Only the first Cast instruction in the Casts vector is of interest. + // The rest of the Casts (if exist) have no uses outside the + // induction update chain itself. + Instruction *CastInst = *Casts.begin(); + if (Lane < UINT_MAX) + VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal); + else + VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal); + } +} + void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) { assert((IV->getType()->isIntegerTy() || IV != OldInduction) && "Primary induction variable must have an integer type"); @@ -2675,6 +2722,7 @@ Value *EntryPart = getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode()); VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart); + recordVectorLoopValueForInductionCast(ID, EntryPart, Part); if (Trunc) addMetadata(EntryPart, Trunc); } @@ -2782,6 +2830,7 @@ auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step)); auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul)); VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add); + recordVectorLoopValueForInductionCast(ID, Add, Part, Lane); } } } @@ -5122,6 +5171,15 @@ PHINode *Phi, const InductionDescriptor &ID, SmallPtrSetImpl &AllowedExit) { Inductions[Phi] = ID; + + // In case this induction also comes with casts that we know we can ignore + // in the vectorized loop body, record them here. All casts could be recorded + // here for ignoring, but suffices to record only the first (as it is the + // only one that may bw used outside the cast sequence). + const SmallVectorImpl &Casts = ID.getCastInsts(); + if (!Casts.empty()) + InductionCastsToIgnore.insert(*Casts.begin()); + Type *PhiTy = Phi->getType(); const DataLayout &DL = Phi->getModule()->getDataLayout(); @@ -5751,7 +5809,7 @@ return true; } -bool LoopVectorizationLegality::isInductionVariable(const Value *V) { +bool LoopVectorizationLegality::isInductionPhi(const Value *V) { Value *In0 = const_cast(V); PHINode *PN = dyn_cast_or_null(In0); if (!PN) @@ -5760,6 +5818,18 @@ return Inductions.count(PN); } +bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) { + auto *Inst = dyn_cast(V); + if (!Inst) + return false; + + return InductionCastsToIgnore.count(Inst); +} + +bool LoopVectorizationLegality::isInductionVariable(const Value *V) { + return isInductionPhi(V) || isCastedInductionVariable(V); +} + bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) { return FirstOrderRecurrences.count(Phi); } @@ -6833,6 +6903,9 @@ if (ValuesToIgnore.count(&I)) continue; + if (VF > 1 && VecValuesToIgnore.count(&I)) + continue; + VectorizationCostTy C = getInstructionCost(&I, VF); // Check if we should override the cost. @@ -6869,14 +6942,16 @@ static const SCEV *getAddressAccessSCEV( Value *Ptr, LoopVectorizationLegality *Legal, - ScalarEvolution *SE, + PredicatedScalarEvolution &PSE, const Loop *TheLoop) { + auto *Gep = dyn_cast(Ptr); if (!Gep) return nullptr; // We are looking for a gep with all loop invariant indices except for one // which should be an induction variable. + auto SE = PSE.getSE(); unsigned NumOperands = Gep->getNumOperands(); for (unsigned i = 1; i < NumOperands; ++i) { Value *Opd = Gep->getOperand(i); @@ -6886,7 +6961,7 @@ } // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. - return SE->getSCEV(Ptr); + return PSE.getSCEV(Ptr); } static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) { @@ -6906,7 +6981,7 @@ // Figure out whether the access is strided and get the stride value // if it's known in compile time - const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop); + const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop); // Get the cost of the scalar memory instruction and address computation. unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV); @@ -7460,6 +7535,13 @@ SmallPtrSetImpl &Casts = RedDes.getCastInsts(); VecValuesToIgnore.insert(Casts.begin(), Casts.end()); } + // Ignore type-casting instructions we identified during induction + // detection. + for (auto &Induction : *Legal->getInductionVars()) { + InductionDescriptor &IndDes = Induction.second; + const SmallVectorImpl &Casts = IndDes.getCastInsts(); + VecValuesToIgnore.insert(Casts.begin(), Casts.end()); + } } LoopVectorizationCostModel::VectorizationFactor @@ -7562,6 +7644,18 @@ return U == Ind || DeadInstructions.count(cast(U)); })) DeadInstructions.insert(IndUpdate); + + // We record as "Dead" also the type-casting instructions we had identified + // during induction analysis. We don't need any handling for them in the + // vectorized loop because we have proven that, under a proper runtime + // test guarding the vectorized loop, the value of the phi, and the casted + // value of the phi, are the same. The last instruction in this casting chain + // will get its scalar/vector/widened def from the scalar/vector/widened def + // of the respective phi node. Any other casts in the induction def-use chain + // have no other uses outside the phi update chain, and will be ignored. + InductionDescriptor &IndDes = Induction.second; + const SmallVectorImpl &Casts = IndDes.getCastInsts(); + DeadInstructions.insert(Casts.begin(), Casts.end()); } } Index: test/Transforms/LoopVectorize/vect-phiscev-sext-trunc.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/vect-phiscev-sext-trunc.ll @@ -0,0 +1,211 @@ +; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 < %s | FileCheck %s -check-prefix=VF8 +; RUN: opt -S -loop-vectorize -force-vector-width=1 -force-vector-interleave=4 < %s | FileCheck %s -check-prefix=VF1 + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; Given a loop with an induction variable which is being +; truncated/extended using casts that had been proven to +; be redundant under a runtime test, we want to make sure +; that these casts, do not get vectorized/scalarized/widened. +; This is the case for inductions whose SCEV expression is +; of the form "ExtTrunc(%phi) + %step", where "ExtTrunc" +; can be a result of the IR sequences we check below. +; +; See also pr30654. +; + +; Case1: Check the following induction pattern: +; +; %p.09 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %for.body ] +; %sext = shl i32 %p.09, 24 +; %conv = ashr exact i32 %sext, 24 +; %add = add nsw i32 %conv, %step +; +; This is the case in the following code: +; +; void doit1(int n, int step) { +; int i; +; char p = 0; +; for (i = 0; i < n; i++) { +; a[i] = p; +; p = p + step; +; } +; } +; +; The "ExtTrunc" IR sequence here is: +; "%sext = shl i32 %p.09, 24" +; "%conv = ashr exact i32 %sext, 24" +; We check that it does not appear in the vector loop body, whether +; we vectorize or scalarize the induction. +; In the case of widened induction, this means that the induction phi +; is directly used, without shl/ashr on the way. + +; VF8-LABEL: @doit1 +; VF8: vector.body: +; VF8: %vec.ind = phi <8 x i32> +; VF8: store <8 x i32> %vec.ind +; VF8: middle.block: + +; VF1-LABEL: @doit1 +; VF1: vector.body: +; VF1-NOT: %{{.*}} = shl i32 +; VF1: middle.block: + +@a = common local_unnamed_addr global [250 x i32] zeroinitializer, align 16 + +define void @doit1(i32 %n, i32 %step) { +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %p.09 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %for.body ] + %sext = shl i32 %p.09, 24 + %conv = ashr exact i32 %sext, 24 + %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv + store i32 %conv, i32* %arrayidx, align 4 + %add = add nsw i32 %conv, %step + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} + + +; Case2: Another variant of the above pattern is where the induction variable +; is used only for address compuation (i.e. it is a GEP index) and therefore +; the induction is not vectorized but rather only the step is widened. +; +; This is the case in the following code, where the induction variable 'w_ix' +; is only used to access the array 'in': +; +; void doit2(int *in, int *out, size_t size, size_t step) +; { +; int w_ix = 0; +; for (size_t offset = 0; offset < size; ++offset) +; { +; int w = in[w_ix]; +; out[offset] = w; +; w_ix += step; +; } +; } +; +; The "ExtTrunc" IR sequence here is similar to the previous case: +; "%sext = shl i64 %w_ix.012, 32 +; %idxprom = ashr exact i64 %sext, 32" +; We check that it does not appear in the vector loop body, whether +; we widen or scalarize the induction. +; In the case of widened induction, this means that the induction phi +; is directly used, without shl/ashr on the way. + +; VF8-LABEL: @doit2 +; VF8: vector.body: +; VF8: %vec.ind = phi <8 x i64> +; VF8: %{{.*}} = extractelement <8 x i64> %vec.ind +; VF8: middle.block: + +; VF1-LABEL: @doit2 +; VF1: vector.body: +; VF1-NOT: %{{.*}} = shl i64 +; VF1: middle.block: +; + +define void @doit2(i32* nocapture readonly %in, i32* nocapture %out, i64 %size, i64 %step) { +entry: + %cmp9 = icmp eq i64 %size, 0 + br i1 %cmp9, label %for.cond.cleanup, label %for.body.lr.ph + +for.body.lr.ph: + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + ret void + +for.body: + %w_ix.011 = phi i64 [ 0, %for.body.lr.ph ], [ %add, %for.body ] + %offset.010 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %sext = shl i64 %w_ix.011, 32 + %idxprom = ashr exact i64 %sext, 32 + %arrayidx = getelementptr inbounds i32, i32* %in, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %out, i64 %offset.010 + store i32 %0, i32* %arrayidx1, align 4 + %add = add i64 %idxprom, %step + %inc = add nuw i64 %offset.010, 1 + %exitcond = icmp eq i64 %inc, %size + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} + +; Case3: Lastly, check also the following induction pattern: +; +; %p.09 = phi i32 [ %val0, %scalar.ph ], [ %add, %for.body ] +; %conv = and i32 %p.09, 255 +; %add = add nsw i32 %conv, %step +; +; This is the case in the following code: +; +; int a[N]; +; void doit3(int n, int step) { +; int i; +; unsigned char p = 0; +; for (i = 0; i < n; i++) { +; a[i] = p; +; p = p + step; +; } +; } +; +; The "ExtTrunc" IR sequence here is: +; "%conv = and i32 %p.09, 255". +; We check that it does not appear in the vector loop body, whether +; we vectorize or scalarize the induction. + +; VF8-LABEL: @doit3 +; VF8: vector.body: +; VF8: %vec.ind = phi <8 x i32> +; VF8: store <8 x i32> %vec.ind +; VF8: middle.block: + +; VF1-LABEL: @doit3 +; VF1: vector.body: +; VF1-NOT: %{{.*}} = and i32 +; VF1: middle.block: + +define void @doit3(i32 %n, i32 %step) { +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %p.09 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %for.body ] + %conv = and i32 %p.09, 255 + %arrayidx = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 %indvars.iv + store i32 %conv, i32* %arrayidx, align 4 + %add = add nsw i32 %conv, %step + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: + br label %for.end + +for.end: + ret void +}