diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -546,14 +546,13 @@ PrimaryInduction = Phi; } - // Both the PHI node itself, and the "post-increment" value feeding - // back into the PHI node may have external users. - // We can allow those uses, except if the SCEVs we have for them rely - // on predicates that only hold within the loop, since allowing the exit - // currently means re-using this SCEV outside the loop (see PR33706 for more - // details). + // The induction PHI node may have external users. + AllowedExit.insert(Phi); + + // Allow external uses of the "post-increment" value feeding back into + // the PHI node unless its SCEV relies on predicates that may not hold + // outside of the loop (see PR33706 for more details). if (PSE.getUnionPredicate().isAlwaysTrue()) { - AllowedExit.insert(Phi); AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch())); } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -568,7 +568,7 @@ /// Set up the values of the IVs correctly when exiting the vector loop. void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, - Value *CountRoundDown, Value *EndValue, + Value *VectorTripCount, Value *EndValue, BasicBlock *MiddleBlock); /// Create a new induction variable inside L. @@ -3536,9 +3536,9 @@ Type *StepType = II.getStep()->getType(); Instruction::CastOps CastOp = CastInst::getCastOpcode(VectorTripCount, true, StepType, true); - Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); + Value *VTC = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.vtc"); const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); - EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); + EndValue = emitTransformedIndex(B, VTC, PSE.getSE(), DL, II); EndValue->setName("ind.end"); // Compute the end value for the additional bypass (if applicable). @@ -3546,10 +3546,10 @@ B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt())); CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true, StepType, true); - CRD = - B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); + VTC = + B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.vtc"); EndValueFromAdditionalBypass = - emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); + emitTransformedIndex(B, VTC, PSE.getSE(), DL, II); EndValueFromAdditionalBypass->setName("ind.end"); } } @@ -3717,13 +3717,13 @@ // times the unroll factor (num of SIMD instructions). Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt()); Value *Step = createStepForVF(Builder, IdxTy, VF, UF); - Value *CountRoundDown = getOrCreateVectorTripCount(Lp); + Value *VectorTripCount = getOrCreateVectorTripCount(Lp); Induction = - createInductionVariable(Lp, StartIdx, CountRoundDown, Step, + createInductionVariable(Lp, StartIdx, VectorTripCount, Step, getDebugLocFromInstOrOperands(OldInduction)); // Emit phis for the new starting index of the scalar loop. - createInductionResumeValues(Lp, CountRoundDown); + createInductionResumeValues(Lp, VectorTripCount); return completeLoopSkeleton(Lp, OrigLoopID); } @@ -3734,7 +3734,7 @@ // value for the IV when arriving directly from the middle block. void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, - Value *CountRoundDown, Value *EndValue, + Value *VectorTripCount, Value *EndValue, BasicBlock *MiddleBlock) { // There are two kinds of external IV usages - those that use the value // computed in the last iteration (the PHI) and those that use the penultimate @@ -3750,40 +3750,43 @@ Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); for (User *U : PostInc->users()) { Instruction *UI = cast(U); - if (!OrigLoop->contains(UI)) { - assert(isa(UI) && "Expected LCSSA form"); - MissingVals[UI] = EndValue; - } + if (OrigLoop->contains(UI)) + continue; + + assert(isa(UI) && "Expected LCSSA form"); + assert(!Cost->foldTailByMasking() && "Unexpected external use of IV"); + MissingVals[UI] = EndValue; } // An external user of the penultimate value need to see EndValue - Step. // The simplest way to get this is to recompute it from the constituent SCEVs, - // that is Start + (Step * (CRD - 1)). + // that is Start + (Step * (VTC - 1)). for (User *U : OrigPhi->users()) { auto *UI = cast(U); - if (!OrigLoop->contains(UI)) { - const DataLayout &DL = - OrigLoop->getHeader()->getModule()->getDataLayout(); - assert(isa(UI) && "Expected LCSSA form"); + if (OrigLoop->contains(UI)) + continue; - IRBuilder<> B(MiddleBlock->getTerminator()); + const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); + assert(isa(UI) && "Expected LCSSA form"); + assert(!Cost->foldTailByMasking() && "Unexpected external use of IV"); - // Fast-math-flags propagate from the original induction instruction. - if (II.getInductionBinOp() && isa(II.getInductionBinOp())) - B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); + IRBuilder<> B(MiddleBlock->getTerminator()); - Value *CountMinusOne = B.CreateSub( - CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); - Value *CMO = - !II.getStep()->getType()->isIntegerTy() - ? B.CreateCast(Instruction::SIToFP, CountMinusOne, - II.getStep()->getType()) - : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); - CMO->setName("cast.cmo"); - Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); - Escape->setName("ind.escape"); - MissingVals[UI] = Escape; - } + // Fast-math-flags propagate from the original induction instruction. + if (II.getInductionBinOp() && isa(II.getInductionBinOp())) + B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags()); + + Value *CountMinusOne = B.CreateSub( + VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1)); + Value *CMO = + !II.getStep()->getType()->isIntegerTy() + ? B.CreateCast(Instruction::SIToFP, CountMinusOne, + II.getStep()->getType()) + : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); + CMO->setName("cast.cmo"); + Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); + Escape->setName("ind.escape"); + MissingVals[UI] = Escape; } for (auto &I : MissingVals) { @@ -8173,10 +8176,10 @@ IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt()); Value *Step = getRuntimeVF(B, IdxTy, VF * UF); - Value *CountRoundDown = getOrCreateVectorTripCount(Lp); - EPI.VectorTripCount = CountRoundDown; + Value *VectorTripCount = getOrCreateVectorTripCount(Lp); + EPI.VectorTripCount = VectorTripCount; Induction = - createInductionVariable(Lp, StartIdx, CountRoundDown, Step, + createInductionVariable(Lp, StartIdx, VectorTripCount, Step, getDebugLocFromInstOrOperands(OldInduction)); // Skip induction resume value creation here because they will be created in @@ -8333,11 +8336,11 @@ // Generate the induction variable. OldInduction = Legal->getPrimaryInduction(); - Value *CountRoundDown = getOrCreateVectorTripCount(Lp); + Value *VectorTripCount = getOrCreateVectorTripCount(Lp); Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF); Value *StartIdx = EPResumeVal; Induction = - createInductionVariable(Lp, StartIdx, CountRoundDown, Step, + createInductionVariable(Lp, StartIdx, VectorTripCount, Step, getDebugLocFromInstOrOperands(OldInduction)); // Generate induction resume values. These variables save the new starting @@ -8347,7 +8350,7 @@ // check, then the resume value for the induction variable comes from // the trip count of the main vector loop, hence passing the AdditionalBypass // argument. - createInductionResumeValues(Lp, CountRoundDown, + createInductionResumeValues(Lp, VectorTripCount, {VecEpilogueIterationCountCheck, EPI.VectorTripCount} /* AdditionalBypass */); diff --git a/llvm/test/Transforms/LoopVectorize/pr52335.ll b/llvm/test/Transforms/LoopVectorize/pr52335.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/pr52335.ll @@ -0,0 +1,63 @@ +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -S | FileCheck %s + +; Generated from: +; +; #include +; #include +; +; int test(int* arr, size_t n) { +; int value = 0; +; #pragma clang loop vectorize_predicate(enable) +; for (uint8_t i = 1; i < n; ++i) { +; arr[i - 1] = 0x41; +; value = arr[i]; +; } +; return value; +; } +; + +target triple = "x86_64-pc-linux-gnu" + +; Function Attrs: nofree norecurse nounwind uwtable +define dso_local i32 @test(i32* nocapture %arr, i64 %n) local_unnamed_addr #0 { +; The vectorizor should refuse to fold the tail by masking because +; %conv12 is used outside of the loop. Test for this by checking that +; %n.vec, the vector trip count, is rounded down to the next multiple of +; 4. If folding the tail, it would have been rounded up instead. +; +; CHECK-LABEL: vector.ph: +; CHECK-NEXT: %n.mod.vf = urem i64 %0, 4 +; CHECK-NEXT: %n.vec = sub i64 %0, %n.mod.vf +entry: + %cmp10 = icmp ugt i64 %n, 1 + br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.for.cond.cleanup_crit_edge: + %conv12.lcssa = phi i64 [ %conv12, %for.body ] + %arrayidx4.le = getelementptr inbounds i32, i32* %arr, i64 %conv12.lcssa + %0 = load i32, i32* %arrayidx4.le, align 4 + br label %for.cond.cleanup + +for.cond.cleanup: + %value.0.lcssa = phi i32 [ %0, %for.cond.for.cond.cleanup_crit_edge ], [ 0, %entry ] + ret i32 %value.0.lcssa + +for.body: + %conv12 = phi i64 [ %conv, %for.body ], [ 1, %for.body.preheader ] + %i.011 = phi i8 [ %inc, %for.body ], [ 1, %for.body.preheader ] + %sub = add nsw i64 %conv12, -1 + %arrayidx = getelementptr inbounds i32, i32* %arr, i64 %sub + store i32 65, i32* %arrayidx, align 4 + %inc = add i8 %i.011, 1 + %conv = zext i8 %inc to i64 + %cmp = icmp ult i64 %conv, %n + br i1 %cmp, label %for.body, label %for.cond.for.cond.cleanup_crit_edge, !llvm.loop !0 +} + +!0 = distinct !{!0, !1, !2, !3} +!1 = !{!"llvm.loop.unroll.disable"} +!2 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} +!3 = !{!"llvm.loop.vectorize.enable", i1 true}