Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -355,6 +355,12 @@ /// Create an empty loop, based on the loop ranges of the old loop. void createEmptyLoop(); + + /// Set up the values of the IVs correctly when exiting the vector loop. + void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, + Value *CountRoundDown, Value *EndValue, + BasicBlock *MiddleBlock); + /// Create a new induction variable inside L. PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, Value *Step, Instruction *DL); @@ -1493,7 +1499,7 @@ /// Holds the widest induction type encountered. Type *WidestIndTy; - /// Allowed outside users. This holds the reduction + /// Allowed outside users. This holds the induction and reduction /// vars which can be accessed from outside the loop. SmallPtrSet AllowedExit; /// This set holds the variables which are known to be uniform after @@ -3218,6 +3224,9 @@ // or the value at the end of the vectorized loop. BCResumeVal->addIncoming(EndValue, MiddleBlock); + // Fix up external users of the induction variable. + fixupIVUsers(OrigPhi, II, CountRoundDown, EndValue, MiddleBlock); + // Fix the scalar body counter (PHI node). unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); @@ -3257,6 +3266,54 @@ Hints.setAlreadyVectorized(); } +// Fix up external users of the induction variable. At this point, we are +// in LCSSA form, with all external PHIs that use the IV having one input value, +// coming from the remainder loop. We need those PHIs to also have a correct +// value for the IV when arriving directly from the middle block. +void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, + const InductionDescriptor &II, + Value *CountRoundDown, Value *EndValue, + BasicBlock *MiddleBlock) { + // There are two kinds of external IV usages - those that use the value + // computed in the last iteration (the PHI) and those that use the penultimate + // value (the value that feeds into the phi from the loop latch). + // We allow both, but they, obviously, have different values. + + // External users of the last iteration's value should see the value that the + // remainder loop uses to initialize its own IV. + Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()); + for (User *U : PostInc->users()) { + Instruction *UI = cast(U); + if (!OrigLoop->contains(UI)) { + assert(isa(UI) && "Expected LCSSA form"); + cast(UI)->addIncoming(EndValue, MiddleBlock); + } + } + + // External users of the penultimate value need to see EndValue - Step. + // The simplest way to get this is to recompute it from the constituent SCEVs, + // that is Start + (Step * (CRD - 1)). + Value *PrevValue = nullptr; + for (User *U : OrigPhi->users()) { + Instruction *UI = cast(U); + if (!OrigLoop->contains(UI)) { + assert(isa(UI) && "Expected LCSSA form"); + if (!PrevValue) { + const DataLayout &DL = + OrigLoop->getHeader()->getModule()->getDataLayout(); + IRBuilder<> B(MiddleBlock->getTerminator()); + Value *CountMinusOne = B.CreateSub( + CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1)); + Value *CMO = B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType(), + "cast.cmo"); + PrevValue = II.transform(B, CMO, PSE.getSE(), DL); + PrevValue->setName("ind.escape"); + } + cast(UI)->addIncoming(PrevValue, MiddleBlock); + } + } +} + namespace { struct CSEDenseMapInfo { static bool canHandle(Instruction *I) { @@ -4637,10 +4694,10 @@ /// \brief Check that the instruction has outside loop users and is not an /// identified reduction variable. static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, - SmallPtrSetImpl &Reductions) { - // Reduction instructions are allowed to have exit users. All other - // instructions must not have external users. - if (!Reductions.count(Inst)) + SmallPtrSetImpl &AllowedExit) { + // Reduction and Induction instructions are allowed to have exit users. All + // other instructions must not have external users. + if (!AllowedExit.count(Inst)) // Check that all of the users of the loop are inside the BB. for (User *U : Inst->users()) { Instruction *UI = cast(U); @@ -4682,15 +4739,6 @@ DEBUG(dbgs() << "LV: Found an induction variable.\n"); - // Until we explicitly handle the case of an induction variable with - // an outside loop user we have to give up vectorizing this loop. - if (hasOutsideLoopUser(TheLoop, Phi, AllowedExit)) { - emitAnalysis(VectorizationReport(Phi) << - "use of induction value outside of the " - "loop is not handled by vectorizer"); - return false; - } - return true; } @@ -4757,6 +4805,11 @@ if (InductionDescriptor::isInductionPHI(Phi, PSE, ID)) { if (!addInductionPhi(Phi, ID)) return false; + // Both the PHI node itself, and the "post-increment" value feeding + // back into the PHI node may have external users. + AllowedExit.insert(Phi); + AllowedExit.insert( + Phi->getIncomingValueForBlock(TheLoop->getLoopLatch())); continue; } @@ -4770,6 +4823,9 @@ if (InductionDescriptor::isInductionPHI(Phi, PSE, ID, true)) { if (!addInductionPhi(Phi, ID)) return false; + AllowedExit.insert(Phi); + AllowedExit.insert( + Phi->getIncomingValueForBlock(TheLoop->getLoopLatch())); continue; } Index: test/Transforms/LoopVectorize/iv_outside_user.ll =================================================================== --- test/Transforms/LoopVectorize/iv_outside_user.ll +++ test/Transforms/LoopVectorize/iv_outside_user.ll @@ -0,0 +1,86 @@ +; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 < %s | FileCheck %s + +; CHECK-LABEL: @postinc +; CHECK-LABEL: scalar.ph: +; CHECK: %bc.resume.val = phi i32 [ %n.vec, %middle.block ], [ 0, %entry ] +; CHECK-LABEL: for.end: +; CHECK: %[[RET:.*]] = phi i32 [ {{.*}}, %for.body ], [ %n.vec, %middle.block ] +; CHECK: ret i32 %[[RET]] +define i32 @postinc(i32 %k) { +entry: + br label %for.body + +for.body: + %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %inc = add nsw i32 %inc.phi, 1 + %cmp = icmp eq i32 %inc, %k + br i1 %cmp, label %for.end, label %for.body + +for.end: + ret i32 %inc +} + +; CHECK-LABEL: @preinc +; CHECK-LABEL: middle.block: +; CHECK: %3 = sub i32 %n.vec, 1 +; CHECK: %ind.escape = add i32 0, %3 +; CHECK-LABEL: scalar.ph: +; CHECK: %bc.resume.val = phi i32 [ %n.vec, %middle.block ], [ 0, %entry ] +; CHECK-LABEL: for.end: +; CHECK: %[[RET:.*]] = phi i32 [ {{.*}}, %for.body ], [ %ind.escape, %middle.block ] +; CHECK: ret i32 %[[RET]] +define i32 @preinc(i32 %k) { +entry: + br label %for.body + +for.body: + %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %inc = add nsw i32 %inc.phi, 1 + %cmp = icmp eq i32 %inc, %k + br i1 %cmp, label %for.end, label %for.body + +for.end: + ret i32 %inc.phi +} + +; CHECK-LABEL: @constpre +; CHECK-LABEL: for.end: +; CHECK: %[[RET:.*]] = phi i32 [ {{.*}}, %for.body ], [ 2, %middle.block ] +; CHECK: ret i32 %[[RET]] +define i32 @constpre() { +entry: + br label %for.body + +for.body: + %inc.phi = phi i32 [ 32, %entry ], [ %inc, %for.body ] + %inc = sub nsw i32 %inc.phi, 2 + %cmp = icmp eq i32 %inc, 0 + br i1 %cmp, label %for.end, label %for.body + +for.end: + ret i32 %inc.phi +} + +; CHECK-LABEL: @geppre +; CHECK-LABEL: middle.block: +; CHECK: %ind.escape = getelementptr i32, i32* %ptr, i64 124 +; CHECK-LABEL: for.end: +; CHECK: %[[RET:.*]] = phi i32* [ {{.*}}, %for.body ], [ %ind.escape, %middle.block ] +; CHECK: ret i32* %[[RET]] +define i32* @geppre(i32* %ptr) { +entry: + br label %for.body + +for.body: + %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %ptr.phi = phi i32* [ %ptr, %entry ], [ %inc.ptr, %for.body ] + %inc = add nsw i32 %inc.phi, 1 + %inc.ptr = getelementptr i32, i32* %ptr.phi, i32 4 + %cmp = icmp eq i32 %inc, 32 + br i1 %cmp, label %for.end, label %for.body + +for.end: + ret i32* %ptr.phi +} + + Index: test/Transforms/LoopVectorize/no_outside_user.ll =================================================================== --- test/Transforms/LoopVectorize/no_outside_user.ll +++ test/Transforms/LoopVectorize/no_outside_user.ll @@ -1,7 +1,6 @@ ; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 < %s 2>&1 | FileCheck %s ; CHECK: remark: {{.*}}: loop not vectorized: value could not be identified as an induction or reduction variable -; CHECK: remark: {{.*}}: loop not vectorized: use of induction value outside of the loop is not handled by vectorizer target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128" @@ -41,34 +40,3 @@ %.lcssa = phi i32 [ %tmp17, %bb16 ] ret i32 %.lcssa } - -; Don't vectorize this loop. Its phi node (induction variable) has an outside -; loop user. We currently don't handle this case. -; PR17179 - -; CHECK-LABEL: @test2( -; CHECK-NOT: <2 x - -@x1 = common global i32 0, align 4 -@x2 = common global i32 0, align 4 -@x0 = common global i32 0, align 4 - -define i32 @test2() { -entry: - store i32 0, i32* @x1, align 4 - %0 = load i32, i32* @x0, align 4 - br label %for.cond1.preheader - -for.cond1.preheader: - %inc7 = phi i32 [ 0, %entry ], [ %inc, %for.cond1.preheader ] - %inc = add nsw i32 %inc7, 1 - %cmp = icmp eq i32 %inc, 52 - br i1 %cmp, label %for.end5, label %for.cond1.preheader - -for.end5: - %inc7.lcssa = phi i32 [ %inc7, %for.cond1.preheader ] - %xor = xor i32 %inc7.lcssa, %0 - store i32 52, i32* @x1, align 4 - store i32 1, i32* @x2, align 4 - ret i32 %xor -}