Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4896,37 +4896,78 @@ void LoopVectorizationLegality::collectLoopUniforms() { // We now know that the loop is vectorizable! // Collect variables that will remain uniform after vectorization. - std::vector Worklist; - BasicBlock *Latch = TheLoop->getLoopLatch(); - // Start with the conditional branch and walk up the block. - Worklist.push_back(Latch->getTerminator()->getOperand(0)); + // If V is not an instruction inside current loop, it is a Value + // outside of scope where we are interesting in. + auto isOutOfScope = [&](Value *V) -> bool { + Instruction *I = dyn_cast(V); + if (!I || !TheLoop->contains(I)) + return true; + return false; + }; + + SetVector Worklist; + BasicBlock *Latch = TheLoop->getLoopLatch(); + // Start with the conditional branch. + if (!isOutOfScope(Latch->getTerminator()->getOperand(0))) + Worklist.insert(Latch->getTerminator()->getOperand(0)); // Also add all consecutive pointer values; these values will be uniform - // after vectorization (and subsequent cleanup) and, until revectorization is - // supported, all dependencies must also be uniform. + // after vectorization (and subsequent cleanup). for (Loop::block_iterator B = TheLoop->block_begin(), BE = TheLoop->block_end(); - B != BE; ++B) - for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end(); I != IE; ++I) + B != BE; ++B) { + for (BasicBlock::iterator I = (*B)->begin(), IE = (*B)->end(); I != IE; + ++I) { if (I->getType()->isPointerTy() && isConsecutivePtr(&*I)) - Worklist.insert(Worklist.end(), I->op_begin(), I->op_end()); + Worklist.insert(&*I); + } + } - while (!Worklist.empty()) { - Instruction *I = dyn_cast(Worklist.back()); - Worklist.pop_back(); - - // Look at instructions inside this loop. - // Stop when reaching PHI nodes. - // TODO: we need to follow values all over the loop, not only in this block. - if (!I || !TheLoop->contains(I) || isa(I)) - continue; + // Expand Worklist in topological order: whenever a new instruction + // is added , its users should be either already inside Worklist, or + // out of scope. It ensures a uniform instruction will only be used + // by uniform instructions or out of scope instructions. + unsigned idx = 0; + do { + Instruction *I = cast(Worklist[idx++]); - // This is a known uniform. - Uniforms.insert(I); + for (auto OV : I->operand_values()) { + if (isOutOfScope(OV)) + continue; + Instruction *OI = cast(OV); + if (std::all_of(OI->user_begin(), OI->user_end(), [&](User *U) -> bool { + return Worklist.count(U) || isOutOfScope(U); + })) { + Worklist.insert(OV); + } + } + } while (idx != Worklist.size()); + + // Dep loops will not be added into Worklist above so they have to be + // processed separately. Bundle the instructions in the dep loop together + // and check their users as a group. + for (auto &Induction : *getInductionVars()) { + auto *PN = Induction.first; + auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch()); + if (std::all_of(PN->user_begin(), PN->user_end(), + [&](User *U) -> bool { + return U == UpdateV || Worklist.count(U) || + isOutOfScope(U); + }) && + std::all_of(UpdateV->user_begin(), UpdateV->user_end(), + [&](User *U) -> bool { + return U == PN || Worklist.count(U) || isOutOfScope(U); + })) { + Worklist.insert(PN); + Worklist.insert(UpdateV); + } + } - // Insert all operands. - Worklist.insert(Worklist.end(), I->op_begin(), I->op_end()); + for (auto UV : Worklist) { + Instruction *UI = cast(UV); + Uniforms.insert(UI); + DEBUG(dbgs() << "LV: Found uniform instruction: " << *UI << "\n"); } } Index: test/Transforms/LoopVectorize/X86/uniform-phi.ll =================================================================== --- test/Transforms/LoopVectorize/X86/uniform-phi.ll +++ test/Transforms/LoopVectorize/X86/uniform-phi.ll @@ -0,0 +1,50 @@ +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -debug-only=loop-vectorize -S 2>&1 | FileCheck %s +; REQUIRES: asserts +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; CHECK-LABEL: test +; CHECK-DAG: LV: Found uniform instruction: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] +; CHECK-DAG: LV: Found uniform instruction: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-DAG: LV: Found uniform instruction: %exitcond = icmp eq i64 %indvars.iv, 1599 + +define void @test(float* noalias nocapture %a, float* noalias nocapture readonly %b) #0 { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv + %tmp0 = load float, float* %arrayidx, align 4 + %add = fadd float %tmp0, 1.000000e+00 + %arrayidx5 = getelementptr inbounds float, float* %a, i64 %indvars.iv + store float %add, float* %arrayidx5, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv, 1599 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; CHECK-LABEL: foo +; CHECK-DAG: LV: Found uniform instruction: %cond = icmp eq i64 %i.next, %n +; CHECK-DAG: LV: Found uniform instruction: %tmp1 = getelementptr inbounds i32, i32* %a, i32 %tmp0 +; CHECK-NOT: LV: Found uniform instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + +define void @foo(i32* %a, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp0 = trunc i64 %i to i32 + %tmp1 = getelementptr inbounds i32, i32* %a, i32 %tmp0 + store i32 %tmp0, i32* %tmp1, align 4 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp eq i64 %i.next, %n + br i1 %cond, label %for.end, label %for.body + +for.end: + ret void +}