Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5007,38 +5007,83 @@ void LoopVectorizationLegality::collectLoopUniforms() { // We now know that the loop is vectorizable! // Collect variables that will remain uniform after vectorization. - std::vector Worklist; - BasicBlock *Latch = TheLoop->getLoopLatch(); - // Start with the conditional branch and walk up the block. - Worklist.push_back(Latch->getTerminator()->getOperand(0)); + // If V is not an instruction inside the current loop, it is a Value + // outside of the scope which we are interesting in. + auto isOutOfScope = [&](Value *V) -> bool { + Instruction *I = dyn_cast(V); + return (!I || !TheLoop->contains(I)); + }; + + SetVector Worklist; + BasicBlock *Latch = TheLoop->getLoopLatch(); + // Start with the conditional branch. + if (!isOutOfScope(Latch->getTerminator()->getOperand(0))) { + Instruction *Cmp = cast(Latch->getTerminator()->getOperand(0)); + Worklist.insert(Cmp); + DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n"); + } // Also add all consecutive pointer values; these values will be uniform - // after vectorization (and subsequent cleanup) and, until revectorization is - // supported, all dependencies must also be uniform. - for (Loop::block_iterator B = TheLoop->block_begin(), - BE = TheLoop->block_end(); - B != BE; ++B) - for (Instruction &I : **B) - if (I.getType()->isPointerTy() && isConsecutivePtr(&I)) - Worklist.insert(Worklist.end(), I.op_begin(), I.op_end()); - - while (!Worklist.empty()) { - Instruction *I = dyn_cast(Worklist.back()); - Worklist.pop_back(); - - // Look at instructions inside this loop. - // Stop when reaching PHI nodes. - // TODO: we need to follow values all over the loop, not only in this block. - if (!I || !TheLoop->contains(I) || isa(I)) - continue; + // after vectorization (and subsequent cleanup). + for (auto *BB : TheLoop->getBlocks()) { + for (auto &I : *BB) { + if (I.getType()->isPointerTy() && isConsecutivePtr(&I)) { + Worklist.insert(&I); + DEBUG(dbgs() << "LV: Found uniform instruction: " << I << "\n"); + } + } + } - // This is a known uniform. - Uniforms.insert(I); + // Expand Worklist in topological order: whenever a new instruction + // is added , its users should be either already inside Worklist, or + // out of scope. It ensures a uniform instruction will only be used + // by uniform instructions or out of scope instructions. + unsigned idx = 0; + do { + Instruction *I = Worklist[idx++]; - // Insert all operands. - Worklist.insert(Worklist.end(), I->op_begin(), I->op_end()); + for (auto OV : I->operand_values()) { + if (isOutOfScope(OV)) + continue; + Instruction *OI = cast(OV); + if (std::all_of(OI->user_begin(), OI->user_end(), [&](User *U) -> bool { + return isOutOfScope(U) || Worklist.count(cast(U)); + })) { + Worklist.insert(OI); + DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n"); + } + } + } while (idx != Worklist.size()); + + // For an instruction to be added into Worklist above, all its users inside + // the current loop should be already added into Worklist. This condition + // cannot be true for phi instructions which is always in a dependence loop. + // Because any instruction in the dependence cycle always depends on others + // in the cycle to be added into Worklist first, the result is no ones in + // the cycle will be added into Worklist in the end. + // That is why we process PHI separately. + for (auto &Induction : *getInductionVars()) { + auto *PN = Induction.first; + auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch()); + if (std::all_of(PN->user_begin(), PN->user_end(), + [&](User *U) -> bool { + return U == UpdateV || isOutOfScope(U) || + Worklist.count(cast(U)); + }) && + std::all_of(UpdateV->user_begin(), UpdateV->user_end(), + [&](User *U) -> bool { + return U == PN || isOutOfScope(U) || + Worklist.count(cast(U)); + })) { + Worklist.insert(cast(PN)); + Worklist.insert(cast(UpdateV)); + DEBUG(dbgs() << "LV: Found uniform instruction: " << *PN << "\n"); + DEBUG(dbgs() << "LV: Found uniform instruction: " << *UpdateV << "\n"); + } } + + Uniforms.insert(Worklist.begin(), Worklist.end()); } bool LoopVectorizationLegality::canVectorizeMemory() { Index: llvm/trunk/test/Transforms/LoopVectorize/X86/uniform-phi.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/uniform-phi.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/uniform-phi.ll @@ -0,0 +1,50 @@ +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -debug-only=loop-vectorize -S 2>&1 | FileCheck %s +; REQUIRES: asserts +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; CHECK-LABEL: test +; CHECK-DAG: LV: Found uniform instruction: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] +; CHECK-DAG: LV: Found uniform instruction: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-DAG: LV: Found uniform instruction: %exitcond = icmp eq i64 %indvars.iv, 1599 + +define void @test(float* noalias nocapture %a, float* noalias nocapture readonly %b) #0 { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv + %tmp0 = load float, float* %arrayidx, align 4 + %add = fadd float %tmp0, 1.000000e+00 + %arrayidx5 = getelementptr inbounds float, float* %a, i64 %indvars.iv + store float %add, float* %arrayidx5, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv, 1599 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; CHECK-LABEL: foo +; CHECK-DAG: LV: Found uniform instruction: %cond = icmp eq i64 %i.next, %n +; CHECK-DAG: LV: Found uniform instruction: %tmp1 = getelementptr inbounds i32, i32* %a, i32 %tmp0 +; CHECK-NOT: LV: Found uniform instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + +define void @foo(i32* %a, i64 %n) { +entry: + br label %for.body + +for.body: + %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] + %tmp0 = trunc i64 %i to i32 + %tmp1 = getelementptr inbounds i32, i32* %a, i32 %tmp0 + store i32 %tmp0, i32* %tmp1, align 4 + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp eq i64 %i.next, %n + br i1 %cond, label %for.end, label %for.body + +for.end: + ret void +}