Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5040,41 +5040,23 @@ return true; } -void LoopVectorizationLegality::collectLoopUniforms() { - // We now know that the loop is vectorizable! - // Collect variables that will remain uniform after vectorization. - +/// Given some seed instructions in \p Worklist, find out the dependent +/// closure set and return it in \p Worklist. The dependent closure set +/// contains the seed instructions, and all the instructions in the +/// \p Loop which are either used by other instructions in the set, or +/// by instructions outside of the \p loop. +static void getDependentClosure(SetVector &Worklist, Loop *Loop, + LoopVectorizationLegality *Legal) { // If V is not an instruction inside the current loop, it is a Value // outside of the scope which we are interesting in. auto isOutOfScope = [&](Value *V) -> bool { Instruction *I = dyn_cast(V); - return (!I || !TheLoop->contains(I)); + return (!I || !Loop->contains(I)); }; - SetVector Worklist; - BasicBlock *Latch = TheLoop->getLoopLatch(); - // Start with the conditional branch. - if (!isOutOfScope(Latch->getTerminator()->getOperand(0))) { - Instruction *Cmp = cast(Latch->getTerminator()->getOperand(0)); - Worklist.insert(Cmp); - DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n"); - } - - // Also add all consecutive pointer values; these values will be uniform - // after vectorization (and subsequent cleanup). - for (auto *BB : TheLoop->getBlocks()) { - for (auto &I : *BB) { - if (I.getType()->isPointerTy() && isConsecutivePtr(&I)) { - Worklist.insert(&I); - DEBUG(dbgs() << "LV: Found uniform instruction: " << I << "\n"); - } - } - } - // Expand Worklist in topological order: whenever a new instruction // is added , its users should be either already inside Worklist, or - // out of scope. It ensures a uniform instruction will only be used - // by uniform instructions or out of scope instructions. + // out of scope. unsigned idx = 0; do { Instruction *I = Worklist[idx++]; @@ -5085,10 +5067,8 @@ Instruction *OI = cast(OV); if (std::all_of(OI->user_begin(), OI->user_end(), [&](User *U) -> bool { return isOutOfScope(U) || Worklist.count(cast(U)); - })) { + })) Worklist.insert(OI); - DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n"); - } } } while (idx != Worklist.size()); @@ -5099,9 +5079,9 @@ // in the cycle to be added into Worklist first, the result is no ones in // the cycle will be added into Worklist in the end. // That is why we process PHI separately. - for (auto &Induction : *getInductionVars()) { + for (auto &Induction : *Legal->getInductionVars()) { auto *PN = Induction.first; - auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch()); + auto *UpdateV = PN->getIncomingValueForBlock(Loop->getLoopLatch()); if (std::all_of(PN->user_begin(), PN->user_end(), [&](User *U) -> bool { return U == UpdateV || isOutOfScope(U) || @@ -5114,12 +5094,38 @@ })) { Worklist.insert(cast(PN)); Worklist.insert(cast(UpdateV)); - DEBUG(dbgs() << "LV: Found uniform instruction: " << *PN << "\n"); - DEBUG(dbgs() << "LV: Found uniform instruction: " << *UpdateV << "\n"); } } +} - Uniforms.insert(Worklist.begin(), Worklist.end()); +void LoopVectorizationLegality::collectLoopUniforms() { + // We now know that the loop is vectorizable! + // Collect variables that will remain uniform after vectorization. + + SetVector Worklist; + BasicBlock *Latch = TheLoop->getLoopLatch(); + // Start with the conditional branch. + Instruction *Cmp = + dyn_cast(Latch->getTerminator()->getOperand(0)); + if (Cmp && TheLoop->contains(Cmp)) + Worklist.insert(Cmp); + + // Also add all consecutive pointer values; these values will be uniform + // after vectorization (and subsequent cleanup). + for (auto *BB : TheLoop->getBlocks()) { + for (auto &I : *BB) { + if (I.getType()->isPointerTy() && isConsecutivePtr(&I)) + Worklist.insert(&I); + } + } + + // Find dependent closure for the seed uniform instructions in Worklist. + getDependentClosure(Worklist, TheLoop, this); + + for (auto &I : Worklist) { + Uniforms.insert(I); + DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); + } } bool LoopVectorizationLegality::canVectorizeMemory() { @@ -6485,66 +6491,36 @@ VecValuesToIgnore.insert(Casts.begin(), Casts.end()); } - // Ignore induction phis that are only used in either GetElementPtr or ICmp - // instruction to exit loop. Induction variables usually have large types and - // can have big impact when estimating register usage. - // This is for when VF > 1. - for (auto &Induction : *Legal->getInductionVars()) { - auto *PN = Induction.first; - auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch()); + SetVector Worklist; + BasicBlock *Latch = TheLoop->getLoopLatch(); + // Loop compare instruction will not have vector version. Add it into + // Worklist as a seed instruction. + Instruction *Cmp = + dyn_cast(Latch->getTerminator()->getOperand(0)); + if (Cmp && TheLoop->contains(Cmp)) + Worklist.insert(Cmp); - // Check that the PHI is only used by the induction increment (UpdateV) or - // by GEPs. Then check that UpdateV is only used by a compare instruction, - // the loop header PHI, or by GEPs. - // FIXME: Need precise def-use analysis to determine if this instruction - // variable will be vectorized. - if (std::all_of(PN->user_begin(), PN->user_end(), - [&](const User *U) -> bool { - return U == UpdateV || isa(U); - }) && - std::all_of(UpdateV->user_begin(), UpdateV->user_end(), - [&](const User *U) -> bool { - return U == PN || isa(U) || - isa(U); - })) { - VecValuesToIgnore.insert(PN); - VecValuesToIgnore.insert(UpdateV); + // Ptr Instructions used by consecutive load/stores or used by + // non-consecutive && non-gather/scatter load/stores will not have + // vector versions. Add them into Worklist as seed instructions. + for (auto *BB : TheLoop->getBlocks()) { + for (auto &I : *BB) { + LoadInst *LI = dyn_cast(&I); + StoreInst *SI = dyn_cast(&I); + if (!LI && !SI) + continue; + Value *Ptr = SI ? SI->getPointerOperand() : LI->getPointerOperand(); + Instruction *PI = dyn_cast(Ptr); + if (PI && (Legal->isConsecutivePtr(PI) || + !isGatherOrScatterLegal(&I, PI, Legal))) + Worklist.insert(PI); } } - // Ignore instructions that will not be vectorized. - // This is for when VF > 1. - for (auto bb = TheLoop->block_begin(), be = TheLoop->block_end(); bb != be; - ++bb) { - for (auto &Inst : **bb) { - switch (Inst.getOpcode()) - case Instruction::GetElementPtr: { - // Ignore GEP if its last operand is an induction variable so that it is - // a consecutive load/store and won't be vectorized as scatter/gather - // pattern. - - GetElementPtrInst *Gep = cast(&Inst); - unsigned NumOperands = Gep->getNumOperands(); - unsigned InductionOperand = getGEPInductionOperand(Gep); - bool GepToIgnore = true; - - // Check that all of the gep indices are uniform except for the - // induction operand. - for (unsigned i = 0; i != NumOperands; ++i) { - if (i != InductionOperand && - !PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), - TheLoop)) { - GepToIgnore = false; - break; - } - } + // Find the dependent closure which will not have vector versions. + getDependentClosure(Worklist, TheLoop, Legal); - if (GepToIgnore) - VecValuesToIgnore.insert(&Inst); - break; - } - } - } + VecValuesToIgnore.insert(Worklist.begin(), Worklist.end()); } void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, Index: test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll =================================================================== --- test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll +++ test/Transforms/LoopVectorize/PowerPC/vsx-tsvc-s173.ll @@ -43,7 +43,7 @@ ; CHECK-LABEL: @s173 ; CHECK: load <4 x float>, <4 x float>* -; CHECK: add nsw i64 %1, 16000 +; CHECK: add nsw i64 %index, 16000 ; CHECK: ret i32 0 } Index: test/Transforms/LoopVectorize/X86/reg-usage.ll =================================================================== --- test/Transforms/LoopVectorize/X86/reg-usage.ll +++ test/Transforms/LoopVectorize/X86/reg-usage.ll @@ -1,9 +1,7 @@ -; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -O2 -S 2>&1 | FileCheck %s +; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -O2 -mtriple=x86_64-unknown-linux -S 2>&1 | FileCheck %s +; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -O2 -mtriple=x86_64-unknown-linux -mattr=+avx512f -S 2>&1 | FileCheck %s --check-prefix=AVX512F ; REQUIRES: asserts -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - @a = global [1024 x i8] zeroinitializer, align 16 @b = global [1024 x i8] zeroinitializer, align 16 @@ -45,6 +43,42 @@ br i1 %exitcond, label %for.cond.cleanup, label %for.body } +define i32 @goo() { +; CHECK-LABEL: goo +; CHECK: LV(REG): VF = 4 +; CHECK-NEXT: LV(REG): Found max usage: 4 +; CHECK: LV(REG): VF = 8 +; CHECK-NEXT: LV(REG): Found max usage: 7 +; CHECK: LV(REG): VF = 16 +; CHECK-NEXT: LV(REG): Found max usage: 13 +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + %add.lcssa = phi i32 [ %add, %for.body ] + ret i32 %add.lcssa + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %s.015 = phi i32 [ 0, %entry ], [ %add, %for.body ] + %tmp1 = add nsw i64 %indvars.iv, 3 + %arrayidx = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %tmp1 + %tmp = load i8, i8* %arrayidx, align 1 + %conv = zext i8 %tmp to i32 + %tmp2 = add nsw i64 %indvars.iv, 2 + %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %tmp2 + %tmp3 = load i8, i8* %arrayidx2, align 1 + %conv3 = zext i8 %tmp3 to i32 + %sub = sub nsw i32 %conv, %conv3 + %ispos = icmp sgt i32 %sub, -1 + %neg = sub nsw i32 0, %sub + %tmp4 = select i1 %ispos, i32 %sub, i32 %neg + %add = add nsw i32 %tmp4, %s.015 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + define i64 @bar(i64* nocapture %a) { ; CHECK-LABEL: bar ; CHECK: LV(REG): VF = 2 @@ -69,3 +103,31 @@ %exitcond = icmp eq i64 %inc, 1024 br i1 %exitcond, label %for.cond.cleanup, label %for.body } + +@d = external global [0 x i64], align 8 +@e = external global [0 x i32], align 4 +@c = external global [0 x i32], align 4 + +define void @hoo(i32 %n) { +; AVX512F-LABEL: bar +; AVX512F: LV(REG): VF = 16 +; AVX512F: LV(REG): Found max usage: 2 +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 %indvars.iv + %tmp = load i64, i64* %arrayidx, align 8 + %arrayidx1 = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 %tmp + %tmp1 = load i32, i32* %arrayidx1, align 4 + %arrayidx3 = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 %indvars.iv + store i32 %tmp1, i32* %arrayidx3, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 10000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} Index: test/Transforms/LoopVectorize/reverse_induction.ll =================================================================== --- test/Transforms/LoopVectorize/reverse_induction.ll +++ test/Transforms/LoopVectorize/reverse_induction.ll @@ -142,11 +142,24 @@ ; } ; CHECK-LABEL: @reverse_forward_induction_i64_i8( -; CHECK: vector.body ; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] -; CHECK: %vec.ind = phi <4 x i64> [ , %vector.ph ] -; CHECK: %step.add = add <4 x i64> %vec.ind, -; CHECK: trunc i64 %index to i8 +; CHECK: %offset.idx = sub i64 1023, %index +; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0 +; CHECK: %[[v0:.+]] = insertelement <4 x i64> undef, i64 %[[a0]], i64 0 +; CHECK: %[[a1:.+]] = add i64 %offset.idx, -1 +; CHECK: %[[v1:.+]] = insertelement <4 x i64> %[[v0]], i64 %[[a1]], i64 1 +; CHECK: %[[a2:.+]] = add i64 %offset.idx, -2 +; CHECK: %[[v2:.+]] = insertelement <4 x i64> %[[v1]], i64 %[[a2]], i64 2 +; CHECK: %[[a3:.+]] = add i64 %offset.idx, -3 +; CHECK: %[[v3:.+]] = insertelement <4 x i64> %[[v2]], i64 %[[a3]], i64 3 +; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4 +; CHECK: %[[v4:.+]] = insertelement <4 x i64> undef, i64 %[[a4]], i64 0 +; CHECK: %[[a5:.+]] = add i64 %offset.idx, -5 +; CHECK: %[[v5:.+]] = insertelement <4 x i64> %[[v4]], i64 %[[a5]], i64 1 +; CHECK: %[[a6:.+]] = add i64 %offset.idx, -6 +; CHECK: %[[v6:.+]] = insertelement <4 x i64> %[[v5]], i64 %[[a6]], i64 2 +; CHECK: %[[a7:.+]] = add i64 %offset.idx, -7 +; CHECK: %[[v7:.+]] = insertelement <4 x i64> %[[v6]], i64 %[[a7]], i64 3 define void @reverse_forward_induction_i64_i8() { entry: @@ -169,10 +182,24 @@ } ; CHECK-LABEL: @reverse_forward_induction_i64_i8_signed( -; CHECK: vector.body: -; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] -; CHECK: %vec.ind = phi <4 x i64> [ , %vector.ph ] -; CHECK: %step.add = add <4 x i64> %vec.ind, +; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK: %offset.idx = sub i64 1023, %index +; CHECK: %[[a0:.+]] = add i64 %offset.idx, 0 +; CHECK: %[[v0:.+]] = insertelement <4 x i64> undef, i64 %[[a0]], i64 0 +; CHECK: %[[a1:.+]] = add i64 %offset.idx, -1 +; CHECK: %[[v1:.+]] = insertelement <4 x i64> %[[v0]], i64 %[[a1]], i64 1 +; CHECK: %[[a2:.+]] = add i64 %offset.idx, -2 +; CHECK: %[[v2:.+]] = insertelement <4 x i64> %[[v1]], i64 %[[a2]], i64 2 +; CHECK: %[[a3:.+]] = add i64 %offset.idx, -3 +; CHECK: %[[v3:.+]] = insertelement <4 x i64> %[[v2]], i64 %[[a3]], i64 3 +; CHECK: %[[a4:.+]] = add i64 %offset.idx, -4 +; CHECK: %[[v4:.+]] = insertelement <4 x i64> undef, i64 %[[a4]], i64 0 +; CHECK: %[[a5:.+]] = add i64 %offset.idx, -5 +; CHECK: %[[v5:.+]] = insertelement <4 x i64> %[[v4]], i64 %[[a5]], i64 1 +; CHECK: %[[a6:.+]] = add i64 %offset.idx, -6 +; CHECK: %[[v6:.+]] = insertelement <4 x i64> %[[v5]], i64 %[[a6]], i64 2 +; CHECK: %[[a7:.+]] = add i64 %offset.idx, -7 +; CHECK: %[[v7:.+]] = insertelement <4 x i64> %[[v6]], i64 %[[a7]], i64 3 define void @reverse_forward_induction_i64_i8_signed() { entry: Index: test/Transforms/LoopVectorize/reverse_iter.ll =================================================================== --- test/Transforms/LoopVectorize/reverse_iter.ll +++ test/Transforms/LoopVectorize/reverse_iter.ll @@ -13,9 +13,19 @@ ; } ; -;CHECK-LABEL: @foo( -;CHECK: -;CHECK: ret +; CHECK-LABEL: @foo( +; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; CHECK: %offset.idx = sub i64 {{.*}}, %index +; CHECK: %[[v0:.+]] = insertelement <4 x i64> undef, i64 %offset.idx, i64 0 +; CHECK: %[[a1:.+]] = add i64 %offset.idx, -1 +; CHECK: %[[v1:.+]] = insertelement <4 x i64> %[[v0]], i64 %[[a1]], i64 1 +; CHECK: %[[a2:.+]] = add i64 %offset.idx, -2 +; CHECK: %[[v2:.+]] = insertelement <4 x i64> %[[v1]], i64 %[[a2]], i64 2 +; CHECK: %[[a3:.+]] = add i64 %offset.idx, -3 +; CHECK: %[[v3:.+]] = insertelement <4 x i64> %[[v2]], i64 %[[a3]], i64 3 +; CHECK: %[[tv:.+]] = trunc <4 x i64> %[[v3]] to <4 x i32> +; CHECK: %[[sv:.+]] = shl nsw <4 x i32> %[[tv]], +; CHECK: ret define i32 @foo(i32 %n, i32* nocapture %A) { %1 = icmp sgt i32 %n, 0 br i1 %1, label %.lr.ph, label %._crit_edge