Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4668,14 +4668,28 @@ SetVector Worklist; BasicBlock *Latch = TheLoop->getLoopLatch(); + // Instructions that are scalar with predication must not be considered + // uniform after vectorization, because that would create an erroneous + // replicating region where only a single instance out of VF should be formed. + // TODO: optimize such seldom cases if found important, see PR40816. + auto WorklistInsert = [&](Instruction *I) -> void { + if (!isScalarWithPredication(I, VF)) { + Worklist.insert(I); + LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); + } +#ifndef NDEBUG + else + LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " + << *I << "\n"); +#endif /* NDEBUG */ + }; + // Start with the conditional branch. If the branch condition is an // instruction contained in the loop that is only used by the branch, it is // uniform. auto *Cmp = dyn_cast(Latch->getTerminator()->getOperand(0)); - if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) { - Worklist.insert(Cmp); - LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n"); - } + if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) + WorklistInsert(Cmp); // Holds consecutive and consecutive-like pointers. Consecutive-like pointers // are pointers that are treated like consecutive pointers during @@ -4734,10 +4748,8 @@ // Add to the Worklist all consecutive and consecutive-like pointers that // aren't also identified as possibly non-uniform. for (auto *V : ConsecutiveLikePtrs) - if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) { - LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n"); - Worklist.insert(V); - } + if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) + WorklistInsert(V); // Expand Worklist in topological order: whenever a new instruction // is added , its users should be already inside Worklist. It ensures @@ -4763,10 +4775,8 @@ return Worklist.count(J) || (OI == getLoadStorePointerOperand(J) && isUniformDecision(J, VF)); - })) { - Worklist.insert(OI); - LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n"); - } + })) + WorklistInsert(OI); } } @@ -4808,11 +4818,8 @@ continue; // The induction variable and its update instruction will remain uniform. - Worklist.insert(Ind); - Worklist.insert(IndUpdate); - LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n"); - LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate - << "\n"); + WorklistInsert(Ind); + WorklistInsert(IndUpdate); } Uniforms[VF].insert(Worklist.begin(), Worklist.end()); Index: llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll +++ llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll @@ -1,5 +1,6 @@ ; REQUIRES: asserts ; RUN: opt < %s -loop-vectorize -instcombine -S -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-width=2 -S | FileCheck %s -check-prefix=FORCE target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -64,4 +65,92 @@ ret void } +; CHECK-LABEL: PR40816 +; +; Check that scalar with predication instructions are not considered uniform +; after vectorization, because that results in replicating a region instead of +; having a single instance (out of VF). +; +; CHECK: LV: Found trip count: 3 +; CHECK: LV: Found uniform instruction: {{%.*}} = icmp eq i16 {{%.*}}, 0 +; CHECK-NOT: LV: Found uniform instruction: {{%.*}} = load i16, i16* {{%.*}}, align 1 +; CHECK: LV: Found not uniform being ScalarWithPredication: {{%.*}} = load i16, i16* {{%.*}}, align 1 +; CHECK: LV: Found scalar instruction: {{%.*}} = getelementptr inbounds [5 x [3 x i16]], [5 x [3 x i16]]* @a, i16 0, i16 {{%.*}}, i16 {{%.*}} +; FORCE-LABEL: @PR40816( +; FORCE-NEXT: entry: +; FORCE-NEXT: br i1 false, label {{%.*}}, label [[VECTOR_PH:%.*]] +; FORCE: vector.ph: +; FORCE-NEXT: br label [[VECTOR_BODY:%.*]] +; FORCE: vector.body: +; FORCE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] +; FORCE-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; FORCE-NEXT: [[VEC_IND3:%.*]] = phi <2 x i16> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT4:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; FORCE-NEXT: [[TMP0:%.*]] = icmp ule <2 x i32> [[VEC_IND]], +; FORCE-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0 +; FORCE-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; FORCE: pred.store.if: +; FORCE-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[VEC_IND]], i32 0 +; FORCE-NEXT: store i32 [[TMP2]], i32* @b, align 1 +; FORCE-NEXT: br label [[PRED_STORE_CONTINUE]] +; FORCE: pred.store.continue: +; FORCE-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1 +; FORCE-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] +; FORCE: pred.store.if1: +; FORCE-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[VEC_IND]], i32 1 +; FORCE-NEXT: store i32 [[TMP4]], i32* @b, align 1 +; FORCE-NEXT: br label [[PRED_STORE_CONTINUE2]] +; FORCE: pred.store.continue2: +; FORCE-NEXT: [[TMP5:%.*]] = trunc i32 [[INDEX]] to i16 +; FORCE-NEXT: [[TMP6:%.*]] = mul <2 x i16> [[VEC_IND3]], [[VEC_IND3]] +; FORCE-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0 +; FORCE-NEXT: br i1 [[TMP7]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; FORCE: pred.load.if: +; FORCE-NEXT: [[TMP8:%.*]] = extractelement <2 x i16> [[TMP6]], i32 0 +; FORCE-NEXT: [[TMP9:%.*]] = add i16 [[TMP5]], 0 +; FORCE-NEXT: [[TMP10:%.*]] = getelementptr inbounds [5 x [3 x i16]], [5 x [3 x i16]]* @a, i16 0, i16 [[TMP8]], i16 [[TMP9]] +; FORCE-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP10]], align 1 +; FORCE-NEXT: [[TMP12:%.*]] = insertelement <2 x i16> undef, i16 [[TMP11]], i32 0 +; FORCE-NEXT: br label [[PRED_LOAD_CONTINUE]] +; FORCE: pred.load.continue: +; FORCE-NEXT: [[TMP13:%.*]] = phi <2 x i16> [ undef, [[PRED_STORE_CONTINUE2]] ], [ [[TMP12]], [[PRED_LOAD_IF]] ] +; FORCE-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1 +; FORCE-NEXT: br i1 [[TMP14]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]] +; FORCE: pred.load.if5: +; FORCE-NEXT: [[TMP15:%.*]] = extractelement <2 x i16> [[TMP6]], i32 1 +; FORCE-NEXT: [[TMP16:%.*]] = add i16 [[TMP5]], 1 +; FORCE-NEXT: [[TMP17:%.*]] = getelementptr inbounds [5 x [3 x i16]], [5 x [3 x i16]]* @a, i16 0, i16 [[TMP15]], i16 [[TMP16]] +; FORCE-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP17]], align 1 +; FORCE-NEXT: [[TMP19:%.*]] = insertelement <2 x i16> [[TMP13]], i16 [[TMP18]], i32 1 +; FORCE-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; FORCE: pred.load.continue6: +; FORCE-NEXT: [[TMP20:%.*]] = phi <2 x i16> [ [[TMP13]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP19]], [[PRED_LOAD_IF5]] ] +; FORCE-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 +; FORCE-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], +; FORCE-NEXT: [[VEC_IND_NEXT4]] = add <2 x i16> [[VEC_IND3]], +; FORCE-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4 +; FORCE-NEXT: br i1 [[TMP21]], label {{%.*}}, label [[VECTOR_BODY]] +; + +@a = internal constant [5 x [3 x i16]] [[3 x i16] [i16 7, i16 7, i16 7], [3 x i16] [i16 8, i16 8, i16 8], [3 x i16] [i16 4, i16 0, i16 5], [3 x i16] [i16 0, i16 4, i16 0], [3 x i16] zeroinitializer], align 1 +@b = external global i32, align 1 + +define void @PR40816() #0 { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ] + store i32 %storemerge, i32* @b, align 1 + %0 = trunc i32 %storemerge to i16 + %1 = mul i16 %0, %0 + %arrayidx1 = getelementptr inbounds [5 x [3 x i16]], [5 x [3 x i16]]* @a, i16 0, i16 %1, i16 %0 + %2 = load i16, i16* %arrayidx1, align 1 + %cmp2 = icmp eq i16 %2, 0 + %inc = add nuw nsw i32 %storemerge, 1 + br i1 %cmp2, label %return, label %for.body + +return: ; preds = %for.body + ret void +} + attributes #0 = { "target-cpu"="knl" }