diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9789,6 +9789,8 @@ DominatorTree *DT, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI, OptimizationRemarkEmitter *ORE) { auto &SE = *PSE.getSE(); + auto PreferPredicate = + TTI->preferPredicateOverEpilogue(L, LI, SE, *AC, TLI, DT, &LVL, IAI); // 1) OptSize takes precedence over all other options, i.e. if this is set, // don't look at hints or options, and don't request a scalar epilogue. @@ -9846,21 +9848,27 @@ << "count.\n"); return None; } - if (*ExpectedTC <= TTI->getMinTripCountTailFoldingThreshold()) { - LLVM_DEBUG(dbgs() << "But the target considers the trip count too " - "small to consider vectorizing.\n"); - reportVectorizationFailure( - "The trip count is below the minimal threshold value.", - "loop trip count is too low, avoiding vectorization", - "LowTripCount", ORE, L); - return None; + // If trip count is known compile time constant there is no any extra + // overhead from vectorization with scalar epilogue. Let cost model to + // decide on profitability in that case. + auto TC = SE.getSmallConstantTripCount(L); + if (TC == 0 || PreferPredicate) { + if (*ExpectedTC <= TTI->getMinTripCountTailFoldingThreshold()) { + LLVM_DEBUG(dbgs() << "But the target considers the trip count too " + "small to consider vectorizing.\n"); + reportVectorizationFailure( + "The trip count is below the minimal threshold value.", + "loop trip count is too low, avoiding vectorization", + "LowTripCount", ORE, L); + return None; + } + return CM_SEL_PredicateOrDontVectorize; } - return CM_SEL_PredicateOrDontVectorize; } } // 5) if the TTI hook indicates this is profitable, request predication. - if (TTI->preferPredicateOverEpilogue(L, LI, SE, *AC, TLI, DT, &LVL, IAI)) + if (PreferPredicate) return CM_SEL_PredicateElseScalar; return CM_SEL_Allowed; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll @@ -1,22 +1,61 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-vectorize -S < %s | FileCheck %s target triple = "aarch64-unknown-linux-gnu" define void @trip7_i64(i64* noalias nocapture noundef %dst, i64* noalias nocapture noundef readonly %src) #0 { ; CHECK-LABEL: @trip7_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 7, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 7, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 7, [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK: [[ACTIVE_LANE_MASK:%.*]] = phi [ {{%.*}}, %vector.ph ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %vector.body ] -; CHECK: {{%.*}} = call @llvm.masked.load.nxv2i64.p0nxv2i64(* {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]], poison) -; CHECK: {{%.*}} = call @llvm.masked.load.nxv2i64.p0nxv2i64(* {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]], poison) -; CHECK: call void @llvm.masked.store.nxv2i64.p0nxv2i64( {{%.*}}, * {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]]) -; CHECK: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[VF:%.*]] = mul i64 [[VSCALE]], 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VF]] -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 7) -; CHECK-NEXT: [[ACTIVE_LANE_MASK_NOT:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[COND:%.*]] = extractelement [[ACTIVE_LANE_MASK_NOT]], i32 0 -; CHECK-NEXT: br i1 [[COND]], label %middle.block, label %vector.body +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, i64* [[SRC:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, i64* [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64* [[TMP6]] to * +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP7]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = shl nsw [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64* [[DST:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64* [[TMP10]] to * +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load , * [[TMP11]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = add nsw [[WIDE_LOAD1]], [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i64* [[TMP10]] to * +; CHECK-NEXT: store [[TMP12]], * [[TMP13]], align 8 +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 7, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_06:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[SRC]], i64 [[I_06]] +; CHECK-NEXT: [[TMP17:%.*]] = load i64, i64* [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[TMP17]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[DST]], i64 [[I_06]] +; CHECK-NEXT: [[TMP18:%.*]] = load i64, i64* [[ARRAYIDX1]], align 8 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP18]], [[MUL]] +; CHECK-NEXT: store i64 [[ADD]], i64* [[ARRAYIDX1]], align 8 +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_06]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 7 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void ; entry: br label %for.body @@ -41,19 +80,45 @@ define void @trip5_i8(i8* noalias nocapture noundef %dst, i8* noalias nocapture noundef readonly %src) #0 { ; CHECK-LABEL: @trip5_i8( ; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = shl <4 x i8> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i8> [[TMP4]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>* +; CHECK-NEXT: store <4 x i8> [[TMP8]], <4 x i8>* [[TMP9]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4 +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 5, 4 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[I_08]] -; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP0]], 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[I_08]] -; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP1]] +; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[I_08]] +; CHECK-NEXT: [[TMP11:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP11]], 1 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[I_08]] +; CHECK-NEXT: [[TMP12:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP12]] ; CHECK-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 5 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll @@ -6,53 +6,41 @@ define void @trip5_i8(i8* noalias nocapture noundef %dst, i8* noalias nocapture noundef readonly %src) #0 { ; CHECK-LABEL: @trip5_i8( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i64 -6, [[TMP1]] -; CHECK-NEXT: br i1 [[TMP2]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 8 -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[TMP6]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 5, [[TMP7]] -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP8]], i64 5) -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8i8.p0nxv8i8(* [[TMP11]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP12:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP13]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to * -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv8i8.p0nxv8i8(* [[TMP15]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP16:%.*]] = add [[TMP12]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP14]] to * -; CHECK-NEXT: call void @llvm.masked.store.nxv8i8.p0nxv8i8( [[TMP16]], * [[TMP17]], i32 1, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP19]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = shl <4 x i8> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i8> [[TMP4]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>* +; CHECK-NEXT: store <4 x i8> [[TMP8]], <4 x i8>* [[TMP9]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4 +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 5, 4 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[I_08]] -; CHECK-NEXT: [[TMP20:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP20]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = shl i8 [[TMP11]], 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[I_08]] -; CHECK-NEXT: [[TMP21:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP21]] +; CHECK-NEXT: [[TMP12:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP12]] ; CHECK-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX1]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 5 diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll --- a/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll @@ -17,58 +17,29 @@ ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ] -; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[PRED_STORE_CONTINUE4]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE4]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <2 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 -; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] -; CHECK: pred.load.if: -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* @A, i64 0, i64 [[TMP0]] -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i32 0 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]] -; CHECK: pred.load.continue: -; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_LOAD_IF]] ] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 -; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] -; CHECK: pred.load.if1: -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* @A, i64 0, i64 [[TMP1]] -; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP10]], i32 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] -; CHECK: pred.load.continue2: -; CHECK-NEXT: [[TMP12]] = phi <2 x i32> [ [[TMP7]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP11]], [[PRED_LOAD_IF1]] ] -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[TMP12]], <2 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 -; CHECK-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* @B, i64 0, i64 [[TMP0]] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0 -; CHECK-NEXT: store i32 [[TMP16]], i32* [[TMP15]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] -; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 -; CHECK-NEXT: br i1 [[TMP17]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]] -; CHECK: pred.store.if3: -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* @B, i64 0, i64 [[TMP1]] -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1 -; CHECK-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] -; CHECK: pred.store.continue4: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], -; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 6 -; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* @A, i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>* +; CHECK-NEXT: [[WIDE_LOAD]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[WIDE_LOAD]], <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* @B, i64 0, i64 [[TMP0]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP4]], <2 x i32>* [[TMP7]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1 -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i32> [[TMP12]], i32 0 -; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 5, 4 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 6, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LV:%.*]], [[LOOP]] ] @@ -79,7 +50,7 @@ ; CHECK-NEXT: store i32 [[SCALAR_RECUR]], i32* [[B_PTR]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 5 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], [[LOOP2:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll --- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll @@ -26,7 +26,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16** [[TMP3]] to <2 x i16*>* ; CHECK-NEXT: store <2 x i16*> , <2 x i16*>* [[TMP4]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 2, 2 ; CHECK-NEXT: br i1 [[CMP_N]], label [[BB3:%.*]], label [[SCALAR_PH]] @@ -43,7 +43,7 @@ ; CHECK-NEXT: store i16* [[_TMP4]], i16** [[_TMP7]], align 8 ; CHECK-NEXT: [[_TMP9]] = add nsw i16 [[C_1_0]], 1 ; CHECK-NEXT: [[_TMP11:%.*]] = icmp slt i16 [[_TMP9]], 2 -; CHECK-NEXT: br i1 [[_TMP11]], label [[BB2]], label [[BB3]], [[LOOP2:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[_TMP11]], label [[BB2]], label [[BB3]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: bb3: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/pr31671-consecutive-ptr-uniforms.ll rename from llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll rename to llvm/test/Transforms/LoopVectorize/X86/pr31671-consecutive-ptr-uniforms.ll --- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr31671-consecutive-ptr-uniforms.ll @@ -1,6 +1,5 @@ ; REQUIRES: asserts ; RUN: opt < %s -aa-pipeline=basic-aa -passes=loop-vectorize,instcombine -S -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s -; RUN: opt < %s -loop-vectorize -force-vector-width=2 -S | FileCheck %s -check-prefix=FORCE target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -18,7 +17,7 @@ ; CHECK-NOT: LV: Found uniform instruction: %tmp3 = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %i ; CHECK-NOT: LV: Found uniform instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ] ; CHECK-NOT: LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 5 -; CHECK: define void @PR31671( +; CHECK-LABEL: @PR31671( ; CHECK: vector.ph: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x float> poison, float %x, i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x float> [[BROADCAST_SPLATINSERT]], <16 x float> poison, <16 x i32> zeroinitializer @@ -68,66 +67,3 @@ attributes #0 = { "target-cpu"="knl" } -; CHECK-LABEL: PR40816 -; -; Check that scalar with predication instructions are not considered uniform -; after vectorization, because that results in replicating a region instead of -; having a single instance (out of VF). The predication stems from a tiny count -; of 3 leading to folding the tail by masking using icmp ule <= <2, 2>. -; -; CHECK: LV: Found trip count: 3 -; CHECK: LV: Found uniform instruction: {{%.*}} = icmp eq i32 {{%.*}}, 0 -; CHECK-NOT: LV: Found uniform instruction: {{%.*}} = load i32, i32* {{%.*}}, align 1 -; CHECK: LV: Found not uniform being ScalarWithPredication: {{%.*}} = load i32, i32* {{%.*}}, align 1 -; CHECK: LV: Found scalar instruction: {{%.*}} = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 {{%.*}} -; -; FORCE-LABEL: @PR40816( -; FORCE-NEXT: entry: -; FORCE-NEXT: br i1 false, label {{%.*}}, label [[VECTOR_PH:%.*]] -; FORCE: vector.ph: -; FORCE-NEXT: br label [[VECTOR_BODY:%.*]] -; FORCE: vector.body: -; FORCE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE4:%.*]] ] -; FORCE-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE4]] ] -; FORCE-NEXT: [[TMP2:%.*]] = icmp ule <2 x i32> [[VEC_IND]], -; FORCE-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 -; FORCE-NEXT: br i1 [[TMP3]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] -; FORCE: pred.load.if: -; FORCE-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; FORCE-NEXT: store i32 [[TMP0]], i32* @b, align 1 -; FORCE-NEXT: br label [[PRED_LOAD_CONTINUE]] -; FORCE: pred.load.continue: -; FORCE-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 -; FORCE-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4]] -; FORCE: pred.load.if1: -; FORCE-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 -; FORCE-NEXT: store i32 [[TMP1]], i32* @b, align 1 -; FORCE-NEXT: br label [[PRED_LOAD_CONTINUE4]] -; FORCE: pred.load.continue2: -; FORCE-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 -; FORCE-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], -; FORCE-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4 -; FORCE-NEXT: br i1 [[TMP15]], label {{%.*}}, label [[VECTOR_BODY]] -; -@a = internal constant [3 x i32] [i32 7, i32 7, i32 0], align 1 -@b = external global i32, align 1 - -define void @PR40816() #1 { - -entry: - br label %for.body - -for.body: ; preds = %for.body, %entry - %0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] - store i32 %0, i32* @b, align 1 - %arrayidx1 = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 %0 - %1 = load i32, i32* %arrayidx1, align 1 - %cmp2 = icmp eq i32 %1, 0 - %inc = add nuw nsw i32 %0, 1 - br i1 %cmp2, label %return, label %for.body - -return: ; preds = %for.body - ret void -} - -attributes #1 = { "target-cpu"="core2" } diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll --- a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll @@ -21,16 +21,16 @@ ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !0 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group [[ACC_GRP0]] ; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] ; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !0 +; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group [[ACC_GRP0]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP1:!llvm.loop !.*]] +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 8, 8 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -40,14 +40,14 @@ ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !0 +; CHECK-NEXT: [[TMP9:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP0]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !0 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP10]], [[TMP11]] -; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !0 +; CHECK-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP9]], [[TMP10]] +; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP3:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr40816-consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/pr40816-consecutive-ptr-uniforms.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/pr40816-consecutive-ptr-uniforms.ll @@ -0,0 +1,99 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; REQUIRES: asserts +; RUN: opt < %s -aa-pipeline=basic-aa -passes=loop-vectorize,instcombine -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s +; RUN: opt < %s -loop-vectorize -instcombine -S -debug-only=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -disable-output -print-after=instcombine -enable-new-pm=0 2>&1 | FileCheck %s +; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -force-vector-width=2 -S | FileCheck %s -check-prefix=FORCE + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; CHECK-LABEL: PR40816 +; +; Check that scalar with predication instructions are not considered uniform +; after vectorization, because that results in replicating a region instead of +; having a single instance (out of VF). The predication stems from a tiny count +; of 3 leading to folding the tail by masking using icmp ule <= <2, 2>. +; +; CHECK: LV: Found trip count: 3 +; CHECK: LV: Found uniform instruction: {{%.*}} = icmp eq i32 {{%.*}}, 0 +; CHECK-NOT: LV: Found uniform instruction: {{%.*}} = load i32, i32* {{%.*}}, align 1 +; CHECK: LV: Found not uniform being ScalarWithPredication: {{%.*}} = load i32, i32* {{%.*}}, align 1 +; CHECK: LV: Found scalar instruction: {{%.*}} = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 {{%.*}} +; +@a = internal constant [3 x i32] [i32 7, i32 7, i32 0], align 1 +@b = external global i32, align 1 + +define void @PR40816() #1 { +; CHECK-LABEL: @PR40816( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: store i32 [[TMP0]], i32* @b, align 1 +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[TMP0]], 2 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[TMP0]], 1 +; CHECK-NEXT: br i1 [[CMP2]], label [[RETURN:%.*]], label [[FOR_BODY]] +; CHECK: return: +; CHECK-NEXT: ret void +; +; FORCE-LABEL: @PR40816( +; FORCE-NEXT: entry: +; FORCE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FORCE: vector.ph: +; FORCE-NEXT: br label [[VECTOR_BODY:%.*]] +; FORCE: vector.body: +; FORCE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ] +; FORCE-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE2]] ] +; FORCE-NEXT: [[TMP0:%.*]] = icmp ule <2 x i32> [[VEC_IND]], +; FORCE-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0 +; FORCE-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] +; FORCE: pred.load.if: +; FORCE-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 0 +; FORCE-NEXT: store i32 [[TMP2]], i32* @b, align 1 +; FORCE-NEXT: br label [[PRED_LOAD_CONTINUE]] +; FORCE: pred.load.continue: +; FORCE-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1 +; FORCE-NEXT: br i1 [[TMP3]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]] +; FORCE: pred.load.if1: +; FORCE-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 1 +; FORCE-NEXT: store i32 [[TMP4]], i32* @b, align 1 +; FORCE-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; FORCE: pred.load.continue2: +; FORCE-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 +; FORCE-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], +; FORCE-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4 +; FORCE-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; FORCE: middle.block: +; FORCE-NEXT: br i1 true, label [[RETURN:%.*]], label [[SCALAR_PH]] +; FORCE: scalar.ph: +; FORCE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; FORCE-NEXT: br label [[FOR_BODY:%.*]] +; FORCE: for.body: +; FORCE-NEXT: [[TMP6:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; FORCE-NEXT: store i32 [[TMP6]], i32* @b, align 1 +; FORCE-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 [[TMP6]] +; FORCE-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX1]], align 1 +; FORCE-NEXT: [[CMP2:%.*]] = icmp eq i32 [[TMP7]], 0 +; FORCE-NEXT: [[INC]] = add nuw nsw i32 [[TMP6]], 1 +; FORCE-NEXT: br i1 [[CMP2]], label [[RETURN]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; FORCE: return: +; FORCE-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + store i32 %0, i32* @b, align 1 + %arrayidx1 = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 %0 + %1 = load i32, i32* %arrayidx1, align 1 + %cmp2 = icmp eq i32 %1, 0 + %inc = add nuw nsw i32 %0, 1 + br i1 %cmp2, label %return, label %for.body + +return: ; preds = %for.body + ret void +} + +attributes #1 = { "target-cpu"="core2" } + diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll --- a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll @@ -38,7 +38,7 @@ ; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !0 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP1:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 20, 16 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -55,7 +55,7 @@ ; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !0 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP4:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -95,40 +95,37 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[INDUCTION]], -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <8 x float>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP4]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group !6 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP7]], i32 4, <8 x i1> [[TMP1]], <8 x float> poison), !llvm.access.group !6 -; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP6]] to <8 x float>* -; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP8]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP1]]), !llvm.access.group !6 -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP7:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !6 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !6 +; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>* +; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !6 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 20, 16 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !6 +; CHECK-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !6 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !6 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP11]], [[TMP12]] +; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !6 +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP10]], [[TMP11]] ; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !6 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP9:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -180,7 +177,7 @@ ; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !6 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, 16 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -197,7 +194,7 @@ ; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !6 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP11:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: opt < %s -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -force-widen-divrem-via-safe-divisor=0 -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s +; RUN: opt < %s -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -force-widen-divrem-via-safe-divisor=0 -disable-output -debug-only=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue 2>&1 | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" diff --git a/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll b/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll --- a/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll +++ b/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll @@ -39,40 +39,21 @@ ; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-EMPTY: -; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count -; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: -; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION -; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next, ir<1> -; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<0>, ir<1> -; CHECK-NEXT: EMIT vp<[[COND:%.+]]> = icmp ule ir<%iv> vp<[[BTC]]> -; CHECK-NEXT: WIDEN ir<%cond0> = icmp ult ir<%iv>, ir<13> -; CHECK-NEXT: WIDEN-SELECT ir<%s> = select ir<%cond0>, ir<10>, ir<20> -; CHECK-NEXT: Successor(s): pred.store -; CHECK-EMPTY: -; CHECK-NEXT: pred.store: { -; CHECK-NEXT: pred.store.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<[[COND]]> -; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue -; CHECK-EMPTY: -; CHECK-NEXT: pred.store.if: -; CHECK-NEXT: REPLICATE ir<%gep> = getelementptr ir<%ptr>, vp<[[STEPS]]> -; CHECK-NEXT: REPLICATE store ir<%s>, ir<%gep> -; CHECK-NEXT: Successor(s): pred.store.continue -; CHECK-EMPTY: -; CHECK-NEXT: pred.store.continue: -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-NEXT: Successor(s): loop.0 -; CHECK-EMPTY: -; CHECK-NEXT: loop.0: -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = VF * UF + vp<[[CAN_IV]]> -; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]> vp<[[VEC_TC]]> -; CHECK-NEXT: No successor +; CHECK-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION +; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next, ir<1> +; CHECK-NEXT: vp<%4> = SCALAR-STEPS vp<%2>, ir<0>, ir<1> +; CHECK-NEXT: WIDEN ir<%cond0> = icmp ult ir<%iv>, ir<13> +; CHECK-NEXT: WIDEN-SELECT ir<%s> = select ir<%cond0>, ir<10>, ir<20> +; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%ptr>, vp<%4> +; CHECK-NEXT: WIDEN store ir<%gep>, ir<%s> +; CHECK-NEXT: EMIT vp<%8> = VF * UF +(nuw) vp<%2> +; CHECK-NEXT: EMIT branch-on-count vp<%8> vp<%1> +; CHECK-NEXT: No successors ; CHECK-NEXT: } define void @test(i32* %ptr) { entry: diff --git a/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll --- a/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll +++ b/llvm/test/Transforms/LoopVectorize/pr43166-fold-tail-by-masking.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -S | FileCheck %s +; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s ; Test cases below are reduced (and slightly modified) reproducers based on a diff --git a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll --- a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll +++ b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -S | FileCheck %s -; RUN: opt < %s -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -S | FileCheck %s -check-prefix=VF2UF2 -; RUN: opt < %s -passes=loop-vectorize -force-vector-width=1 -force-vector-interleave=4 -S | FileCheck %s -check-prefix=VF1UF4 +; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck %s +; RUN: opt < %s -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck %s -check-prefix=VF2UF2 +; RUN: opt < %s -passes=loop-vectorize -force-vector-width=1 -force-vector-interleave=4 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck %s -check-prefix=VF1UF4 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" diff --git a/llvm/test/Transforms/LoopVectorize/reduction-order.ll b/llvm/test/Transforms/LoopVectorize/reduction-order.ll --- a/llvm/test/Transforms/LoopVectorize/reduction-order.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-order.ll @@ -1,4 +1,4 @@ -; RUN: opt -passes='loop-vectorize' -force-vector-width=4 -force-vector-interleave=1 -S < %s 2>&1 | FileCheck %s +; RUN: opt -passes='loop-vectorize' -force-vector-width=4 -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S < %s 2>&1 | FileCheck %s target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll --- a/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll +++ b/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -loop-vectorize -force-vector-interleave=4 -pass-remarks='loop-vectorize' -disable-output -S 2>&1 | FileCheck %s --check-prefix=CHECK-REMARKS -; RUN: opt < %s -loop-vectorize -force-vector-interleave=4 -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-interleave=4 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -pass-remarks='loop-vectorize' -disable-output -S 2>&1 | FileCheck %s --check-prefix=CHECK-REMARKS +; RUN: opt < %s -loop-vectorize -force-vector-interleave=4 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck %s ; These tests are to check that fold-tail procedure produces correct scalar code when ; loop-vectorization is only unrolling but not vectorizing. diff --git a/llvm/test/Transforms/LoopVectorize/tripcount.ll b/llvm/test/Transforms/LoopVectorize/tripcount.ll --- a/llvm/test/Transforms/LoopVectorize/tripcount.ll +++ b/llvm/test/Transforms/LoopVectorize/tripcount.ll @@ -195,17 +195,39 @@ ; Simple loop with constant, small trip count and no profiling info. ; CHECK-LABEL: @const_low_trip_count( ; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <2 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, <2 x i8>* [[TMP3]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[TMP4]], <2 x i8> , <2 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to <2 x i8>* +; CHECK-NEXT: store <2 x i8> [[TMP5]], <2 x i8>* [[TMP6]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 3, 2 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 2, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] -; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP0]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP8]], 0 ; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 ; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i32 [[I_08]], 2 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret i32 0 ; @@ -248,7 +270,7 @@ ; CHECK-NEXT: store <4 x i8> [[TMP5]], <4 x i8>* [[TMP6]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 1001, 1000 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -264,7 +286,7 @@ ; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i32 [[I_08]], 1000 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret i32 0 ; @@ -291,17 +313,45 @@ ; Simple loop with static, small trip count and no profiling info. ; CHECK-LABEL: @const_small_trip_count_step( ; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i32 [[INDEX]], 5 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 5 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[TMP2]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = load i8, i8* [[TMP3]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i8> poison, i8 [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i8> [[TMP6]], i8 [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <2 x i8> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP8]], <2 x i8> , <2 x i8> +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i8> [[TMP9]], i32 0 +; CHECK-NEXT: store i8 [[TMP10]], i8* [[TMP2]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i8> [[TMP9]], i32 1 +; CHECK-NEXT: store i8 [[TMP11]], i8* [[TMP3]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2 +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 3, 2 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 10, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]] -; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP0]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP13]], 0 ; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1 ; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 5 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i32 [[I_08]], 10 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret i32 0 ; @@ -344,7 +394,7 @@ ; CHECK-NEXT: store <4 x i8> [[TMP5]], <4 x i8>* [[TMP6]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 1001, 1000 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -360,7 +410,7 @@ ; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i32 [[I_08]], 1000 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END]], !prof [[PROF0]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END]], !prof [[PROF0]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret i32 0 ; diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll @@ -240,8 +240,6 @@ ; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-EMPTY: -; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count -; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: @@ -250,8 +248,6 @@ ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 21, %iv.next, ir<1> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<21>, ir<1> -; CHECK-NEXT: EMIT vp<[[WIDE_CAN_IV:%.+]]> = WIDEN-CANONICAL-INDUCTION vp<[[CAN_IV]]> -; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule vp<[[WIDE_CAN_IV]]> vp<[[BTC]]> ; CHECK-NEXT: CLONE ir<%gep.A.uniform> = getelementptr ir<%A>, ir<0> ; CHECK-NEXT: CLONE ir<%lv> = load ir<%gep.A.uniform> ; CHECK-NEXT: WIDEN ir<%cmp> = icmp ult ir<%iv>, ir<%k> @@ -259,12 +255,11 @@ ; CHECK-EMPTY: ; CHECK-NEXT: loop.then: ; CHECK-NEXT: EMIT vp<[[NOT2:%.+]]> = not ir<%cmp> -; CHECK-NEXT: EMIT vp<[[MASK2:%.+]]> = select vp<[[MASK]]> vp<[[NOT2]]> ir ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: pred.store: { ; CHECK-NEXT: pred.store.entry: -; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK2]]> +; CHECK-NEXT: BRANCH-ON-MASK vp<[[NOT2]]> ; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: @@ -281,7 +276,7 @@ ; CHECK-NEXT: Successor(s): loop.latch ; CHECK-EMPTY: ; CHECK-NEXT: loop.latch: -; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = VF * UF + vp<[[CAN_IV]]> +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = VF * UF +(nuw) vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]> vp<[[VEC_TC]]> ; CHECK-NEXT: No successors ; CHECK-NEXT: }