diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h --- a/llvm/include/llvm/IR/PatternMatch.h +++ b/llvm/include/llvm/IR/PatternMatch.h @@ -2109,6 +2109,14 @@ return IntrinsicID_match(IntrID); } +/// Matches MaskedLoad Intrinsic. +template +inline typename m_Intrinsic_Ty::Ty +m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2, + const Opnd3 &Op3) { + return m_Intrinsic(Op0, Op1, Op2, Op3); +} + template inline typename m_Intrinsic_Ty::Ty m_Intrinsic(const T0 &Op0) { return m_CombineAnd(m_Intrinsic(), m_Argument<0>(Op0)); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -3218,5 +3218,36 @@ if (Value *Fr = foldSelectWithFrozenICmp(SI, Builder)) return replaceInstUsesWith(SI, Fr); + // select(mask, mload(,,mask,0), 0) -> mload(,,mask,0) + // Load inst is intentionally not checked for hasOneUse() + if (match(FalseVal, m_Zero()) && + match(TrueVal, m_MaskedLoad(m_Value(), m_Value(), m_Specific(CondVal), + m_CombineOr(m_Undef(), m_Zero())))) { + auto *MaskedLoad = cast(TrueVal); + if (isa(MaskedLoad->getArgOperand(3))) + MaskedLoad->setArgOperand(3, FalseVal /* Zero */); + return replaceInstUsesWith(SI, MaskedLoad); + } + + Value *Mask; + if (match(TrueVal, m_Zero()) && + match(FalseVal, m_MaskedLoad(m_Value(), m_Value(), m_Value(Mask), + m_CombineOr(m_Undef(), m_Zero())))) { + // We can remove the select by ensuring the load zeros all lanes the + // select would have. We determine this by proving there is no overlap + // between the load and select masks. + // (i.e (load_mask & select_mask) == 0 == no overlap) + bool CanMergeSelectIntoLoad = false; + if (Value *V = SimplifyAndInst(CondVal, Mask, SQ.getWithInstruction(&SI))) + CanMergeSelectIntoLoad = match(V, m_Zero()); + + if (CanMergeSelectIntoLoad) { + auto *MaskedLoad = cast(FalseVal); + if (isa(MaskedLoad->getArgOperand(3))) + MaskedLoad->setArgOperand(3, TrueVal /* Zero */); + return replaceInstUsesWith(SI, MaskedLoad); + } + } + return nullptr; } diff --git a/llvm/test/Transforms/InstCombine/select-masked_load.ll b/llvm/test/Transforms/InstCombine/select-masked_load.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/select-masked_load.ll @@ -0,0 +1,98 @@ +; RUN: opt < %s -instcombine -S | FileCheck %s + +; Fold zeroing of inactive lanes into the load's passthrough parameter. +define <4 x float> @masked_load_and_zero_inactive_1(<4 x float>* %ptr, <4 x i1> %mask) { +; CHECK-LABEL: @masked_load_and_zero_inactive_1( +; CHECK: %load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %ptr, i32 4, <4 x i1> %mask, <4 x float> zeroinitializer) +; CHECK-NEXT: ret <4 x float> %load + %load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %ptr, i32 4, <4 x i1> %mask, <4 x float> undef) + %masked = select <4 x i1> %mask, <4 x float> %load, <4 x float> zeroinitializer + ret <4 x float> %masked +} + +; As above but reuse the load's existing passthrough. +define <4 x i32> @masked_load_and_zero_inactive_2(<4 x i32>* %ptr, <4 x i1> %mask) { +; CHECK-LABEL: @masked_load_and_zero_inactive_2( +; CHECK: %load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptr, i32 4, <4 x i1> %mask, <4 x i32> zeroinitializer) +; CHECK-NEXT: ret <4 x i32> %load + %load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptr, i32 4, <4 x i1> %mask, <4 x i32> zeroinitializer) + %masked = select <4 x i1> %mask, <4 x i32> %load, <4 x i32> zeroinitializer + ret <4 x i32> %masked +} + +; No transform when the load's passthrough cannot be reused or altered. +define <4 x i32> @masked_load_and_zero_inactive_3(<4 x i32>* %ptr, <4 x i1> %mask, <4 x i32> %passthrough) { +; CHECK-LABEL: @masked_load_and_zero_inactive_3( +; CHECK: %load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptr, i32 4, <4 x i1> %mask, <4 x i32> %passthrough) +; CHECK-NEXT: %masked = select <4 x i1> %mask, <4 x i32> %load, <4 x i32> zeroinitializer +; CHECK-NEXT: ret <4 x i32> %masked + %load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptr, i32 4, <4 x i1> %mask, <4 x i32> %passthrough) + %masked = select <4 x i1> %mask, <4 x i32> %load, <4 x i32> zeroinitializer + ret <4 x i32> %masked +} + +; Remove redundant select when its mask doesn't overlap with the load mask. +define <4 x i32> @masked_load_and_zero_inactive_4(<4 x i32>* %ptr, <4 x i1> %inv_mask) { +; CHECK-LABEL: @masked_load_and_zero_inactive_4( +; CHECK: %mask = xor <4 x i1> %inv_mask, +; CHECK-NEXT: %load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptr, i32 4, <4 x i1> %mask, <4 x i32> zeroinitializer) +; CHECK-NEXT: ret <4 x i32> %load + %mask = xor <4 x i1> %inv_mask, + %load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptr, i32 4, <4 x i1> %mask, <4 x i32> undef) + %masked = select <4 x i1> %inv_mask, <4 x i32> zeroinitializer, <4 x i32> %load + ret <4 x i32> %masked +} + +; As above but reuse the load's existing passthrough. +define <4 x i32> @masked_load_and_zero_inactive_5(<4 x i32>* %ptr, <4 x i1> %inv_mask) { +; CHECK-LABEL: @masked_load_and_zero_inactive_5( +; CHECK: %mask = xor <4 x i1> %inv_mask, +; CHECK-NEXT: %load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptr, i32 4, <4 x i1> %mask, <4 x i32> zeroinitializer) +; CHECK-NEXT: ret <4 x i32> %load + %mask = xor <4 x i1> %inv_mask, + %load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptr, i32 4, <4 x i1> %mask, <4 x i32> zeroinitializer) + %masked = select <4 x i1> %inv_mask, <4 x i32> zeroinitializer, <4 x i32> %load + ret <4 x i32> %masked +} + +; No transform when the load's passthrough cannot be reused or altered. +define <4 x i32> @masked_load_and_zero_inactive_6(<4 x i32>* %ptr, <4 x i1> %inv_mask, <4 x i32> %passthrough) { +; CHECK-LABEL: @masked_load_and_zero_inactive_6( +; CHECK: %mask = xor <4 x i1> %inv_mask, +; CHECK-NEXT: %load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptr, i32 4, <4 x i1> %mask, <4 x i32> %passthrough) +; CHECK-NEXT: %masked = select <4 x i1> %inv_mask, <4 x i32> zeroinitializer, <4 x i32> %load +; CHECK-NEXT: ret <4 x i32> %masked + %mask = xor <4 x i1> %inv_mask, + %load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptr, i32 4, <4 x i1> %mask, <4 x i32> %passthrough) + %masked = select <4 x i1> %inv_mask, <4 x i32> zeroinitializer, <4 x i32> %load + ret <4 x i32> %masked +} + +; No transform when select and load masks have no relation. +define <4 x i32> @masked_load_and_zero_inactive_7(<4 x i32>* %ptr, <4 x i1> %mask1, <4 x i1> %mask2) { +; CHECK-LABEL: @masked_load_and_zero_inactive_7( +; CHECK: %load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptr, i32 4, <4 x i1> %mask1, <4 x i32> zeroinitializer) +; CHECK-NEXT: %masked = select <4 x i1> %mask2, <4 x i32> zeroinitializer, <4 x i32> %load +; CHECK-NEXT: ret <4 x i32> %masked + %load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %ptr, i32 4, <4 x i1> %mask1, <4 x i32> zeroinitializer) + %masked = select <4 x i1> %mask2, <4 x i32> zeroinitializer, <4 x i32> %load + ret <4 x i32> %masked +} + +; A more complex case where we can prove the select mask is a subset of the +; load's inactive lanes and thus the load's passthrough takes effect. +define <4 x float> @masked_load_and_zero_inactive_8(<4 x float>* %ptr, <4 x i1> %inv_mask, <4 x i1> %cond) { +; CHECK-LABEL: @masked_load_and_zero_inactive_8( +; CHECK: %mask = xor <4 x i1> %inv_mask, +; CHECK-NEXT: %pg = and <4 x i1> %mask, %cond +; CHECK-NEXT: %load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %ptr, i32 4, <4 x i1> %pg, <4 x float> zeroinitializer) +; CHECK-NEXT: ret <4 x float> %load + %mask = xor <4 x i1> %inv_mask, + %pg = and <4 x i1> %mask, %cond + %load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %ptr, i32 4, <4 x i1> %pg, <4 x float> undef) + %masked = select <4 x i1> %inv_mask, <4 x float> zeroinitializer, <4 x float> %load + ret <4 x float> %masked +} + +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) +declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll @@ -12,17 +12,16 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) -; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]]) -; CHECK-NEXT: [[TMP4]] = add i32 [[TMP3]], [[VEC_PHI]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> zeroinitializer) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_MASKED_LOAD]]) +; CHECK-NEXT: [[TMP3]] = add i32 [[TMP2]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -30,7 +29,7 @@ ; CHECK: .lr.ph: ; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: ._crit_edge: -; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] ; entry: @@ -60,27 +59,25 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 257) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> zeroinitializer) ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> zeroinitializer) ; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[VEC_IND]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]]) -; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], [[TMP6]] -; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD1]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]) -; CHECK-NEXT: [[TMP12]] = add i32 [[TMP11]], [[TMP9]] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_MASKED_LOAD]]) +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_MASKED_LOAD1]]) +; CHECK-NEXT: [[TMP10]] = add i32 [[TMP9]], [[TMP8]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -88,7 +85,7 @@ ; CHECK: .lr.ph: ; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: ._crit_edge: -; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll @@ -252,19 +252,18 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) -; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]]) -; CHECK-NEXT: [[TMP4]] = add i32 [[TMP3]], [[VEC_PHI]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> zeroinitializer) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_MASKED_LOAD]]) +; CHECK-NEXT: [[TMP3]] = add i32 [[TMP2]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP4]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP3]], [[VECTOR_BODY]] ] ; CHECK-NEXT: ret i32 [[R_0_LCSSA]] ; entry: @@ -394,19 +393,18 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <8 x i16>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison) -; CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> [[WIDE_MASKED_LOAD]], <8 x i16> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP2]]) -; CHECK-NEXT: [[TMP4]] = add i16 [[TMP3]], [[VEC_PHI]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> zeroinitializer) +; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[WIDE_MASKED_LOAD]]) +; CHECK-NEXT: [[TMP3]] = add i16 [[TMP2]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP4]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP3]], [[VECTOR_BODY]] ] ; CHECK-NEXT: ret i16 [[R_0_LCSSA]] ; entry: @@ -488,19 +486,18 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i8 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i8 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) -; CHECK-NEXT: [[TMP2:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> [[WIDE_MASKED_LOAD]], <16 x i8> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[TMP2]]) -; CHECK-NEXT: [[TMP4]] = add i8 [[TMP3]], [[VEC_PHI]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> zeroinitializer) +; CHECK-NEXT: [[TMP2:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[WIDE_MASKED_LOAD]]) +; CHECK-NEXT: [[TMP3]] = add i8 [[TMP2]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[TMP4]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[TMP3]], [[VECTOR_BODY]] ] ; CHECK-NEXT: ret i8 [[R_0_LCSSA]] ; entry: