diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1283,6 +1283,20 @@ bool useReductionIntrinsic(unsigned Opcode, Type *Ty, ReductionFlags Flags) const; + /// \returns True if the target prefers reductions select kept in the loop + /// when tail folding. i.e. + /// loop: + /// p = phi (0, s) + /// a = add (p, x) + /// s = select (mask, a, p) + /// vecreduce.add(s) + /// + /// As opposed to the normal scheme of p = phi (0, a) which allows the select + /// to be pulled out of the loop. If the select(.., add, ..) can be predicated + /// by the target, this can lead to cleaner code generation. + bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, + ReductionFlags Flags) const; + /// \returns True if the target wants to expand the given reduction intrinsic /// into a shuffle sequence. bool shouldExpandReduction(const IntrinsicInst *II) const; @@ -1573,6 +1587,8 @@ VectorType *VecTy) const = 0; virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty, ReductionFlags) const = 0; + virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, + ReductionFlags) const = 0; virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0; virtual unsigned getGISelRematGlobalCost() const = 0; virtual bool hasActiveVectorLength() const = 0; @@ -2073,6 +2089,10 @@ ReductionFlags Flags) const override { return Impl.useReductionIntrinsic(Opcode, Ty, Flags); } + bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, + ReductionFlags Flags) const override { + return Impl.preferPredicatedReductionSelect(Opcode, Ty, Flags); + } bool shouldExpandReduction(const IntrinsicInst *II) const override { return Impl.shouldExpandReduction(II); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -658,6 +658,11 @@ return false; } + bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, + TTI::ReductionFlags Flags) const { + return false; + } + bool shouldExpandReduction(const IntrinsicInst *II) const { return true; } unsigned getGISelRematGlobalCost() const { return 1; } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1013,6 +1013,11 @@ return TTIImpl->useReductionIntrinsic(Opcode, Ty, Flags); } +bool TargetTransformInfo::preferPredicatedReductionSelect( + unsigned Opcode, Type *Ty, ReductionFlags Flags) const { + return TTIImpl->preferPredicatedReductionSelect(Opcode, Ty, Flags); +} + bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const { return TTIImpl->shouldExpandReduction(II); } diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -186,6 +186,9 @@ bool useReductionIntrinsic(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const; + bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, + TTI::ReductionFlags Flags) const; + bool shouldExpandReduction(const IntrinsicInst *II) const { switch (II->getIntrinsicID()) { case Intrinsic::experimental_vector_reduce_v2_fadd: diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1589,35 +1589,28 @@ const LoopAccessInfo *LAI) { LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n"); - // If there are live-out values, it is probably a reduction, which needs a - // final reduction step after the loop. MVE has a VADDV instruction to reduce - // integer vectors, but doesn't have an equivalent one for float vectors. A - // live-out value that is not recognised as a reduction will result in the - // tail-predicated loop to be reverted to a non-predicated loop and this is - // very expensive, i.e. it has a significant performance impact. So, in this - // case it's better not to tail-predicate the loop, which is what we check - // here. Thus, we allow only 1 live-out value, which has to be an integer - // reduction, which matches the loops supported by ARMLowOverheadLoops. - // It is important to keep ARMLowOverheadLoops and canTailPredicateLoop in - // sync with each other. + // If there are live-out values, it is probably a reduction. We can predicate + // most reduction operations freely under MVE using a combination of + // prefer-predicated-reduction-select and inloop reductions. We limit this to + // floating point and integer reductions, but don't check for operators + // specifically here. If the value ends up not being a reduction (and so the + // vectorizer cannot tailfold the loop), we should fall back to standard + // vectorization automatically. SmallVector< Instruction *, 8 > LiveOuts; LiveOuts = llvm::findDefsUsedOutsideOfLoop(L); - bool IntReductionsDisabled = + bool ReductionsDisabled = EnableTailPredication == TailPredication::EnabledNoReductions || EnableTailPredication == TailPredication::ForceEnabledNoReductions; for (auto *I : LiveOuts) { - if (!I->getType()->isIntegerTy()) { - LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer " + if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() && + !I->getType()->isHalfTy()) { + LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float " "live-out value\n"); return false; } - if (I->getOpcode() != Instruction::Add) { - LLVM_DEBUG(dbgs() << "Only add reductions supported\n"); - return false; - } - if (IntReductionsDisabled) { - LLVM_DEBUG(dbgs() << "Integer add reductions not enabled\n"); + if (ReductionsDisabled) { + LLVM_DEBUG(dbgs() << "Reductions not enabled\n"); return false; } } @@ -1811,3 +1804,10 @@ TTI::ReductionFlags Flags) const { return ST->hasMVEIntegerOps(); } + +bool ARMTTIImpl::preferPredicatedReductionSelect( + unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { + if (!ST->hasMVEIntegerOps()) + return false; + return true; +} diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3929,7 +3929,10 @@ // and so use the select value for the phi instead of the old // LoopExitValue. RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi]; - if (PreferPredicatedReductionSelect) { + if (PreferPredicatedReductionSelect || + TTI->preferPredicatedReductionSelect( + RdxDesc.getRecurrenceBinOp(RdxDesc.getRecurrenceKind()), + Phi->getType(), TargetTransformInfo::ReductionFlags())) { auto *VecRdxPhi = cast(getOrCreateVectorValue(Phi, Part)); VecRdxPhi->setIncomingValueForBlock( LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel); diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll @@ -12,18 +12,18 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 256) ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) -; CHECK-NEXT: [[TMP2]] = add <4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3]] = add <4 x i32> [[VEC_PHI]], [[TMP2]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 -; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 ; CHECK: middle.block: -; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP2]], <4 x i32> [[VEC_PHI]] -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] @@ -60,7 +60,7 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 256) ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* @@ -70,14 +70,14 @@ ; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) ; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND]] ; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[WIDE_MASKED_LOAD]] -; CHECK-NEXT: [[TMP6]] = add <4 x i32> [[TMP5]], [[WIDE_MASKED_LOAD1]] +; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP5]], [[WIDE_MASKED_LOAD1]] +; CHECK-NEXT: [[TMP7]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP6]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 ; CHECK: middle.block: -; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP6]], <4 x i32> [[VEC_PHI]] -; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP7]]) ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] @@ -117,39 +117,29 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 256) ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP5]] = mul <4 x i32> [[TMP4]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) +; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP4]], [[WIDE_MASKED_LOAD1]] +; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP5]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6 ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP5]]) -; CHECK-NEXT: br i1 false, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP6]]) +; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ] ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[PROD_02:%.*]] = phi i32 [ [[L9:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[L2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDVARS_IV]] -; CHECK-NEXT: [[L3:%.*]] = load i32, i32* [[L2]], align 4 -; CHECK-NEXT: [[L4:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[INDVARS_IV]] -; CHECK-NEXT: [[L5:%.*]] = load i32, i32* [[L4]], align 4 -; CHECK-NEXT: [[L8:%.*]] = mul i32 [[PROD_02]], [[L3]] -; CHECK-NEXT: [[L9]] = mul i32 [[L8]], [[L5]] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !7 +; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !7 ; CHECK: ._crit_edge: -; CHECK-NEXT: [[PROD_0_LCSSA:%.*]] = phi i32 [ [[L9]], [[DOTLR_PH]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[PROD_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[PROD_0_LCSSA]] ; entry: @@ -181,39 +171,29 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 256) ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = and <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP5]] = and <4 x i32> [[TMP4]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) +; CHECK-NEXT: [[TMP4:%.*]] = and <4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i32> [[TMP4]], [[WIDE_MASKED_LOAD1]] +; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP5]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8 ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP5]]) -; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP6]]) +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ -1, [[ENTRY]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[RESULT_08:%.*]] = phi i32 [ [[AND:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDVARS_IV]] -; CHECK-NEXT: [[L0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[INDVARS_IV]] -; CHECK-NEXT: [[L1:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = and i32 [[RESULT_08]], [[L0]] -; CHECK-NEXT: [[AND]] = and i32 [[ADD]], [[L1]] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !9 +; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !9 ; CHECK: for.end: -; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]] ; entry: @@ -245,39 +225,29 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 256) ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP5]] = or <4 x i32> [[TMP4]], [[VEC_PHI]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) +; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6]] = or <4 x i32> [[VEC_PHI]], [[TMP5]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10 ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP5]]) -; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP6]]) +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[RESULT_08:%.*]] = phi i32 [ [[OR:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDVARS_IV]] -; CHECK-NEXT: [[L0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[INDVARS_IV]] -; CHECK-NEXT: [[L1:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[L1]], [[L0]] -; CHECK-NEXT: [[OR]] = or i32 [[ADD]], [[RESULT_08]] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !11 +; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !11 ; CHECK: for.end: -; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]] ; entry: @@ -309,39 +279,29 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 256) ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP5]] = xor <4 x i32> [[TMP4]], [[VEC_PHI]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) +; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6]] = xor <4 x i32> [[VEC_PHI]], [[TMP5]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !12 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !12 ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]]) -; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP6]]) +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[RESULT_08:%.*]] = phi i32 [ [[XOR:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDVARS_IV]] -; CHECK-NEXT: [[L0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[INDVARS_IV]] -; CHECK-NEXT: [[L1:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[L1]], [[L0]] -; CHECK-NEXT: [[XOR]] = xor i32 [[ADD]], [[RESULT_08]] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !13 +; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !13 ; CHECK: for.end: -; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]] ; entry: @@ -373,39 +333,29 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 256) ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> undef) ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP5]] = fadd fast <4 x float> [[TMP4]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> undef) +; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <4 x float> [[TMP4]], [[WIDE_MASKED_LOAD1]] +; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP5]], <4 x float> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !14 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !14 ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]]) -; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP6]]) +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[RESULT_08:%.*]] = phi float [ [[FADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i32 [[INDVARS_IV]] -; CHECK-NEXT: [[L0:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i32 [[INDVARS_IV]] -; CHECK-NEXT: [[L1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[RESULT_08]], [[L0]] -; CHECK-NEXT: [[FADD]] = fadd fast float [[ADD]], [[L1]] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !15 +; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !15 ; CHECK: for.end: -; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ undef, [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[RESULT_0_LCSSA]] ; entry: @@ -437,39 +387,29 @@ ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 256) ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> undef) ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP5]] = fmul fast <4 x float> [[TMP4]], [[WIDE_LOAD1]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> undef) +; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <4 x float> [[TMP4]], [[WIDE_MASKED_LOAD1]] +; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP5]], <4 x float> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !16 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !16 ; CHECK: middle.block: -; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[TMP5]]) -; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[TMP6]]) +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[RESULT_08:%.*]] = phi float [ [[FMUL:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i32 [[INDVARS_IV]] -; CHECK-NEXT: [[L0:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i32 [[INDVARS_IV]] -; CHECK-NEXT: [[L1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fmul fast float [[RESULT_08]], [[L0]] -; CHECK-NEXT: [[FMUL]] = fmul fast float [[ADD]], [[L1]] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !17 +; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !17 ; CHECK: for.end: -; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ [[FMUL]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ undef, [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[RESULT_0_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-allowed.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-allowed.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-allowed.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-allowed.ll @@ -34,3 +34,212 @@ %exitcond.not = icmp eq i32 %inc, %N br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body } + +define dso_local void @f32_reduction(float* nocapture readonly %Input, i32 %N, float* nocapture %Output) { +; CHECK-LABEL: f32_reduction( +; CHECK: vector.body: +; CHECK: @llvm.masked.load +; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body +entry: + %cmp6 = icmp eq i32 %N, 0 + br i1 %cmp6, label %while.end, label %while.body.preheader + +while.body.preheader: ; preds = %entry + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %blkCnt.09 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ] + %sum.08 = phi float [ %add, %while.body ], [ 0.000000e+00, %while.body.preheader ] + %Input.addr.07 = phi float* [ %incdec.ptr, %while.body ], [ %Input, %while.body.preheader ] + %incdec.ptr = getelementptr inbounds float, float* %Input.addr.07, i32 1 + %0 = load float, float* %Input.addr.07, align 4 + %add = fadd fast float %0, %sum.08 + %dec = add i32 %blkCnt.09, -1 + %cmp = icmp eq i32 %dec, 0 + br i1 %cmp, label %while.end.loopexit, label %while.body + +while.end.loopexit: ; preds = %while.body + %add.lcssa = phi float [ %add, %while.body ] + br label %while.end + +while.end: ; preds = %while.end.loopexit, %entry + %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa, %while.end.loopexit ] + %conv = uitofp i32 %N to float + %div = fdiv fast float %sum.0.lcssa, %conv + store float %div, float* %Output, align 4 + ret void +} + +define dso_local void @f16_reduction(half* nocapture readonly %Input, i32 %N, half* nocapture %Output) { +; CHECK-LABEL: f16_reduction( +; CHECK: vector.body: +; CHECK: @llvm.masked.load +; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body +entry: + %cmp6 = icmp eq i32 %N, 0 + br i1 %cmp6, label %while.end, label %while.body.preheader + +while.body.preheader: ; preds = %entry + br label %while.body + +while.body: ; preds = %while.body.preheader, %while.body + %blkCnt.09 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ] + %sum.08 = phi half [ %add, %while.body ], [ 0.000000e+00, %while.body.preheader ] + %Input.addr.07 = phi half* [ %incdec.ptr, %while.body ], [ %Input, %while.body.preheader ] + %incdec.ptr = getelementptr inbounds half, half* %Input.addr.07, i32 1 + %0 = load half, half* %Input.addr.07, align 2 + %add = fadd fast half %0, %sum.08 + %dec = add i32 %blkCnt.09, -1 + %cmp = icmp eq i32 %dec, 0 + br i1 %cmp, label %while.end.loopexit, label %while.body + +while.end.loopexit: ; preds = %while.body + %add.lcssa = phi half [ %add, %while.body ] + br label %while.end + +while.end: ; preds = %while.end.loopexit, %entry + %sum.0.lcssa = phi half [ 0.000000e+00, %entry ], [ %add.lcssa, %while.end.loopexit ] + %conv = uitofp i32 %N to half + %div = fdiv fast half %sum.0.lcssa, %conv + store half %div, half* %Output, align 2 + ret void +} + +define dso_local void @mixed_f32_i32_reduction(float* nocapture readonly %fInput, i32* nocapture readonly %iInput, i32 %N, float* nocapture %fOutput, i32* nocapture %iOutput) { +; CHECK-LABEL: mixed_f32_i32_reduction( +; CHECK: vector.body: +; CHECK: @llvm.masked.load +; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body +entry: + %cmp15 = icmp eq i32 %N, 0 + br i1 %cmp15, label %while.end, label %while.body.preheader + +while.body.preheader: + br label %while.body + +while.body: + %blkCnt.020 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ] + %isum.019 = phi i32 [ %add2, %while.body ], [ 0, %while.body.preheader ] + %fsum.018 = phi float [ %add, %while.body ], [ 0.000000e+00, %while.body.preheader ] + %fInput.addr.017 = phi float* [ %incdec.ptr, %while.body ], [ %fInput, %while.body.preheader ] + %iInput.addr.016 = phi i32* [ %incdec.ptr1, %while.body ], [ %iInput, %while.body.preheader ] + %incdec.ptr = getelementptr inbounds float, float* %fInput.addr.017, i32 1 + %incdec.ptr1 = getelementptr inbounds i32, i32* %iInput.addr.016, i32 1 + %0 = load i32, i32* %iInput.addr.016, align 4 + %add2 = add nsw i32 %0, %isum.019 + %1 = load float, float* %fInput.addr.017, align 4 + %add = fadd fast float %1, %fsum.018 + %dec = add i32 %blkCnt.020, -1 + %cmp = icmp eq i32 %dec, 0 + br i1 %cmp, label %while.end.loopexit, label %while.body + +while.end.loopexit: + %add.lcssa = phi float [ %add, %while.body ] + %add2.lcssa = phi i32 [ %add2, %while.body ] + %phitmp = sitofp i32 %add2.lcssa to float + br label %while.end + +while.end: + %fsum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa, %while.end.loopexit ] + %isum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %phitmp, %while.end.loopexit ] + %conv = uitofp i32 %N to float + %div = fdiv fast float %fsum.0.lcssa, %conv + store float %div, float* %fOutput, align 4 + %div5 = fdiv fast float %isum.0.lcssa, %conv + %conv6 = fptosi float %div5 to i32 + store i32 %conv6, i32* %iOutput, align 4 + ret void +} + +define dso_local i32 @i32_mul_reduction(i32* noalias nocapture readonly %B, i32 %N) { +; CHECK-LABEL: i32_mul_reduction( +; CHECK: vector.body: +; CHECK: @llvm.masked.load +; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body +entry: + %cmp6 = icmp sgt i32 %N, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.cleanup.loopexit: + %mul.lcssa = phi i32 [ %mul, %for.body ] + br label %for.cond.cleanup + +for.cond.cleanup: + %S.0.lcssa = phi i32 [ 1, %entry ], [ %mul.lcssa, %for.cond.cleanup.loopexit ] + ret i32 %S.0.lcssa + +for.body: + %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %S.07 = phi i32 [ %mul, %for.body ], [ 1, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.08 + %0 = load i32, i32* %arrayidx, align 4 + %mul = mul nsw i32 %0, %S.07 + %inc = add nuw nsw i32 %i.08, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} + +define dso_local i32 @i32_or_reduction(i32* noalias nocapture readonly %B, i32 %N) { +; CHECK-LABEL: i32_or_reduction( +; CHECK: vector.body: +; CHECK: @llvm.masked.load +; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body +entry: + %cmp6 = icmp sgt i32 %N, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + %or.lcssa = phi i32 [ %or, %for.body ] + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %S.0.lcssa = phi i32 [ 1, %entry ], [ %or.lcssa, %for.cond.cleanup.loopexit ] + ret i32 %S.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %S.07 = phi i32 [ %or, %for.body ], [ 1, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.08 + %0 = load i32, i32* %arrayidx, align 4 + %or = or i32 %0, %S.07 + %inc = add nuw nsw i32 %i.08, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} + +define dso_local i32 @i32_and_reduction(i32* noalias nocapture readonly %A, i32 %N, i32 %S) { +; CHECK-LABEL: i32_and_reduction( +; CHECK: vector.body: +; CHECK: @llvm.masked.load +; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body +entry: + %cmp5 = icmp sgt i32 %N, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + %and.lcssa = phi i32 [ %and, %for.body ] + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %S.addr.0.lcssa = phi i32 [ %S, %entry ], [ %and.lcssa, %for.cond.cleanup.loopexit ] + ret i32 %S.addr.0.lcssa + +for.body: ; preds = %for.body.preheader, %for.body + %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %S.addr.06 = phi i32 [ %and, %for.body ], [ %S, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.07 + %0 = load i32, i32* %arrayidx, align 4 + %and = and i32 %0, %S.addr.06 + %inc = add nuw nsw i32 %i.07, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll @@ -266,189 +266,6 @@ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body } -; Don't tail-fold float reductions. -; -define dso_local void @f32_reduction(float* nocapture readonly %Input, i32 %N, float* nocapture %Output) local_unnamed_addr #0 { -; CHECK-LABEL: f32_reduction( -; CHECK: vector.body: -; CHECK-NOT: @llvm.masked.load -; CHECK-NOT: @llvm.masked.store -; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body -entry: - %cmp6 = icmp eq i32 %N, 0 - br i1 %cmp6, label %while.end, label %while.body.preheader - -while.body.preheader: ; preds = %entry - br label %while.body - -while.body: ; preds = %while.body.preheader, %while.body - %blkCnt.09 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ] - %sum.08 = phi float [ %add, %while.body ], [ 0.000000e+00, %while.body.preheader ] - %Input.addr.07 = phi float* [ %incdec.ptr, %while.body ], [ %Input, %while.body.preheader ] - %incdec.ptr = getelementptr inbounds float, float* %Input.addr.07, i32 1 - %0 = load float, float* %Input.addr.07, align 4 - %add = fadd fast float %0, %sum.08 - %dec = add i32 %blkCnt.09, -1 - %cmp = icmp eq i32 %dec, 0 - br i1 %cmp, label %while.end.loopexit, label %while.body - -while.end.loopexit: ; preds = %while.body - %add.lcssa = phi float [ %add, %while.body ] - br label %while.end - -while.end: ; preds = %while.end.loopexit, %entry - %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa, %while.end.loopexit ] - %conv = uitofp i32 %N to float - %div = fdiv fast float %sum.0.lcssa, %conv - store float %div, float* %Output, align 4 - ret void -} - -; Don't tail-fold float reductions. -; -define dso_local void @mixed_f32_i32_reduction(float* nocapture readonly %fInput, i32* nocapture readonly %iInput, i32 %N, float* nocapture %fOutput, i32* nocapture %iOutput) local_unnamed_addr #0 { -; CHECK-LABEL: mixed_f32_i32_reduction( -; CHECK: vector.body: -; CHECK-NOT: @llvm.masked.load -; CHECK-NOT: @llvm.masked.store -; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body -entry: - %cmp15 = icmp eq i32 %N, 0 - br i1 %cmp15, label %while.end, label %while.body.preheader - -while.body.preheader: - br label %while.body - -while.body: - %blkCnt.020 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ] - %isum.019 = phi i32 [ %add2, %while.body ], [ 0, %while.body.preheader ] - %fsum.018 = phi float [ %add, %while.body ], [ 0.000000e+00, %while.body.preheader ] - %fInput.addr.017 = phi float* [ %incdec.ptr, %while.body ], [ %fInput, %while.body.preheader ] - %iInput.addr.016 = phi i32* [ %incdec.ptr1, %while.body ], [ %iInput, %while.body.preheader ] - %incdec.ptr = getelementptr inbounds float, float* %fInput.addr.017, i32 1 - %incdec.ptr1 = getelementptr inbounds i32, i32* %iInput.addr.016, i32 1 - %0 = load i32, i32* %iInput.addr.016, align 4 - %add2 = add nsw i32 %0, %isum.019 - %1 = load float, float* %fInput.addr.017, align 4 - %add = fadd fast float %1, %fsum.018 - %dec = add i32 %blkCnt.020, -1 - %cmp = icmp eq i32 %dec, 0 - br i1 %cmp, label %while.end.loopexit, label %while.body - -while.end.loopexit: - %add.lcssa = phi float [ %add, %while.body ] - %add2.lcssa = phi i32 [ %add2, %while.body ] - %phitmp = sitofp i32 %add2.lcssa to float - br label %while.end - -while.end: - %fsum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa, %while.end.loopexit ] - %isum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %phitmp, %while.end.loopexit ] - %conv = uitofp i32 %N to float - %div = fdiv fast float %fsum.0.lcssa, %conv - store float %div, float* %fOutput, align 4 - %div5 = fdiv fast float %isum.0.lcssa, %conv - %conv6 = fptosi float %div5 to i32 - store i32 %conv6, i32* %iOutput, align 4 - ret void -} - -define dso_local i32 @i32_mul_reduction(i32* noalias nocapture readonly %B, i32 %N) local_unnamed_addr #0 { -; CHECK-LABEL: i32_mul_reduction( -; CHECK: vector.body: -; CHECK-NOT: @llvm.masked.load -; CHECK-NOT: @llvm.masked.store -; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body -entry: - %cmp6 = icmp sgt i32 %N, 0 - br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup - -for.body.preheader: - br label %for.body - -for.cond.cleanup.loopexit: - %mul.lcssa = phi i32 [ %mul, %for.body ] - br label %for.cond.cleanup - -for.cond.cleanup: - %S.0.lcssa = phi i32 [ 1, %entry ], [ %mul.lcssa, %for.cond.cleanup.loopexit ] - ret i32 %S.0.lcssa - -for.body: - %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] - %S.07 = phi i32 [ %mul, %for.body ], [ 1, %for.body.preheader ] - %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.08 - %0 = load i32, i32* %arrayidx, align 4 - %mul = mul nsw i32 %0, %S.07 - %inc = add nuw nsw i32 %i.08, 1 - %exitcond = icmp eq i32 %inc, %N - br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body -} - -define dso_local i32 @i32_or_reduction(i32* noalias nocapture readonly %B, i32 %N) local_unnamed_addr #0 { -; CHECK-LABEL: i32_or_reduction( -; CHECK: vector.body: -; CHECK-NOT: @llvm.masked.load -; CHECK-NOT: @llvm.masked.store -; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body -entry: - %cmp6 = icmp sgt i32 %N, 0 - br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup - -for.body.preheader: ; preds = %entry - br label %for.body - -for.cond.cleanup.loopexit: ; preds = %for.body - %or.lcssa = phi i32 [ %or, %for.body ] - br label %for.cond.cleanup - -for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry - %S.0.lcssa = phi i32 [ 1, %entry ], [ %or.lcssa, %for.cond.cleanup.loopexit ] - ret i32 %S.0.lcssa - -for.body: ; preds = %for.body.preheader, %for.body - %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] - %S.07 = phi i32 [ %or, %for.body ], [ 1, %for.body.preheader ] - %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.08 - %0 = load i32, i32* %arrayidx, align 4 - %or = or i32 %0, %S.07 - %inc = add nuw nsw i32 %i.08, 1 - %exitcond = icmp eq i32 %inc, %N - br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body -} - -define dso_local i32 @i32_and_reduction(i32* noalias nocapture readonly %A, i32 %N, i32 %S) local_unnamed_addr #0 { -; CHECK-LABEL: i32_and_reduction( -; CHECK: vector.body: -; CHECK-NOT: @llvm.masked.load -; CHECK-NOT: @llvm.masked.store -; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body -entry: - %cmp5 = icmp sgt i32 %N, 0 - br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup - -for.body.preheader: ; preds = %entry - br label %for.body - -for.cond.cleanup.loopexit: ; preds = %for.body - %and.lcssa = phi i32 [ %and, %for.body ] - br label %for.cond.cleanup - -for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry - %S.addr.0.lcssa = phi i32 [ %S, %entry ], [ %and.lcssa, %for.cond.cleanup.loopexit ] - ret i32 %S.addr.0.lcssa - -for.body: ; preds = %for.body.preheader, %for.body - %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] - %S.addr.06 = phi i32 [ %and, %for.body ], [ %S, %for.body.preheader ] - %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.07 - %0 = load i32, i32* %arrayidx, align 4 - %and = and i32 %0, %S.addr.06 - %inc = add nuw nsw i32 %i.07, 1 - %exitcond = icmp eq i32 %inc, %N - br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body -} - define i32 @i32_smin_reduction(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: i32_smin_reduction( ; CHECK: vector.body: