Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1619,13 +1619,24 @@ TTI::TargetCostKind CostKind) { EVT ValVT = TLI->getValueType(DL, ValTy); EVT ResVT = TLI->getValueType(DL, ResTy); + if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) { std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); - if ((LT.second == MVT::v16i8 && ResVT.getSizeInBits() <= 32) || - (LT.second == MVT::v8i16 && - ResVT.getSizeInBits() <= (IsMLA ? 64 : 32)) || - (LT.second == MVT::v4i32 && ResVT.getSizeInBits() <= 64)) + + // The legal cases are: + // VADDV u/s 8/16/32 + // VMLAV u/s 8/16/32 + // VADDLV u/s 32 + // VMLALV u/s 16/32 + // Codegen currently cannot always handle larger than legal vectors very + // well, especially for predicated reductions where the mask needs to be + // split, so restrict to 128bit or smaller input types. + unsigned RevVTSize = ResVT.getSizeInBits(); + if (ValVT.getSizeInBits() <= 128 && + ((LT.second == MVT::v16i8 && RevVTSize <= 32) || + (LT.second == MVT::v8i16 && RevVTSize <= (IsMLA ? 64 : 32)) || + (LT.second == MVT::v4i32 && RevVTSize <= 64))) return ST->getMVEVectorCostFactor(CostKind) * LT.first; } Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7137,6 +7137,7 @@ Optional LoopVectorizationCostModel::getReductionPatternCost( Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { + using namespace llvm::PatternMatch; // Early exit for no inloop reductions if (InLoopReductionChains.empty() || VF.isScalar() || !isa(Ty)) return None; @@ -7155,13 +7156,12 @@ // it is not we return an invalid cost specifying the orignal cost method // should be used. Instruction *RetI = I; - if ((RetI->getOpcode() == Instruction::SExt || - RetI->getOpcode() == Instruction::ZExt)) { + if (match(RetI, m_ZExtOrSExt(m_Value()))) { if (!RetI->hasOneUser()) return None; RetI = RetI->user_back(); } - if (RetI->getOpcode() == Instruction::Mul && + if (match(RetI, m_Mul(m_Value(), m_Value())) && RetI->user_back()->getOpcode() == Instruction::Add) { if (!RetI->hasOneUser()) return None; @@ -7193,8 +7193,34 @@ VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); - if (RedOp && (isa(RedOp) || isa(RedOp)) && - !TheLoop->isLoopInvariant(RedOp)) { + Instruction *Op0, *Op1; + if (RedOp && + match(RedOp, + m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) && + match(Op0, m_ZExtOrSExt(m_Value())) && + Op0->getOpcode() == Op1->getOpcode() && + Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && + !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) && + (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) { + // Matched reduce(ext(mul(ext(A), ext(B))) + bool IsUnsigned = isa(Op0); + auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); + + InstructionCost ExtCost = + TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, + TTI::CastContextHint::None, CostKind, Op0); + InstructionCost MulCost = + TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); + + InstructionCost RedCost = TTI.getExtendedAddReductionCost( + /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, + CostKind); + + if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) + return I == RetI ? RedCost : 0; + } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) && + !TheLoop->isLoopInvariant(RedOp)) { + // Matched reduce(ext(A)) bool IsUnsigned = isa(RedOp); auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); InstructionCost RedCost = TTI.getExtendedAddReductionCost( @@ -7206,22 +7232,20 @@ TTI::CastContextHint::None, CostKind, RedOp); if (RedCost.isValid() && RedCost < BaseCost + ExtCost) return I == RetI ? RedCost : 0; - } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { - Instruction *Mul = RedOp; - Instruction *Op0 = dyn_cast(Mul->getOperand(0)); - Instruction *Op1 = dyn_cast(Mul->getOperand(1)); - if (Op0 && Op1 && (isa(Op0) || isa(Op0)) && + } else if (RedOp && + match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { + if (match(Op0, m_ZExtOrSExt(m_Value())) && Op0->getOpcode() == Op1->getOpcode() && Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { bool IsUnsigned = isa(Op0); auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); - // reduce(mul(ext, ext)) + // Matched reduce(mul(ext, ext)) InstructionCost ExtCost = TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, TTI::CastContextHint::None, CostKind, Op0); InstructionCost MulCost = - TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); + TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); InstructionCost RedCost = TTI.getExtendedAddReductionCost( /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, @@ -7229,9 +7253,10 @@ if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) return I == RetI ? RedCost : 0; - } else { + } else if (!match(I, m_ZExtOrSExt(m_Value()))) { + // Matched reduce(mul()) InstructionCost MulCost = - TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); + TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); InstructionCost RedCost = TTI.getExtendedAddReductionCost( /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, Index: llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll +++ llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll @@ -725,34 +725,34 @@ } ; 8x to use VMLAL.u16 -; FIXME: 8x, TailPredicate, double-extended +; FIXME: TailPredicate define i64 @mla_i8_i64(i8* nocapture readonly %x, i8* nocapture readonly %y, i32 %n) #0 { ; CHECK-LABEL: @mla_i8_i64( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.body.preheader: -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -16 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -8 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw <16 x i32> [[TMP5]], [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = zext <16 x i32> [[TMP6]] to <16 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP7]]) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <8 x i8>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[WIDE_LOAD1]] to <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw <8 x i32> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = zext <8 x i32> [[TMP6]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP7]]) ; CHECK-NEXT: [[TMP9]] = add i64 [[TMP8]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: @@ -1143,26 +1143,26 @@ ; CHECK-NEXT: [[CMP9_NOT:%.*]] = icmp eq i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP9_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 -; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* [[TMP1]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison) -; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP1]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison) +; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i8> [[WIDE_MASKED_LOAD]] to <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <8 x i16>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP4]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison) -; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD1]] to <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <8 x i32> [[TMP5]], [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP6]], <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP7]]) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <4 x i16>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP4]], i32 2, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i16> poison) +; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD1]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <4 x i32> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]]) ; CHECK-NEXT: [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: for.cond.cleanup: @@ -1200,34 +1200,34 @@ ret i32 %s.0.lcssa } -; FIXME: 4x as different sext vs zext +; 4x as different sext vs zext define i64 @red_mla_ext_s16_u16_s64(i16* noalias nocapture readonly %A, i16* noalias nocapture readonly %B, i32 %n) #0 { ; CHECK-LABEL: @red_mla_ext_s16_u16_s64( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP9_NOT:%.*]] = icmp eq i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP9_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]] ; CHECK: for.body.preheader: -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -8 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[A:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <8 x i16>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <8 x i16>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 2 -; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD1]] to <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <8 x i32> [[TMP5]], [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = zext <8 x i32> [[TMP6]] to <8 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP7]]) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <4 x i16>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP4]], align 2 +; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i16> [[WIDE_LOAD1]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <4 x i32> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = zext <4 x i32> [[TMP6]] to <4 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP7]]) ; CHECK-NEXT: [[TMP9]] = add i64 [[TMP8]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK: middle.block: @@ -1288,33 +1288,33 @@ ret i64 %s.0.lcssa } -; FIXME: 4x as different sext vs zext +; 4x as different sext vs zext define i32 @red_mla_u8_s8_u32(i8* noalias nocapture readonly %A, i8* noalias nocapture readonly %B, i32 %n) #0 { ; CHECK-LABEL: @red_mla_u8_s8_u32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP9_NOT:%.*]] = icmp eq i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP9_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15 -; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) -; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP1]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison) +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[WIDE_MASKED_LOAD]] to <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP4]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) -; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP5]], [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP6]], <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP7]]) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP4]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison) +; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i8> [[WIDE_MASKED_LOAD1]] to <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <4 x i32> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]]) ; CHECK-NEXT: [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] ; CHECK: for.cond.cleanup: