Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1182,6 +1182,15 @@ VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; + /// Calculate the cost of an extended reduction pattern, similar to + /// getArithmeticReductionCost. This is the cost of as + /// ResTy vecreduce.add(ext(Ty A)), or if IsMLA flag is set then + /// ResTy vecreduce.add(mul(ext(Ty A), ext(Ty B)). The reduction happens + /// on a VectorType with ResTy elements and Ty lanes. + int getExtendedReductionCost( + bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty, + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; + /// \returns The cost of Intrinsic instructions. Analyses the real arguments. /// Three cases are handled: 1. scalar instruction 2. vector instruction /// 3. scalar instruction which is to be vectorized. @@ -1588,6 +1597,9 @@ virtual int getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned, TTI::TargetCostKind CostKind) = 0; + virtual int getExtendedReductionCost( + bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty, + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) = 0; virtual int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) = 0; virtual int getCallInstrCost(Function *F, Type *RetTy, @@ -2065,6 +2077,12 @@ return Impl.getMinMaxReductionCost(Ty, CondTy, IsPairwiseForm, IsUnsigned, CostKind); } + int getExtendedReductionCost( + bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty, + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) override { + return Impl.getExtendedReductionCost(IsMLA, IsUnsigned, ResTy, Ty, + CostKind); + } int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) override { return Impl.getIntrinsicInstrCost(ICA, CostKind); Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -599,6 +599,12 @@ return 1; } + unsigned getExtendedReductionCost( + bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty, + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const { + return 1; + } + unsigned getCostOfKeepingLiveOverCall(ArrayRef Tys) const { return 0; } Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -2018,6 +2018,25 @@ thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); } + unsigned getExtendedReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy, + VectorType *Ty, + TTI::TargetCostKind CostKind) { + // Without any native support, this is equivalent to the cost of + // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext)) + unsigned RedCost = thisT()->getArithmeticReductionCost(Instruction::Add, Ty, + false, CostKind); + unsigned MulCost = 0; + unsigned ExtCost = thisT()->getCastInstrCost( + IsUnsigned ? Instruction::ZExt : Instruction::SExt, + VectorType::get(ResTy, Ty), Ty, TTI::CastContextHint::None, CostKind); + if (IsMLA) { + MulCost = thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind); + ExtCost *= 2; + } + + return RedCost + MulCost + ExtCost; + } + unsigned getVectorSplitCost() { return 1; } /// @} Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -943,6 +943,15 @@ return Cost; } +int TargetTransformInfo::getExtendedReductionCost( + bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty, + TTI::TargetCostKind CostKind) const { + int Cost = + TTIImpl->getExtendedReductionCost(IsMLA, IsUnsigned, ResTy, Ty, CostKind); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; +} + unsigned TargetTransformInfo::getCostOfKeepingLiveOverCall(ArrayRef Tys) const { return TTIImpl->getCostOfKeepingLiveOverCall(Tys); Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -247,6 +247,8 @@ int getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, bool IsPairwiseForm, TTI::TargetCostKind CostKind); + int getExtendedReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy, + VectorType *ValTy, TTI::TargetCostKind CostKind); int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind); Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1511,6 +1511,24 @@ CostKind); } +int ARMTTIImpl::getExtendedReductionCost(bool IsMLA, bool IsUnsigned, + Type *ResTy, VectorType *ValTy, + TTI::TargetCostKind CostKind) { + EVT ValVT = TLI->getValueType(DL, ValTy); + EVT ResVT = TLI->getValueType(DL, ResTy); + if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) { + std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); + if ((LT.second == MVT::v16i8 && ResVT.getSizeInBits() <= 32) || + (LT.second == MVT::v8i16 && + ResVT.getSizeInBits() <= (IsMLA ? 64 : 32)) || + (LT.second == MVT::v4i32 && ResVT.getSizeInBits() <= 64)) + return ST->getMVEVectorCostFactor() * LT.first; + } + + return BaseT::getExtendedReductionCost(IsMLA, IsUnsigned, ResTy, ValTy, + CostKind); +} + int ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) { // Currently we make a somewhat optimistic assumption that active_lane_mask's @@ -2053,7 +2071,7 @@ unsigned ScalarBits = Ty->getScalarSizeInBits(); switch (Opcode) { case Instruction::Add: - return ScalarBits <= 32; + return ScalarBits <= 64; default: return false; } Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1654,6 +1654,11 @@ InstructionCost getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy); + /// Return the cost of instructions in an inloop reduction pattern, if I is + /// part of that pattern. + int getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, + TTI::TargetCostKind CostKind); + /// Calculate vectorization cost of memory instruction \p I. unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF); @@ -1737,6 +1742,12 @@ /// (PHI) to bottom ReductionChainMap InLoopReductionChains; + /// A Map of inloop reduction operations and their immediate chain operand. + /// FIXME: This can be removed once reductions can be costed correctly in + /// vplan. This was added to allow quick lookup to the inloop operations, + /// without having to loop through InLoopReductionChains. + DenseMap InLoopReductionImmediateChains; + /// Returns the expected difference in cost from scalarizing the expression /// feeding a predicated instruction \p PredInst. The instructions to /// scalarize and their scalar costs are collected in \p ScalarCosts. A @@ -5971,6 +5982,11 @@ if (!Legal->isReductionVariable(PN)) continue; RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; + if (PreferInLoopReductions || + TTI.preferInLoopReduction(RdxDesc.getOpcode(), + RdxDesc.getRecurrenceType(), + TargetTransformInfo::ReductionFlags())) + continue; T = RdxDesc.getRecurrenceType(); } @@ -6799,6 +6815,110 @@ return Cost; } +int LoopVectorizationCostModel::getReductionPatternCost( + Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { + // Early exit for no inloop reductions + if (InLoopReductionChains.empty() || VF.isScalar() || !isa(Ty)) + return -1; + auto *VectorTy = cast(Ty); + + // We are looking for a pattern of, and finding the minimal acceptable cost: + // reduce(mul(ext(A), ext(B))) or + // reduce(mul(A, B)) or + // reduce(ext(A)) or + // reduce(A). + // The basic idea is that we walk down the tree to do that, finding the root + // reduction instruction in InLoopReductionImmediateChains. From there we find + // the pattern of mul/ext and test the cost of the entire pattern vs the cost + // of the components. If the reduction cost is lower then we return it for the + // reduction instruction and 0 for the other instructions in the pattern. If + // it is not we return -1 specifying the orignal cost method should be used. + Instruction *RetI = I; + if ((RetI->getOpcode() == Instruction::SExt || + RetI->getOpcode() == Instruction::ZExt) && + RetI->hasOneUser()) + RetI = RetI->user_back(); + if (RetI->getOpcode() == Instruction::Mul && RetI->hasOneUse() && + RetI->user_back()->getOpcode() == Instruction::Add) + RetI = RetI->user_back(); + + // Test if the found instruction is a reduction, and if not return -1 + // specifying the parent to use the original cost modelling. + if (!InLoopReductionImmediateChains.count(RetI)) + return -1; + + // Find the reduction this chain is a part of and calculate the basic cost of + // the reduction on its own. + Instruction *LastChain = InLoopReductionImmediateChains[RetI]; + Instruction *ReductionPhi = LastChain; + while (!isa(ReductionPhi)) + ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; + + RecurrenceDescriptor RdxDesc = + Legal->getReductionVars()[cast(ReductionPhi)]; + unsigned BaseCost = TTI.getArithmeticReductionCost(RdxDesc.getOpcode(), + VectorTy, false, CostKind); + + // Get the operand that was not the reduction chain and match it to one of the + // patterns, returning the better cost if it is found. + Instruction *RedOp = RetI->getOperand(1) == LastChain + ? dyn_cast(RetI->getOperand(0)) + : dyn_cast(RetI->getOperand(1)); + + VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); + + if (RedOp && (isa(RedOp) || isa(RedOp)) && + !TheLoop->isLoopInvariant(RedOp)) { + bool IsUnsigned = isa(RedOp); + auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy); + unsigned RedCost = TTI.getExtendedReductionCost( + /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, + CostKind); + + unsigned ExtCost = + TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType, + TTI::CastContextHint::None, CostKind, RedOp); + if (RedCost < BaseCost + ExtCost) + return I == RetI ? RedCost : 0; + } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) { + Instruction *Mul = RedOp; + Instruction *Op0 = dyn_cast(Mul->getOperand(0)); + Instruction *Op1 = dyn_cast(Mul->getOperand(1)); + if (Op0 && Op1 && (isa(Op0) || isa(Op0)) && + Op0->getOpcode() == Op1->getOpcode() && + Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && + !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { + bool IsUnsigned = isa(Op0); + auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); + // reduce(mul(ext, ext)) + unsigned ExtCost = + TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, + TTI::CastContextHint::None, CostKind, Op0); + unsigned MulCost = + TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); + + unsigned RedCost = TTI.getExtendedReductionCost( + /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, + CostKind); + + if (RedCost < ExtCost * 2 + MulCost + BaseCost) + return I == RetI ? RedCost : 0; + } else { + unsigned MulCost = + TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); + + unsigned RedCost = TTI.getExtendedReductionCost( + /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy, + CostKind); + + if (RedCost < MulCost + BaseCost) + return I == RetI ? RedCost : 0; + } + } + + return I == RetI ? BaseCost : -1; +} + unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, ElementCount VF) { // Calculate scalar cost only. Vectorization cost should be ready at this @@ -7152,6 +7272,12 @@ // Since we will replace the stride by 1 the multiplication should go away. if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) return 0; + + // Detect reduction patterns + int RedCost; + if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) != -1) + return RedCost; + // Certain instructions can be cheaper to vectorize if they have a constant // second vector operand. One example of this are shifts on x86. Value *Op2 = I->getOperand(1); @@ -7275,6 +7401,11 @@ Trunc->getSrcTy(), CCH, CostKind, Trunc); } + // Detect reduction patterns + int RedCost; + if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) != -1) + return RedCost; + Type *SrcScalarTy = I->getOperand(0)->getType(); Type *SrcVecTy = VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; @@ -7403,8 +7534,15 @@ SmallVector ReductionOperations = RdxDesc.getReductionOpChain(Phi, TheLoop); bool InLoop = !ReductionOperations.empty(); - if (InLoop) + if (InLoop) { InLoopReductionChains[Phi] = ReductionOperations; + // Add the elements to InLoopReductionImmediateChains for cost modelling. + Instruction *LastChain = Phi; + for (auto *I : ReductionOperations) { + InLoopReductionImmediateChains[I] = LastChain; + LastChain = I; + } + } LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") << " reduction for phi: " << *Phi << "\n"); } Index: llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll +++ llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll @@ -12,33 +12,33 @@ ; CHECK: for.body.preheader: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 16 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <16 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[TMP0]], i32 [[N]]) ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP3]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison) -; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i8> [[WIDE_MASKED_LOAD]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) +; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i32 [[TMP0]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP7]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison) -; CHECK-NEXT: [[TMP8:%.*]] = sext <4 x i8> [[WIDE_MASKED_LOAD1]] to <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP8]], [[TMP4]] -; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP9]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP7]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) +; CHECK-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP10]]) ; CHECK-NEXT: [[TMP12]] = add i32 [[TMP11]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] ; CHECK: middle.block: Index: llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll +++ llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll @@ -4,6 +4,7 @@ target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "thumbv8.1m.main-arm-none-eabi" +; Should not be vectorized define i64 @add_i64_i64(i64* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i64_i64( ; CHECK-NEXT: entry: @@ -41,24 +42,50 @@ ret i64 %r.0.lcssa } -; FIXME: 4x +; 4x to use VADDLV +; FIXME: TailPredicate define i64 @add_i32_i64(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i32_i64( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]]) +; CHECK-NEXT: [[TMP4]] = add i64 [[TMP3]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[I_08]] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]] +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP6]] to i64 ; CHECK-NEXT: [[ADD]] = add nsw i64 [[R_07]], [[CONV]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[R_0_LCSSA]] ; entry: @@ -81,24 +108,50 @@ ret i64 %r.0.lcssa } -; FIXME: 4x ? +; 4x to use VADDLV +; FIXME: TailPredicate define i64 @add_i16_i64(i16* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i16_i64( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]]) +; CHECK-NEXT: [[TMP4]] = add i64 [[TMP3]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[I_08]] -; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i64 +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[X]], i32 [[I_08]] +; CHECK-NEXT: [[TMP6:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP6]] to i64 ; CHECK-NEXT: [[ADD]] = add nsw i64 [[R_07]], [[CONV]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP5:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[R_0_LCSSA]] ; entry: @@ -121,24 +174,50 @@ ret i64 %r.0.lcssa } -; FIXME: 4x ? +; 4x to use VADDLV +; FIXME: TailPredicate define i64 @add_i8_i64(i8* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i8_i64( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]]) +; CHECK-NEXT: [[TMP4]] = add i64 [[TMP3]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[I_08]] -; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i64 +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[X]], i32 [[I_08]] +; CHECK-NEXT: [[TMP6:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP6]] to i64 ; CHECK-NEXT: [[ADD]] = add nuw nsw i64 [[R_07]], [[CONV]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP7:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[R_0_LCSSA]] ; entry: @@ -161,6 +240,7 @@ ret i64 %r.0.lcssa } +; 4x to use VADDV.u32 define i32 @add_i32_i32(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i32_i32( ; CHECK-NEXT: entry: @@ -182,7 +262,7 @@ ; CHECK-NEXT: [[TMP4]] = add i32 [[TMP3]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP4]], [[VECTOR_BODY]] ] ; CHECK-NEXT: ret i32 [[R_0_LCSSA]] @@ -206,30 +286,30 @@ ret i32 %r.0.lcssa } -; FIXME: 8x +; 8x to use VADDV.u16 define i32 @add_i16_i32(i16* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i16_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 -; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP1]], i32 2, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i16> poison) -; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <8 x i16>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison) +; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP2]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP3]]) ; CHECK-NEXT: [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP2:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP9:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[VECTOR_BODY]] ] ; CHECK-NEXT: ret i32 [[R_0_LCSSA]] @@ -254,30 +334,30 @@ ret i32 %r.0.lcssa } -; FIXME: 16x +; 16x to use VADDV.u16 define i32 @add_i8_i32(i8* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i8_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 -; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP1]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison) -; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[WIDE_MASKED_LOAD]] to <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) +; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP2]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP3]]) ; CHECK-NEXT: [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP3:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[VECTOR_BODY]] ] ; CHECK-NEXT: ret i32 [[R_0_LCSSA]] @@ -302,6 +382,7 @@ ret i32 %r.0.lcssa } +; 8x to use VADDV.u16 define signext i16 @add_i16_i16(i16* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i16_i16( ; CHECK-NEXT: entry: @@ -323,7 +404,7 @@ ; CHECK-NEXT: [[TMP4]] = add i16 [[TMP3]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP11:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP4]], [[VECTOR_BODY]] ] ; CHECK-NEXT: ret i16 [[R_0_LCSSA]] @@ -347,30 +428,30 @@ ret i16 %r.0.lcssa } -; FIXME: 16x ? +; 16x to use VADDV.u8 define signext i16 @add_i8_i16(i8* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i8_i16( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP8]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 -; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* [[TMP1]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison) -; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> [[TMP2]], <8 x i16> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) +; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i16> +; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i16> [[TMP2]], <16 x i16> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[TMP3]]) ; CHECK-NEXT: [[TMP5]] = add i16 [[TMP4]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[VECTOR_BODY]] ] ; CHECK-NEXT: ret i16 [[R_0_LCSSA]] @@ -395,6 +476,7 @@ ret i16 %r.0.lcssa } +; 16x to use VADDV.u8 define zeroext i8 @add_i8_i8(i8* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i8_i8( ; CHECK-NEXT: entry: @@ -416,7 +498,7 @@ ; CHECK-NEXT: [[TMP4]] = add i8 [[TMP3]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP13:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[TMP4]], [[VECTOR_BODY]] ] ; CHECK-NEXT: ret i8 [[R_0_LCSSA]] @@ -440,6 +522,7 @@ ret i8 %r.0.lcssa } +; Not vectorized define i64 @mla_i64_i64(i64* nocapture readonly %x, i64* nocapture readonly %y, i32 %n) #0 { ; CHECK-LABEL: @mla_i64_i64( ; CHECK-NEXT: entry: @@ -483,26 +566,57 @@ ret i64 %r.0.lcssa } +; 4x to use VMLAL.u32 +; FIXME: TailPredicate define i64 @mla_i32_i64(i32* nocapture readonly %x, i32* nocapture readonly %y, i32 %n) #0 { ; CHECK-LABEL: @mla_i32_i64( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[Y:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i32> [[TMP4]] to <4 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP5]]) +; CHECK-NEXT: [[TMP7]] = add i64 [[TMP6]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP14:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_010:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[R_09:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[I_010]] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[Y:%.*]], i32 [[I_010]] -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[I_010:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[R_09:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_010]] +; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[Y]], i32 [[I_010]] +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP10]], [[TMP9]] ; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[MUL]] to i64 ; CHECK-NEXT: [[ADD]] = add nsw i64 [[R_09]], [[CONV]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_010]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP15:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[R_0_LCSSA]] ; entry: @@ -528,28 +642,61 @@ ret i64 %r.0.lcssa } +; 8x to use VMLAL.u16 +; FIXME: TailPredicate define i64 @mla_i16_i64(i16* nocapture readonly %x, i16* nocapture readonly %y, i32 %n) #0 { ; CHECK-LABEL: @mla_i16_i64( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -8 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <8 x i16>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <8 x i16>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 2 +; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i16> [[WIDE_LOAD1]] to <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <8 x i32> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = sext <8 x i32> [[TMP6]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP7]]) +; CHECK-NEXT: [[TMP9]] = add i64 [[TMP8]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP16:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[R_011:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[I_012]] -; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i32 [[I_012]] -; CHECK-NEXT: [[TMP1:%.*]] = load i16, i16* [[ARRAYIDX1]], align 2 -; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[TMP1]] to i32 +; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[R_011:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[X]], i32 [[I_012]] +; CHECK-NEXT: [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP11]] to i32 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[Y]], i32 [[I_012]] +; CHECK-NEXT: [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX1]], align 2 +; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[TMP12]] to i32 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]] ; CHECK-NEXT: [[CONV3:%.*]] = sext i32 [[MUL]] to i64 ; CHECK-NEXT: [[ADD]] = add nsw i64 [[R_011]], [[CONV3]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_012]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP17:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[R_0_LCSSA]] ; entry: @@ -577,28 +724,61 @@ ret i64 %r.0.lcssa } +; 8x to use VMLAL.u16 +; FIXME: 8x, TailPredicate, double-extended define i64 @mla_i8_i64(i8* nocapture readonly %x, i8* nocapture readonly %y, i32 %n) #0 { ; CHECK-LABEL: @mla_i8_i64( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -16 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw <16 x i32> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = zext <16 x i32> [[TMP6]] to <16 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP7]]) +; CHECK-NEXT: [[TMP9]] = add i64 [[TMP8]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP18:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[R_011:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[I_012]] -; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i32 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[I_012]] -; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP1]] to i32 +; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[R_011:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[X]], i32 [[I_012]] +; CHECK-NEXT: [[TMP11:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP11]] to i32 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[Y]], i32 [[I_012]] +; CHECK-NEXT: [[TMP12:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP12]] to i32 ; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[CONV2]], [[CONV]] ; CHECK-NEXT: [[CONV3:%.*]] = zext i32 [[MUL]] to i64 ; CHECK-NEXT: [[ADD]] = add nuw nsw i64 [[R_011]], [[CONV3]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_012]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP19:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[R_0_LCSSA]] ; entry: @@ -626,6 +806,7 @@ ret i64 %r.0.lcssa } +; 4x to use VMLA.u32 define i32 @mla_i32_i32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32 %n) #0 { ; CHECK-LABEL: @mla_i32_i32( ; CHECK-NEXT: entry: @@ -651,7 +832,7 @@ ; CHECK-NEXT: [[TMP7]] = add i32 [[TMP6]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP7:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP20:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP7]], [[VECTOR_BODY]] ] ; CHECK-NEXT: ret i32 [[R_0_LCSSA]] @@ -678,34 +859,35 @@ ret i32 %r.0.lcssa } +; 8x to use VMLA.u16 define i32 @mla_i16_i32(i16* nocapture readonly %x, i16* nocapture readonly %y, i32 %n) #0 { ; CHECK-LABEL: @mla_i16_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 -; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP1]], i32 2, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i16> poison) -; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <8 x i16>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison) +; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <4 x i16>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP4]], i32 2, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i16> poison) -; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD1]] to <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <4 x i32> [[TMP5]], [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]]) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <8 x i16>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP4]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison) +; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD1]] to <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <8 x i32> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP6]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP7]]) ; CHECK-NEXT: [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP21:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP9]], [[VECTOR_BODY]] ] ; CHECK-NEXT: ret i32 [[R_0_LCSSA]] @@ -734,34 +916,35 @@ ret i32 %r.0.lcssa } +; 16x to use VMLA.u8 define i32 @mla_i8_i32(i8* nocapture readonly %x, i8* nocapture readonly %y, i32 %n) #0 { ; CHECK-LABEL: @mla_i8_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 -; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP1]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison) -; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[WIDE_MASKED_LOAD]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) +; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP4]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison) -; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[WIDE_MASKED_LOAD1]] to <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw <4 x i32> [[TMP5]], [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]]) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP4]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) +; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw <16 x i32> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP6]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP7]]) ; CHECK-NEXT: [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP9:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP22:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP9]], [[VECTOR_BODY]] ] ; CHECK-NEXT: ret i32 [[R_0_LCSSA]] @@ -790,6 +973,7 @@ ret i32 %r.0.lcssa } +; 8x to use VMLA.u16 define signext i16 @mla_i16_i16(i16* nocapture readonly %x, i16* nocapture readonly %y, i32 %n) #0 { ; CHECK-LABEL: @mla_i16_i16( ; CHECK-NEXT: entry: @@ -815,7 +999,7 @@ ; CHECK-NEXT: [[TMP7]] = add i16 [[TMP6]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP23:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP7]], [[VECTOR_BODY]] ] ; CHECK-NEXT: ret i16 [[R_0_LCSSA]] @@ -842,34 +1026,35 @@ ret i16 %r.0.lcssa } +; 16x to use VMLA.u8 define signext i16 @mla_i8_i16(i8* nocapture readonly %x, i8* nocapture readonly %y, i32 %n) #0 { ; CHECK-LABEL: @mla_i8_i16( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP11]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 -; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* [[TMP1]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison) -; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i16> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) +; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i16> ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <8 x i8>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* [[TMP4]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison) -; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD1]] to <8 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = mul nuw <8 x i16> [[TMP5]], [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> [[TMP6]], <8 x i16> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP7]]) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP4]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) +; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = mul nuw <16 x i16> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i16> [[TMP6]], <16 x i16> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[TMP7]]) ; CHECK-NEXT: [[TMP9]] = add i16 [[TMP8]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP11:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP24:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP9]], [[VECTOR_BODY]] ] ; CHECK-NEXT: ret i16 [[R_0_LCSSA]] @@ -898,6 +1083,7 @@ ret i16 %r.0.lcssa } +; 16x to use VMLA.u8 define zeroext i8 @mla_i8_i8(i8* nocapture readonly %x, i8* nocapture readonly %y, i32 %n) #0 { ; CHECK-LABEL: @mla_i8_i8( ; CHECK-NEXT: entry: @@ -923,7 +1109,7 @@ ; CHECK-NEXT: [[TMP7]] = add i8 [[TMP6]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP25:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[TMP7]], [[VECTOR_BODY]] ] ; CHECK-NEXT: ret i8 [[R_0_LCSSA]] @@ -983,7 +1169,7 @@ ; CHECK-NEXT: [[TMP10]] = add i32 [[TMP9]], [[TMP8]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP13:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP26:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT]], label [[SCALAR_PH]] @@ -1003,7 +1189,7 @@ ; CHECK-NEXT: [[RED_2]] = add i32 [[RED_1]], [[L_1]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 2 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT]], [[LOOP14:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT]], [[LOOP27:!llvm.loop !.*]] ; CHECK: exit: ; CHECK-NEXT: [[RET_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RED_2]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[RET_LCSSA]]