Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1160,10 +1160,10 @@ /// Calculate the cost of performing a vector reduction. /// - /// This is the cost of reducing the vector value of type \p Ty to a scalar - /// value using the operation denoted by \p Opcode. The form of the reduction - /// can either be a pairwise reduction or a reduction that splits the vector - /// at every reduction level. + /// This is the cost of reducing the vector value of type \p Ty to a type + /// \p ResTy using the operation denoted by \p Opcode. The form of the + /// reduction can either be a pairwise reduction or a reduction that splits + /// the vector at every reduction level. /// /// Pairwise: /// (v0, v1, v2, v3) @@ -1171,9 +1171,12 @@ /// Split: /// (v0, v1, v2, v3) /// ((v0+v2), (v1+v3), undef, undef) + /// + /// If the IsMLA flags is set then this includes the cost of a multiply step. int getArithmeticReductionCost( - unsigned Opcode, VectorType *Ty, bool IsPairwiseForm, - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; + unsigned Opcode, Type *ResTy, VectorType *Ty, bool IsPairwiseForm, + bool IsMLA = false, + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; int getMinMaxReductionCost( VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned, @@ -1575,8 +1578,9 @@ unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond = false, bool UseMaskForGaps = false) = 0; - virtual int getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, - bool IsPairwiseForm, + virtual int getArithmeticReductionCost(unsigned Opcode, Type *ResTy, + VectorType *Ty, bool IsPairwiseForm, + bool IsMLA, TTI::TargetCostKind CostKind) = 0; virtual int getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned, @@ -2042,11 +2046,11 @@ Alignment, AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); } - int getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, - bool IsPairwiseForm, + int getArithmeticReductionCost(unsigned Opcode, Type *ResTy, VectorType *Ty, + bool IsPairwiseForm, bool IsMLA, TTI::TargetCostKind CostKind) override { - return Impl.getArithmeticReductionCost(Opcode, Ty, IsPairwiseForm, - CostKind); + return Impl.getArithmeticReductionCost(Opcode, ResTy, Ty, IsPairwiseForm, + IsMLA, CostKind); } int getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned, Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -571,8 +571,10 @@ return 0; } - unsigned getArithmeticReductionCost(unsigned, VectorType *, bool, - TTI::TargetCostKind) { return 1; } + unsigned getArithmeticReductionCost(unsigned, Type *, VectorType *, bool, + bool, TTI::TargetCostKind) { + return 1; + } unsigned getMinMaxReductionCost(VectorType *, VectorType *, bool, bool, TTI::TargetCostKind) { return 1; } @@ -1023,8 +1025,9 @@ bool IsPairwise; switch (TTI::matchVectorReduction(EEI, RdxOpcode, RdxType, IsPairwise)) { case TTI::RK_Arithmetic: - return TargetTTI->getArithmeticReductionCost(RdxOpcode, RdxType, - IsPairwise, CostKind); + return TargetTTI->getArithmeticReductionCost( + RdxOpcode, RdxType->getScalarType(), RdxType, IsPairwise, false, + CostKind); case TTI::RK_MinMax: return TargetTTI->getMinMaxReductionCost( RdxType, cast(CmpInst::makeCmpResultType(RdxType)), Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1542,35 +1542,35 @@ CostKind); } case Intrinsic::vector_reduce_add: - return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy, - /*IsPairwiseForm=*/false, - CostKind); + return thisT()->getArithmeticReductionCost( + Instruction::Add, RetTy, VecOpTy, + /*IsPairwiseForm=*/false, /*IsMLA=*/false, CostKind); case Intrinsic::vector_reduce_mul: - return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy, - /*IsPairwiseForm=*/false, - CostKind); + return thisT()->getArithmeticReductionCost( + Instruction::Mul, RetTy, VecOpTy, + /*IsPairwiseForm=*/false, /*IsMLA=*/false, CostKind); case Intrinsic::vector_reduce_and: - return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy, - /*IsPairwiseForm=*/false, - CostKind); + return thisT()->getArithmeticReductionCost( + Instruction::And, RetTy, VecOpTy, + /*IsPairwiseForm=*/false, /*IsMLA=*/false, CostKind); case Intrinsic::vector_reduce_or: - return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, - /*IsPairwiseForm=*/false, - CostKind); + return thisT()->getArithmeticReductionCost( + Instruction::Or, RetTy, VecOpTy, + /*IsPairwiseForm=*/false, /*IsMLA=*/false, CostKind); case Intrinsic::vector_reduce_xor: - return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy, - /*IsPairwiseForm=*/false, - CostKind); + return thisT()->getArithmeticReductionCost( + Instruction::Xor, RetTy, VecOpTy, + /*IsPairwiseForm=*/false, /*IsMLA=*/false, CostKind); case Intrinsic::vector_reduce_fadd: // FIXME: Add new flag for cost of strict reductions. - return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy, - /*IsPairwiseForm=*/false, - CostKind); + return thisT()->getArithmeticReductionCost( + Instruction::FAdd, RetTy, VecOpTy, + /*IsPairwiseForm=*/false, /*IsMLA=*/false, CostKind); case Intrinsic::vector_reduce_fmul: // FIXME: Add new flag for cost of strict reductions. - return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy, - /*IsPairwiseForm=*/false, - CostKind); + return thisT()->getArithmeticReductionCost( + Instruction::FMul, RetTy, VecOpTy, + /*IsPairwiseForm=*/false, /*IsMLA=*/false, CostKind); case Intrinsic::vector_reduce_smax: case Intrinsic::vector_reduce_smin: case Intrinsic::vector_reduce_fmax: @@ -1906,8 +1906,9 @@ /// /// The cost model should take into account that the actual length of the /// vector is reduced on each iteration. - unsigned getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, - bool IsPairwise, + unsigned getArithmeticReductionCost(unsigned Opcode, Type *ResTy, + VectorType *Ty, bool IsPairwise, + bool IsMLA, TTI::TargetCostKind CostKind) { Type *ScalarTy = Ty->getElementType(); unsigned NumVecElts = cast(Ty)->getNumElements(); @@ -1919,6 +1920,9 @@ unsigned LongVectorCount = 0; unsigned MVTLen = LT.second.isVector() ? LT.second.getVectorNumElements() : 1; + if (IsMLA) + ArithCost += + thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind); while (NumVecElts > MVTLen) { NumVecElts /= 2; VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -919,12 +919,11 @@ return Cost; } -int TargetTransformInfo::getArithmeticReductionCost(unsigned Opcode, - VectorType *Ty, - bool IsPairwiseForm, - TTI::TargetCostKind CostKind) const { - int Cost = TTIImpl->getArithmeticReductionCost(Opcode, Ty, IsPairwiseForm, - CostKind); +int TargetTransformInfo::getArithmeticReductionCost( + unsigned Opcode, Type *ResTy, VectorType *Ty, bool IsPairwiseForm, + bool IsMLA, TTI::TargetCostKind CostKind) const { + int Cost = TTIImpl->getArithmeticReductionCost( + Opcode, ResTy, Ty, IsPairwiseForm, IsMLA, CostKind); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -231,9 +231,9 @@ bool useReductionIntrinsic(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const; - int getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, - bool IsPairwiseForm, - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); + int getArithmeticReductionCost( + unsigned Opcode, Type *ResTy, VectorType *Ty, bool IsPairwiseForm, + bool IsMLA, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, VectorType *SubTp); Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1076,14 +1076,14 @@ return false; } -int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, +int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ResTy, VectorType *ValTy, - bool IsPairwiseForm, + bool IsPairwiseForm, bool IsMLA, TTI::TargetCostKind CostKind) { - if (IsPairwiseForm) - return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, - CostKind); + if (IsPairwiseForm || IsMLA || ResTy != ValTy->getScalarType()) + return BaseT::getArithmeticReductionCost(Opcode, ResTy, ValTy, + IsPairwiseForm, IsMLA, CostKind); std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); MVT MTy = LT.second; @@ -1104,8 +1104,8 @@ if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy)) return LT.first * Entry->Cost; - return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, - CostKind); + return BaseT::getArithmeticReductionCost(Opcode, ResTy, ValTy, IsPairwiseForm, + IsMLA, CostKind); } int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -255,9 +255,7 @@ int getInlinerVectorBonusPercent() { return 0; } int getArithmeticReductionCost( - unsigned Opcode, - VectorType *Ty, - bool IsPairwise, + unsigned Opcode, Type *ResTy, VectorType *Ty, bool IsPairwise, bool IsMLA, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -777,17 +777,18 @@ } } -int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, - bool IsPairwise, +int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ResTy, + VectorType *Ty, bool IsPairwise, + bool IsMLA, TTI::TargetCostKind CostKind) { EVT OrigTy = TLI->getValueType(DL, Ty); // Computes cost on targets that have packed math instructions(which support // 16-bit types only). - if (IsPairwise || - !ST->hasVOP3PInsts() || - OrigTy.getScalarSizeInBits() != 16) - return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise, CostKind); + if (IsPairwise || IsMLA || ResTy != Ty->getScalarType() || + !ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16) + return BaseT::getArithmeticReductionCost(Opcode, ResTy, Ty, IsPairwise, + IsMLA, CostKind); std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); return LT.first * getFullRateInstrCost(); Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -244,9 +244,9 @@ Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); - int getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, - bool IsPairwiseForm, - TTI::TargetCostKind CostKind); + int getArithmeticReductionCost(unsigned Opcode, Type *ResTy, + VectorType *ValTy, bool IsPairwiseForm, + bool IsMLA, TTI::TargetCostKind CostKind); int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind); Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1475,27 +1475,27 @@ return ScalarCost; } -int ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, - bool IsPairwiseForm, +int ARMTTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ResTy, + VectorType *ValTy, + bool IsPairwiseForm, bool IsMLA, TTI::TargetCostKind CostKind) { EVT ValVT = TLI->getValueType(DL, ValTy); + EVT ResVT = TLI->getValueType(DL, ResTy); int ISD = TLI->InstructionOpcodeToISD(Opcode); - if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD) - return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, - CostKind); + if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || !ResVT.isSimple() || + ISD != ISD::ADD) + return BaseT::getArithmeticReductionCost(Opcode, ResTy, ValTy, + IsPairwiseForm, IsMLA, CostKind); std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); - static const CostTblEntry CostTblAdd[]{ - {ISD::ADD, MVT::v16i8, 1}, - {ISD::ADD, MVT::v8i16, 1}, - {ISD::ADD, MVT::v4i32, 1}, - }; - if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second)) - return Entry->Cost * ST->getMVEVectorCostFactor() * LT.first; + if ((LT.second == MVT::v16i8 && ResVT.getSizeInBits() <= 32) || + (LT.second == MVT::v8i16 && ResVT.getSizeInBits() <= (IsMLA ? 64 : 32)) || + (LT.second == MVT::v4i32 && ResVT.getSizeInBits() <= 64)) + return ST->getMVEVectorCostFactor() * LT.first; - return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, - CostKind); + return BaseT::getArithmeticReductionCost(Opcode, ResTy, ValTy, IsPairwiseForm, + IsMLA, CostKind); } int ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, @@ -2041,7 +2041,7 @@ unsigned ScalarBits = Ty->getScalarSizeInBits(); switch (Opcode) { case Instruction::Add: - return ScalarBits <= 32; + return ScalarBits <= 64; default: return false; } Index: llvm/lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- llvm/lib/Target/X86/X86TargetTransformInfo.h +++ llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -172,9 +172,9 @@ int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind); - int getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, - bool IsPairwiseForm, - TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency); + int getArithmeticReductionCost( + unsigned Opcode, Type *ResTy, VectorType *Ty, bool IsPairwiseForm, + bool IsMLA, TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency); int getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned); Index: llvm/lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -3332,12 +3332,14 @@ return BaseT::getAddressComputationCost(Ty, SE, Ptr); } -int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, - bool IsPairwise, +int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ResTy, + VectorType *ValTy, bool IsPairwise, + bool IsMLA, TTI::TargetCostKind CostKind) { // Just use the default implementation for pair reductions. - if (IsPairwise) - return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise, CostKind); + if (IsPairwise || IsMLA || ResTy != ValTy->getScalarType()) + return BaseT::getArithmeticReductionCost(Opcode, ResTy, ValTy, IsPairwise, + IsMLA, CostKind); // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput // and make it as the cost. @@ -3494,8 +3496,8 @@ if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy)) return ArithmeticCost + Entry->Cost; - return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise, - CostKind); + return BaseT::getArithmeticReductionCost(Opcode, ResTy, ValVTy, IsPairwise, + IsMLA, CostKind); } unsigned NumVecElts = ValVTy->getNumElements(); @@ -3504,8 +3506,8 @@ // Special case power of 2 reductions where the scalar type isn't changed // by type legalization. if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits()) - return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise, - CostKind); + return BaseT::getArithmeticReductionCost(Opcode, ResTy, ValVTy, IsPairwise, + IsMLA, CostKind); unsigned ReductionCost = 0; Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1635,6 +1635,11 @@ /// the vector type as an output parameter. unsigned getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy); + /// Return the cost of instructions in an inloop reduction pattern, if I is + /// part of that pattern. + int getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, + TTI::TargetCostKind CostKind); + /// Calculate vectorization cost of memory instruction \p I. unsigned getMemoryInstructionCost(Instruction *I, ElementCount VF); @@ -1718,6 +1723,12 @@ /// (PHI) to bottom ReductionChainMap InLoopReductionChains; + /// A Map of inloop reduction operations and their immediate chain operand. + /// FIXME: This can be removed once reductions can be costed correctly in + /// vplan. This was added to allow quick lookup to the inloop operations, + /// without having to loop through InLoopReductionChains. + DenseMap InLoopReductionImmediateChains; + /// Returns the expected difference in cost from scalarizing the expression /// feeding a predicated instruction \p PredInst. The instructions to /// scalarize and their scalar costs are collected in \p ScalarCosts. A @@ -5855,6 +5866,11 @@ if (!Legal->isReductionVariable(PN)) continue; RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; + if (PreferInLoopReductions || + TTI.preferInLoopReduction(RdxDesc.getRecurrenceBinOp(), + RdxDesc.getRecurrenceType(), + TargetTransformInfo::ReductionFlags())) + continue; T = RdxDesc.getRecurrenceType(); } @@ -6682,6 +6698,110 @@ return Cost; } +int LoopVectorizationCostModel::getReductionPatternCost( + Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { + // Early exit for no inloop reductions + if (InLoopReductionChains.empty() || VF.isScalar() || !isa(Ty)) + return -1; + auto *VectorTy = cast(Ty); + + // We are looking for a pattern of, and finding the minimal acceptable cost: + // reduce(mul(ext(A), ext(B))) or + // reduce(mul(A, B)) or + // reduce(ext(A)) or + // reduce(A). + // The basic idea is that we walk down the tree to do that, finding the root + // reduction instruction in InLoopReductionImmediateChains. From there we find + // the pattern of mul/ext and test the cost of the entire pattern vs the cost + // of the components. If the reduction cost is lower then we return it for the + // reduction instruction and 0 for the other instructions in the pattern. If + // it is not we return -1 specifying the orignal cost method should be used. + Instruction *RetI = I; + if ((RetI->getOpcode() == Instruction::SExt || + RetI->getOpcode() == Instruction::ZExt) && + RetI->hasOneUser()) + RetI = RetI->user_back(); + if (RetI->getOpcode() == Instruction::Mul && RetI->hasOneUse() && + RetI->user_back()->getOpcode() == Instruction::Add) + RetI = RetI->user_back(); + + // Test if the found instruction is a reduction, and if not return -1 + // specifying the parent to use the original cost modelling. + if (!InLoopReductionImmediateChains.count(RetI)) + return -1; + + // Find the reduction this chain is a part of and calculate the basic cost of + // the reduction on its own. + Instruction *LastChain = InLoopReductionImmediateChains[RetI]; + Instruction *ReductionPhi = LastChain; + while (!isa(ReductionPhi)) + ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; + + RecurrenceDescriptor RdxDesc = + Legal->getReductionVars()[cast(ReductionPhi)]; + unsigned BaseCost = TTI.getArithmeticReductionCost( + RdxDesc.getRecurrenceBinOp(), RdxDesc.getRecurrenceType(), VectorTy, + false, false, TTI::TCK_RecipThroughput); + + // Get the operand that was not the reduction chain and match it to one of the + // patterns, returning the better cost if it is found. + Value *RedOp = RetI->getOperand(1) == LastChain ? RetI->getOperand(0) + : RetI->getOperand(1); + + VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy); + + if (isa(RedOp) || isa(RedOp)) { + auto *RedOpI = cast(RedOp); + auto *ExtType = VectorType::get(RedOpI->getOperand(0)->getType(), VectorTy); + unsigned RedCost = TTI.getArithmeticReductionCost( + RdxDesc.getRecurrenceBinOp(), RdxDesc.getRecurrenceType(), ExtType, + false, false, TTI::TCK_RecipThroughput); + + unsigned ExtCost = + TTI.getCastInstrCost(RedOpI->getOpcode(), VectorTy, ExtType, + TTI::CastContextHint::None, CostKind, RedOpI); + if (RedCost < BaseCost + ExtCost) + return I == RetI ? RedCost : 0; + } else if (isa(RedOp) && + cast(RedOp)->getOpcode() == Instruction::Mul) { + Instruction *Mul = cast(RedOp); + Value *Op0 = Mul->getOperand(0); + Value *Op1 = Mul->getOperand(1); + if ((isa(Op0) || isa(Op0)) && isa(Op1) && + cast(Op0)->getOpcode() == + cast(Op1)->getOpcode() && + Op0->getType() == Op1->getType()) { + auto *ExtType = VectorType::get( + cast(Op0)->getOperand(0)->getType(), VectorTy); + // reduce(mul(ext, ext)) + unsigned ExtCost = TTI.getCastInstrCost( + cast(Op0)->getOpcode(), VectorTy, ExtType, + TTI::CastContextHint::None, CostKind, cast(Op0)); + unsigned MulCost = + TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); + + unsigned RedCost = TTI.getArithmeticReductionCost( + RdxDesc.getRecurrenceBinOp(), RdxDesc.getRecurrenceType(), ExtType, + false, true, TTI::TCK_RecipThroughput); + + if (RedCost < ExtCost * 2 + MulCost + BaseCost) + return I == RetI ? RedCost : 0; + } else { + unsigned MulCost = + TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind); + + unsigned RedCost = TTI.getArithmeticReductionCost( + RdxDesc.getRecurrenceBinOp(), RdxDesc.getRecurrenceType(), VectorTy, + false, true, TTI::TCK_RecipThroughput); + + if (RedCost < MulCost + BaseCost) + return I == RetI ? RedCost : 0; + } + } + + return I == RetI ? BaseCost : -1; +} + unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, ElementCount VF) { // Calculate scalar cost only. Vectorization cost should be ready at this @@ -7035,6 +7155,12 @@ // Since we will replace the stride by 1 the multiplication should go away. if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal)) return 0; + + // Detect reduction patterns + int RedCost; + if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) != -1) + return RedCost; + // Certain instructions can be cheaper to vectorize if they have a constant // second vector operand. One example of this are shifts on x86. Value *Op2 = I->getOperand(1); @@ -7158,6 +7284,11 @@ Trunc->getSrcTy(), CCH, CostKind, Trunc); } + // Detect reduction patterns + int RedCost; + if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) != -1) + return RedCost; + Type *SrcScalarTy = I->getOperand(0)->getType(); Type *SrcVecTy = VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; @@ -7290,8 +7421,15 @@ SmallVector ReductionOperations = RdxDesc.getReductionOpChain(Phi, TheLoop); bool InLoop = !ReductionOperations.empty(); - if (InLoop) + if (InLoop) { InLoopReductionChains[Phi] = ReductionOperations; + // Add the elements to InLoopReductionImmediateChains for cost modelling. + Instruction *LastChain = Phi; + for (auto *I : ReductionOperations) { + InLoopReductionImmediateChains[I] = LastChain; + LastChain = I; + } + } LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop") << " reduction for phi: " << *Phi << "\n"); } Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7203,12 +7203,12 @@ int SplittingRdxCost; switch (ReductionData.getKind()) { case RK_Arithmetic: - PairwiseRdxCost = - TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy, - /*IsPairwiseForm=*/true); - SplittingRdxCost = - TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy, - /*IsPairwiseForm=*/false); + PairwiseRdxCost = TTI->getArithmeticReductionCost( + ReductionData.getOpcode(), ScalarTy, VecTy, + /*IsPairwiseForm=*/true); + SplittingRdxCost = TTI->getArithmeticReductionCost( + ReductionData.getOpcode(), ScalarTy, VecTy, + /*IsPairwiseForm=*/false); break; case RK_SMin: case RK_SMax: Index: llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll +++ llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll @@ -12,33 +12,33 @@ ; CHECK: for.body.preheader: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 16 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> undef, i32 [[INDEX]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <16 x i32> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[TMP0]], i32 [[N]]) ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP3]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> undef) -; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i8> [[WIDE_MASKED_LOAD]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> undef) +; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i32 [[TMP0]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP7]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> undef) -; CHECK-NEXT: [[TMP8:%.*]] = sext <4 x i8> [[WIDE_MASKED_LOAD1]] to <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP8]], [[TMP4]] -; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP9]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP7]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> undef) +; CHECK-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP10]]) ; CHECK-NEXT: [[TMP12]] = add i32 [[TMP11]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] ; CHECK: middle.block: Index: llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll +++ llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll @@ -4,6 +4,7 @@ target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "thumbv8.1m.main-arm-none-eabi" +; Should not be vectorized define i64 @add_i64_i64(i64* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i64_i64( ; CHECK-NEXT: entry: @@ -41,24 +42,50 @@ ret i64 %r.0.lcssa } -; FIXME: 4x +; 4x to use VADDLV +; FIXME: TailPredicate define i64 @add_i32_i64(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i32_i64( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]]) +; CHECK-NEXT: [[TMP4]] = add i64 [[TMP3]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[I_08]] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_08]] +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[TMP6]] to i64 ; CHECK-NEXT: [[ADD]] = add nsw i64 [[R_07]], [[CONV]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[R_0_LCSSA]] ; entry: @@ -81,24 +108,50 @@ ret i64 %r.0.lcssa } -; FIXME: 4x ? +; 4x to use VADDLV +; FIXME: TailPredicate define i64 @add_i16_i64(i16* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i16_i64( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]]) +; CHECK-NEXT: [[TMP4]] = add i64 [[TMP3]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[I_08]] -; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i64 +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[X]], i32 [[I_08]] +; CHECK-NEXT: [[TMP6:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP6]] to i64 ; CHECK-NEXT: [[ADD]] = add nsw i64 [[R_07]], [[CONV]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP5:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[R_0_LCSSA]] ; entry: @@ -121,24 +174,50 @@ ret i64 %r.0.lcssa } -; FIXME: 4x ? +; 4x to use VADDLV +; FIXME: TailPredicate define i64 @add_i8_i64(i8* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i8_i64( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]]) +; CHECK-NEXT: [[TMP4]] = add i64 [[TMP3]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[I_08]] -; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i64 +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[R_07:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[X]], i32 [[I_08]] +; CHECK-NEXT: [[TMP6:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP6]] to i64 ; CHECK-NEXT: [[ADD]] = add nuw nsw i64 [[R_07]], [[CONV]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP7:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[R_0_LCSSA]] ; entry: @@ -161,6 +240,7 @@ ret i64 %r.0.lcssa } +; 4x to use VADDV.u32 define i32 @add_i32_i32(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i32_i32( ; CHECK-NEXT: entry: @@ -182,7 +262,7 @@ ; CHECK-NEXT: [[TMP4]] = add i32 [[TMP3]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP4]], [[VECTOR_BODY]] ] ; CHECK-NEXT: ret i32 [[R_0_LCSSA]] @@ -206,30 +286,30 @@ ret i32 %r.0.lcssa } -; FIXME: 8x +; 8x to use VADDV.u16 define i32 @add_i16_i32(i16* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i16_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 -; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP1]], i32 2, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i16> undef) -; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <8 x i16>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> undef) +; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP2]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP3]]) ; CHECK-NEXT: [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP2:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP9:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[VECTOR_BODY]] ] ; CHECK-NEXT: ret i32 [[R_0_LCSSA]] @@ -254,30 +334,30 @@ ret i32 %r.0.lcssa } -; FIXME: 16x +; 16x to use VADDV.u16 define i32 @add_i8_i32(i8* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i8_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP6]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 -; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP1]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> undef) -; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[WIDE_MASKED_LOAD]] to <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> undef) +; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP2]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP3]]) ; CHECK-NEXT: [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP3:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[VECTOR_BODY]] ] ; CHECK-NEXT: ret i32 [[R_0_LCSSA]] @@ -302,6 +382,7 @@ ret i32 %r.0.lcssa } +; 8x to use VADDV.u16 define signext i16 @add_i16_i16(i16* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i16_i16( ; CHECK-NEXT: entry: @@ -323,7 +404,7 @@ ; CHECK-NEXT: [[TMP4]] = add i16 [[TMP3]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP11:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP4]], [[VECTOR_BODY]] ] ; CHECK-NEXT: ret i16 [[R_0_LCSSA]] @@ -347,30 +428,30 @@ ret i16 %r.0.lcssa } -; FIXME: 16x ? +; 16x to use VADDV.u8 define signext i16 @add_i8_i16(i8* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i8_i16( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP8]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 -; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* [[TMP1]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> undef) -; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i16> -; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> [[TMP2]], <8 x i16> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> undef) +; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i16> +; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i16> [[TMP2]], <16 x i16> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[TMP3]]) ; CHECK-NEXT: [[TMP5]] = add i16 [[TMP4]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP5]], [[VECTOR_BODY]] ] ; CHECK-NEXT: ret i16 [[R_0_LCSSA]] @@ -395,6 +476,7 @@ ret i16 %r.0.lcssa } +; 16x to use VADDV.u8 define zeroext i8 @add_i8_i8(i8* nocapture readonly %x, i32 %n) #0 { ; CHECK-LABEL: @add_i8_i8( ; CHECK-NEXT: entry: @@ -416,7 +498,7 @@ ; CHECK-NEXT: [[TMP4]] = add i8 [[TMP3]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP13:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[TMP4]], [[VECTOR_BODY]] ] ; CHECK-NEXT: ret i8 [[R_0_LCSSA]] @@ -440,6 +522,7 @@ ret i8 %r.0.lcssa } +; Not vectorized define i64 @mla_i64_i64(i64* nocapture readonly %x, i64* nocapture readonly %y, i32 %n) #0 { ; CHECK-LABEL: @mla_i64_i64( ; CHECK-NEXT: entry: @@ -483,26 +566,57 @@ ret i64 %r.0.lcssa } +; 4x to use VMLAL.u32 +; FIXME: TailPredicate define i64 @mla_i32_i64(i32* nocapture readonly %x, i32* nocapture readonly %y, i32 %n) #0 { ; CHECK-LABEL: @mla_i32_i64( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[Y:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i32> [[TMP4]] to <4 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP5]]) +; CHECK-NEXT: [[TMP7]] = add i64 [[TMP6]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP14:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_010:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[R_09:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[I_010]] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[Y:%.*]], i32 [[I_010]] -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[I_010:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[R_09:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[I_010]] +; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[Y]], i32 [[I_010]] +; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP10]], [[TMP9]] ; CHECK-NEXT: [[CONV:%.*]] = sext i32 [[MUL]] to i64 ; CHECK-NEXT: [[ADD]] = add nsw i64 [[R_09]], [[CONV]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_010]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP15:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[R_0_LCSSA]] ; entry: @@ -528,28 +642,61 @@ ret i64 %r.0.lcssa } +; 8x to use VMLAL.u16 +; FIXME: TailPredicate define i64 @mla_i16_i64(i16* nocapture readonly %x, i16* nocapture readonly %y, i32 %n) #0 { ; CHECK-LABEL: @mla_i16_i64( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -8 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <8 x i16>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <8 x i16>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP4]], align 2 +; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i16> [[WIDE_LOAD1]] to <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <8 x i32> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = sext <8 x i32> [[TMP6]] to <8 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP7]]) +; CHECK-NEXT: [[TMP9]] = add i64 [[TMP8]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP16:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[R_011:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[I_012]] -; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i32 [[I_012]] -; CHECK-NEXT: [[TMP1:%.*]] = load i16, i16* [[ARRAYIDX1]], align 2 -; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[TMP1]] to i32 +; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[R_011:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[X]], i32 [[I_012]] +; CHECK-NEXT: [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP11]] to i32 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[Y]], i32 [[I_012]] +; CHECK-NEXT: [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX1]], align 2 +; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[TMP12]] to i32 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]] ; CHECK-NEXT: [[CONV3:%.*]] = sext i32 [[MUL]] to i64 ; CHECK-NEXT: [[ADD]] = add nsw i64 [[R_011]], [[CONV3]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_012]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP17:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[R_0_LCSSA]] ; entry: @@ -577,28 +724,61 @@ ret i64 %r.0.lcssa } +; 8x to use VMLAL.u16 +; FIXME: 8x, TailPredicate, code generation? define i64 @mla_i8_i64(i8* nocapture readonly %x, i8* nocapture readonly %y, i32 %n) #0 { ; CHECK-LABEL: @mla_i8_i64( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -16 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw <16 x i32> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = zext <16 x i32> [[TMP6]] to <16 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP7]]) +; CHECK-NEXT: [[TMP9]] = add i64 [[TMP8]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP18:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[R_011:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[I_012]] -; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i32 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[I_012]] -; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP1]] to i32 +; CHECK-NEXT: [[I_012:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[R_011:%.*]] = phi i64 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[X]], i32 [[I_012]] +; CHECK-NEXT: [[TMP11:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP11]] to i32 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[Y]], i32 [[I_012]] +; CHECK-NEXT: [[TMP12:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP12]] to i32 ; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[CONV2]], [[CONV]] ; CHECK-NEXT: [[CONV3:%.*]] = zext i32 [[MUL]] to i64 ; CHECK-NEXT: [[ADD]] = add nuw nsw i64 [[R_011]], [[CONV3]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_012]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP19:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ADD]], [[FOR_BODY]] ] +; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[R_0_LCSSA]] ; entry: @@ -626,6 +806,7 @@ ret i64 %r.0.lcssa } +; 4x to use VMLA.u32 define i32 @mla_i32_i32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32 %n) #0 { ; CHECK-LABEL: @mla_i32_i32( ; CHECK-NEXT: entry: @@ -651,7 +832,7 @@ ; CHECK-NEXT: [[TMP7]] = add i32 [[TMP6]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP7:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP20:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP7]], [[VECTOR_BODY]] ] ; CHECK-NEXT: ret i32 [[R_0_LCSSA]] @@ -678,34 +859,35 @@ ret i32 %r.0.lcssa } +; 8x to use VMLA.u16 define i32 @mla_i16_i32(i16* nocapture readonly %x, i16* nocapture readonly %y, i32 %n) #0 { ; CHECK-LABEL: @mla_i16_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 -; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <4 x i16>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP1]], i32 2, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i16> undef) -; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[TMP0]] to <8 x i16>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP1]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> undef) +; CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD]] to <8 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[Y:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <4 x i16>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP4]], i32 2, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i16> undef) -; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD1]] to <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <4 x i32> [[TMP5]], [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]]) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16* [[TMP3]] to <8 x i16>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP4]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> undef) +; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i16> [[WIDE_MASKED_LOAD1]] to <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <8 x i32> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i32> [[TMP6]], <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP7]]) ; CHECK-NEXT: [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP21:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP9]], [[VECTOR_BODY]] ] ; CHECK-NEXT: ret i32 [[R_0_LCSSA]] @@ -734,34 +916,35 @@ ret i32 %r.0.lcssa } +; 16x to use VMLA.u8 define i32 @mla_i8_i32(i8* nocapture readonly %x, i8* nocapture readonly %y, i32 %n) #0 { ; CHECK-LABEL: @mla_i8_i32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP9:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP9]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 -; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP1]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> undef) -; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[WIDE_MASKED_LOAD]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> undef) +; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP4]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> undef) -; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[WIDE_MASKED_LOAD1]] to <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw <4 x i32> [[TMP5]], [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP7]]) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP4]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> undef) +; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw <16 x i32> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i32> [[TMP6]], <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP7]]) ; CHECK-NEXT: [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP9:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP22:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP9]], [[VECTOR_BODY]] ] ; CHECK-NEXT: ret i32 [[R_0_LCSSA]] @@ -790,6 +973,7 @@ ret i32 %r.0.lcssa } +; 8x to use VMLA.u16 define signext i16 @mla_i16_i16(i16* nocapture readonly %x, i16* nocapture readonly %y, i32 %n) #0 { ; CHECK-LABEL: @mla_i16_i16( ; CHECK-NEXT: entry: @@ -815,7 +999,7 @@ ; CHECK-NEXT: [[TMP7]] = add i16 [[TMP6]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP23:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP7]], [[VECTOR_BODY]] ] ; CHECK-NEXT: ret i16 [[R_0_LCSSA]] @@ -842,34 +1026,35 @@ ret i16 %r.0.lcssa } +; 16x to use VMLA.u8 define signext i16 @mla_i8_i16(i8* nocapture readonly %x, i8* nocapture readonly %y, i32 %n) #0 { ; CHECK-LABEL: @mla_i8_i16( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP11]], label [[VECTOR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 -; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15 +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[N]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[N]]) ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[X:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* [[TMP1]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> undef) -; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i16> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> undef) +; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD]] to <16 x i16> ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[Y:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <8 x i8>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* [[TMP4]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> undef) -; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD1]] to <8 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = mul nuw <8 x i16> [[TMP5]], [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> [[TMP6]], <8 x i16> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP7]]) +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP4]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> undef) +; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_MASKED_LOAD1]] to <16 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = mul nuw <16 x i16> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i16> [[TMP6]], <16 x i16> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[TMP7]]) ; CHECK-NEXT: [[TMP9]] = add i16 [[TMP8]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP11:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP10]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP24:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP9]], [[VECTOR_BODY]] ] ; CHECK-NEXT: ret i16 [[R_0_LCSSA]] @@ -898,6 +1083,7 @@ ret i16 %r.0.lcssa } +; 16x to use VMLA.u8 define zeroext i8 @mla_i8_i8(i8* nocapture readonly %x, i8* nocapture readonly %y, i32 %n) #0 { ; CHECK-LABEL: @mla_i8_i8( ; CHECK-NEXT: entry: @@ -923,7 +1109,7 @@ ; CHECK-NEXT: [[TMP7]] = add i8 [[TMP6]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_COND_CLEANUP]], label [[VECTOR_BODY]], [[LOOP25:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[TMP7]], [[VECTOR_BODY]] ] ; CHECK-NEXT: ret i8 [[R_0_LCSSA]]