Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -628,13 +628,19 @@ /// ((v0+v2), (v1+v3), undef, undef) int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) const; - /// \returns The cost of Intrinsic instructions. Types analysis only. + /// \returns The cost of Intrinsic instructions. Analyses the real arguments. + /// Three cases are handled: 1. scalar instruction 2. vector instruction + /// 3. scalar instruction which is to be vectorized with VF. int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef Tys, FastMathFlags FMF) const; + ArrayRef Args, FastMathFlags FMF, + unsigned VF = 1) const; - /// \returns The cost of Intrinsic instructions. Analyses the real arguments. + /// \returns The cost of Intrinsic instructions. Types analysis only. + /// If ScalarizationCostPassed is UINT_MAX, the cost of scalarizing the + /// arguments and the return value will be computed based on types. int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef Args, FastMathFlags FMF) const; + ArrayRef Tys, FastMathFlags FMF, + unsigned ScalarizationCostPassed = UINT_MAX) const; /// \returns The cost of Call instructions. int getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys) const; @@ -828,11 +834,10 @@ virtual int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) = 0; virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef Tys, - FastMathFlags FMF) = 0; + ArrayRef Tys, FastMathFlags FMF, + unsigned ScalarizationCostPassed) = 0; virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef Args, - FastMathFlags FMF) = 0; + ArrayRef Args, FastMathFlags FMF, unsigned VF) = 0; virtual int getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys) = 0; virtual unsigned getNumberOfParts(Type *Tp) = 0; @@ -1086,13 +1091,13 @@ return Impl.getReductionCost(Opcode, Ty, IsPairwiseForm); } int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys, - FastMathFlags FMF) override { - return Impl.getIntrinsicInstrCost(ID, RetTy, Tys, FMF); + FastMathFlags FMF, unsigned ScalarizationCostPassed) override { + return Impl.getIntrinsicInstrCost(ID, RetTy, Tys, FMF, + ScalarizationCostPassed); } int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef Args, - FastMathFlags FMF) override { - return Impl.getIntrinsicInstrCost(ID, RetTy, Args, FMF); + ArrayRef Args, FastMathFlags FMF, unsigned VF) override { + return Impl.getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF); } int getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys) override { Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -369,11 +369,12 @@ } unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef Tys, FastMathFlags FMF) { + ArrayRef Tys, FastMathFlags FMF, + unsigned ScalarizationCostPassed) { return 1; } unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef Args, FastMathFlags FMF) { + ArrayRef Args, FastMathFlags FMF, unsigned VF) { return 1; } Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -306,16 +306,28 @@ return Cost; } - /// Estimate the overhead of scalarizing an instructions unique operands. + /// Estimate the overhead of scalarizing an instructions unique + /// non-constant operands. The types of the arguments are ordinarily + /// scalar, in which case the costs are multiplied with VF. Vector + /// arguments are allowed if 1 is passed for VF. unsigned getOperandsScalarizationOverhead(ArrayRef Args, unsigned VF) { unsigned Cost = 0; SmallPtrSet UniqueOperands; for (const Value *A : Args) { - if (UniqueOperands.insert(A).second) - Cost += getScalarizationOverhead(VectorType::get(A->getType(), VF), - false, true); + if (!isa(A) && UniqueOperands.insert(A).second) { + Type *VecTy = nullptr; + if (A->getType()->isVectorTy()) { + assert (VF == 1 && "Vector argument passed with VF > 1"); + VecTy = A->getType(); + } + else + VecTy = VectorType::get(A->getType(), VF); + + Cost += getScalarizationOverhead(VecTy, false, true); + } } + return Cost; } @@ -705,18 +717,40 @@ return Cost; } - /// Get intrinsic cost based on arguments + /// Get intrinsic cost based on arguments. unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef Args, FastMathFlags FMF) { + ArrayRef Args, FastMathFlags FMF, + unsigned VF = 1) { + unsigned RetVF = (RetTy->isVectorTy() ? RetTy->getVectorNumElements() : 1); + assert ((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type"); + switch (IID) { default: { + // Assume that we need to scalarize this intrinsic. SmallVector Types; - for (Value *Op : Args) - Types.push_back(Op->getType()); - return static_cast(this)->getIntrinsicInstrCost(IID, RetTy, Types, - FMF); + for (Value *Op : Args) { + Type *OpTy = Op->getType(); + assert (VF == 1 || !OpTy->isVectorTy()); + Types.push_back(VF == 1 ? OpTy : VectorType::get(OpTy, VF)); + } + + if (VF > 1 && !RetTy->isVoidTy()) + RetTy = VectorType::get(RetTy, VF); + + // Compute the scalarization overhead based on Args for a vector + // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while + // CostModel will pass a vector RetTy and VF is 1. + unsigned ScalarizationCost = UINT_MAX; + if (RetVF > 1 || VF > 1) { + ScalarizationCost = getScalarizationOverhead(RetTy, true, false); + ScalarizationCost += getOperandsScalarizationOverhead(Args, VF); + } + + return static_cast(this)-> + getIntrinsicInstrCost(IID, RetTy, Types, FMF, ScalarizationCost); } case Intrinsic::masked_scatter: { + assert (VF == 1 && "Can't vectorize types here."); Value *Mask = Args[3]; bool VarMask = !isa(Mask); unsigned Alignment = cast(Args[2])->getZExtValue(); @@ -727,6 +761,7 @@ Alignment); } case Intrinsic::masked_gather: { + assert (VF == 1 && "Can't vectorize types here."); Value *Mask = Args[2]; bool VarMask = !isa(Mask); unsigned Alignment = cast(Args[1])->getZExtValue(); @@ -738,19 +773,23 @@ } } - /// Get intrinsic cost based on argument types + /// Get intrinsic cost based on argument types. + /// If ScalarizationCostPassed is UINT_MAX, the cost of scalarizing the + /// arguments and the return value will be computed based on types. unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef Tys, FastMathFlags FMF) { + ArrayRef Tys, FastMathFlags FMF, + unsigned ScalarizationCostPassed = UINT_MAX) { SmallVector ISDs; unsigned SingleCallCost = 10; // Library call cost. Make it expensive. switch (IID) { default: { // Assume that we need to scalarize this intrinsic. - unsigned ScalarizationCost = 0; + unsigned ScalarizationCost = ScalarizationCostPassed; unsigned ScalarCalls = 1; Type *ScalarRetTy = RetTy; if (RetTy->isVectorTy()) { - ScalarizationCost = getScalarizationOverhead(RetTy, true, false); + if (ScalarizationCostPassed == UINT_MAX) + ScalarizationCost = getScalarizationOverhead(RetTy, true, false); ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements()); ScalarRetTy = RetTy->getScalarType(); } @@ -758,7 +797,8 @@ for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { Type *Ty = Tys[i]; if (Ty->isVectorTy()) { - ScalarizationCost += getScalarizationOverhead(Ty, false, true); + if (ScalarizationCostPassed == UINT_MAX) + ScalarizationCost += getScalarizationOverhead(Ty, false, true); ScalarCalls = std::max(ScalarCalls, Ty->getVectorNumElements()); Ty = Ty->getScalarType(); } @@ -906,7 +946,8 @@ // this will emit a costly libcall, adding call overhead and spills. Make it // very expensive. if (RetTy->isVectorTy()) { - unsigned ScalarizationCost = getScalarizationOverhead(RetTy, true, false); + unsigned ScalarizationCost = ((ScalarizationCostPassed != UINT_MAX) ? + ScalarizationCostPassed : getScalarizationOverhead(RetTy, true, false)); unsigned ScalarCalls = RetTy->getVectorNumElements(); SmallVector ScalarTys; for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { @@ -919,7 +960,8 @@ IID, RetTy->getScalarType(), ScalarTys, FMF); for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { if (Tys[i]->isVectorTy()) { - ScalarizationCost += getScalarizationOverhead(Tys[i], false, true); + if (ScalarizationCostPassed == UINT_MAX) + ScalarizationCost += getScalarizationOverhead(Tys[i], false, true); ScalarCalls = std::max(ScalarCalls, Tys[i]->getVectorNumElements()); } } Index: lib/Analysis/CostModel.cpp =================================================================== --- lib/Analysis/CostModel.cpp +++ lib/Analysis/CostModel.cpp @@ -542,9 +542,7 @@ } case Instruction::Call: if (const IntrinsicInst *II = dyn_cast(I)) { - SmallVector Args; - for (unsigned J = 0, JE = II->getNumArgOperands(); J != JE; ++J) - Args.push_back(II->getArgOperand(J)); + SmallVector Args(II->arg_operands()); FastMathFlags FMF; if (auto *FPMO = dyn_cast(II)) Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -378,17 +378,17 @@ } int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef Tys, - FastMathFlags FMF) const { - int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Tys, FMF); + ArrayRef Tys, FastMathFlags FMF, + unsigned ScalarizationCostPassed) const { + int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Tys, FMF, + ScalarizationCostPassed); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, - ArrayRef Args, - FastMathFlags FMF) const { - int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args, FMF); + ArrayRef Args, FastMathFlags FMF, unsigned VF) const { + int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -74,9 +74,11 @@ const SCEV *Ptr); int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef Tys, FastMathFlags FMF); + ArrayRef Tys, FastMathFlags FMF, + unsigned ScalarizationCostPassed = UINT_MAX); int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef Args, FastMathFlags FMF); + ArrayRef Args, FastMathFlags FMF, + unsigned VF = 1); int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm); Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -1370,7 +1370,8 @@ } int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef Tys, FastMathFlags FMF) { + ArrayRef Tys, FastMathFlags FMF, + unsigned ScalarizationCostPassed) { // Costs should match the codegen from: // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll @@ -1551,12 +1552,12 @@ if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) return LT.first * Entry->Cost; - return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF); + return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed); } int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, - ArrayRef Args, FastMathFlags FMF) { - return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF); + ArrayRef Args, FastMathFlags FMF, unsigned VF) { + return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF); } int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { Index: lib/Transforms/Vectorize/BBVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/BBVectorize.cpp +++ lib/Transforms/Vectorize/BBVectorize.cpp @@ -1127,39 +1127,51 @@ FastMathFlags FMFCI; if (auto *FPMOCI = dyn_cast(CI)) FMFCI = FPMOCI->getFastMathFlags(); + SmallVector IArgs(CI->arg_operands()); + unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, IArgs, FMFCI); - SmallVector Tys; - for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) - Tys.push_back(CI->getArgOperand(i)->getType()); - unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, Tys, FMFCI); - - Tys.clear(); CallInst *CJ = cast(J); FastMathFlags FMFCJ; if (auto *FPMOCJ = dyn_cast(CJ)) FMFCJ = FPMOCJ->getFastMathFlags(); - for (unsigned i = 0, ie = CJ->getNumArgOperands(); i != ie; ++i) - Tys.push_back(CJ->getArgOperand(i)->getType()); - unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, Tys, FMFCJ); + SmallVector JArgs(CJ->arg_operands()); + unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, JArgs, FMFCJ); - Tys.clear(); assert(CI->getNumArgOperands() == CJ->getNumArgOperands() && "Intrinsic argument counts differ"); + SmallVector Tys; + SmallVector VecArgs; for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz || - IID == Intrinsic::cttz) && i == 1) + IID == Intrinsic::cttz) && i == 1) { Tys.push_back(CI->getArgOperand(i)->getType()); - else + VecArgs.push_back(CI->getArgOperand(i)); + } + else { Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(), CJ->getArgOperand(i)->getType())); + // Add both operands, and then count their scalarization overhead + // with VF 1. + VecArgs.push_back(CI->getArgOperand(i)); + VecArgs.push_back(CJ->getArgOperand(i)); + } } + // Compute the scalarization cost here with the original operands (to + // check for uniqueness etc), and then call getIntrinsicInstrCost() + // with the constructed vector types. + Type *RetTy = getVecTypeForPair(IT1, JT1); + unsigned ScalarizationCost = 0; + if (!RetTy->isVoidTy()) + ScalarizationCost += TTI->getScalarizationOverhead(RetTy, true, false); + ScalarizationCost += TTI->getOperandsScalarizationOverhead(VecArgs, 1); + FastMathFlags FMFV = FMFCI; FMFV &= FMFCJ; - Type *RetTy = getVecTypeForPair(IT1, JT1); - unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys, FMFV); + unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys, FMFV, + ScalarizationCost); if (VCost > ICost + JCost) return false; Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3784,16 +3784,12 @@ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); assert(ID && "Expected intrinsic call!"); - Type *RetTy = ToVectorTy(CI->getType(), VF); - SmallVector Tys; - for (Value *ArgOperand : CI->arg_operands()) - Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); - FastMathFlags FMF; if (auto *FPMO = dyn_cast(CI)) FMF = FPMO->getFastMathFlags(); - return TTI.getIntrinsicInstrCost(ID, RetTy, Tys, FMF); + SmallVector Operands(CI->arg_operands()); + return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF); } static Type *smallestIntegerVectorType(Type *T1, Type *T2) { Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1914,12 +1914,9 @@ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // Calculate the cost of the scalar and vector calls. - SmallVector ScalarTys, VecTys; - for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) { + SmallVector ScalarTys; + for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) ScalarTys.push_back(CI->getArgOperand(op)->getType()); - VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(), - VecTy->getNumElements())); - } FastMathFlags FMF; if (auto *FPMO = dyn_cast(CI)) @@ -1928,7 +1925,9 @@ int ScalarCallCost = VecTy->getNumElements() * TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF); - int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF); + SmallVector Args(CI->arg_operands()); + int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF, + VecTy->getNumElements()); DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost << " (" << VecCallCost << "-" << ScalarCallCost << ")" Index: test/Analysis/CostModel/X86/arith-fp.ll =================================================================== --- test/Analysis/CostModel/X86/arith-fp.ll +++ test/Analysis/CostModel/X86/arith-fp.ll @@ -456,20 +456,20 @@ ; AVX2: cost of 1 {{.*}} %F32 = call float @llvm.fma.f32 ; AVX512: cost of 1 {{.*}} %F32 = call float @llvm.fma.f32 %F32 = call float @llvm.fma.f32(float undef, float undef, float undef) - ; SSE2: cost of 52 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32 - ; SSE42: cost of 52 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32 + ; SSE2: cost of 43 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32 + ; SSE42: cost of 43 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32 ; AVX: cost of 1 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32 ; AVX2: cost of 1 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32 ; AVX512: cost of 1 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32 %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) - ; SSE2: cost of 104 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32 - ; SSE42: cost of 104 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32 + ; SSE2: cost of 86 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32 + ; SSE42: cost of 86 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32 ; AVX: cost of 1 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32 ; AVX2: cost of 1 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32 ; AVX512: cost of 1 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32 %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) - ; SSE2: cost of 208 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32 - ; SSE42: cost of 208 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32 + ; SSE2: cost of 172 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32 + ; SSE42: cost of 172 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32 ; AVX: cost of 4 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32 ; AVX2: cost of 4 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32 ; AVX512: cost of 1 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32 @@ -481,20 +481,20 @@ ; AVX2: cost of 1 {{.*}} %F64 = call double @llvm.fma.f64 ; AVX512: cost of 1 {{.*}} %F64 = call double @llvm.fma.f64 %F64 = call double @llvm.fma.f64(double undef, double undef, double undef) - ; SSE2: cost of 24 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64 - ; SSE42: cost of 24 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64 + ; SSE2: cost of 21 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64 + ; SSE42: cost of 21 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64 ; AVX: cost of 1 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64 ; AVX2: cost of 1 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64 ; AVX512: cost of 1 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64 %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) - ; SSE2: cost of 48 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64 - ; SSE42: cost of 48 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64 + ; SSE2: cost of 42 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64 + ; SSE42: cost of 42 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64 ; AVX: cost of 1 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64 ; AVX2: cost of 1 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64 ; AVX512: cost of 1 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64 %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) - ; SSE2: cost of 96 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64 - ; SSE42: cost of 96 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64 + ; SSE2: cost of 84 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64 + ; SSE42: cost of 84 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64 ; AVX: cost of 4 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64 ; AVX2: cost of 4 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64 ; AVX512: cost of 1 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64 Index: test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll =================================================================== --- test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll +++ test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll @@ -170,8 +170,8 @@ ; VF_2-LABEL: Checking a loop in "i64_factor_8" ; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 10 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 10 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.8, %i64.8* %data, i64 %i, i32 2 Index: test/Transforms/LoopVectorize/ARM/interleaved_cost.ll =================================================================== --- test/Transforms/LoopVectorize/ARM/interleaved_cost.ll +++ test/Transforms/LoopVectorize/ARM/interleaved_cost.ll @@ -124,12 +124,12 @@ ; VF_4: Found an estimated cost of 40 for VF 4 For instruction: %tmp2 = load half, half* %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, half* %tmp1, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 40 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 32 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2 ; VF_8-LABEL: Checking a loop in "half_factor_2" ; VF_8: Found an estimated cost of 80 for VF 8 For instruction: %tmp2 = load half, half* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load half, half* %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 80 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 64 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 0