Diff 379740

llvm/include/llvm/Analysis/TargetTransformInfo.h

Show First 20 Lines • Show All 1,187 Lines • ▼ Show 20 Lines
InstructionCost getArithmeticReductionCost(		InstructionCost getArithmeticReductionCost(
unsigned Opcode, VectorType *Ty, Optional<FastMathFlags> FMF,		unsigned Opcode, VectorType *Ty, Optional<FastMathFlags> FMF,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;		TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;

InstructionCost getMinMaxReductionCost(		InstructionCost getMinMaxReductionCost(
VectorType Ty, VectorType CondTy, bool IsUnsigned,		VectorType Ty, VectorType CondTy, bool IsUnsigned,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;		TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;

		/// Calculate the cost of a call to the llvm.fmuladd intrinsic. This is
		/// modeled as the cost of a normal fmul instruction plus the cost of an fadd
		/// reduction.
		InstructionCost getFMulAddReductionCost(
		VectorType *Ty, Optional<FastMathFlags> FMF,
		TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;

/// Calculate the cost of an extended reduction pattern, similar to		/// Calculate the cost of an extended reduction pattern, similar to
/// getArithmeticReductionCost of an Add reduction with an extension and		/// getArithmeticReductionCost of an Add reduction with an extension and
/// optional multiply. This is the cost of as:		/// optional multiply. This is the cost of as:
/// ResTy vecreduce.add(ext(Ty A)), or if IsMLA flag is set then:		/// ResTy vecreduce.add(ext(Ty A)), or if IsMLA flag is set then:
/// ResTy vecreduce.add(mul(ext(Ty A), ext(Ty B)). The reduction happens		/// ResTy vecreduce.add(mul(ext(Ty A), ext(Ty B)). The reduction happens
/// on a VectorType with ResTy elements and Ty lanes.		/// on a VectorType with ResTy elements and Ty lanes.
InstructionCost getExtendedAddReductionCost(		InstructionCost getExtendedAddReductionCost(
bool IsMLA, bool IsUnsigned, Type ResTy, VectorType Ty,		bool IsMLA, bool IsUnsigned, Type ResTy, VectorType Ty,
▲ Show 20 Lines • Show All 453 Lines • ▼ Show 20 Lines	virtual InstructionCost getInterleavedMemoryOpCost(
bool UseMaskForCond = false, bool UseMaskForGaps = false) = 0;		bool UseMaskForCond = false, bool UseMaskForGaps = false) = 0;
virtual InstructionCost		virtual InstructionCost
getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,		getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
Optional<FastMathFlags> FMF,		Optional<FastMathFlags> FMF,
TTI::TargetCostKind CostKind) = 0;		TTI::TargetCostKind CostKind) = 0;
virtual InstructionCost		virtual InstructionCost
getMinMaxReductionCost(VectorType Ty, VectorType CondTy, bool IsUnsigned,		getMinMaxReductionCost(VectorType Ty, VectorType CondTy, bool IsUnsigned,
TTI::TargetCostKind CostKind) = 0;		TTI::TargetCostKind CostKind) = 0;
		virtual InstructionCost
		getFMulAddReductionCost(VectorType *Ty, Optional<FastMathFlags> FMF,
		TTI::TargetCostKind CostKind) = 0;
virtual InstructionCost getExtendedAddReductionCost(		virtual InstructionCost getExtendedAddReductionCost(
bool IsMLA, bool IsUnsigned, Type ResTy, VectorType Ty,		bool IsMLA, bool IsUnsigned, Type ResTy, VectorType Ty,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) = 0;		TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) = 0;
virtual InstructionCost		virtual InstructionCost
getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,		getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind) = 0;		TTI::TargetCostKind CostKind) = 0;
virtual InstructionCost getCallInstrCost(Function F, Type RetTy,		virtual InstructionCost getCallInstrCost(Function F, Type RetTy,
ArrayRef<Type *> Tys,		ArrayRef<Type *> Tys,
▲ Show 20 Lines • Show All 499 Lines • ▼ Show 20 Lines	getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
TTI::TargetCostKind CostKind) override {		TTI::TargetCostKind CostKind) override {
return Impl.getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);		return Impl.getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
}		}
InstructionCost		InstructionCost
getMinMaxReductionCost(VectorType Ty, VectorType CondTy, bool IsUnsigned,		getMinMaxReductionCost(VectorType Ty, VectorType CondTy, bool IsUnsigned,
TTI::TargetCostKind CostKind) override {		TTI::TargetCostKind CostKind) override {
return Impl.getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);		return Impl.getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
}		}
		InstructionCost
		getFMulAddReductionCost(VectorType *Ty, Optional<FastMathFlags> FMF,
		TTI::TargetCostKind CostKind) override {
		return Impl.getFMulAddReductionCost(Ty, FMF, CostKind);
		}
InstructionCost getExtendedAddReductionCost(		InstructionCost getExtendedAddReductionCost(
bool IsMLA, bool IsUnsigned, Type ResTy, VectorType Ty,		bool IsMLA, bool IsUnsigned, Type ResTy, VectorType Ty,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) override {		TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) override {
return Impl.getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, Ty,		return Impl.getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, Ty,
CostKind);		CostKind);
}		}
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,		InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind) override {		TTI::TargetCostKind CostKind) override {
▲ Show 20 Lines • Show All 228 Lines • Show Last 20 Lines

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Show First 20 Lines • Show All 631 Lines • ▼ Show 20 Lines	InstructionCost getArithmeticReductionCost(unsigned, VectorType *,
return 1;		return 1;
}		}

InstructionCost getMinMaxReductionCost(VectorType , VectorType , bool,		InstructionCost getMinMaxReductionCost(VectorType , VectorType , bool,
TTI::TargetCostKind) const {		TTI::TargetCostKind) const {
return 1;		return 1;
}		}

		InstructionCost getFMulAddReductionCost(VectorType *, Optional<FastMathFlags>,
		TTI::TargetCostKind) const {
		return 1;
		}

InstructionCost		InstructionCost
getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy,		getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy,
VectorType *Ty,		VectorType *Ty,
TTI::TargetCostKind CostKind) const {		TTI::TargetCostKind CostKind) const {
return 1;		return 1;
}		}

InstructionCost getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const {		InstructionCost getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const {
▲ Show 20 Lines • Show All 538 Lines • Show Last 20 Lines

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Show First 20 Lines • Show All 2,168 Lines • ▼ Show 20 Lines	MinMaxCost +=
thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,		thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
CmpInst::BAD_ICMP_PREDICATE, CostKind));		CmpInst::BAD_ICMP_PREDICATE, CostKind));
// The last min/max should be in vector registers and we counted it above.		// The last min/max should be in vector registers and we counted it above.
// So just need a single extractelement.		// So just need a single extractelement.
return ShuffleCost + MinMaxCost +		return ShuffleCost + MinMaxCost +
thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);		thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
}		}

		InstructionCost getFMulAddReductionCost(VectorType *Ty,
		Optional<FastMathFlags> FMF,
		TTI::TargetCostKind CostKind) {
		InstructionCost FAddReductionCost = thisT()->getArithmeticReductionCost(
		Instruction::FAdd, Ty, FMF, CostKind);
		InstructionCost FMulCost =
		thisT()->getArithmeticInstrCost(Instruction::FMul, Ty, CostKind);
		return FMulCost + FAddReductionCost;
		}
		paulwalker-armUnsubmitted Not Done Reply Inline Actions Do we need a new TTI interface for this? To my mind the costing side of TTI exists to cost real entities and in this instance the IR has no discrete concept for an FMA reduction. Instead what we have is LoopVectorize pretending such a concept exists that is knows ahead of time it will simulate with separate fmul and ordered_fadd_reduce operations. For this reason I think it would be better to explicitly cost that exact idiom within the same code that is using it. So I guess I'm saying that if you move these three lines into LoopVectorize.cpp the patch can be much smaller and you're not creating a new interface for something that doesn't really exist. paulwalker-arm: Do we need a new TTI interface for this? To my mind the costing side of TTI exists to cost…
		RosieSumpterAuthorUnsubmitted Done Reply Inline Actions Hi Paul, thanks for having a look at this. Initially I did put the cost calculation into the vectorizer, but there was some discussion (also a comment from @fhahn) about whether it would make more sense in `TTI.getArithmeticReductionCost` and, to avoid changing the interface for `getArithmeticReductionCost`, @david-arm and @sdesmalen suggested adding the new `getFMulAddReductionCost`. I am happy to change it back to LoopVectorize.cpp if that seems like the better option. RosieSumpter: Hi Paul, thanks for having a look at this. Initially I did put the cost calculation into the…
		david-armUnsubmitted Not Done Reply Inline Actions For what it's worth, I personally think it makes a bit more sense to calculate the costs separately in the vectoriser because in my mind at least the fmul isn't part of the reduction, since we are always going to widen it into a normal vector operation. I suggested adding a new interface as a compromise, so we can avoid over-complicating (and avoid making this patch significantly larger) the existing getArithmeticReductionCost method. I wouldn't block the patch over this though! david-arm: For what it's worth, I personally think it makes a bit more sense to calculate the costs…

InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,		InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
Type ResTy, VectorType Ty,		Type ResTy, VectorType Ty,
TTI::TargetCostKind CostKind) {		TTI::TargetCostKind CostKind) {
// Without any native support, this is equivalent to the cost of		// Without any native support, this is equivalent to the cost of
// vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext))		// vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext))
VectorType *ExtTy = VectorType::get(ResTy, Ty);		VectorType *ExtTy = VectorType::get(ResTy, Ty);
InstructionCost RedCost = thisT()->getArithmeticReductionCost(		InstructionCost RedCost = thisT()->getArithmeticReductionCost(
Instruction::Add, ExtTy, None, CostKind);		Instruction::Add, ExtTy, None, CostKind);
Show All 38 Lines

llvm/lib/Analysis/TargetTransformInfo.cpp

Show First 20 Lines • Show All 911 Lines • ▼ Show 20 Lines	InstructionCost TargetTransformInfo::getMinMaxReductionCost(
VectorType Ty, VectorType CondTy, bool IsUnsigned,		VectorType Ty, VectorType CondTy, bool IsUnsigned,
TTI::TargetCostKind CostKind) const {		TTI::TargetCostKind CostKind) const {
InstructionCost Cost =		InstructionCost Cost =
TTIImpl->getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);		TTIImpl->getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
assert(Cost >= 0 && "TTI should not produce negative costs!");		assert(Cost >= 0 && "TTI should not produce negative costs!");
return Cost;		return Cost;
}		}

		InstructionCost TargetTransformInfo::getFMulAddReductionCost(
		VectorType *Ty, Optional<FastMathFlags> FMF,
		TTI::TargetCostKind CostKind) const {
		InstructionCost Cost = TTIImpl->getFMulAddReductionCost(Ty, FMF, CostKind);
		assert(Cost >= 0 && "TTI should not produce negative costs!");
		return Cost;
		}

InstructionCost TargetTransformInfo::getExtendedAddReductionCost(		InstructionCost TargetTransformInfo::getExtendedAddReductionCost(
bool IsMLA, bool IsUnsigned, Type ResTy, VectorType Ty,		bool IsMLA, bool IsUnsigned, Type ResTy, VectorType Ty,
TTI::TargetCostKind CostKind) const {		TTI::TargetCostKind CostKind) const {
return TTIImpl->getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, Ty,		return TTIImpl->getExtendedAddReductionCost(IsMLA, IsUnsigned, ResTy, Ty,
CostKind);		CostKind);
}		}

InstructionCost		InstructionCost
▲ Show 20 Lines • Show All 246 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Show First 20 Lines • Show All 1,977 Lines • ▼ Show 20 Lines	bool AArch64TTIImpl::isLegalToVectorizeReduction(
case RecurKind::SMin:		case RecurKind::SMin:
case RecurKind::SMax:		case RecurKind::SMax:
case RecurKind::UMin:		case RecurKind::UMin:
case RecurKind::UMax:		case RecurKind::UMax:
case RecurKind::FMin:		case RecurKind::FMin:
case RecurKind::FMax:		case RecurKind::FMax:
case RecurKind::SelectICmp:		case RecurKind::SelectICmp:
case RecurKind::SelectFCmp:		case RecurKind::SelectFCmp:
		case RecurKind::FMulAdd:
return true;		return true;
default:		default:
return false;		return false;
}		}
}		}

InstructionCost		InstructionCost
AArch64TTIImpl::getMinMaxReductionCost(VectorType Ty, VectorType CondTy,		AArch64TTIImpl::getMinMaxReductionCost(VectorType Ty, VectorType CondTy,
▲ Show 20 Lines • Show All 305 Lines • Show Last 20 Lines

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 7,249 Lines • ▼ Show 20 Lines	Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
Instruction *LastChain = InLoopReductionImmediateChains[RetI];		Instruction *LastChain = InLoopReductionImmediateChains[RetI];
Instruction *ReductionPhi = LastChain;		Instruction *ReductionPhi = LastChain;
while (!isa<PHINode>(ReductionPhi))		while (!isa<PHINode>(ReductionPhi))
ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];		ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];

const RecurrenceDescriptor &RdxDesc =		const RecurrenceDescriptor &RdxDesc =
Legal->getReductionVars()[cast<PHINode>(ReductionPhi)];		Legal->getReductionVars()[cast<PHINode>(ReductionPhi)];

InstructionCost BaseCost = TTI.getArithmeticReductionCost(		InstructionCost BaseCost;
		if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
		// Recognize a call to the llvm.fmuladd intrinsic.
		BaseCost = TTI.getFMulAddReductionCost(VectorTy, RdxDesc.getFastMathFlags(),
		CostKind);
		else
		BaseCost = TTI.getArithmeticReductionCost(
RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);		RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);

// If we're using ordered reductions then we can just return the base cost		// If we're using ordered reductions then we can just return the base cost
// here, since getArithmeticReductionCost calculates the full ordered		// here, since getArithmeticReductionCost calculates the full ordered
// reduction cost when FP reassociation is not allowed.		// reduction cost when FP reassociation is not allowed.
if (useOrderedReductions(RdxDesc))		if (useOrderedReductions(RdxDesc))
return BaseCost;		return BaseCost;

		paulwalker-armUnsubmitted Done Reply Inline Actions Perhaps I've misunderstood something? because this code looks more complicated (or perhaps just more verbose) than the previous patch. I guess I'm struggling why different versions of `getArithmeticInstrCost` are being called between the two. I just assumed you'd add something like: InstructionCost BaseCost = TTI.getArithmeticReductionCost( RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); + // For llvm.fmuladd based reductions we must include the cost of the normal + // vector fmul that will occur prior to the fadd reduction. + if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd) + BaseCost += TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind); // If we're using ordered reductions then we can just return the base cost // here, since getArithmeticReductionCost calculates the full ordered // reduction cost when FP reassociation is not allowed. if (useOrderedReductions(RdxDesc)) return BaseCost; paulwalker-arm: Perhaps I've misunderstood something? because this code looks more complicated (or perhaps just…
// Get the operand that was not the reduction chain and match it to one of the		// Get the operand that was not the reduction chain and match it to one of the
// patterns, returning the better cost if it is found.		// patterns, returning the better cost if it is found.
Instruction *RedOp = RetI->getOperand(1) == LastChain		Instruction *RedOp = RetI->getOperand(1) == LastChain
? dyn_cast<Instruction>(RetI->getOperand(0))		? dyn_cast<Instruction>(RetI->getOperand(0))
		fhahnUnsubmitted Not Done Reply Inline Actions Could this be handled in `TTI.getArithmeticReductionCost`? What about other potential users of fmuladd reductions, like the SLP vectorizer? fhahn: Could this be handled in `TTI.getArithmeticReductionCost`? What about other potential users of…
		RosieSumpterAuthorUnsubmitted Not Done Reply Inline Actions Hi Florian, I did discuss this option with @david-arm, but it would mean changing the interface of `getArithmeticReductionCost` (e.g. by adding an optional `Instruction` argument) to be able to determine if it's a call to the fmuladd intrinsic. David also made the point that the fmul isn't actually part of the reduction cost, so perhaps it doesn't make sense to ask for an fmuladd reduction cost? If you would prefer it to be there though I'm happy to make the change. For the SLP vectorizer, it doesn't handle the fmuladd at the moment (I've added an assert for this in D111555 for safety) RosieSumpter:* Hi Florian, I did discuss this option with @david-arm, but it would mean changing the interface…
: dyn_cast<Instruction>(RetI->getOperand(1));		: dyn_cast<Instruction>(RetI->getOperand(1));

VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);		VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);

Instruction Op0, Op1;		Instruction Op0, Op1;
if (RedOp &&		if (RedOp &&
match(RedOp,		match(RedOp,
m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&		m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
▲ Show 20 Lines • Show All 645 Lines • ▼ Show 20 Lines	if (canTruncateToMinimalBitwidth(I, VF)) {
VectorTy =		VectorTy =
smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);		smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
}		}
}		}

return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);		return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
}		}
case Instruction::Call: {		case Instruction::Call: {
		// Recognize a call to the llvm.fmuladd intrinsic.
		sdesmalenUnsubmitted Not Done Reply Inline Actions nit: redundant comment (i.e. the code says as much) sdesmalen: nit: redundant comment (i.e. the code says as much)
		if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) {
		sdesmalenUnsubmitted Not Done Reply Inline Actions nit: unnecessary curly braces. sdesmalen: nit: unnecessary curly braces.
		// Detect reduction patterns.
		sdesmalenUnsubmitted Not Done Reply Inline Actions nit: redundant comment. sdesmalen: nit: redundant comment.
		if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
		return *RedCost;
		}
bool NeedToScalarize;		bool NeedToScalarize;
CallInst *CI = cast<CallInst>(I);		CallInst *CI = cast<CallInst>(I);
InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);		InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
if (getVectorIntrinsicIDForCall(CI, TLI)) {		if (getVectorIntrinsicIDForCall(CI, TLI)) {
InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);		InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
return std::min(CallCost, IntrinsicCost);		return std::min(CallCost, IntrinsicCost);
}		}
return CallCost;		return CallCost;
▲ Show 20 Lines • Show All 2,706 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll

Show First 20 Lines • Show All 389 Lines • ▼ Show 20 Lines	for.end: ; preds = %for.body
ret float %rdx		ret float %rdx
}		}

; Test case where loop has a call to the llvm.fmuladd intrinsic.		; Test case where loop has a call to the llvm.fmuladd intrinsic.
define float @fmuladd_strict(float* %a, float* %b, i64 %n) #0 {		define float @fmuladd_strict(float* %a, float* %b, i64 %n) #0 {
; CHECK-ORDERED-LABEL: @fmuladd_strict		; CHECK-ORDERED-LABEL: @fmuladd_strict
; CHECK-ORDERED: vector.body:		; CHECK-ORDERED: vector.body:
; CHECK-ORDERED: [[VEC_PHI:%.]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX2:%.]], %vector.body ]		; CHECK-ORDERED: [[VEC_PHI:%.]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX2:%.]], %vector.body ]
; CHECK-ORDERED: [[WIDE_LOAD:%.]] = load <4 x float>, <4 x float>		; CHECK-ORDERED: [[WIDE_LOAD:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
		david-armUnsubmitted Not Done Reply Inline Actions Hi @RosieSumpter, do you know why these CHECK lines have changed? It doesn't seem like your patch should affect these tests because these loops are forced to use a certain VF anyway. david-arm: Hi @RosieSumpter, do you know why these CHECK lines have changed? It doesn't seem like your…
		RosieSumpterAuthorUnsubmitted Not Done Reply Inline Actions Hi @david-arm, good point. The reason the test has changed is because of adding `FMulAdd` as an allowed recurrence kind to `AArch64TTIImpl::isLegalToVectorizeReduction`. I now see that it probably makes more sense for this particular change to be in D111555, so I'll do that now. RosieSumpter: Hi @david-arm, good point. The reason the test has changed is because of adding ##FMulAdd## as…
; CHECK-ORDERED: [[WIDE_LOAD1:%.]] = load <4 x float>, <4 x float>		; CHECK-ORDERED: [[WIDE_LOAD1:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-ORDERED: [[WIDE_LOAD2:%.]] = load <4 x float>, <4 x float>		; CHECK-ORDERED: [[WIDE_LOAD2:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-ORDERED: [[WIDE_LOAD3:%.]] = load <4 x float>, <4 x float>		; CHECK-ORDERED: [[WIDE_LOAD3:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-ORDERED: [[WIDE_LOAD4:%.]] = load <4 x float>, <4 x float>		; CHECK-ORDERED: [[WIDE_LOAD4:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-ORDERED: [[WIDE_LOAD5:%.]] = load <4 x float>, <4 x float>		; CHECK-ORDERED: [[WIDE_LOAD5:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-ORDERED: [[WIDE_LOAD6:%.]] = load <4 x float>, <4 x float>		; CHECK-ORDERED: [[WIDE_LOAD6:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-ORDERED: [[WIDE_LOAD7:%.]] = load <4 x float>, <4 x float>		; CHECK-ORDERED: [[WIDE_LOAD7:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-ORDERED: [[FMUL:%.*]] = fmul <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]		; CHECK-ORDERED: [[FMUL:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
; CHECK-ORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[FMUL]])		; CHECK-ORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[FMUL]])
; CHECK-ORDERED: [[FMUL1:%.*]] = fmul <4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]		; CHECK-ORDERED: [[FMUL1:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
; CHECK-ORDERED: [[RDX1:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float [[RDX]], <4 x float> [[FMUL1]])		; CHECK-ORDERED: [[RDX1:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX]], <vscale x 8 x float> [[FMUL1]])
; CHECK-ORDERED: [[FMUL2:%.*]] = fmul <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]		; CHECK-ORDERED: [[FMUL2:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
; CHECK-ORDERED: [[RDX2:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float [[RDX1]], <4 x float> [[FMUL2]])		; CHECK-ORDERED: [[RDX2:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX1]], <vscale x 8 x float> [[FMUL2]])
; CHECK-ORDERED: [[FMUL3:%.*]] = fmul <4 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]		; CHECK-ORDERED: [[FMUL3:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
; CHECK-ORDERED: [[RDX3:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float [[RDX2]], <4 x float> [[FMUL3]])		; CHECK-ORDERED: [[RDX3:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX2]], <vscale x 8 x float> [[FMUL3]])
; CHECK-ORDERED: for.end		; CHECK-ORDERED: for.end
; CHECK-ORDERED: [[RES:%.]] = phi float [ [[SCALAR:%.]], %for.body ], [ [[RDX3]], %middle.block ]		; CHECK-ORDERED: [[RES:%.]] = phi float [ [[SCALAR:%.]], %for.body ], [ [[RDX3]], %middle.block ]
; CHECK-ORDERED: ret float [[RES]]		; CHECK-ORDERED: ret float [[RES]]

; CHECK-UNORDERED-LABEL: @fmuladd_strict		; CHECK-UNORDERED-LABEL: @fmuladd_strict
; CHECK-UNORDERED: vector.body		; CHECK-UNORDERED: vector.body
; CHECK-UNORDERED: [[VEC_PHI:%.]] = phi <4 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ [[FMULADD:%.]], %vector.body ]		; CHECK-UNORDERED: [[VEC_PHI:%.]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ [[FMULADD:%.]], %vector.body ]
; CHECK-UNORDERED: [[VEC_PHI1:%.]] = phi <4 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ [[FMULADD1:%.]], %vector.body ]		; CHECK-UNORDERED: [[VEC_PHI1:%.]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD1:%.]], %vector.body ]
; CHECK-UNORDERED: [[VEC_PHI2:%.]] = phi <4 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ [[FMULADD2:%.]], %vector.body ]		; CHECK-UNORDERED: [[VEC_PHI2:%.]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD2:%.]], %vector.body ]
; CHECK-UNORDERED: [[VEC_PHI3:%.]] = phi <4 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ [[FMULADD3:%.]], %vector.body ]		; CHECK-UNORDERED: [[VEC_PHI3:%.]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD3:%.]], %vector.body ]
; CHECK-UNORDERED: [[WIDE_LOAD:%.]] = load <4 x float>, <4 x float>		; CHECK-UNORDERED: [[WIDE_LOAD:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-UNORDERED: [[WIDE_LOAD1:%.]] = load <4 x float>, <4 x float>		; CHECK-UNORDERED: [[WIDE_LOAD1:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-UNORDERED: [[WIDE_LOAD2:%.]] = load <4 x float>, <4 x float>		; CHECK-UNORDERED: [[WIDE_LOAD2:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-UNORDERED: [[WIDE_LOAD3:%.]] = load <4 x float>, <4 x float>		; CHECK-UNORDERED: [[WIDE_LOAD3:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-UNORDERED: [[WIDE_LOAD4:%.]] = load <4 x float>, <4 x float>		; CHECK-UNORDERED: [[WIDE_LOAD4:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-UNORDERED: [[WIDE_LOAD5:%.]] = load <4 x float>, <4 x float>		; CHECK-UNORDERED: [[WIDE_LOAD5:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-UNORDERED: [[WIDE_LOAD6:%.]] = load <4 x float>, <4 x float>		; CHECK-UNORDERED: [[WIDE_LOAD6:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-UNORDERED: [[WIDE_LOAD7:%.]] = load <4 x float>, <4 x float>		; CHECK-UNORDERED: [[WIDE_LOAD7:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-UNORDERED: [[FMULADD]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD4]], <4 x float> [[VEC_PHI]])		; CHECK-UNORDERED: [[FMULADD]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[WIDE_LOAD4]], <vscale x 8 x float> [[VEC_PHI]])
; CHECK-UNORDERED: [[FMULADD1]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD5]], <4 x float> [[VEC_PHI1]])		; CHECK-UNORDERED: [[FMULADD1]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD1]], <vscale x 8 x float> [[WIDE_LOAD5]], <vscale x 8 x float> [[VEC_PHI1]])
; CHECK-UNORDERED: [[FMULADD2]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD2]], <4 x float> [[WIDE_LOAD6]], <4 x float> [[VEC_PHI2]])		; CHECK-UNORDERED: [[FMULADD2]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD2]], <vscale x 8 x float> [[WIDE_LOAD6]], <vscale x 8 x float> [[VEC_PHI2]])
; CHECK-UNORDERED: [[FMULADD3]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD3]], <4 x float> [[WIDE_LOAD7]], <4 x float> [[VEC_PHI3]])		; CHECK-UNORDERED: [[FMULADD3]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD3]], <vscale x 8 x float> [[WIDE_LOAD7]], <vscale x 8 x float> [[VEC_PHI3]])
; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd		; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
; CHECK-UNORDERED: middle.block		; CHECK-UNORDERED: middle.block
; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd <4 x float> [[FMULADD1]], [[FMULADD]]		; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd <vscale x 8 x float> [[FMULADD1]], [[FMULADD]]
; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd <4 x float> [[FMULADD2]], [[BIN_RDX]]		; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd <vscale x 8 x float> [[FMULADD2]], [[BIN_RDX]]
; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd <4 x float> [[FMULADD3]], [[BIN_RDX1]]		; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd <vscale x 8 x float> [[FMULADD3]], [[BIN_RDX1]]
; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX2]]		; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX2]]
; CHECK-UNORDERED: for.body		; CHECK-UNORDERED: for.body
; CHECK-UNORDERED: [[SUM_07:%.]] = phi float [ [[SCALAR:%.]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ]		; CHECK-UNORDERED: [[SUM_07:%.]] = phi float [ [[SCALAR:%.]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ]
; CHECK-UNORDERED: [[LOAD:%.]] = load float, float		; CHECK-UNORDERED: [[LOAD:%.]] = load float, float
; CHECK-UNORDERED: [[LOAD1:%.]] = load float, float		; CHECK-UNORDERED: [[LOAD1:%.]] = load float, float
; CHECK-UNORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD1]], float [[SUM_07]])		; CHECK-UNORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD1]], float [[SUM_07]])
; CHECK-UNORDERED: for.end		; CHECK-UNORDERED: for.end
; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX]], %middle.block ]		; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX]], %middle.block ]
; CHECK-UNORDERED: ret float [[RES]]		; CHECK-UNORDERED: ret float [[RES]]
Show All 20 Lines	for.end:
ret float %muladd		ret float %muladd
}		}

; Same as above but where the call to the llvm.fmuladd intrinsic uses a fast-math flag.		; Same as above but where the call to the llvm.fmuladd intrinsic uses a fast-math flag.
define float @fmuladd_strict_fmf(float* %a, float* %b, i64 %n) #0 {		define float @fmuladd_strict_fmf(float* %a, float* %b, i64 %n) #0 {
; CHECK-ORDERED-LABEL: @fmuladd_strict_fmf		; CHECK-ORDERED-LABEL: @fmuladd_strict_fmf
; CHECK-ORDERED: vector.body:		; CHECK-ORDERED: vector.body:
; CHECK-ORDERED: [[VEC_PHI:%.]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX2:%.]], %vector.body ]		; CHECK-ORDERED: [[VEC_PHI:%.]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX2:%.]], %vector.body ]
; CHECK-ORDERED: [[WIDE_LOAD:%.]] = load <4 x float>, <4 x float>		; CHECK-ORDERED: [[WIDE_LOAD:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-ORDERED: [[WIDE_LOAD1:%.]] = load <4 x float>, <4 x float>		; CHECK-ORDERED: [[WIDE_LOAD1:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-ORDERED: [[WIDE_LOAD2:%.]] = load <4 x float>, <4 x float>		; CHECK-ORDERED: [[WIDE_LOAD2:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-ORDERED: [[WIDE_LOAD3:%.]] = load <4 x float>, <4 x float>		; CHECK-ORDERED: [[WIDE_LOAD3:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-ORDERED: [[WIDE_LOAD4:%.]] = load <4 x float>, <4 x float>		; CHECK-ORDERED: [[WIDE_LOAD4:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-ORDERED: [[WIDE_LOAD5:%.]] = load <4 x float>, <4 x float>		; CHECK-ORDERED: [[WIDE_LOAD5:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-ORDERED: [[WIDE_LOAD6:%.]] = load <4 x float>, <4 x float>		; CHECK-ORDERED: [[WIDE_LOAD6:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-ORDERED: [[WIDE_LOAD7:%.]] = load <4 x float>, <4 x float>		; CHECK-ORDERED: [[WIDE_LOAD7:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-ORDERED: [[FMUL:%.*]] = fmul nnan <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]		; CHECK-ORDERED: [[FMUL:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
; CHECK-ORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[FMUL]])		; CHECK-ORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[FMUL]])
; CHECK-ORDERED: [[FMUL1:%.*]] = fmul nnan <4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]		; CHECK-ORDERED: [[FMUL1:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
; CHECK-ORDERED: [[RDX1:%.*]] = call nnan float @llvm.vector.reduce.fadd.v4f32(float [[RDX]], <4 x float> [[FMUL1]])		; CHECK-ORDERED: [[RDX1:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX]], <vscale x 8 x float> [[FMUL1]])
; CHECK-ORDERED: [[FMUL2:%.*]] = fmul nnan <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]		; CHECK-ORDERED: [[FMUL2:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
; CHECK-ORDERED: [[RDX2:%.*]] = call nnan float @llvm.vector.reduce.fadd.v4f32(float [[RDX1]], <4 x float> [[FMUL2]])		; CHECK-ORDERED: [[RDX2:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX1]], <vscale x 8 x float> [[FMUL2]])
; CHECK-ORDERED: [[FMUL3:%.*]] = fmul nnan <4 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]		; CHECK-ORDERED: [[FMUL3:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
; CHECK-ORDERED: [[RDX3:%.*]] = call nnan float @llvm.vector.reduce.fadd.v4f32(float [[RDX2]], <4 x float> [[FMUL3]])		; CHECK-ORDERED: [[RDX3:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX2]], <vscale x 8 x float> [[FMUL3]])
; CHECK-ORDERED: for.end		; CHECK-ORDERED: for.end
; CHECK-ORDERED: [[RES:%.]] = phi float [ [[SCALAR:%.]], %for.body ], [ [[RDX3]], %middle.block ]		; CHECK-ORDERED: [[RES:%.]] = phi float [ [[SCALAR:%.]], %for.body ], [ [[RDX3]], %middle.block ]
; CHECK-ORDERED: ret float [[RES]]		; CHECK-ORDERED: ret float [[RES]]

; CHECK-UNORDERED-LABEL: @fmuladd_strict_fmf		; CHECK-UNORDERED-LABEL: @fmuladd_strict_fmf
; CHECK-UNORDERED: vector.body		; CHECK-UNORDERED: vector.body
; CHECK-UNORDERED: [[VEC_PHI:%.]] = phi <4 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ [[FMULADD:%.]], %vector.body ]		; CHECK-UNORDERED: [[VEC_PHI:%.]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ [[FMULADD:%.]], %vector.body ]
; CHECK-UNORDERED: [[VEC_PHI1:%.]] = phi <4 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ [[FMULADD1:%.]], %vector.body ]		; CHECK-UNORDERED: [[VEC_PHI1:%.]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD1:%.]], %vector.body ]
; CHECK-UNORDERED: [[VEC_PHI2:%.]] = phi <4 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ [[FMULADD2:%.]], %vector.body ]		; CHECK-UNORDERED: [[VEC_PHI2:%.]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD2:%.]], %vector.body ]
; CHECK-UNORDERED: [[VEC_PHI3:%.]] = phi <4 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ [[FMULADD3:%.]], %vector.body ]		; CHECK-UNORDERED: [[VEC_PHI3:%.]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD3:%.]], %vector.body ]
; CHECK-UNORDERED: [[WIDE_LOAD:%.]] = load <4 x float>, <4 x float>		; CHECK-UNORDERED: [[WIDE_LOAD:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-UNORDERED: [[WIDE_LOAD1:%.]] = load <4 x float>, <4 x float>		; CHECK-UNORDERED: [[WIDE_LOAD1:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-UNORDERED: [[WIDE_LOAD2:%.]] = load <4 x float>, <4 x float>		; CHECK-UNORDERED: [[WIDE_LOAD2:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-UNORDERED: [[WIDE_LOAD3:%.]] = load <4 x float>, <4 x float>		; CHECK-UNORDERED: [[WIDE_LOAD3:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-UNORDERED: [[WIDE_LOAD4:%.]] = load <4 x float>, <4 x float>		; CHECK-UNORDERED: [[WIDE_LOAD4:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-UNORDERED: [[WIDE_LOAD5:%.]] = load <4 x float>, <4 x float>		; CHECK-UNORDERED: [[WIDE_LOAD5:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-UNORDERED: [[WIDE_LOAD6:%.]] = load <4 x float>, <4 x float>		; CHECK-UNORDERED: [[WIDE_LOAD6:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-UNORDERED: [[WIDE_LOAD7:%.]] = load <4 x float>, <4 x float>		; CHECK-UNORDERED: [[WIDE_LOAD7:%.]] = load <vscale x 8 x float>, <vscale x 8 x float>
; CHECK-UNORDERED: [[FMULADD]] = call nnan <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD4]], <4 x float> [[VEC_PHI]])		; CHECK-UNORDERED: [[FMULADD]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[WIDE_LOAD4]], <vscale x 8 x float> [[VEC_PHI]])
; CHECK-UNORDERED: [[FMULADD1]] = call nnan <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD5]], <4 x float> [[VEC_PHI1]])		; CHECK-UNORDERED: [[FMULADD1]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD1]], <vscale x 8 x float> [[WIDE_LOAD5]], <vscale x 8 x float> [[VEC_PHI1]])
; CHECK-UNORDERED: [[FMULADD2]] = call nnan <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD2]], <4 x float> [[WIDE_LOAD6]], <4 x float> [[VEC_PHI2]])		; CHECK-UNORDERED: [[FMULADD2]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD2]], <vscale x 8 x float> [[WIDE_LOAD6]], <vscale x 8 x float> [[VEC_PHI2]])
; CHECK-UNORDERED: [[FMULADD3]] = call nnan <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD3]], <4 x float> [[WIDE_LOAD7]], <4 x float> [[VEC_PHI3]])		; CHECK-UNORDERED: [[FMULADD3]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD3]], <vscale x 8 x float> [[WIDE_LOAD7]], <vscale x 8 x float> [[VEC_PHI3]])
; CHECK-UNORDERED-NOT: call nnan float @llvm.vector.reduce.fadd		; CHECK-UNORDERED-NOT: call nnan float @llvm.vector.reduce.fadd
; CHECK-UNORDERED: middle.block		; CHECK-UNORDERED: middle.block
; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd nnan <4 x float> [[FMULADD1]], [[FMULADD]]		; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd nnan <vscale x 8 x float> [[FMULADD1]], [[FMULADD]]
; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd nnan <4 x float> [[FMULADD2]], [[BIN_RDX]]		; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd nnan <vscale x 8 x float> [[FMULADD2]], [[BIN_RDX]]
; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd nnan <4 x float> [[FMULADD3]], [[BIN_RDX1]]		; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd nnan <vscale x 8 x float> [[FMULADD3]], [[BIN_RDX1]]
; CHECK-UNORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX2]]		; CHECK-UNORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX2]]
; CHECK-UNORDERED: for.body		; CHECK-UNORDERED: for.body
; CHECK-UNORDERED: [[SUM_07:%.]] = phi float [ [[SCALAR:%.]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ]		; CHECK-UNORDERED: [[SUM_07:%.]] = phi float [ [[SCALAR:%.]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ]
; CHECK-UNORDERED: [[LOAD:%.]] = load float, float		; CHECK-UNORDERED: [[LOAD:%.]] = load float, float
; CHECK-UNORDERED: [[LOAD1:%.]] = load float, float		; CHECK-UNORDERED: [[LOAD1:%.]] = load float, float
; CHECK-UNORDERED: [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD1]], float [[SUM_07]])		; CHECK-UNORDERED: [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD1]], float [[SUM_07]])
; CHECK-UNORDERED: for.end		; CHECK-UNORDERED: for.end
; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX]], %middle.block ]		; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX]], %middle.block ]
; CHECK-UNORDERED: ret float [[RES]]		; CHECK-UNORDERED: ret float [[RES]]
Show All 35 Lines

llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll

Show First 20 Lines • Show All 42 Lines • ▼ Show 20 Lines	for.body:
%add = fadd double %0, %sum.07		%add = fadd double %0, %sum.07
%iv.next = add nuw nsw i64 %iv, 1		%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, %n		%exitcond.not = icmp eq i64 %iv.next, %n
br i1 %exitcond.not, label %for.end, label %for.body		br i1 %exitcond.not, label %for.end, label %for.body

for.end:		for.end:
ret double %add		ret double %add
}		}

		; CHECK-VF4: Found an estimated cost of 23 for VF 4 For instruction: %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
		; CHECK-VF8: Found an estimated cost of 46 for VF 8 For instruction: %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)

		define float @fmuladd_strict32(float* %a, float* %b, i64 %n) {
		entry:
		br label %for.body

		for.body:
		%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
		%sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ]
		%arrayidx = getelementptr inbounds float, float* %a, i64 %iv
		%0 = load float, float* %arrayidx, align 4
		%arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv
		%1 = load float, float* %arrayidx2, align 4
		%muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
		%iv.next = add nuw nsw i64 %iv, 1
		%exitcond.not = icmp eq i64 %iv.next, %n
		br i1 %exitcond.not, label %for.end, label %for.body

		for.end:
		ret float %muladd
		}

		declare float @llvm.fmuladd.f32(float, float, float)

		; CHECK-VF4: Found an estimated cost of 22 for VF 4 For instruction: %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07)
		; CHECK-VF8: Found an estimated cost of 44 for VF 8 For instruction: %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07)

		define double @fmuladd_strict64(double* %a, double* %b, i64 %n) {
		entry:
		br label %for.body

		for.body:
		%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
		%sum.07 = phi double [ 0.000000e+00, %entry ], [ %muladd, %for.body ]
		%arrayidx = getelementptr inbounds double, double* %a, i64 %iv
		%0 = load double, double* %arrayidx, align 4
		%arrayidx2 = getelementptr inbounds double, double* %b, i64 %iv
		%1 = load double, double* %arrayidx2, align 4
		%muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07)
		%iv.next = add nuw nsw i64 %iv, 1
		%exitcond.not = icmp eq i64 %iv.next, %n
		br i1 %exitcond.not, label %for.end, label %for.body

		for.end:
		ret double %muladd
		}

		declare double @llvm.fmuladd.f64(double, double, double)

This is an archive of the discontinued LLVM Phabricator instance.

[LoopVectorize][CostModel] Update cost model for fmuladd intrinsic
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 379740

llvm/include/llvm/Analysis/TargetTransformInfo.h

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

llvm/include/llvm/CodeGen/BasicTTIImpl.h

llvm/lib/Analysis/TargetTransformInfo.cpp

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll

llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll

This is an archive of the discontinued LLVM Phabricator instance.

[LoopVectorize][CostModel] Update cost model for fmuladd intrinsicClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 379740

llvm/include/llvm/Analysis/TargetTransformInfo.h

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

llvm/include/llvm/CodeGen/BasicTTIImpl.h

llvm/lib/Analysis/TargetTransformInfo.cpp

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll

llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll

[LoopVectorize][CostModel] Update cost model for fmuladd intrinsic
ClosedPublic