Diff 484768

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Show First 20 Lines • Show All 1,039 Lines • ▼ Show 20 Lines	if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
PTest->takeName(&II);		PTest->takeName(&II);

return IC.replaceInstUsesWith(II, PTest);		return IC.replaceInstUsesWith(II, PTest);
}		}

return std::nullopt;		return std::nullopt;
}		}

		template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
		sdesmalenUnsubmitted Not Done Reply Inline Actions nit: Can you give `T` and `M` slightly more descriptive names? :) sdesmalen: nit: Can you give `T` and `M` slightly more descriptive names? :)
static std::optional<Instruction *>		static std::optional<Instruction *>
instCombineSVEVectorFMLA(InstCombiner &IC, IntrinsicInst &II) {		instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II,
// fold (fadd p a (fmul p b c)) -> (fma p a b c)		bool MergeIntoAddendOp) {
		sdesmalenUnsubmitted Done Reply Inline Actions nit: Perhaps `MergeIntoAddendOp` is a better name? sdesmalen: nit: Perhaps `MergeIntoAddendOp` is a better name?
Value *P = II.getOperand(0);		Value *P = II.getOperand(0);
Value *A = II.getOperand(1);		Value MulOp0, MulOp1, AddendOp, Mul;
auto FMul = II.getOperand(2);		if (MergeIntoAddendOp) {
Value B, C;		AddendOp = II.getOperand(1);
if (!match(FMul, m_Intrinsic<Intrinsic::aarch64_sve_fmul>(		Mul = II.getOperand(2);
m_Specific(P), m_Value(B), m_Value(C))))		} else {
		AddendOp = II.getOperand(2);
		Mul = II.getOperand(1);
		}
		sdesmalenUnsubmitted Done Reply Inline Actions Could you rename these variables such that: A -> MulOp0 B -> MulOp1 C -> AddendOp That makes the code below (where you swap the order of these in `{P, C, A, B}` for example, a bit easier to follow) sdesmalen: Could you rename these variables such that: A -> MulOp0 B -> MulOp1 C -> AddendOp That…

		if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),
		m_Value(MulOp1))))
return std::nullopt;		return std::nullopt;

if (!FMul->hasOneUse())		if (!Mul->hasOneUse())
return std::nullopt;		return std::nullopt;

		Instruction *FMFSource = nullptr;
		if (II.getType()->isFPOrFPVectorTy()) {
llvm::FastMathFlags FAddFlags = II.getFastMathFlags();		llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
// Stop the combine when the flags on the inputs differ in case dropping flags		// Stop the combine when the flags on the inputs differ in case dropping
// would lead to us missing out on more beneficial optimizations.		// flags would lead to us missing out on more beneficial optimizations.
if (FAddFlags != cast<CallInst>(FMul)->getFastMathFlags())		if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
return std::nullopt;		return std::nullopt;
if (!FAddFlags.allowContract())		if (!FAddFlags.allowContract())
return std::nullopt;		return std::nullopt;
		FMFSource = &II;
		}

IRBuilder<> Builder(II.getContext());		IRBuilder<> Builder(II.getContext());
Builder.SetInsertPoint(&II);		Builder.SetInsertPoint(&II);
auto FMLA = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fmla,
{II.getType()}, {P, A, B, C}, &II);		CallInst *Res;
FMLA->setFastMathFlags(FAddFlags);		if (MergeIntoAddendOp)
return IC.replaceInstUsesWith(II, FMLA);		Res = Builder.CreateIntrinsic(FuseOpc, {II.getType()},
		{P, AddendOp, MulOp0, MulOp1}, FMFSource);
		else
		Res = Builder.CreateIntrinsic(FuseOpc, {II.getType()},
		{P, MulOp0, MulOp1, AddendOp}, FMFSource);

		return IC.replaceInstUsesWith(II, Res);
}		}

static bool isAllActivePredicate(Value *Pred) {		static bool isAllActivePredicate(Value *Pred) {
// Look through convert.from.svbool(convert.to.svbool(...) chain.		// Look through convert.from.svbool(convert.to.svbool(...) chain.
		sdesmalenUnsubmitted Done Reply Inline Actions This function looks very similar to instCombineSVEVectorFMLA, I think we could combine them. There are some subtle differences though: Fast-math flags can only be copied if the type is floating-point. MAD is different from MLA in that the result is merged into the multiplicant, not the addend. You could pass a parameter `WriteAddend` that is used like this: Value A = II.getOperand(1); Value Mul = II.getOperand(2); if (!WriteAddend) std::swap(A, Mul); if (!match(Mul, m_Intrinsic<Intrinsic::aarch64_sve_mul>(...)) If you pass in the opcode to test for (in this case `Intrinsic::aarch64_sve_mul`) and the result `Intrinsic::aarch64_sve_mad`, then you could generalise this. sdesmalen: This function looks very similar to instCombineSVEVectorFMLA, I think we could combine them.
		MattDevereauAuthorUnsubmitted Done Reply Inline Actions I've reworked `instCombineSVEVectorFMLA` into a function that can handle both the Addend/Not-Addend case which also handles floating-point vectors. MattDevereau: I've reworked `instCombineSVEVectorFMLA` into a function that can handle both the Addend/Not…
Value *UncastedPred;		Value *UncastedPred;
if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(		if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(		m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
m_Value(UncastedPred)))))		m_Value(UncastedPred)))))
// If the predicate has the same or less lanes than the uncasted		// If the predicate has the same or less lanes than the uncasted
// predicate then we know the casting has no effect.		// predicate then we know the casting has no effect.
if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=		if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())		cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
▲ Show 20 Lines • Show All 73 Lines • ▼ Show 20 Lines	instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) {
IRBuilder<> Builder(II.getContext());		IRBuilder<> Builder(II.getContext());
Builder.SetInsertPoint(&II);		Builder.SetInsertPoint(&II);
Builder.setFastMathFlags(II.getFastMathFlags());		Builder.setFastMathFlags(II.getFastMathFlags());
auto BinOp =		auto BinOp =
Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));		Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));
return IC.replaceInstUsesWith(II, BinOp);		return IC.replaceInstUsesWith(II, BinOp);
}		}

static std::optional<Instruction *>		static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) {		IntrinsicInst &II) {
if (auto FMLA = instCombineSVEVectorFMLA(IC, II))		if (auto FMLA =
		instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
		Intrinsic::aarch64_sve_fmla>(IC, II,
		true))
return FMLA;		return FMLA;
		if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
		Intrinsic::aarch64_sve_mla>(
		IC, II, true))
		return MLA;
		if (auto FMAD =
		instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
		Intrinsic::aarch64_sve_fmad>(IC, II,
		false))
		return FMAD;
		if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
		Intrinsic::aarch64_sve_mad>(
		IC, II, false))
		return MAD;
		return instCombineSVEVectorBinOp(IC, II);
		}

		static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
		IntrinsicInst &II) {
		if (auto FMLS =
		instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
		Intrinsic::aarch64_sve_fmls>(IC, II,
		true))
		return FMLS;
		if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
		Intrinsic::aarch64_sve_mls>(
		IC, II, true))
		return MLS;
		if (auto FMSB =
		instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
		Intrinsic::aarch64_sve_fnmsb>(
		IC, II, false))
		return FMSB;
return instCombineSVEVectorBinOp(IC, II);		return instCombineSVEVectorBinOp(IC, II);
		sdesmalenUnsubmitted Not Done Reply Inline Actions nit: Can you add a comment saying that there is no integer version of nmsb for the equivalent integer case of the above? (is there a negative test available for that case?) sdesmalen: nit: Can you add a comment saying that there is no integer version of nmsb for the equivalent…
		MattDevereauAuthorUnsubmitted Done Reply Inline Actions I'm personally of the opinion that comments describing the absence of things are bloat but I'm happy to put it in. Regarding the test, I'm not sure what it would be beyond a `sub(mul)` with no transformation. Without the context of other tests surrounding it, I'm not sure it would make much sense. MattDevereau: I'm personally of the opinion that comments describing the absence of things are bloat but I'm…
		sdesmalenUnsubmitted Not Done Reply Inline Actions Regarding the test, I'm not sure what it would be beyond a sub(mul) with no transformation. Without the context of other tests surrounding it, I'm not sure it would make much sense. The 'no transformation' would be the point. i.e. there is a positive test for the floating-point case that translates to fnmsb, and we'd have a negative test for the integer form because there is no equivalent integer instruction (it just covers that we don't make any wrong assumptions) I'm personally of the opinion that comments describing the absence of things are bloat but I'm happy to put it in. Fair enough, I'm happy for you to omit adding a comment here, if we have a test that covers it with a comment that explains the need for the test. That should guard it from adding an invalid transform in the future. sdesmalen: > Regarding the test, I'm not sure what it would be beyond a sub(mul) with no transformation.
		MattDevereauAuthorUnsubmitted Done Reply Inline Actions I've added a test for this with a comment describing why it's there. MattDevereau: I've added a test for this with a comment describing why it's there.
}		}

static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,		static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
IntrinsicInst &II) {		IntrinsicInst &II) {
auto *OpPredicate = II.getOperand(0);		auto *OpPredicate = II.getOperand(0);
auto *OpMultiplicand = II.getOperand(1);		auto *OpMultiplicand = II.getOperand(1);
auto *OpMultiplier = II.getOperand(2);		auto *OpMultiplier = II.getOperand(2);

▲ Show 20 Lines • Show All 283 Lines • ▼ Show 20 Lines	AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
case Intrinsic::aarch64_sve_ptest_any:		case Intrinsic::aarch64_sve_ptest_any:
case Intrinsic::aarch64_sve_ptest_first:		case Intrinsic::aarch64_sve_ptest_first:
case Intrinsic::aarch64_sve_ptest_last:		case Intrinsic::aarch64_sve_ptest_last:
return instCombineSVEPTest(IC, II);		return instCombineSVEPTest(IC, II);
case Intrinsic::aarch64_sve_mul:		case Intrinsic::aarch64_sve_mul:
case Intrinsic::aarch64_sve_fmul:		case Intrinsic::aarch64_sve_fmul:
return instCombineSVEVectorMul(IC, II);		return instCombineSVEVectorMul(IC, II);
case Intrinsic::aarch64_sve_fadd:		case Intrinsic::aarch64_sve_fadd:
return instCombineSVEVectorFAdd(IC, II);		case Intrinsic::aarch64_sve_add:
		return instCombineSVEVectorAdd(IC, II);
case Intrinsic::aarch64_sve_fsub:		case Intrinsic::aarch64_sve_fsub:
		sdesmalenUnsubmitted Done Reply Inline Actions It would be nice if we could do the same thing for subtracts (fmls/fmsb/mls/msb) sdesmalen: It would be nice if we could do the same thing for subtracts (fmls/fmsb/mls/msb)
		MattDevereauAuthorUnsubmitted Done Reply Inline Actions I've added support for these now. MattDevereau: I've added support for these now.
return instCombineSVEVectorBinOp(IC, II);		case Intrinsic::aarch64_sve_sub:
		return instCombineSVEVectorSub(IC, II);
case Intrinsic::aarch64_sve_tbl:		case Intrinsic::aarch64_sve_tbl:
return instCombineSVETBL(IC, II);		return instCombineSVETBL(IC, II);
case Intrinsic::aarch64_sve_uunpkhi:		case Intrinsic::aarch64_sve_uunpkhi:
case Intrinsic::aarch64_sve_uunpklo:		case Intrinsic::aarch64_sve_uunpklo:
case Intrinsic::aarch64_sve_sunpkhi:		case Intrinsic::aarch64_sve_sunpkhi:
case Intrinsic::aarch64_sve_sunpklo:		case Intrinsic::aarch64_sve_sunpklo:
return instCombineSVEUnpack(IC, II);		return instCombineSVEUnpack(IC, II);
case Intrinsic::aarch64_sve_zip1:		case Intrinsic::aarch64_sve_zip1:
▲ Show 20 Lines • Show All 1,767 Lines • Show Last 20 Lines

llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-muladd.ll

This file was moved to llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-muladdsub.ll.

llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-muladdsub.ll

This file was moved from llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-muladd.ll.

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt -S -passes=instcombine < %s \| FileCheck %s			; RUN: opt -S -passes=instcombine < %s \| FileCheck %s

	target triple = "aarch64-unknown-linux-gnu"			target triple = "aarch64-unknown-linux-gnu"

	define dso_local <vscale x 8 x half> @combine_fmla(<vscale x 16 x i1> %p, <vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) local_unnamed_addr #0 {			define dso_local <vscale x 8 x half> @combine_fmla(<vscale x 16 x i1> %p, <vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) local_unnamed_addr #0 {
	; CHECK-LABEL: @combine_fmla(			; CHECK-LABEL: @combine_fmla(
	; CHECK-NEXT: [[TMP1:%.]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[P:%.]])			; CHECK-NEXT: [[TMP1:%.]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[P:%.]])
	; CHECK-NEXT: [[TMP2:%.]] = call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[A:%.]], <vscale x 8 x half> [[B:%.]], <vscale x 8 x half> [[C:%.]])			; CHECK-NEXT: [[TMP2:%.]] = call fast <vscale x 8 x half> @llvm.aarch64.sve.fmla.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[C:%.]], <vscale x 8 x half> [[A:%.]], <vscale x 8 x half> [[B:%.]])
	; CHECK-NEXT: ret <vscale x 8 x half> [[TMP2]]			; CHECK-NEXT: ret <vscale x 8 x half> [[TMP2]]
	;			;
	%1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %p)			%1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %p)
	%2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %b, <vscale x 8 x half> %c)			%2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
	%3 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %2)			%3 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %c, <vscale x 8 x half> %2)
	ret <vscale x 8 x half> %3			ret <vscale x 8 x half> %3
	}			}

	define dso_local <vscale x 8 x half> @neg_combine_fmla_mul_first_operand(<vscale x 16 x i1> %p, <vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) local_unnamed_addr #0 {			define dso_local <vscale x 16 x i8> @combine_mla_i8(<vscale x 16 x i1> %p, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) local_unnamed_addr #0 {
	; CHECK-LABEL: @neg_combine_fmla_mul_first_operand(			; CHECK-LABEL: @combine_mla_i8(
				; CHECK-NEXT: [[TMP1:%.]] = call <vscale x 16 x i8> @llvm.aarch64.sve.mla.nxv16i8(<vscale x 16 x i1> [[P:%.]], <vscale x 16 x i8> [[C:%.]], <vscale x 16 x i8> [[A:%.]], <vscale x 16 x i8> [[B:%.*]])
				; CHECK-NEXT: ret <vscale x 16 x i8> [[TMP1]]
				;
				%1 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.mul.nxv16i8(<vscale x 16 x i1> %p, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
				%2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.add.nxv16i8(<vscale x 16 x i1> %p, <vscale x 16 x i8> %c, <vscale x 16 x i8> %1)
				ret <vscale x 16 x i8> %2
				}

				define dso_local <vscale x 8 x half> @combine_fmad(<vscale x 16 x i1> %p, <vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) local_unnamed_addr #0 {
				; CHECK-LABEL: @combine_fmad(
				sdesmalenUnsubmitted Not Done Reply Inline Actions Can you remove things like `dso_local` and `local_unnamed_addr` from these tests? sdesmalen: Can you remove things like `dso_local` and `local_unnamed_addr` from these tests?
	; CHECK-NEXT: [[TMP1:%.]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[P:%.]])			; CHECK-NEXT: [[TMP1:%.]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[P:%.]])
	; CHECK-NEXT: [[TMP2:%.]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[B:%.]], <vscale x 8 x half> [[C:%.*]])			; CHECK-NEXT: [[TMP2:%.]] = call fast <vscale x 8 x half> @llvm.aarch64.sve.fmad.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[A:%.]], <vscale x 8 x half> [[B:%.]], <vscale x 8 x half> [[C:%.]])
	; CHECK-NEXT: [[TMP3:%.]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[A:%.]])			; CHECK-NEXT: ret <vscale x 8 x half> [[TMP2]]
	; CHECK-NEXT: ret <vscale x 8 x half> [[TMP3]]			;
				%1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %p)
				%2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
				%3 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %c)
				ret <vscale x 8 x half> %3
				}

				define dso_local <vscale x 16 x i8> @combine_mad_i8(<vscale x 16 x i1> %p, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) local_unnamed_addr #0 {
				; CHECK-LABEL: @combine_mad_i8(
				; CHECK-NEXT: [[TMP1:%.]] = call <vscale x 16 x i8> @llvm.aarch64.sve.mad.nxv16i8(<vscale x 16 x i1> [[P:%.]], <vscale x 16 x i8> [[A:%.]], <vscale x 16 x i8> [[B:%.]], <vscale x 16 x i8> [[C:%.*]])
				; CHECK-NEXT: ret <vscale x 16 x i8> [[TMP1]]
				;
				%1 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.mul.nxv16i8(<vscale x 16 x i1> %p, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
				%2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.add.nxv16i8(<vscale x 16 x i1> %p, <vscale x 16 x i8> %1, <vscale x 16 x i8> %c)
				ret <vscale x 16 x i8> %2
				}

				define dso_local <vscale x 8 x half> @combine_fmls(<vscale x 16 x i1> %p, <vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) local_unnamed_addr #0 {
				; CHECK-LABEL: @combine_fmls(
				; CHECK-NEXT: [[TMP1:%.]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[P:%.]])
				; CHECK-NEXT: [[TMP2:%.]] = call fast <vscale x 8 x half> @llvm.aarch64.sve.fmls.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[C:%.]], <vscale x 8 x half> [[A:%.]], <vscale x 8 x half> [[B:%.]])
				; CHECK-NEXT: ret <vscale x 8 x half> [[TMP2]]
	;			;
	%1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %p)			%1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %p)
	%2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %b, <vscale x 8 x half> %c)			%2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
	%3 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %a)			%3 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fsub.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %c, <vscale x 8 x half> %2)
	ret <vscale x 8 x half> %3			ret <vscale x 8 x half> %3
	}			}

	define dso_local <vscale x 8 x half> @neg_combine_fmla_contract_flag_only(<vscale x 16 x i1> %p, <vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) local_unnamed_addr #0 {			define dso_local <vscale x 16 x i8> @combine_mls_i8(<vscale x 16 x i1> %p, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) local_unnamed_addr #0 {
	; CHECK-LABEL: @neg_combine_fmla_contract_flag_only(			; CHECK-LABEL: @combine_mls_i8(
				; CHECK-NEXT: [[TMP1:%.]] = call <vscale x 16 x i8> @llvm.aarch64.sve.mls.nxv16i8(<vscale x 16 x i1> [[P:%.]], <vscale x 16 x i8> [[C:%.]], <vscale x 16 x i8> [[A:%.]], <vscale x 16 x i8> [[B:%.*]])
				; CHECK-NEXT: ret <vscale x 16 x i8> [[TMP1]]
				;
				%1 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.mul.nxv16i8(<vscale x 16 x i1> %p, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
				%2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.sub.nxv16i8(<vscale x 16 x i1> %p, <vscale x 16 x i8> %c, <vscale x 16 x i8> %1)
				ret <vscale x 16 x i8> %2
				}

				define dso_local <vscale x 8 x half> @combine_fnmsb(<vscale x 16 x i1> %p, <vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) local_unnamed_addr #0 {
				; CHECK-LABEL: @combine_fnmsb(
	; CHECK-NEXT: [[TMP1:%.]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[P:%.]])			; CHECK-NEXT: [[TMP1:%.]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[P:%.]])
	; CHECK-NEXT: [[TMP2:%.]] = call contract <vscale x 8 x half> @llvm.aarch64.sve.fmla.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[A:%.]], <vscale x 8 x half> [[B:%.]], <vscale x 8 x half> [[C:%.]])			; CHECK-NEXT: [[TMP2:%.]] = call fast <vscale x 8 x half> @llvm.aarch64.sve.fnmsb.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[A:%.]], <vscale x 8 x half> [[B:%.]], <vscale x 8 x half> [[C:%.]])
	; CHECK-NEXT: ret <vscale x 8 x half> [[TMP2]]			; CHECK-NEXT: ret <vscale x 8 x half> [[TMP2]]
	;			;
	%1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %p)			%1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %p)
	%2 = tail call contract <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %b, <vscale x 8 x half> %c)			%2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
	%3 = tail call contract <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %2)			%3 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fsub.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %2, <vscale x 8 x half> %c)
				ret <vscale x 8 x half> %3
				}

				; No integer variant of fnmsb exists; Do not combine
				define dso_local <vscale x 16 x i8> @neg_combine_nmsb_i8(<vscale x 16 x i1> %p, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) local_unnamed_addr #0 {
				; CHECK-LABEL: @neg_combine_nmsb_i8(
				; CHECK-NEXT: [[TMP1:%.]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.mul.nxv16i8(<vscale x 16 x i1> [[P:%.]], <vscale x 16 x i8> [[A:%.]], <vscale x 16 x i8> [[B:%.]])
				; CHECK-NEXT: [[TMP2:%.]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.sub.nxv16i8(<vscale x 16 x i1> [[P]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[C:%.]])
				; CHECK-NEXT: ret <vscale x 16 x i8> [[TMP2]]
				;
				%1 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.mul.nxv16i8(<vscale x 16 x i1> %p, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
				%2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.sub.nxv16i8(<vscale x 16 x i1> %p, <vscale x 16 x i8> %1, <vscale x 16 x i8> %c)
				ret <vscale x 16 x i8> %2
				}

				define dso_local <vscale x 8 x half> @combine_fmla_contract_flag_only(<vscale x 16 x i1> %p, <vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) local_unnamed_addr #0 {
				; CHECK-LABEL: @combine_fmla_contract_flag_only(
				; CHECK-NEXT: [[TMP1:%.]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[P:%.]])
				; CHECK-NEXT: [[TMP2:%.]] = call contract <vscale x 8 x half> @llvm.aarch64.sve.fmla.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[C:%.]], <vscale x 8 x half> [[A:%.]], <vscale x 8 x half> [[B:%.]])
				; CHECK-NEXT: ret <vscale x 8 x half> [[TMP2]]
				;
				%1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %p)
				%2 = tail call contract <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
				%3 = tail call contract <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %c, <vscale x 8 x half> %2)
	ret <vscale x 8 x half> %3			ret <vscale x 8 x half> %3
	}			}

	define dso_local <vscale x 8 x half> @neg_combine_fmla_no_flags(<vscale x 16 x i1> %p, <vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) local_unnamed_addr #0 {			define dso_local <vscale x 8 x half> @neg_combine_fmla_no_flags(<vscale x 16 x i1> %p, <vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) local_unnamed_addr #0 {
	; CHECK-LABEL: @neg_combine_fmla_no_flags(			; CHECK-LABEL: @neg_combine_fmla_no_flags(
	; CHECK-NEXT: [[TMP1:%.]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[P:%.]])			; CHECK-NEXT: [[TMP1:%.]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[P:%.]])
	; CHECK-NEXT: [[TMP2:%.]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[B:%.]], <vscale x 8 x half> [[C:%.*]])			; CHECK-NEXT: [[TMP2:%.]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[A:%.]], <vscale x 8 x half> [[B:%.*]])
	; CHECK-NEXT: [[TMP3:%.]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[A:%.]], <vscale x 8 x half> [[TMP2]])			; CHECK-NEXT: [[TMP3:%.]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[C:%.]], <vscale x 8 x half> [[TMP2]])
	; CHECK-NEXT: ret <vscale x 8 x half> [[TMP3]]			; CHECK-NEXT: ret <vscale x 8 x half> [[TMP3]]
	;			;
	%1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %p)			%1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %p)
	%2 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %b, <vscale x 8 x half> %c)			%2 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
	%3 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %2)			%3 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %c, <vscale x 8 x half> %2)
	ret <vscale x 8 x half> %3			ret <vscale x 8 x half> %3
	}			}

	define dso_local <vscale x 8 x half> @neg_combine_fmla_neq_pred(<vscale x 16 x i1> %p, <vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) local_unnamed_addr #0 {			define dso_local <vscale x 8 x half> @neg_combine_fmla_neq_pred(<vscale x 16 x i1> %p, <vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) local_unnamed_addr #0 {
	; CHECK-LABEL: @neg_combine_fmla_neq_pred(			; CHECK-LABEL: @neg_combine_fmla_neq_pred(
	; CHECK-NEXT: [[TMP1:%.]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[P:%.]])			; CHECK-NEXT: [[TMP1:%.]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[P:%.]])
	; CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 5)			; CHECK-NEXT: [[TMP2:%.*]] = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 5)
	; CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP2]])			; CHECK-NEXT: [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[TMP2]])
	; CHECK-NEXT: [[TMP4:%.]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[B:%.]], <vscale x 8 x half> [[C:%.*]])			; CHECK-NEXT: [[TMP4:%.]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[A:%.]], <vscale x 8 x half> [[B:%.*]])
	; CHECK-NEXT: [[TMP5:%.]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> [[TMP3]], <vscale x 8 x half> [[A:%.]], <vscale x 8 x half> [[TMP4]])			; CHECK-NEXT: [[TMP5:%.]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> [[TMP3]], <vscale x 8 x half> [[C:%.]], <vscale x 8 x half> [[TMP4]])
	; CHECK-NEXT: ret <vscale x 8 x half> [[TMP5]]			; CHECK-NEXT: ret <vscale x 8 x half> [[TMP5]]
	;			;
	; ret <vscale x 8 x half> %9			; ret <vscale x 8 x half> %9
	%1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %p)			%1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %p)
	%2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 5)			%2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 5)
	%3 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %2)			%3 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %2)
	%4 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %b, <vscale x 8 x half> %c)			%4 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
	%5 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %3, <vscale x 8 x half> %a, <vscale x 8 x half> %4)			%5 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %3, <vscale x 8 x half> %c, <vscale x 8 x half> %4)
	ret <vscale x 8 x half> %5			ret <vscale x 8 x half> %5
	}			}

	define dso_local <vscale x 8 x half> @neg_combine_fmla_two_fmul_uses(<vscale x 16 x i1> %p, <vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) local_unnamed_addr #0 {			define dso_local <vscale x 8 x half> @neg_combine_fmla_two_fmul_uses(<vscale x 16 x i1> %p, <vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) local_unnamed_addr #0 {
	; CHECK-LABEL: @neg_combine_fmla_two_fmul_uses(			; CHECK-LABEL: @neg_combine_fmla_two_fmul_uses(
	; CHECK-NEXT: [[TMP1:%.]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[P:%.]])			; CHECK-NEXT: [[TMP1:%.]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[P:%.]])
	; CHECK-NEXT: [[TMP2:%.]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[B:%.]], <vscale x 8 x half> [[C:%.*]])			; CHECK-NEXT: [[TMP2:%.]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[A:%.]], <vscale x 8 x half> [[B:%.*]])
	; CHECK-NEXT: [[TMP3:%.]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[A:%.]], <vscale x 8 x half> [[TMP2]])			; CHECK-NEXT: [[TMP3:%.]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[C:%.]], <vscale x 8 x half> [[TMP2]])
	; CHECK-NEXT: [[TMP4:%.*]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP2]])			; CHECK-NEXT: [[TMP4:%.*]] = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP2]])
	; CHECK-NEXT: ret <vscale x 8 x half> [[TMP4]]			; CHECK-NEXT: ret <vscale x 8 x half> [[TMP4]]
	;			;
	; ret <vscale x 8 x half> %8			; ret <vscale x 8 x half> %8
	%1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %p)			%1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %p)
	%2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %b, <vscale x 8 x half> %c)			%2 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
	%3 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %2)			%3 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %c, <vscale x 8 x half> %2)
	%4 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %3, <vscale x 8 x half> %2)			%4 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %3, <vscale x 8 x half> %2)
	ret <vscale x 8 x half> %4			ret <vscale x 8 x half> %4
	}			}

	define dso_local <vscale x 8 x half> @neg_combine_fmla_neq_flags(<vscale x 16 x i1> %p, <vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) local_unnamed_addr #0 {			define dso_local <vscale x 8 x half> @neg_combine_fmla_neq_flags(<vscale x 16 x i1> %p, <vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) local_unnamed_addr #0 {
	; CHECK-LABEL: @neg_combine_fmla_neq_flags(			; CHECK-LABEL: @neg_combine_fmla_neq_flags(
	; CHECK-NEXT: [[TMP1:%.]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[P:%.]])			; CHECK-NEXT: [[TMP1:%.]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[P:%.]])
	; CHECK-NEXT: [[TMP2:%.]] = tail call reassoc nnan contract <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[B:%.]], <vscale x 8 x half> [[C:%.*]])			; CHECK-NEXT: [[TMP2:%.]] = tail call reassoc nnan contract <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[B:%.]], <vscale x 8 x half> [[C:%.*]])
	; CHECK-NEXT: [[TMP3:%.]] = tail call reassoc contract <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[A:%.]], <vscale x 8 x half> [[TMP2]])			; CHECK-NEXT: [[TMP3:%.]] = tail call reassoc contract <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> [[TMP1]], <vscale x 8 x half> [[A:%.]], <vscale x 8 x half> [[TMP2]])
	; CHECK-NEXT: ret <vscale x 8 x half> [[TMP3]]			; CHECK-NEXT: ret <vscale x 8 x half> [[TMP3]]
	;			;
	; ret <vscale x 8 x half> %7			; ret <vscale x 8 x half> %7
	%1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %p)			%1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %p)
	%2 = tail call reassoc nnan contract <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %b, <vscale x 8 x half> %c)			%2 = tail call reassoc nnan contract <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %b, <vscale x 8 x half> %c)
	%3 = tail call reassoc contract <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %2)			%3 = tail call reassoc contract <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %1, <vscale x 8 x half> %a, <vscale x 8 x half> %2)
	ret <vscale x 8 x half> %3			ret <vscale x 8 x half> %3
	}			}

				declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32)
	declare <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1>)			declare <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1>)
				declare <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1>)
				declare <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1>)
	declare <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)			declare <vscale x 8 x half> @llvm.aarch64.sve.fmul.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
	declare <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)			declare <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
	declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32)			declare <vscale x 8 x half> @llvm.aarch64.sve.fsub.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
				sdesmalenUnsubmitted Not Done Reply Inline Actions The result is merged into `%c`, not the multiplicant (`%1`), so this should be resulting in `sve.mla` instead. sdesmalen: The result is merged into `%c`, not the multiplicant (`%1`), so this should be resulting in…
				declare <vscale x 16 x i8> @llvm.aarch64.sve.mul.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
				declare <vscale x 16 x i8> @llvm.aarch64.sve.add.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
				declare <vscale x 16 x i8> @llvm.aarch64.sve.sub.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)

	attributes #0 = { "target-features"="+sve" }			attributes #0 = { "target-features"="+sve" }
				sdesmalenUnsubmitted Done Reply Inline Actions You can't make the same assumptions for MSB as you can for MAD, because the meaning of this is `%a - (%b * %c)`, which is not equal to `(%b * %c) - %a`. sdesmalen: You can't make the same assumptions for MSB as you can for MAD, because the meaning of this is…
				MattDevereauAuthorUnsubmitted Done Reply Inline Actions I've removed MSB and changed FMSB to FNMSB MattDevereau: I've removed MSB and changed FMSB to FNMSB
				sdesmalenUnsubmitted Done Reply Inline Actions I don't necessarily think you need to repeat these negative tests for all the fmla/fmad/fmls/.. combinations (same for the tests above (predicate not matching, not the right fast-math flags, etc). Maybe you can have all positive tests firsts, and then all the negative tests for just the `fmla`, since they should directly translate to the other mul-add intrinsics. sdesmalen: I don't necessarily think you need to repeat these negative tests for all the fmla/fmad/fmls/..
				MattDevereauAuthorUnsubmitted Done Reply Inline Actions Sure, these were just low overhead copy/pasted tests and the repetition is unnecessary. MattDevereau: Sure, these were just low overhead copy/pasted tests and the repetition is unnecessary.
				sdesmalenUnsubmitted Done Reply Inline Actions I don't think we need these tests for all the element types, probably doing this for 1 FP type and 1 Integer type is sufficient. sdesmalen: I don't think we need these tests for all the element types, probably doing this for 1 FP type…

This is an archive of the discontinued LLVM Phabricator instance.

[AArch64][InstCombine] Fuse ADD+MUL and SUB+MUL AArch64 instrinsics
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 484768

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-muladd.ll

llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-muladdsub.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AArch64][InstCombine] Fuse ADD+MUL and SUB+MUL AArch64 instrinsicsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 484768

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-muladd.ll

llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-muladdsub.ll

[AArch64][InstCombine] Fuse ADD+MUL and SUB+MUL AArch64 instrinsics
ClosedPublic