Diff 384763

llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h

Show First 20 Lines • Show All 393 Lines • ▼ Show 20 Lines	bool matchCombineFAddFMAFMulToFMadOrFMA(MachineInstr &MI,
BuildFnTy &MatchInfo);		BuildFnTy &MatchInfo);

// Transform (fadd (fma x, y, (fpext (fmul u, v))), z)		// Transform (fadd (fma x, y, (fpext (fmul u, v))), z)
// -> (fma x, y, (fma (fpext u), (fpext v), z))		// -> (fma x, y, (fma (fpext u), (fpext v), z))
// -> (fmad x, y, (fma (fpext u), (fpext v), z))		// -> (fmad x, y, (fma (fpext u), (fpext v), z))
bool matchCombineFAddFpExtFMulToFMadOrFMAAggressive(MachineInstr &MI,		bool matchCombineFAddFpExtFMulToFMadOrFMAAggressive(MachineInstr &MI,
BuildFnTy &MatchInfo);		BuildFnTy &MatchInfo);

		/// Transform (fsub (fmul x, y), z) -> (fma x, y, -z)
		/// -> (fmad x, y, -z)
		bool matchCombineFSubFMulToFMadOrFMA(MachineInstr &MI, BuildFnTy &MatchInfo);

/// Transform trunc ([asz]ext x) to x or ([asz]ext x) or (trunc x).		/// Transform trunc ([asz]ext x) to x or ([asz]ext x) or (trunc x).
bool matchCombineTruncOfExt(MachineInstr &MI,		bool matchCombineTruncOfExt(MachineInstr &MI,
std::pair<Register, unsigned> &MatchInfo);		std::pair<Register, unsigned> &MatchInfo);
void applyCombineTruncOfExt(MachineInstr &MI,		void applyCombineTruncOfExt(MachineInstr &MI,
std::pair<Register, unsigned> &MatchInfo);		std::pair<Register, unsigned> &MatchInfo);

/// Transform trunc (shl x, K) to shl (trunc x),		/// Transform trunc (shl x, K) to shl (trunc x),
/// K => K < VT.getScalarSizeInBits().		/// K => K < VT.getScalarSizeInBits().
▲ Show 20 Lines • Show All 307 Lines • Show Last 20 Lines

llvm/include/llvm/Target/GlobalISel/Combine.td

Show First 20 Lines • Show All 654 Lines • ▼ Show 20 Lines
// (fma x, y, (fma (fpext u), (fpext v), z))		// (fma x, y, (fma (fpext u), (fpext v), z))
def combine_fadd_fpext_fma_fmul_to_fmad_or_fma: GICombineRule<		def combine_fadd_fpext_fma_fmul_to_fmad_or_fma: GICombineRule<
(defs root:$root, build_fn_matchinfo:$info),		(defs root:$root, build_fn_matchinfo:$info),
(match (wip_match_opcode G_FADD):$root,		(match (wip_match_opcode G_FADD):$root,
[{ return Helper.matchCombineFAddFpExtFMulToFMadOrFMAAggressive(		[{ return Helper.matchCombineFAddFpExtFMulToFMadOrFMAAggressive(
*${root}, ${info}); }]),		*${root}, ${info}); }]),
(apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>;		(apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>;

		// Transform (fsub (fmul x, y), z) -> (fma x, y, -z)
		def combine_fsub_fmul_to_fmad_or_fma: GICombineRule<
		(defs root:$root, build_fn_matchinfo:$info),
		(match (wip_match_opcode G_FSUB):$root,
		[{ return Helper.matchCombineFSubFMulToFMadOrFMA(*${root},
		${info}); }]),
		(apply [{ Helper.applyBuildFn(*${root}, ${info}); }])>;

// Currently only the one combine above.		// Currently only the one combine above.
def insert_vec_elt_combines : GICombineGroup<		def insert_vec_elt_combines : GICombineGroup<
[combine_insert_vec_elts_build_vector]>;		[combine_insert_vec_elts_build_vector]>;

def extract_vec_elt_build_vec : GICombineRule<		def extract_vec_elt_build_vec : GICombineRule<
(defs root:$root, register_matchinfo:$matchinfo),		(defs root:$root, register_matchinfo:$matchinfo),
(match (wip_match_opcode G_EXTRACT_VECTOR_ELT):$root,		(match (wip_match_opcode G_EXTRACT_VECTOR_ELT):$root,
[{ return Helper.matchExtractVecEltBuildVec(*${root}, ${matchinfo}); }]),		[{ return Helper.matchExtractVecEltBuildVec(*${root}, ${matchinfo}); }]),
▲ Show 20 Lines • Show All 157 Lines • ▼ Show 20 Lines	def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
unmerge_zext_to_zext, merge_unmerge, trunc_ext_fold, trunc_shl,		unmerge_zext_to_zext, merge_unmerge, trunc_ext_fold, trunc_shl,
const_combines, xor_of_and_with_same_reg, ptr_add_with_zero,		const_combines, xor_of_and_with_same_reg, ptr_add_with_zero,
shift_immed_chain, shift_of_shifted_logic_chain, load_or_combine,		shift_immed_chain, shift_of_shifted_logic_chain, load_or_combine,
truncstore_merge, div_rem_to_divrem, funnel_shift_combines,		truncstore_merge, div_rem_to_divrem, funnel_shift_combines,
form_bitfield_extract, constant_fold, fabs_fneg_fold,		form_bitfield_extract, constant_fold, fabs_fneg_fold,
intdiv_combines, mulh_combines, redundant_neg_operands,		intdiv_combines, mulh_combines, redundant_neg_operands,
combine_fadd_fmul_to_fmad_or_fma, combine_fadd_fpext_fmul_to_fmad_or_fma,		combine_fadd_fmul_to_fmad_or_fma, combine_fadd_fpext_fmul_to_fmad_or_fma,
combine_fadd_fma_fmul_to_fmad_or_fma,		combine_fadd_fma_fmul_to_fmad_or_fma,
combine_fadd_fpext_fma_fmul_to_fmad_or_fma]>;		combine_fadd_fpext_fma_fmul_to_fmad_or_fma,
		combine_fsub_fmul_to_fmad_or_fma]>;

// A combine group used to for prelegalizer combiners at -O0. The combines in		// A combine group used to for prelegalizer combiners at -O0. The combines in
// this group have been selected based on experiments to balance code size and		// this group have been selected based on experiments to balance code size and
// compile time performance.		// compile time performance.
def optnone_combines : GICombineGroup<[trivial_combines,		def optnone_combines : GICombineGroup<[trivial_combines,
ptr_add_immed_chain, combines_for_extload,		ptr_add_immed_chain, combines_for_extload,
not_cmp_fold, opt_brcond_by_inverting_cond]>;		not_cmp_fold, opt_brcond_by_inverting_cond]>;

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp

Show First 20 Lines • Show All 4,076 Lines • ▼ Show 20 Lines
}		}

bool CombinerHelper::matchBitfieldExtractFromShr(		bool CombinerHelper::matchBitfieldExtractFromShr(
MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {		MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
const unsigned Opcode = MI.getOpcode();		const unsigned Opcode = MI.getOpcode();
assert(Opcode == TargetOpcode::G_ASHR \|\| Opcode == TargetOpcode::G_LSHR);		assert(Opcode == TargetOpcode::G_ASHR \|\| Opcode == TargetOpcode::G_LSHR);

const Register Dst = MI.getOperand(0).getReg();		const Register Dst = MI.getOperand(0).getReg();

const unsigned ExtrOpcode = Opcode == TargetOpcode::G_ASHR		const unsigned ExtrOpcode = Opcode == TargetOpcode::G_ASHR
		arsenmUnsubmitted Not Done Reply Inline Actions This should probably allow vectors we can break down later too arsenm: This should probably allow vectors we can break down later too
? TargetOpcode::G_SBFX		? TargetOpcode::G_SBFX
: TargetOpcode::G_UBFX;		: TargetOpcode::G_UBFX;
		arsenmUnsubmitted Not Done Reply Inline Actions Don't see where isFMADLegal is fedined arsenm: Don't see where isFMADLegal is fedined
		matejamAuthorUnsubmitted Done Reply Inline Actions In the other revision (the parent): D93305 matejam: In the other revision (the parent): [[ https://reviews.llvm.org/D93305 \| D93305 ]]

// Check if the type we would use for the extract is legal		// Check if the type we would use for the extract is legal
LLT Ty = MRI.getType(Dst);		LLT Ty = MRI.getType(Dst);
LLT ExtractTy = getTargetLowering().getPreferredShiftAmountTy(Ty);		LLT ExtractTy = getTargetLowering().getPreferredShiftAmountTy(Ty);
if (!LI \|\| !LI->isLegalOrCustom({ExtrOpcode, {Ty, ExtractTy}}))		if (!LI \|\| !LI->isLegalOrCustom({ExtrOpcode, {Ty, ExtractTy}}))
return false;		return false;

Register ShlSrc;		Register ShlSrc;
int64_t ShrAmt;		int64_t ShrAmt;
int64_t ShlAmt;		int64_t ShlAmt;
const unsigned Size = Ty.getScalarSizeInBits();		const unsigned Size = Ty.getScalarSizeInBits();

// Try to match shr (shl x, c1), c2		// Try to match shr (shl x, c1), c2
if (!mi_match(Dst, MRI,		if (!mi_match(Dst, MRI,
m_BinOp(Opcode,		m_BinOp(Opcode,
m_OneNonDBGUse(m_GShl(m_Reg(ShlSrc), m_ICst(ShlAmt))),		m_OneNonDBGUse(m_GShl(m_Reg(ShlSrc), m_ICst(ShlAmt))),
m_ICst(ShrAmt))))		m_ICst(ShrAmt))))
return false;		return false;

// Make sure that the shift sizes can fit a bitfield extract		// Make sure that the shift sizes can fit a bitfield extract
if (ShlAmt < 0 \|\| ShlAmt > ShrAmt \|\| ShrAmt >= Size)		if (ShlAmt < 0 \|\| ShlAmt > ShrAmt \|\| ShrAmt >= Size)
return false;		return false;

// Skip this combine if the G_SEXT_INREG combine could handle it		// Skip this combine if the G_SEXT_INREG combine could handle it
if (Opcode == TargetOpcode::G_ASHR && ShlAmt == ShrAmt)		if (Opcode == TargetOpcode::G_ASHR && ShlAmt == ShrAmt)
return false;		return false;

// Calculate start position and width of the extract		// Calculate start position and width of the extract
const int64_t Pos = ShrAmt - ShlAmt;		const int64_t Pos = ShrAmt - ShlAmt;
const int64_t Width = Size - ShrAmt;		const int64_t Width = Size - ShrAmt;

		arsenmUnsubmitted Not Done Reply Inline Actions I'm not sure I follow this heuristic, or what SwapPriority means arsenm: I'm not sure I follow this heuristic, or what SwapPriority means
		matejamAuthorUnsubmitted Done Reply Inline Actions If SwapPriority is equal to 0 that means that the first and second operands aren't both fmul instructions, if it's equal to 2 it means that both of the arguments are fmul and that the second arg has fewer uses so we pick him for folding, vice versa if it's equal to 1. I will make it more simple in the next version. matejam: If SwapPriority is equal to 0 that means that the first and second operands aren't both fmul…
MatchInfo = [=](MachineIRBuilder &B) {		MatchInfo = [=](MachineIRBuilder &B) {
auto WidthCst = B.buildConstant(ExtractTy, Width);		auto WidthCst = B.buildConstant(ExtractTy, Width);
auto PosCst = B.buildConstant(ExtractTy, Pos);		auto PosCst = B.buildConstant(ExtractTy, Pos);
B.buildInstr(ExtrOpcode, {Dst}, {ShlSrc, PosCst, WidthCst});		B.buildInstr(ExtrOpcode, {Dst}, {ShlSrc, PosCst, WidthCst});
};		};
return true;		return true;
}		}

Show All 24 Lines	bool CombinerHelper::matchBitfieldExtractFromShrAnd(

// Check that ubfx can do the extraction, with no holes in the mask.		// Check that ubfx can do the extraction, with no holes in the mask.
uint64_t UMask = SMask;		uint64_t UMask = SMask;
UMask \|= maskTrailingOnes<uint64_t>(ShrAmt);		UMask \|= maskTrailingOnes<uint64_t>(ShrAmt);
UMask &= maskTrailingOnes<uint64_t>(Size);		UMask &= maskTrailingOnes<uint64_t>(Size);
if (!isMask_64(UMask))		if (!isMask_64(UMask))
return false;		return false;

// Calculate start position and width of the extract.		// Calculate start position and width of the extract.
const int64_t Pos = ShrAmt;		const int64_t Pos = ShrAmt;
		arsenmUnsubmitted Not Done Reply Inline Actions The types are all identical, there's no reason to query every type arsenm: The types are all identical, there's no reason to query every type
const int64_t Width = countTrailingOnes(UMask) - ShrAmt;		const int64_t Width = countTrailingOnes(UMask) - ShrAmt;

// It's preferable to keep the shift, rather than form G_SBFX.		// It's preferable to keep the shift, rather than form G_SBFX.
		arsenmUnsubmitted Not Done Reply Inline Actions You can directly use the type and avoid the explicit createGenericVirtualRegister with auto Neg = B.buildFNeg(Ty, X) arsenm: You can directly use the type and avoid the explicit createGenericVirtualRegister with auto Neg…
// TODO: remove the G_AND via demanded bits analysis.		// TODO: remove the G_AND via demanded bits analysis.
if (Opcode == TargetOpcode::G_ASHR && Width + ShrAmt == Size)		if (Opcode == TargetOpcode::G_ASHR && Width + ShrAmt == Size)
return false;		return false;

MatchInfo = [=](MachineIRBuilder &B) {		MatchInfo = [=](MachineIRBuilder &B) {
auto WidthCst = B.buildConstant(Ty, Width);		auto WidthCst = B.buildConstant(Ty, Width);
auto PosCst = B.buildConstant(Ty, Pos);		auto PosCst = B.buildConstant(Ty, Pos);
B.buildInstr(TargetOpcode::G_UBFX, {Dst}, {AndSrc, PosCst, WidthCst});		B.buildInstr(TargetOpcode::G_UBFX, {Dst}, {AndSrc, PosCst, WidthCst});
▲ Show 20 Lines • Show All 257 Lines • ▼ Show 20 Lines	bool CombinerHelper::matchNarrowBinopFeedingAnd(
unsigned NarrowWidth = Mask.countTrailingOnes();		unsigned NarrowWidth = Mask.countTrailingOnes();
if (NarrowWidth == WideTy.getSizeInBits())		if (NarrowWidth == WideTy.getSizeInBits())
return false;		return false;
LLT NarrowTy = LLT::scalar(NarrowWidth);		LLT NarrowTy = LLT::scalar(NarrowWidth);

// Check if adding the zext + truncates could be harmful.		// Check if adding the zext + truncates could be harmful.
auto &MF = *MI.getMF();		auto &MF = *MI.getMF();
const auto &TLI = getTargetLowering();		const auto &TLI = getTargetLowering();
LLVMContext &Ctx = MF.getFunction().getContext();		LLVMContext &Ctx = MF.getFunction().getContext();
		foadUnsubmitted Not Done Reply Inline Actions Typo "refers", and they're called MI0 and MI1. foad: Typo "refers", and they're called MI0 and MI1.
auto &DL = MF.getDataLayout();		auto &DL = MF.getDataLayout();
if (!TLI.isTruncateFree(WideTy, NarrowTy, DL, Ctx) \|\|		if (!TLI.isTruncateFree(WideTy, NarrowTy, DL, Ctx) \|\|
!TLI.isZExtFree(NarrowTy, WideTy, DL, Ctx))		!TLI.isZExtFree(NarrowTy, WideTy, DL, Ctx))
return false;		return false;
if (!isLegalOrBeforeLegalizer({TargetOpcode::G_TRUNC, {NarrowTy, WideTy}}) \|\|		if (!isLegalOrBeforeLegalizer({TargetOpcode::G_TRUNC, {NarrowTy, WideTy}}) \|\|
		foadUnsubmitted Not Done Reply Inline Actions Use hasMoreUses() from the previous patch. foad: Use hasMoreUses() from the previous patch.
!isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {WideTy, NarrowTy}}))		!isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {WideTy, NarrowTy}}))
return false;		return false;
Register BinOpLHS = LHSInst->getOperand(1).getReg();		Register BinOpLHS = LHSInst->getOperand(1).getReg();
Register BinOpRHS = LHSInst->getOperand(2).getReg();		Register BinOpRHS = LHSInst->getOperand(2).getReg();
MatchInfo = [=, &MI](MachineIRBuilder &B) {		MatchInfo = [=, &MI](MachineIRBuilder &B) {
auto NarrowLHS = Builder.buildTrunc(NarrowTy, BinOpLHS);		auto NarrowLHS = Builder.buildTrunc(NarrowTy, BinOpLHS);
auto NarrowRHS = Builder.buildTrunc(NarrowTy, BinOpRHS);		auto NarrowRHS = Builder.buildTrunc(NarrowTy, BinOpRHS);
auto NarrowBinOp =		auto NarrowBinOp =
▲ Show 20 Lines • Show All 615 Lines • ▼ Show 20 Lines	if (isContractableFMul(*FMulMI, AllowFusionGlobally) &&
};		};
return true;		return true;
}		}
}		}

return false;		return false;
}		}

		bool CombinerHelper::matchCombineFSubFMulToFMadOrFMA(
		MachineInstr &MI,
		std::function<void(MachineIRBuilder &)> &MatchInfo) {
		assert(MI.getOpcode() == TargetOpcode::G_FSUB);

		bool AllowFusionGlobally, HasFMAD, Aggressive;
		if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive))
		return false;

		MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg());
		MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg());
		LLT DstTy = MRI.getType(MI.getOperand(0).getReg());

		// If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
		// prefer to fold the multiply with fewer uses.
		int FirstMulHasFewerUses = true;
		if (isContractableFMul(*LHS, AllowFusionGlobally) &&
		isContractableFMul(*RHS, AllowFusionGlobally) &&
		hasMoreUses(LHS, RHS, MRI))
		FirstMulHasFewerUses = false;

		unsigned PreferredFusedOpcode =
		HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA;

		// fold (fsub (fmul x, y), z) -> (fma x, y, -z)
		if (FirstMulHasFewerUses &&
		(isContractableFMul(*LHS, AllowFusionGlobally) &&
		(Aggressive \|\| MRI.hasOneNonDBGUse(LHS->getOperand(0).getReg())))) {
		MatchInfo = [=, &MI](MachineIRBuilder &B) {
		Register NegZ = B.buildFNeg(DstTy, RHS->getOperand(0).getReg())
		.getReg(0);
		B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
		{LHS->getOperand(1).getReg(), LHS->getOperand(2).getReg(),
		NegZ});
		mbrkusaninUnsubmitted Not Done Reply Inline Actions Rename Src3 to NegZ so it matches the comment above. Use B not Builder. mbrkusanin: Rename Src3 to NegZ so it matches the comment above. Use B not Builder.
		};

		return true;
		}
		// fold (fsub x, (fmul y, z)) -> (fma -y, z, x)
		else if ((isContractableFMul(*RHS, AllowFusionGlobally) &&
		(Aggressive \|\| MRI.hasOneNonDBGUse(RHS->getOperand(0).getReg())))) {
		MatchInfo = [=, &MI](MachineIRBuilder &B) {
		Register NegY = B.buildFNeg(DstTy, RHS->getOperand(1).getReg())
		.getReg(0);
		B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
		{NegY, RHS->getOperand(2).getReg(),
		LHS->getOperand(0).getReg()});
		mbrkusaninUnsubmitted Not Done Reply Inline Actions Rename Src1 to NegY so it matches the comment above. Use B not Builder. mbrkusanin: Rename Src1 to NegY so it matches the comment above. Use B not Builder.
		};

		return true;
		}

		return false;
		}

bool CombinerHelper::tryCombine(MachineInstr &MI) {		bool CombinerHelper::tryCombine(MachineInstr &MI) {
if (tryCombineCopy(MI))		if (tryCombineCopy(MI))
return true;		return true;
if (tryCombineExtendingLoads(MI))		if (tryCombineExtendingLoads(MI))
return true;		return true;
if (tryCombineIndexedLoadStore(MI))		if (tryCombineIndexedLoadStore(MI))
return true;		return true;
return false;		return false;
}		}

llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 < %s \| FileCheck -check-prefix=GFX9 %s
				; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -fp-contract=fast < %s \| FileCheck -check-prefix=GFX9-CONTRACT %s
				; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s \| FileCheck -check-prefix=GFX9-DENORM %s
				; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 < %s \| FileCheck -check-prefix=GFX10 %s
				; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s \| FileCheck -check-prefix=GFX10-CONTRACT %s
				; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s \| FileCheck -check-prefix=GFX10-DENORM %s

				; fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
				; fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)

				define float @test_f32_sub_mul(float %x, float %y, float %z) {
				; GFX9-LABEL: test_f32_sub_mul:
				; GFX9: ; %bb.0: ; %.entry
				; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
				; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2
				; GFX9-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-CONTRACT-LABEL: test_f32_sub_mul:
				; GFX9-CONTRACT: ; %bb.0: ; %.entry
				; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, -v2
				; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-DENORM-LABEL: test_f32_sub_mul:
				; GFX9-DENORM: ; %bb.0: ; %.entry
				; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v1, -v2
				; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-LABEL: test_f32_sub_mul:
				; GFX10: ; %bb.0: ; %.entry
				; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
				; GFX10-NEXT: v_sub_f32_e32 v0, v0, v2
				; GFX10-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-CONTRACT-LABEL: test_f32_sub_mul:
				; GFX10-CONTRACT: ; %bb.0: ; %.entry
				; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, -v2
				; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-DENORM-LABEL: test_f32_sub_mul:
				; GFX10-DENORM: ; %bb.0: ; %.entry
				; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-DENORM-NEXT: v_mad_f32 v0, v0, v1, -v2
				; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
				.entry:
				%a = fmul float %x, %y
				%b = fsub float %a, %z
				ret float %b
				}

				define float @test_f32_sub_mul_rhs(float %x, float %y, float %z) {
				; GFX9-LABEL: test_f32_sub_mul_rhs:
				; GFX9: ; %bb.0: ; %.entry
				; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1
				; GFX9-NEXT: v_sub_f32_e32 v0, v2, v0
				; GFX9-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-CONTRACT-LABEL: test_f32_sub_mul_rhs:
				; GFX9-CONTRACT: ; %bb.0: ; %.entry
				; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-CONTRACT-NEXT: v_fma_f32 v0, -v0, v1, v2
				; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-DENORM-LABEL: test_f32_sub_mul_rhs:
				; GFX9-DENORM: ; %bb.0: ; %.entry
				; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-DENORM-NEXT: v_mad_f32 v0, -v0, v1, v2
				; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-LABEL: test_f32_sub_mul_rhs:
				; GFX10: ; %bb.0: ; %.entry
				; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1
				; GFX10-NEXT: v_sub_f32_e32 v0, v2, v0
				; GFX10-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-CONTRACT-LABEL: test_f32_sub_mul_rhs:
				; GFX10-CONTRACT: ; %bb.0: ; %.entry
				; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-CONTRACT-NEXT: v_fma_f32 v0, -v0, v1, v2
				; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-DENORM-LABEL: test_f32_sub_mul_rhs:
				; GFX10-DENORM: ; %bb.0: ; %.entry
				; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-DENORM-NEXT: v_mad_f32 v0, -v0, v1, v2
				; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
				.entry:
				%a = fmul float %x, %y
				%b = fsub float %z, %a
				ret float %b
				}

				define half @test_half_sub_mul(half %x, half %y, half %z) {
				; GFX9-LABEL: test_half_sub_mul:
				; GFX9: ; %bb.0: ; %.entry
				; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-NEXT: v_mul_f16_e32 v0, v0, v1
				; GFX9-NEXT: v_add_f16_e64 v0, v0, -v2
				; GFX9-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-CONTRACT-LABEL: test_half_sub_mul:
				; GFX9-CONTRACT: ; %bb.0: ; %.entry
				; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-CONTRACT-NEXT: v_xor_b32_e32 v2, 0x8000, v2
				; GFX9-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2
				; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
				arsenmUnsubmitted Not Done Reply Inline Actions Why did we fail to fold the modifier here? arsenm: Why did we fail to fold the modifier here?
				;
				; GFX9-DENORM-LABEL: test_half_sub_mul:
				; GFX9-DENORM: ; %bb.0: ; %.entry
				; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-DENORM-NEXT: v_mad_legacy_f16 v0, v0, v1, -v2
				; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-LABEL: test_half_sub_mul:
				; GFX10: ; %bb.0: ; %.entry
				; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1
				; GFX10-NEXT: v_add_f16_e64 v0, v0, -v2
				; GFX10-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-CONTRACT-LABEL: test_half_sub_mul:
				; GFX10-CONTRACT: ; %bb.0: ; %.entry
				; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-CONTRACT-NEXT: v_xor_b32_e32 v2, 0x8000, v2
				; GFX10-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2
				; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-DENORM-LABEL: test_half_sub_mul:
				; GFX10-DENORM: ; %bb.0: ; %.entry
				; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1
				; GFX10-DENORM-NEXT: v_add_f16_e64 v0, v0, -v2
				; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
				.entry:
				%a = fmul half %x, %y
				%b = fsub half %a, %z
				ret half %b
				}

				define half @test_half_sub_mul_rhs(half %x, half %y, half %z) {
				; GFX9-LABEL: test_half_sub_mul_rhs:
				; GFX9: ; %bb.0: ; %.entry
				; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-NEXT: v_mul_f16_e32 v0, v0, v1
				; GFX9-NEXT: v_add_f16_e64 v0, v2, -v0
				; GFX9-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-CONTRACT-LABEL: test_half_sub_mul_rhs:
				; GFX9-CONTRACT: ; %bb.0: ; %.entry
				; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-CONTRACT-NEXT: v_xor_b32_e32 v0, 0x8000, v0
				; GFX9-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2
				; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-DENORM-LABEL: test_half_sub_mul_rhs:
				; GFX9-DENORM: ; %bb.0: ; %.entry
				; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1
				; GFX9-DENORM-NEXT: v_add_f16_e64 v0, v2, -v0
				; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-LABEL: test_half_sub_mul_rhs:
				; GFX10: ; %bb.0: ; %.entry
				; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1
				; GFX10-NEXT: v_add_f16_e64 v0, v2, -v0
				; GFX10-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-CONTRACT-LABEL: test_half_sub_mul_rhs:
				; GFX10-CONTRACT: ; %bb.0: ; %.entry
				; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-CONTRACT-NEXT: v_xor_b32_e32 v0, 0x8000, v0
				; GFX10-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2
				; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-DENORM-LABEL: test_half_sub_mul_rhs:
				; GFX10-DENORM: ; %bb.0: ; %.entry
				; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1
				; GFX10-DENORM-NEXT: v_add_f16_e64 v0, v2, -v0
				; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
				.entry:
				%a = fmul half %x, %y
				%b = fsub half %z, %a
				mbrkusaninUnsubmitted Done Reply Inline Actions %a and %z should be swapped here, otherwise this is the same test as the one above. Also combiner fails for this test for -mcpu=gfx900 --denormal-fp-math=preserve-sign. Same for test above (test_half_sub_mul). It produces correct result only because fsub is replaced by fadd + fneg in legalizer and then is probably matched by one of other combiners that start from fadd. mbrkusanin: %a and %z should be swapped here, otherwise this is the same test as the one above. Also…
				ret half %b
				}

				define double @test_double_sub_mul(double %x, double %y, double %z) {
				; GFX9-LABEL: test_double_sub_mul:
				; GFX9: ; %bb.0: ; %.entry
				; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
				; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5]
				; GFX9-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-CONTRACT-LABEL: test_double_sub_mul:
				; GFX9-CONTRACT: ; %bb.0: ; %.entry
				; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
				; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-DENORM-LABEL: test_double_sub_mul:
				; GFX9-DENORM: ; %bb.0: ; %.entry
				; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-DENORM-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
				; GFX9-DENORM-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5]
				; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-LABEL: test_double_sub_mul:
				; GFX10: ; %bb.0: ; %.entry
				; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
				; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5]
				; GFX10-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-CONTRACT-LABEL: test_double_sub_mul:
				; GFX10-CONTRACT: ; %bb.0: ; %.entry
				; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5]
				; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-DENORM-LABEL: test_double_sub_mul:
				; GFX10-DENORM: ; %bb.0: ; %.entry
				; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-DENORM-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
				; GFX10-DENORM-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5]
				; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
				.entry:
				%a = fmul double %x, %y
				%b = fsub double %a, %z
				ret double %b
				}

				define double @test_double_sub_mul_rhs(double %x, double %y, double %z) {
				; GFX9-LABEL: test_double_sub_mul_rhs:
				; GFX9: ; %bb.0: ; %.entry
				; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
				; GFX9-NEXT: v_add_f64 v[0:1], v[4:5], -v[0:1]
				; GFX9-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-CONTRACT-LABEL: test_double_sub_mul_rhs:
				; GFX9-CONTRACT: ; %bb.0: ; %.entry
				; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], v[4:5]
				; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-DENORM-LABEL: test_double_sub_mul_rhs:
				; GFX9-DENORM: ; %bb.0: ; %.entry
				; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-DENORM-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
				; GFX9-DENORM-NEXT: v_add_f64 v[0:1], v[4:5], -v[0:1]
				; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-LABEL: test_double_sub_mul_rhs:
				; GFX10: ; %bb.0: ; %.entry
				; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
				; GFX10-NEXT: v_add_f64 v[0:1], v[4:5], -v[0:1]
				; GFX10-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-CONTRACT-LABEL: test_double_sub_mul_rhs:
				; GFX10-CONTRACT: ; %bb.0: ; %.entry
				; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-CONTRACT-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], v[4:5]
				; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-DENORM-LABEL: test_double_sub_mul_rhs:
				; GFX10-DENORM: ; %bb.0: ; %.entry
				; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-DENORM-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
				; GFX10-DENORM-NEXT: v_add_f64 v[0:1], v[4:5], -v[0:1]
				; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
				.entry:
				%a = fmul double %x, %y
				%b = fsub double %z, %a
				ret double %b
				}

				define <4 x float> @test_v4f32_sub_mul(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
				; GFX9-LABEL: test_v4f32_sub_mul:
				; GFX9: ; %bb.0: ; %.entry
				; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-NEXT: v_mul_f32_e32 v0, v0, v4
				; GFX9-NEXT: v_mul_f32_e32 v1, v1, v5
				; GFX9-NEXT: v_mul_f32_e32 v2, v2, v6
				; GFX9-NEXT: v_mul_f32_e32 v3, v3, v7
				; GFX9-NEXT: v_sub_f32_e32 v0, v0, v8
				; GFX9-NEXT: v_sub_f32_e32 v1, v1, v9
				; GFX9-NEXT: v_sub_f32_e32 v2, v2, v10
				; GFX9-NEXT: v_sub_f32_e32 v3, v3, v11
				; GFX9-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-CONTRACT-LABEL: test_v4f32_sub_mul:
				; GFX9-CONTRACT: ; %bb.0: ; %.entry
				; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v4, -v8
				; GFX9-CONTRACT-NEXT: v_fma_f32 v1, v1, v5, -v9
				; GFX9-CONTRACT-NEXT: v_fma_f32 v2, v2, v6, -v10
				; GFX9-CONTRACT-NEXT: v_fma_f32 v3, v3, v7, -v11
				; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-DENORM-LABEL: test_v4f32_sub_mul:
				; GFX9-DENORM: ; %bb.0: ; %.entry
				; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v4, -v8
				; GFX9-DENORM-NEXT: v_mad_f32 v1, v1, v5, -v9
				; GFX9-DENORM-NEXT: v_mad_f32 v2, v2, v6, -v10
				; GFX9-DENORM-NEXT: v_mad_f32 v3, v3, v7, -v11
				; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-LABEL: test_v4f32_sub_mul:
				; GFX10: ; %bb.0: ; %.entry
				; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-NEXT: v_mul_f32_e32 v0, v0, v4
				; GFX10-NEXT: v_mul_f32_e32 v1, v1, v5
				; GFX10-NEXT: v_mul_f32_e32 v2, v2, v6
				; GFX10-NEXT: v_mul_f32_e32 v3, v3, v7
				; GFX10-NEXT: v_sub_f32_e32 v0, v0, v8
				; GFX10-NEXT: v_sub_f32_e32 v1, v1, v9
				; GFX10-NEXT: v_sub_f32_e32 v2, v2, v10
				; GFX10-NEXT: v_sub_f32_e32 v3, v3, v11
				; GFX10-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-CONTRACT-LABEL: test_v4f32_sub_mul:
				; GFX10-CONTRACT: ; %bb.0: ; %.entry
				; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-CONTRACT-NEXT: v_fma_f32 v0, v0, v4, -v8
				; GFX10-CONTRACT-NEXT: v_fma_f32 v1, v1, v5, -v9
				; GFX10-CONTRACT-NEXT: v_fma_f32 v2, v2, v6, -v10
				; GFX10-CONTRACT-NEXT: v_fma_f32 v3, v3, v7, -v11
				; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-DENORM-LABEL: test_v4f32_sub_mul:
				; GFX10-DENORM: ; %bb.0: ; %.entry
				; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-DENORM-NEXT: v_mad_f32 v0, v0, v4, -v8
				; GFX10-DENORM-NEXT: v_mad_f32 v1, v1, v5, -v9
				; GFX10-DENORM-NEXT: v_mad_f32 v2, v2, v6, -v10
				; GFX10-DENORM-NEXT: v_mad_f32 v3, v3, v7, -v11
				; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
				.entry:
				%a = fmul <4 x float> %x, %y
				%b = fsub <4 x float> %a, %z
				ret <4 x float> %b
				}

				define <4 x float> @test_v4f32_sub_mul_rhs(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
				; GFX9-LABEL: test_v4f32_sub_mul_rhs:
				; GFX9: ; %bb.0: ; %.entry
				; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-NEXT: v_mul_f32_e32 v0, v0, v4
				; GFX9-NEXT: v_mul_f32_e32 v1, v1, v5
				; GFX9-NEXT: v_mul_f32_e32 v2, v2, v6
				; GFX9-NEXT: v_mul_f32_e32 v3, v3, v7
				; GFX9-NEXT: v_sub_f32_e32 v0, v8, v0
				; GFX9-NEXT: v_sub_f32_e32 v1, v9, v1
				; GFX9-NEXT: v_sub_f32_e32 v2, v10, v2
				; GFX9-NEXT: v_sub_f32_e32 v3, v11, v3
				; GFX9-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-CONTRACT-LABEL: test_v4f32_sub_mul_rhs:
				; GFX9-CONTRACT: ; %bb.0: ; %.entry
				; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-CONTRACT-NEXT: v_fma_f32 v0, -v0, v4, v8
				; GFX9-CONTRACT-NEXT: v_fma_f32 v1, -v1, v5, v9
				; GFX9-CONTRACT-NEXT: v_fma_f32 v2, -v2, v6, v10
				; GFX9-CONTRACT-NEXT: v_fma_f32 v3, -v3, v7, v11
				; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-DENORM-LABEL: test_v4f32_sub_mul_rhs:
				; GFX9-DENORM: ; %bb.0: ; %.entry
				; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-DENORM-NEXT: v_mad_f32 v0, -v0, v4, v8
				; GFX9-DENORM-NEXT: v_mad_f32 v1, -v1, v5, v9
				; GFX9-DENORM-NEXT: v_mad_f32 v2, -v2, v6, v10
				; GFX9-DENORM-NEXT: v_mad_f32 v3, -v3, v7, v11
				; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-LABEL: test_v4f32_sub_mul_rhs:
				; GFX10: ; %bb.0: ; %.entry
				; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-NEXT: v_mul_f32_e32 v0, v0, v4
				; GFX10-NEXT: v_mul_f32_e32 v1, v1, v5
				; GFX10-NEXT: v_mul_f32_e32 v2, v2, v6
				; GFX10-NEXT: v_mul_f32_e32 v3, v3, v7
				; GFX10-NEXT: v_sub_f32_e32 v0, v8, v0
				; GFX10-NEXT: v_sub_f32_e32 v1, v9, v1
				; GFX10-NEXT: v_sub_f32_e32 v2, v10, v2
				; GFX10-NEXT: v_sub_f32_e32 v3, v11, v3
				; GFX10-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-CONTRACT-LABEL: test_v4f32_sub_mul_rhs:
				; GFX10-CONTRACT: ; %bb.0: ; %.entry
				; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-CONTRACT-NEXT: v_fma_f32 v0, -v0, v4, v8
				; GFX10-CONTRACT-NEXT: v_fma_f32 v1, -v1, v5, v9
				; GFX10-CONTRACT-NEXT: v_fma_f32 v2, -v2, v6, v10
				; GFX10-CONTRACT-NEXT: v_fma_f32 v3, -v3, v7, v11
				; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-DENORM-LABEL: test_v4f32_sub_mul_rhs:
				; GFX10-DENORM: ; %bb.0: ; %.entry
				; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-DENORM-NEXT: v_mad_f32 v0, -v0, v4, v8
				; GFX10-DENORM-NEXT: v_mad_f32 v1, -v1, v5, v9
				; GFX10-DENORM-NEXT: v_mad_f32 v2, -v2, v6, v10
				; GFX10-DENORM-NEXT: v_mad_f32 v3, -v3, v7, v11
				; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
				.entry:
				%a = fmul <4 x float> %x, %y
				%b = fsub <4 x float> %z, %a
				ret <4 x float> %b
				}

				define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %z) {
				; GFX9-LABEL: test_v4f16_sub_mul:
				; GFX9: ; %bb.0: ; %.entry
				; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2
				; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3
				; GFX9-NEXT: v_add_f16_e64 v2, v0, -v4
				; GFX9-NEXT: v_add_f16_sdwa v0, v0, -v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
				; GFX9-NEXT: v_add_f16_e64 v3, v1, -v5
				; GFX9-NEXT: v_add_f16_sdwa v1, v1, -v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
				; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff
				; GFX9-NEXT: v_and_or_b32 v0, v2, v4, v0
				; GFX9-NEXT: v_and_or_b32 v1, v3, v4, v1
				; GFX9-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul:
				; GFX9-CONTRACT: ; %bb.0: ; %.entry
				; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 neg_lo:[0,0,1] neg_hi:[0,0,1]
				; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 neg_lo:[0,0,1] neg_hi:[0,0,1]
				; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-DENORM-LABEL: test_v4f16_sub_mul:
				; GFX9-DENORM: ; %bb.0: ; %.entry
				; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
				; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
				; GFX9-DENORM-NEXT: v_add_f16_e64 v2, v0, -v4
				; GFX9-DENORM-NEXT: v_add_f16_sdwa v0, v0, -v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
				; GFX9-DENORM-NEXT: v_add_f16_e64 v3, v1, -v5
				; GFX9-DENORM-NEXT: v_add_f16_sdwa v1, v1, -v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
				; GFX9-DENORM-NEXT: v_mov_b32_e32 v4, 0xffff
				; GFX9-DENORM-NEXT: v_and_or_b32 v0, v2, v4, v0
				; GFX9-DENORM-NEXT: v_and_or_b32 v1, v3, v4, v1
				; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-LABEL: test_v4f16_sub_mul:
				; GFX10: ; %bb.0: ; %.entry
				; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2
				; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3
				; GFX10-NEXT: v_add_f16_e64 v2, v0, -v4
				; GFX10-NEXT: v_add_f16_sdwa v0, v0, -v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
				; GFX10-NEXT: v_add_f16_e64 v3, v1, -v5
				; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff
				; GFX10-NEXT: v_add_f16_sdwa v1, v1, -v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
				; GFX10-NEXT: v_and_or_b32 v0, v2, v4, v0
				; GFX10-NEXT: v_and_or_b32 v1, v3, v4, v1
				; GFX10-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul:
				; GFX10-CONTRACT: ; %bb.0: ; %.entry
				; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 neg_lo:[0,0,1] neg_hi:[0,0,1]
				; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 neg_lo:[0,0,1] neg_hi:[0,0,1]
				; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-DENORM-LABEL: test_v4f16_sub_mul:
				; GFX10-DENORM: ; %bb.0: ; %.entry
				; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
				; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
				; GFX10-DENORM-NEXT: v_add_f16_e64 v2, v0, -v4
				; GFX10-DENORM-NEXT: v_add_f16_sdwa v0, v0, -v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
				; GFX10-DENORM-NEXT: v_add_f16_e64 v3, v1, -v5
				; GFX10-DENORM-NEXT: v_mov_b32_e32 v4, 0xffff
				; GFX10-DENORM-NEXT: v_add_f16_sdwa v1, v1, -v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
				; GFX10-DENORM-NEXT: v_and_or_b32 v0, v2, v4, v0
				; GFX10-DENORM-NEXT: v_and_or_b32 v1, v3, v4, v1
				; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
				.entry:
				%a = fmul <4 x half> %x, %y
				%b = fsub <4 x half> %a, %z
				ret <4 x half> %b
				}

				define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x half> %z) {
				; GFX9-LABEL: test_v4f16_sub_mul_rhs:
				; GFX9: ; %bb.0: ; %.entry
				; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2
				; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3
				; GFX9-NEXT: v_add_f16_e64 v2, v4, -v0
				; GFX9-NEXT: v_add_f16_sdwa v0, v4, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
				; GFX9-NEXT: v_add_f16_e64 v3, v5, -v1
				; GFX9-NEXT: v_add_f16_sdwa v1, v5, -v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
				; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff
				; GFX9-NEXT: v_and_or_b32 v0, v2, v4, v0
				; GFX9-NEXT: v_and_or_b32 v1, v3, v4, v1
				; GFX9-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
				; GFX9-CONTRACT: ; %bb.0: ; %.entry
				; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 neg_lo:[1,0,0] neg_hi:[1,0,0]
				; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 neg_lo:[1,0,0] neg_hi:[1,0,0]
				; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-DENORM-LABEL: test_v4f16_sub_mul_rhs:
				; GFX9-DENORM: ; %bb.0: ; %.entry
				; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
				; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
				; GFX9-DENORM-NEXT: v_add_f16_e64 v2, v4, -v0
				; GFX9-DENORM-NEXT: v_add_f16_sdwa v0, v4, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
				; GFX9-DENORM-NEXT: v_add_f16_e64 v3, v5, -v1
				; GFX9-DENORM-NEXT: v_add_f16_sdwa v1, v5, -v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
				; GFX9-DENORM-NEXT: v_mov_b32_e32 v4, 0xffff
				; GFX9-DENORM-NEXT: v_and_or_b32 v0, v2, v4, v0
				; GFX9-DENORM-NEXT: v_and_or_b32 v1, v3, v4, v1
				; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-LABEL: test_v4f16_sub_mul_rhs:
				; GFX10: ; %bb.0: ; %.entry
				; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2
				; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3
				; GFX10-NEXT: v_add_f16_e64 v2, v4, -v0
				; GFX10-NEXT: v_add_f16_sdwa v0, v4, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
				; GFX10-NEXT: v_add_f16_e64 v3, v5, -v1
				; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff
				; GFX10-NEXT: v_add_f16_sdwa v1, v5, -v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
				; GFX10-NEXT: v_and_or_b32 v0, v2, v4, v0
				; GFX10-NEXT: v_and_or_b32 v1, v3, v4, v1
				; GFX10-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
				; GFX10-CONTRACT: ; %bb.0: ; %.entry
				; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 neg_lo:[1,0,0] neg_hi:[1,0,0]
				; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 neg_lo:[1,0,0] neg_hi:[1,0,0]
				; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-DENORM-LABEL: test_v4f16_sub_mul_rhs:
				; GFX10-DENORM: ; %bb.0: ; %.entry
				; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
				; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
				; GFX10-DENORM-NEXT: v_add_f16_e64 v2, v4, -v0
				; GFX10-DENORM-NEXT: v_add_f16_sdwa v0, v4, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
				; GFX10-DENORM-NEXT: v_add_f16_e64 v3, v5, -v1
				; GFX10-DENORM-NEXT: v_mov_b32_e32 v4, 0xffff
				; GFX10-DENORM-NEXT: v_add_f16_sdwa v1, v5, -v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
				; GFX10-DENORM-NEXT: v_and_or_b32 v0, v2, v4, v0
				; GFX10-DENORM-NEXT: v_and_or_b32 v1, v3, v4, v1
				; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
				.entry:
				%a = fmul <4 x half> %x, %y
				%b = fsub <4 x half> %z, %a
				ret <4 x half> %b
				}

				define <4 x double> @test_v4f64_sub_mul(<4 x double> %x, <4 x double> %y, <4 x double> %z) {
				; GFX9-LABEL: test_v4f64_sub_mul:
				; GFX9: ; %bb.0: ; %.entry
				; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9]
				; GFX9-NEXT: v_mul_f64 v[2:3], v[2:3], v[10:11]
				; GFX9-NEXT: v_mul_f64 v[4:5], v[4:5], v[12:13]
				; GFX9-NEXT: v_mul_f64 v[6:7], v[6:7], v[14:15]
				; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], -v[16:17]
				; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], -v[18:19]
				; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], -v[20:21]
				; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], -v[22:23]
				; GFX9-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-CONTRACT-LABEL: test_v4f64_sub_mul:
				; GFX9-CONTRACT: ; %bb.0: ; %.entry
				; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], -v[16:17]
				; GFX9-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], -v[18:19]
				; GFX9-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], -v[20:21]
				; GFX9-CONTRACT-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], -v[22:23]
				; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-DENORM-LABEL: test_v4f64_sub_mul:
				; GFX9-DENORM: ; %bb.0: ; %.entry
				; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-DENORM-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9]
				; GFX9-DENORM-NEXT: v_mul_f64 v[2:3], v[2:3], v[10:11]
				; GFX9-DENORM-NEXT: v_mul_f64 v[4:5], v[4:5], v[12:13]
				; GFX9-DENORM-NEXT: v_mul_f64 v[6:7], v[6:7], v[14:15]
				; GFX9-DENORM-NEXT: v_add_f64 v[0:1], v[0:1], -v[16:17]
				; GFX9-DENORM-NEXT: v_add_f64 v[2:3], v[2:3], -v[18:19]
				; GFX9-DENORM-NEXT: v_add_f64 v[4:5], v[4:5], -v[20:21]
				; GFX9-DENORM-NEXT: v_add_f64 v[6:7], v[6:7], -v[22:23]
				; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-LABEL: test_v4f64_sub_mul:
				; GFX10: ; %bb.0: ; %.entry
				; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9]
				; GFX10-NEXT: v_mul_f64 v[2:3], v[2:3], v[10:11]
				; GFX10-NEXT: v_mul_f64 v[4:5], v[4:5], v[12:13]
				; GFX10-NEXT: v_mul_f64 v[6:7], v[6:7], v[14:15]
				; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], -v[16:17]
				; GFX10-NEXT: v_add_f64 v[2:3], v[2:3], -v[18:19]
				; GFX10-NEXT: v_add_f64 v[4:5], v[4:5], -v[20:21]
				; GFX10-NEXT: v_add_f64 v[6:7], v[6:7], -v[22:23]
				; GFX10-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-CONTRACT-LABEL: test_v4f64_sub_mul:
				; GFX10-CONTRACT: ; %bb.0: ; %.entry
				; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], -v[16:17]
				; GFX10-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], -v[18:19]
				; GFX10-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], -v[20:21]
				; GFX10-CONTRACT-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], -v[22:23]
				; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-DENORM-LABEL: test_v4f64_sub_mul:
				; GFX10-DENORM: ; %bb.0: ; %.entry
				; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-DENORM-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9]
				; GFX10-DENORM-NEXT: v_mul_f64 v[2:3], v[2:3], v[10:11]
				; GFX10-DENORM-NEXT: v_mul_f64 v[4:5], v[4:5], v[12:13]
				; GFX10-DENORM-NEXT: v_mul_f64 v[6:7], v[6:7], v[14:15]
				; GFX10-DENORM-NEXT: v_add_f64 v[0:1], v[0:1], -v[16:17]
				; GFX10-DENORM-NEXT: v_add_f64 v[2:3], v[2:3], -v[18:19]
				; GFX10-DENORM-NEXT: v_add_f64 v[4:5], v[4:5], -v[20:21]
				; GFX10-DENORM-NEXT: v_add_f64 v[6:7], v[6:7], -v[22:23]
				; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
				.entry:
				%a = fmul <4 x double> %x, %y
				%b = fsub <4 x double> %a, %z
				mbrkusaninUnsubmitted Done Reply Inline Actions Same here, swap %a and %z. mbrkusanin: Same here, swap %a and %z.
				ret <4 x double> %b
				}

				define <4 x double> @test_v4f64_sub_mul_rhs(<4 x double> %x, <4 x double> %y, <4 x double> %z) {
				; GFX9-LABEL: test_v4f64_sub_mul_rhs:
				; GFX9: ; %bb.0: ; %.entry
				; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9]
				; GFX9-NEXT: v_mul_f64 v[2:3], v[2:3], v[10:11]
				; GFX9-NEXT: v_mul_f64 v[4:5], v[4:5], v[12:13]
				; GFX9-NEXT: v_mul_f64 v[6:7], v[6:7], v[14:15]
				; GFX9-NEXT: v_add_f64 v[0:1], v[16:17], -v[0:1]
				; GFX9-NEXT: v_add_f64 v[2:3], v[18:19], -v[2:3]
				; GFX9-NEXT: v_add_f64 v[4:5], v[20:21], -v[4:5]
				; GFX9-NEXT: v_add_f64 v[6:7], v[22:23], -v[6:7]
				; GFX9-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-CONTRACT-LABEL: test_v4f64_sub_mul_rhs:
				; GFX9-CONTRACT: ; %bb.0: ; %.entry
				; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], -v[0:1], v[8:9], v[16:17]
				; GFX9-CONTRACT-NEXT: v_fma_f64 v[2:3], -v[2:3], v[10:11], v[18:19]
				; GFX9-CONTRACT-NEXT: v_fma_f64 v[4:5], -v[4:5], v[12:13], v[20:21]
				; GFX9-CONTRACT-NEXT: v_fma_f64 v[6:7], -v[6:7], v[14:15], v[22:23]
				; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-DENORM-LABEL: test_v4f64_sub_mul_rhs:
				; GFX9-DENORM: ; %bb.0: ; %.entry
				; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-DENORM-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9]
				; GFX9-DENORM-NEXT: v_mul_f64 v[2:3], v[2:3], v[10:11]
				; GFX9-DENORM-NEXT: v_mul_f64 v[4:5], v[4:5], v[12:13]
				; GFX9-DENORM-NEXT: v_mul_f64 v[6:7], v[6:7], v[14:15]
				; GFX9-DENORM-NEXT: v_add_f64 v[0:1], v[16:17], -v[0:1]
				; GFX9-DENORM-NEXT: v_add_f64 v[2:3], v[18:19], -v[2:3]
				; GFX9-DENORM-NEXT: v_add_f64 v[4:5], v[20:21], -v[4:5]
				; GFX9-DENORM-NEXT: v_add_f64 v[6:7], v[22:23], -v[6:7]
				; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-LABEL: test_v4f64_sub_mul_rhs:
				; GFX10: ; %bb.0: ; %.entry
				; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9]
				; GFX10-NEXT: v_mul_f64 v[2:3], v[2:3], v[10:11]
				; GFX10-NEXT: v_mul_f64 v[4:5], v[4:5], v[12:13]
				; GFX10-NEXT: v_mul_f64 v[6:7], v[6:7], v[14:15]
				; GFX10-NEXT: v_add_f64 v[0:1], v[16:17], -v[0:1]
				; GFX10-NEXT: v_add_f64 v[2:3], v[18:19], -v[2:3]
				; GFX10-NEXT: v_add_f64 v[4:5], v[20:21], -v[4:5]
				; GFX10-NEXT: v_add_f64 v[6:7], v[22:23], -v[6:7]
				; GFX10-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-CONTRACT-LABEL: test_v4f64_sub_mul_rhs:
				; GFX10-CONTRACT: ; %bb.0: ; %.entry
				; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-CONTRACT-NEXT: v_fma_f64 v[0:1], -v[0:1], v[8:9], v[16:17]
				; GFX10-CONTRACT-NEXT: v_fma_f64 v[2:3], -v[2:3], v[10:11], v[18:19]
				; GFX10-CONTRACT-NEXT: v_fma_f64 v[4:5], -v[4:5], v[12:13], v[20:21]
				; GFX10-CONTRACT-NEXT: v_fma_f64 v[6:7], -v[6:7], v[14:15], v[22:23]
				; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX10-DENORM-LABEL: test_v4f64_sub_mul_rhs:
				; GFX10-DENORM: ; %bb.0: ; %.entry
				; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
				; GFX10-DENORM-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9]
				; GFX10-DENORM-NEXT: v_mul_f64 v[2:3], v[2:3], v[10:11]
				; GFX10-DENORM-NEXT: v_mul_f64 v[4:5], v[4:5], v[12:13]
				; GFX10-DENORM-NEXT: v_mul_f64 v[6:7], v[6:7], v[14:15]
				; GFX10-DENORM-NEXT: v_add_f64 v[0:1], v[16:17], -v[0:1]
				; GFX10-DENORM-NEXT: v_add_f64 v[2:3], v[18:19], -v[2:3]
				; GFX10-DENORM-NEXT: v_add_f64 v[4:5], v[20:21], -v[4:5]
				; GFX10-DENORM-NEXT: v_add_f64 v[6:7], v[22:23], -v[6:7]
				; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
				.entry:
				%a = fmul <4 x double> %x, %y
				%b = fsub <4 x double> %z, %a
				ret <4 x double> %b
				}

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU][GlobalISel] Transform (fsub (fmul x, y), z) -> (fma x, y, -z)
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 384763

llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h

llvm/include/llvm/Target/GlobalISel/Combine.td

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp

llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU][GlobalISel] Transform (fsub (fmul x, y), z) -> (fma x, y, -z)ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 384763

llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h

llvm/include/llvm/Target/GlobalISel/Combine.td

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp

llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll

[AMDGPU][GlobalISel] Transform (fsub (fmul x, y), z) -> (fma x, y, -z)
ClosedPublic