This is an archive of the discontinued LLVM Phabricator instance.

[aarch64] Add combine patterns for fp16 fmla
ClosedPublic

Authored by sebpop on Sep 6 2019, 12:03 PM.

Download Raw Diff

Details

Reviewers

SjoerdMeijer
az
evandro
jgreenhalgh
kristof.beyls

Commits

rGeacb2c2c975c: [aarch64] Add combine patterns for fp16 fmla
rL371321: [aarch64] Add combine patterns for fp16 fmla

Summary

This patch enables generation of fused multiply add/sub for instructions operating on fp16.
Tested on aarch64-linux.

There are 7 CHECK-FIXME for patterns for which I was not able to create a testcase to exercise the added code paths.
Those 7 patterns are mixing v[4|8]i16 with v[4|8]fp16 types with the help of a bitcast.
I am not sure how to write a testcase without the bitcast, and to generate coverage over those combine patterns,
so I would appreciate help on rewriting those testcases.

Diff Detail

Repository: rL LLVM

Event Timeline

sebpop created this revision.Sep 6 2019, 12:03 PM

Herald added a project: Restricted Project. · View Herald TranscriptSep 6 2019, 12:03 PM

Herald added a subscriber: hiraditya. · View Herald Transcript

Hi Sebastian, thanks for fixing this.
This looks reasonable to me as an initial commit. This instcombiner part is a real copy-paste mess, but there's enough prior art here that this should be okay for now. I think we should follow up though to clean this up, and actually it's not bad to have a reference for now.
Bit of nit: instead of the CHECK-FIXME, perhaps it's better to just match the current output for now and have a FIXME as comment so that it is obvious when codegen changes.
And lastly, related to this that can be addressed separately, I noticed an llvm fma intrinsics when I looked into this. I haven't looked into details yet, but probably we need to support the f16 variant for completeness.

This revision is now accepted and ready to land.Sep 6 2019, 1:44 PM

Closed by commit rL371321: [aarch64] Add combine patterns for fp16 fmla (authored by spop). · Explain WhySep 7 2019, 1:24 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

include/

llvm/

CodeGen/

MachineCombinerPattern.h

19 lines

lib/

Target/

AArch64/

AArch64InstrInfo.cpp

342 lines

test/

CodeGen/

AArch64/

fp16-fmla.ll

208 lines

Diff 219250

llvm/trunk/include/llvm/CodeGen/MachineCombinerPattern.h

Show All 33 Lines	enum class MachineCombinerPattern {
MULSUBWI_OP1,		MULSUBWI_OP1,
MULADDX_OP1,		MULADDX_OP1,
MULADDX_OP2,		MULADDX_OP2,
MULSUBX_OP1,		MULSUBX_OP1,
MULSUBX_OP2,		MULSUBX_OP2,
MULADDXI_OP1,		MULADDXI_OP1,
MULSUBXI_OP1,		MULSUBXI_OP1,
// Floating Point		// Floating Point
		FMULADDH_OP1,
		FMULADDH_OP2,
		FMULSUBH_OP1,
		FMULSUBH_OP2,
FMULADDS_OP1,		FMULADDS_OP1,
FMULADDS_OP2,		FMULADDS_OP2,
FMULSUBS_OP1,		FMULSUBS_OP1,
FMULSUBS_OP2,		FMULSUBS_OP2,
FMULADDD_OP1,		FMULADDD_OP1,
FMULADDD_OP2,		FMULADDD_OP2,
FMULSUBD_OP1,		FMULSUBD_OP1,
FMULSUBD_OP2,		FMULSUBD_OP2,
		FNMULSUBH_OP1,
FNMULSUBS_OP1,		FNMULSUBS_OP1,
FNMULSUBD_OP1,		FNMULSUBD_OP1,
FMLAv1i32_indexed_OP1,		FMLAv1i32_indexed_OP1,
FMLAv1i32_indexed_OP2,		FMLAv1i32_indexed_OP2,
FMLAv1i64_indexed_OP1,		FMLAv1i64_indexed_OP1,
FMLAv1i64_indexed_OP2,		FMLAv1i64_indexed_OP2,
		FMLAv4f16_OP1,
		FMLAv4f16_OP2,
		FMLAv8f16_OP1,
		FMLAv8f16_OP2,
FMLAv2f32_OP2,		FMLAv2f32_OP2,
FMLAv2f32_OP1,		FMLAv2f32_OP1,
FMLAv2f64_OP1,		FMLAv2f64_OP1,
FMLAv2f64_OP2,		FMLAv2f64_OP2,
		FMLAv4i16_indexed_OP1,
		FMLAv4i16_indexed_OP2,
		FMLAv8i16_indexed_OP1,
		FMLAv8i16_indexed_OP2,
FMLAv2i32_indexed_OP1,		FMLAv2i32_indexed_OP1,
FMLAv2i32_indexed_OP2,		FMLAv2i32_indexed_OP2,
FMLAv2i64_indexed_OP1,		FMLAv2i64_indexed_OP1,
FMLAv2i64_indexed_OP2,		FMLAv2i64_indexed_OP2,
FMLAv4f32_OP1,		FMLAv4f32_OP1,
FMLAv4f32_OP2,		FMLAv4f32_OP2,
FMLAv4i32_indexed_OP1,		FMLAv4i32_indexed_OP1,
FMLAv4i32_indexed_OP2,		FMLAv4i32_indexed_OP2,
FMLSv1i32_indexed_OP2,		FMLSv1i32_indexed_OP2,
FMLSv1i64_indexed_OP2,		FMLSv1i64_indexed_OP2,
		FMLSv4f16_OP2,
		FMLSv8f16_OP1,
		FMLSv8f16_OP2,
FMLSv2f32_OP1,		FMLSv2f32_OP1,
FMLSv2f32_OP2,		FMLSv2f32_OP2,
FMLSv2f64_OP1,		FMLSv2f64_OP1,
FMLSv2f64_OP2,		FMLSv2f64_OP2,
		FMLSv4i16_indexed_OP2,
		FMLSv8i16_indexed_OP1,
		FMLSv8i16_indexed_OP2,
FMLSv2i32_indexed_OP1,		FMLSv2i32_indexed_OP1,
FMLSv2i32_indexed_OP2,		FMLSv2i32_indexed_OP2,
FMLSv2i64_indexed_OP1,		FMLSv2i64_indexed_OP1,
FMLSv2i64_indexed_OP2,		FMLSv2i64_indexed_OP2,
FMLSv4f32_OP1,		FMLSv4f32_OP1,
FMLSv4f32_OP2,		FMLSv4f32_OP2,
FMLSv4i32_indexed_OP1,		FMLSv4i32_indexed_OP1,
FMLSv4i32_indexed_OP2		FMLSv4i32_indexed_OP2
};		};

} // end namespace llvm		} // end namespace llvm

#endif		#endif

llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp

Show First 20 Lines • Show All 3,460 Lines • ▼ Show 20 Lines	static bool isCombineInstrCandidate64(unsigned Opc) {
return false;		return false;
}		}

// FP Opcodes that can be combined with a FMUL		// FP Opcodes that can be combined with a FMUL
static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {		static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
switch (Inst.getOpcode()) {		switch (Inst.getOpcode()) {
default:		default:
break;		break;
		case AArch64::FADDHrr:
case AArch64::FADDSrr:		case AArch64::FADDSrr:
case AArch64::FADDDrr:		case AArch64::FADDDrr:
		case AArch64::FADDv4f16:
		case AArch64::FADDv8f16:
case AArch64::FADDv2f32:		case AArch64::FADDv2f32:
case AArch64::FADDv2f64:		case AArch64::FADDv2f64:
case AArch64::FADDv4f32:		case AArch64::FADDv4f32:
		case AArch64::FSUBHrr:
case AArch64::FSUBSrr:		case AArch64::FSUBSrr:
case AArch64::FSUBDrr:		case AArch64::FSUBDrr:
		case AArch64::FSUBv4f16:
		case AArch64::FSUBv8f16:
case AArch64::FSUBv2f32:		case AArch64::FSUBv2f32:
case AArch64::FSUBv2f64:		case AArch64::FSUBv2f64:
case AArch64::FSUBv4f32:		case AArch64::FSUBv4f32:
TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;		TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options;
return (Options.UnsafeFPMath \|\|		return (Options.UnsafeFPMath \|\|
Options.AllowFPOpFusion == FPOpFusion::Fast);		Options.AllowFPOpFusion == FPOpFusion::Fast);
}		}
return false;		return false;
▲ Show 20 Lines • Show All 193 Lines • ▼ Show 20 Lines	static bool getFMAPatterns(MachineInstr &Root,

MachineBasicBlock &MBB = *Root.getParent();		MachineBasicBlock &MBB = *Root.getParent();
bool Found = false;		bool Found = false;

switch (Root.getOpcode()) {		switch (Root.getOpcode()) {
default:		default:
assert(false && "Unsupported FP instruction in combiner\n");		assert(false && "Unsupported FP instruction in combiner\n");
break;		break;
		case AArch64::FADDHrr:
		assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
		"FADDHrr does not have register operands");
		if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULHrr)) {
		Patterns.push_back(MachineCombinerPattern::FMULADDH_OP1);
		Found = true;
		}
		if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULHrr)) {
		Patterns.push_back(MachineCombinerPattern::FMULADDH_OP2);
		Found = true;
		}
		break;
case AArch64::FADDSrr:		case AArch64::FADDSrr:
assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&		assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
"FADDWrr does not have register operands");		"FADDSrr does not have register operands");
if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {		if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
Patterns.push_back(MachineCombinerPattern::FMULADDS_OP1);		Patterns.push_back(MachineCombinerPattern::FMULADDS_OP1);
Found = true;		Found = true;
} else if (canCombineWithFMUL(MBB, Root.getOperand(1),		} else if (canCombineWithFMUL(MBB, Root.getOperand(1),
AArch64::FMULv1i32_indexed)) {		AArch64::FMULv1i32_indexed)) {
Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP1);		Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP1);
Found = true;		Found = true;
}		}
Show All 19 Lines	if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
Patterns.push_back(MachineCombinerPattern::FMULADDD_OP2);		Patterns.push_back(MachineCombinerPattern::FMULADDD_OP2);
Found = true;		Found = true;
} else if (canCombineWithFMUL(MBB, Root.getOperand(2),		} else if (canCombineWithFMUL(MBB, Root.getOperand(2),
AArch64::FMULv1i64_indexed)) {		AArch64::FMULv1i64_indexed)) {
Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP2);		Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP2);
Found = true;		Found = true;
}		}
break;		break;
		case AArch64::FADDv4f16:
		if (canCombineWithFMUL(MBB, Root.getOperand(1),
		AArch64::FMULv4i16_indexed)) {
		Patterns.push_back(MachineCombinerPattern::FMLAv4i16_indexed_OP1);
		Found = true;
		} else if (canCombineWithFMUL(MBB, Root.getOperand(1),
		AArch64::FMULv4f16)) {
		Patterns.push_back(MachineCombinerPattern::FMLAv4f16_OP1);
		Found = true;
		}
		if (canCombineWithFMUL(MBB, Root.getOperand(2),
		AArch64::FMULv4i16_indexed)) {
		Patterns.push_back(MachineCombinerPattern::FMLAv4i16_indexed_OP2);
		Found = true;
		} else if (canCombineWithFMUL(MBB, Root.getOperand(2),
		AArch64::FMULv4f16)) {
		Patterns.push_back(MachineCombinerPattern::FMLAv4f16_OP2);
		Found = true;
		}
		break;
		case AArch64::FADDv8f16:
		if (canCombineWithFMUL(MBB, Root.getOperand(1),
		AArch64::FMULv8i16_indexed)) {
		Patterns.push_back(MachineCombinerPattern::FMLAv8i16_indexed_OP1);
		Found = true;
		} else if (canCombineWithFMUL(MBB, Root.getOperand(1),
		AArch64::FMULv8f16)) {
		Patterns.push_back(MachineCombinerPattern::FMLAv8f16_OP1);
		Found = true;
		}
		if (canCombineWithFMUL(MBB, Root.getOperand(2),
		AArch64::FMULv8i16_indexed)) {
		Patterns.push_back(MachineCombinerPattern::FMLAv8i16_indexed_OP2);
		Found = true;
		} else if (canCombineWithFMUL(MBB, Root.getOperand(2),
		AArch64::FMULv8f16)) {
		Patterns.push_back(MachineCombinerPattern::FMLAv8f16_OP2);
		Found = true;
		}
		break;
case AArch64::FADDv2f32:		case AArch64::FADDv2f32:
if (canCombineWithFMUL(MBB, Root.getOperand(1),		if (canCombineWithFMUL(MBB, Root.getOperand(1),
AArch64::FMULv2i32_indexed)) {		AArch64::FMULv2i32_indexed)) {
Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP1);		Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP1);
Found = true;		Found = true;
} else if (canCombineWithFMUL(MBB, Root.getOperand(1),		} else if (canCombineWithFMUL(MBB, Root.getOperand(1),
AArch64::FMULv2f32)) {		AArch64::FMULv2f32)) {
Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP1);		Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP1);
▲ Show 20 Lines • Show All 45 Lines • ▼ Show 20 Lines	if (canCombineWithFMUL(MBB, Root.getOperand(2),
Found = true;		Found = true;
} else if (canCombineWithFMUL(MBB, Root.getOperand(2),		} else if (canCombineWithFMUL(MBB, Root.getOperand(2),
AArch64::FMULv4f32)) {		AArch64::FMULv4f32)) {
Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP2);		Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP2);
Found = true;		Found = true;
}		}
break;		break;

		case AArch64::FSUBHrr:
		if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULHrr)) {
		Patterns.push_back(MachineCombinerPattern::FMULSUBH_OP1);
		Found = true;
		}
		if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULHrr)) {
		Patterns.push_back(MachineCombinerPattern::FMULSUBH_OP2);
		Found = true;
		}
		if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULHrr)) {
		Patterns.push_back(MachineCombinerPattern::FNMULSUBH_OP1);
		Found = true;
		}
		break;
case AArch64::FSUBSrr:		case AArch64::FSUBSrr:
if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {		if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP1);		Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP1);
Found = true;		Found = true;
}		}
if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {		if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP2);		Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP2);
Found = true;		Found = true;
Show All 20 Lines	if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2);		Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2);
Found = true;		Found = true;
}		}
if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULDrr)) {		if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULDrr)) {
Patterns.push_back(MachineCombinerPattern::FNMULSUBD_OP1);		Patterns.push_back(MachineCombinerPattern::FNMULSUBD_OP1);
Found = true;		Found = true;
}		}
break;		break;
		case AArch64::FSUBv4f16:
		if (canCombineWithFMUL(MBB, Root.getOperand(2),
		AArch64::FMULv4i16_indexed)) {
		Patterns.push_back(MachineCombinerPattern::FMLSv4i16_indexed_OP2);
		Found = true;
		} else if (canCombineWithFMUL(MBB, Root.getOperand(2),
		AArch64::FMULv4f16)) {
		Patterns.push_back(MachineCombinerPattern::FMLSv4f16_OP2);
		Found = true;
		}
		if (canCombineWithFMUL(MBB, Root.getOperand(1),
		AArch64::FMULv4i16_indexed)) {
		Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP1);
		Found = true;
		} else if (canCombineWithFMUL(MBB, Root.getOperand(1),
		AArch64::FMULv4f16)) {
		Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP1);
		Found = true;
		}
		break;
		case AArch64::FSUBv8f16:
		if (canCombineWithFMUL(MBB, Root.getOperand(2),
		AArch64::FMULv8i16_indexed)) {
		Patterns.push_back(MachineCombinerPattern::FMLSv8i16_indexed_OP2);
		Found = true;
		} else if (canCombineWithFMUL(MBB, Root.getOperand(2),
		AArch64::FMULv8f16)) {
		Patterns.push_back(MachineCombinerPattern::FMLSv8f16_OP2);
		Found = true;
		}
		if (canCombineWithFMUL(MBB, Root.getOperand(1),
		AArch64::FMULv8i16_indexed)) {
		Patterns.push_back(MachineCombinerPattern::FMLSv8i16_indexed_OP1);
		Found = true;
		} else if (canCombineWithFMUL(MBB, Root.getOperand(1),
		AArch64::FMULv8f16)) {
		Patterns.push_back(MachineCombinerPattern::FMLSv8f16_OP1);
		Found = true;
		}
		break;
case AArch64::FSUBv2f32:		case AArch64::FSUBv2f32:
if (canCombineWithFMUL(MBB, Root.getOperand(2),		if (canCombineWithFMUL(MBB, Root.getOperand(2),
AArch64::FMULv2i32_indexed)) {		AArch64::FMULv2i32_indexed)) {
Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2);		Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2);
Found = true;		Found = true;
} else if (canCombineWithFMUL(MBB, Root.getOperand(2),		} else if (canCombineWithFMUL(MBB, Root.getOperand(2),
AArch64::FMULv2f32)) {		AArch64::FMULv2f32)) {
Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP2);		Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP2);
▲ Show 20 Lines • Show All 56 Lines • ▼ Show 20 Lines
/// Return true when a code sequence can improve throughput. It		/// Return true when a code sequence can improve throughput. It
/// should be called only for instructions in loops.		/// should be called only for instructions in loops.
/// \param Pattern - combiner pattern		/// \param Pattern - combiner pattern
bool AArch64InstrInfo::isThroughputPattern(		bool AArch64InstrInfo::isThroughputPattern(
MachineCombinerPattern Pattern) const {		MachineCombinerPattern Pattern) const {
switch (Pattern) {		switch (Pattern) {
default:		default:
break;		break;
		case MachineCombinerPattern::FMULADDH_OP1:
		case MachineCombinerPattern::FMULADDH_OP2:
		case MachineCombinerPattern::FMULSUBH_OP1:
		case MachineCombinerPattern::FMULSUBH_OP2:
case MachineCombinerPattern::FMULADDS_OP1:		case MachineCombinerPattern::FMULADDS_OP1:
case MachineCombinerPattern::FMULADDS_OP2:		case MachineCombinerPattern::FMULADDS_OP2:
case MachineCombinerPattern::FMULSUBS_OP1:		case MachineCombinerPattern::FMULSUBS_OP1:
case MachineCombinerPattern::FMULSUBS_OP2:		case MachineCombinerPattern::FMULSUBS_OP2:
case MachineCombinerPattern::FMULADDD_OP1:		case MachineCombinerPattern::FMULADDD_OP1:
case MachineCombinerPattern::FMULADDD_OP2:		case MachineCombinerPattern::FMULADDD_OP2:
case MachineCombinerPattern::FMULSUBD_OP1:		case MachineCombinerPattern::FMULSUBD_OP1:
case MachineCombinerPattern::FMULSUBD_OP2:		case MachineCombinerPattern::FMULSUBD_OP2:
		case MachineCombinerPattern::FNMULSUBH_OP1:
case MachineCombinerPattern::FNMULSUBS_OP1:		case MachineCombinerPattern::FNMULSUBS_OP1:
case MachineCombinerPattern::FNMULSUBD_OP1:		case MachineCombinerPattern::FNMULSUBD_OP1:
		case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
		case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
		case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
		case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
case MachineCombinerPattern::FMLAv1i32_indexed_OP1:		case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
case MachineCombinerPattern::FMLAv1i32_indexed_OP2:		case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
case MachineCombinerPattern::FMLAv1i64_indexed_OP1:		case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
case MachineCombinerPattern::FMLAv1i64_indexed_OP2:		case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
		case MachineCombinerPattern::FMLAv4f16_OP2:
		case MachineCombinerPattern::FMLAv4f16_OP1:
		case MachineCombinerPattern::FMLAv8f16_OP1:
		case MachineCombinerPattern::FMLAv8f16_OP2:
case MachineCombinerPattern::FMLAv2f32_OP2:		case MachineCombinerPattern::FMLAv2f32_OP2:
case MachineCombinerPattern::FMLAv2f32_OP1:		case MachineCombinerPattern::FMLAv2f32_OP1:
case MachineCombinerPattern::FMLAv2f64_OP1:		case MachineCombinerPattern::FMLAv2f64_OP1:
case MachineCombinerPattern::FMLAv2f64_OP2:		case MachineCombinerPattern::FMLAv2f64_OP2:
case MachineCombinerPattern::FMLAv2i32_indexed_OP1:		case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
case MachineCombinerPattern::FMLAv2i32_indexed_OP2:		case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
case MachineCombinerPattern::FMLAv2i64_indexed_OP1:		case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
case MachineCombinerPattern::FMLAv2i64_indexed_OP2:		case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
case MachineCombinerPattern::FMLAv4f32_OP1:		case MachineCombinerPattern::FMLAv4f32_OP1:
case MachineCombinerPattern::FMLAv4f32_OP2:		case MachineCombinerPattern::FMLAv4f32_OP2:
case MachineCombinerPattern::FMLAv4i32_indexed_OP1:		case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
case MachineCombinerPattern::FMLAv4i32_indexed_OP2:		case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
		case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
		case MachineCombinerPattern::FMLSv8i16_indexed_OP1:
		case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
case MachineCombinerPattern::FMLSv1i32_indexed_OP2:		case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
case MachineCombinerPattern::FMLSv1i64_indexed_OP2:		case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
case MachineCombinerPattern::FMLSv2i32_indexed_OP2:		case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
case MachineCombinerPattern::FMLSv2i64_indexed_OP2:		case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
		case MachineCombinerPattern::FMLSv4f16_OP2:
		case MachineCombinerPattern::FMLSv8f16_OP1:
		case MachineCombinerPattern::FMLSv8f16_OP2:
case MachineCombinerPattern::FMLSv2f32_OP2:		case MachineCombinerPattern::FMLSv2f32_OP2:
case MachineCombinerPattern::FMLSv2f64_OP2:		case MachineCombinerPattern::FMLSv2f64_OP2:
case MachineCombinerPattern::FMLSv4i32_indexed_OP2:		case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
case MachineCombinerPattern::FMLSv4f32_OP2:		case MachineCombinerPattern::FMLSv4f32_OP2:
return true;		return true;
} // end switch (Pattern)		} // end switch (Pattern)
return false;		return false;
}		}
▲ Show 20 Lines • Show All 331 Lines • ▼ Show 20 Lines	if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
.addImm(Encoding);		.addImm(Encoding);
InsInstrs.push_back(MIB1);		InsInstrs.push_back(MIB1);
InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));		InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);		MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
}		}
break;		break;
}		}
// Floating Point Support		// Floating Point Support
		case MachineCombinerPattern::FMULADDH_OP1:
		Opc = AArch64::FMADDHrrr;
		RC = &AArch64::FPR16RegClass;
		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
		break;
case MachineCombinerPattern::FMULADDS_OP1:		case MachineCombinerPattern::FMULADDS_OP1:
case MachineCombinerPattern::FMULADDD_OP1:
// MUL I=A,B,0
// ADD R,I,C
// ==> MADD R,A,B,C
// --- Create(MADD);
if (Pattern == MachineCombinerPattern::FMULADDS_OP1) {
Opc = AArch64::FMADDSrrr;		Opc = AArch64::FMADDSrrr;
RC = &AArch64::FPR32RegClass;		RC = &AArch64::FPR32RegClass;
} else {		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
		break;
		case MachineCombinerPattern::FMULADDD_OP1:
Opc = AArch64::FMADDDrrr;		Opc = AArch64::FMADDDrrr;
RC = &AArch64::FPR64RegClass;		RC = &AArch64::FPR64RegClass;
}
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
break;		break;

		case MachineCombinerPattern::FMULADDH_OP2:
		Opc = AArch64::FMADDHrrr;
		RC = &AArch64::FPR16RegClass;
		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
		break;
case MachineCombinerPattern::FMULADDS_OP2:		case MachineCombinerPattern::FMULADDS_OP2:
case MachineCombinerPattern::FMULADDD_OP2:
// FMUL I=A,B,0
// FADD R,C,I
// ==> FMADD R,A,B,C
// --- Create(FMADD);
if (Pattern == MachineCombinerPattern::FMULADDS_OP2) {
Opc = AArch64::FMADDSrrr;		Opc = AArch64::FMADDSrrr;
RC = &AArch64::FPR32RegClass;		RC = &AArch64::FPR32RegClass;
} else {		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
		break;
		case MachineCombinerPattern::FMULADDD_OP2:
Opc = AArch64::FMADDDrrr;		Opc = AArch64::FMADDDrrr;
RC = &AArch64::FPR64RegClass;		RC = &AArch64::FPR64RegClass;
}
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
break;		break;

case MachineCombinerPattern::FMLAv1i32_indexed_OP1:		case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
Opc = AArch64::FMLAv1i32_indexed;		Opc = AArch64::FMLAv1i32_indexed;
RC = &AArch64::FPR32RegClass;		RC = &AArch64::FPR32RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
FMAInstKind::Indexed);		FMAInstKind::Indexed);
Show All 13 Lines	case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
break;		break;
case MachineCombinerPattern::FMLAv1i64_indexed_OP2:		case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
Opc = AArch64::FMLAv1i64_indexed;		Opc = AArch64::FMLAv1i64_indexed;
RC = &AArch64::FPR64RegClass;		RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
FMAInstKind::Indexed);		FMAInstKind::Indexed);
break;		break;

		case MachineCombinerPattern::FMLAv4i16_indexed_OP1:
		RC = &AArch64::FPR64RegClass;
		Opc = AArch64::FMLAv4i16_indexed;
		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
		FMAInstKind::Indexed);
		break;
		case MachineCombinerPattern::FMLAv4f16_OP1:
		RC = &AArch64::FPR64RegClass;
		Opc = AArch64::FMLAv4f16;
		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
		FMAInstKind::Accumulator);
		break;
		case MachineCombinerPattern::FMLAv4i16_indexed_OP2:
		RC = &AArch64::FPR64RegClass;
		Opc = AArch64::FMLAv4i16_indexed;
		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
		FMAInstKind::Indexed);
		break;
		case MachineCombinerPattern::FMLAv4f16_OP2:
		RC = &AArch64::FPR64RegClass;
		Opc = AArch64::FMLAv4f16;
		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
		FMAInstKind::Accumulator);
		break;

case MachineCombinerPattern::FMLAv2i32_indexed_OP1:		case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
case MachineCombinerPattern::FMLAv2f32_OP1:		case MachineCombinerPattern::FMLAv2f32_OP1:
RC = &AArch64::FPR64RegClass;		RC = &AArch64::FPR64RegClass;
if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) {		if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
Opc = AArch64::FMLAv2i32_indexed;		Opc = AArch64::FMLAv2i32_indexed;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
FMAInstKind::Indexed);		FMAInstKind::Indexed);
} else {		} else {
Show All 11 Lines	if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
FMAInstKind::Indexed);		FMAInstKind::Indexed);
} else {		} else {
Opc = AArch64::FMLAv2f32;		Opc = AArch64::FMLAv2f32;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
FMAInstKind::Accumulator);		FMAInstKind::Accumulator);
}		}
break;		break;

		case MachineCombinerPattern::FMLAv8i16_indexed_OP1:
		RC = &AArch64::FPR128RegClass;
		Opc = AArch64::FMLAv8i16_indexed;
		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
		FMAInstKind::Indexed);
		break;
		case MachineCombinerPattern::FMLAv8f16_OP1:
		RC = &AArch64::FPR128RegClass;
		Opc = AArch64::FMLAv8f16;
		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
		FMAInstKind::Accumulator);
		break;
		case MachineCombinerPattern::FMLAv8i16_indexed_OP2:
		RC = &AArch64::FPR128RegClass;
		Opc = AArch64::FMLAv8i16_indexed;
		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
		FMAInstKind::Indexed);
		break;
		case MachineCombinerPattern::FMLAv8f16_OP2:
		RC = &AArch64::FPR128RegClass;
		Opc = AArch64::FMLAv8f16;
		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
		FMAInstKind::Accumulator);
		break;

case MachineCombinerPattern::FMLAv2i64_indexed_OP1:		case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
case MachineCombinerPattern::FMLAv2f64_OP1:		case MachineCombinerPattern::FMLAv2f64_OP1:
RC = &AArch64::FPR128RegClass;		RC = &AArch64::FPR128RegClass;
if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) {		if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
Opc = AArch64::FMLAv2i64_indexed;		Opc = AArch64::FMLAv2i64_indexed;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
FMAInstKind::Indexed);		FMAInstKind::Indexed);
} else {		} else {
Show All 39 Lines	if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
FMAInstKind::Indexed);		FMAInstKind::Indexed);
} else {		} else {
Opc = AArch64::FMLAv4f32;		Opc = AArch64::FMLAv4f32;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
FMAInstKind::Accumulator);		FMAInstKind::Accumulator);
}		}
break;		break;

		case MachineCombinerPattern::FMULSUBH_OP1:
		Opc = AArch64::FNMSUBHrrr;
		RC = &AArch64::FPR16RegClass;
		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
		break;
case MachineCombinerPattern::FMULSUBS_OP1:		case MachineCombinerPattern::FMULSUBS_OP1:
case MachineCombinerPattern::FMULSUBD_OP1: {
// FMUL I=A,B,0
// FSUB R,I,C
// ==> FNMSUB R,A,B,C // = -C + A*B
// --- Create(FNMSUB);
if (Pattern == MachineCombinerPattern::FMULSUBS_OP1) {
Opc = AArch64::FNMSUBSrrr;		Opc = AArch64::FNMSUBSrrr;
RC = &AArch64::FPR32RegClass;		RC = &AArch64::FPR32RegClass;
} else {		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
		break;
		case MachineCombinerPattern::FMULSUBD_OP1:
Opc = AArch64::FNMSUBDrrr;		Opc = AArch64::FNMSUBDrrr;
RC = &AArch64::FPR64RegClass;		RC = &AArch64::FPR64RegClass;
}
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
break;		break;
}

		case MachineCombinerPattern::FNMULSUBH_OP1:
		Opc = AArch64::FNMADDHrrr;
		RC = &AArch64::FPR16RegClass;
		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
		break;
case MachineCombinerPattern::FNMULSUBS_OP1:		case MachineCombinerPattern::FNMULSUBS_OP1:
case MachineCombinerPattern::FNMULSUBD_OP1: {
// FNMUL I=A,B,0
// FSUB R,I,C
// ==> FNMADD R,A,B,C // = -A*B - C
// --- Create(FNMADD);
if (Pattern == MachineCombinerPattern::FNMULSUBS_OP1) {
Opc = AArch64::FNMADDSrrr;		Opc = AArch64::FNMADDSrrr;
RC = &AArch64::FPR32RegClass;		RC = &AArch64::FPR32RegClass;
} else {		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
		break;
		case MachineCombinerPattern::FNMULSUBD_OP1:
Opc = AArch64::FNMADDDrrr;		Opc = AArch64::FNMADDDrrr;
RC = &AArch64::FPR64RegClass;		RC = &AArch64::FPR64RegClass;
}
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
break;		break;
}

		case MachineCombinerPattern::FMULSUBH_OP2:
		Opc = AArch64::FMSUBHrrr;
		RC = &AArch64::FPR16RegClass;
		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
		break;
case MachineCombinerPattern::FMULSUBS_OP2:		case MachineCombinerPattern::FMULSUBS_OP2:
case MachineCombinerPattern::FMULSUBD_OP2: {
// FMUL I=A,B,0
// FSUB R,C,I
// ==> FMSUB R,A,B,C (computes C - A*B)
// --- Create(FMSUB);
if (Pattern == MachineCombinerPattern::FMULSUBS_OP2) {
Opc = AArch64::FMSUBSrrr;		Opc = AArch64::FMSUBSrrr;
RC = &AArch64::FPR32RegClass;		RC = &AArch64::FPR32RegClass;
} else {		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
		break;
		case MachineCombinerPattern::FMULSUBD_OP2:
Opc = AArch64::FMSUBDrrr;		Opc = AArch64::FMSUBDrrr;
RC = &AArch64::FPR64RegClass;		RC = &AArch64::FPR64RegClass;
}
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
break;		break;
}

case MachineCombinerPattern::FMLSv1i32_indexed_OP2:		case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
Opc = AArch64::FMLSv1i32_indexed;		Opc = AArch64::FMLSv1i32_indexed;
RC = &AArch64::FPR32RegClass;		RC = &AArch64::FPR32RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
FMAInstKind::Indexed);		FMAInstKind::Indexed);
break;		break;

case MachineCombinerPattern::FMLSv1i64_indexed_OP2:		case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
Opc = AArch64::FMLSv1i64_indexed;		Opc = AArch64::FMLSv1i64_indexed;
RC = &AArch64::FPR64RegClass;		RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
FMAInstKind::Indexed);		FMAInstKind::Indexed);
break;		break;

		case MachineCombinerPattern::FMLSv4f16_OP2:
		RC = &AArch64::FPR64RegClass;
		Opc = AArch64::FMLSv4f16;
		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
		FMAInstKind::Accumulator);
		break;
		case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
		RC = &AArch64::FPR64RegClass;
		Opc = AArch64::FMLSv4i16_indexed;
		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
		FMAInstKind::Indexed);
		break;

case MachineCombinerPattern::FMLSv2f32_OP2:		case MachineCombinerPattern::FMLSv2f32_OP2:
case MachineCombinerPattern::FMLSv2i32_indexed_OP2:		case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
RC = &AArch64::FPR64RegClass;		RC = &AArch64::FPR64RegClass;
if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) {		if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
Opc = AArch64::FMLSv2i32_indexed;		Opc = AArch64::FMLSv2i32_indexed;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
FMAInstKind::Indexed);		FMAInstKind::Indexed);
} else {		} else {
Opc = AArch64::FMLSv2f32;		Opc = AArch64::FMLSv2f32;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
FMAInstKind::Accumulator);		FMAInstKind::Accumulator);
}		}
break;		break;

		case MachineCombinerPattern::FMLSv8f16_OP1:
		RC = &AArch64::FPR128RegClass;
		Opc = AArch64::FMLSv8f16;
		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
		FMAInstKind::Accumulator);
		break;
		case MachineCombinerPattern::FMLSv8i16_indexed_OP1:
		RC = &AArch64::FPR128RegClass;
		Opc = AArch64::FMLSv8i16_indexed;
		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
		FMAInstKind::Indexed);
		break;

		case MachineCombinerPattern::FMLSv8f16_OP2:
		RC = &AArch64::FPR128RegClass;
		Opc = AArch64::FMLSv8f16;
		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
		FMAInstKind::Accumulator);
		break;
		case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
		RC = &AArch64::FPR128RegClass;
		Opc = AArch64::FMLSv8i16_indexed;
		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
		FMAInstKind::Indexed);
		break;

case MachineCombinerPattern::FMLSv2f64_OP2:		case MachineCombinerPattern::FMLSv2f64_OP2:
case MachineCombinerPattern::FMLSv2i64_indexed_OP2:		case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
RC = &AArch64::FPR128RegClass;		RC = &AArch64::FPR128RegClass;
if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) {		if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
Opc = AArch64::FMLSv2i64_indexed;		Opc = AArch64::FMLSv2i64_indexed;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
FMAInstKind::Indexed);		FMAInstKind::Indexed);
} else {		} else {
▲ Show 20 Lines • Show All 1,109 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AArch64/fp16-fmla.ll

				; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+v8.2a,+fullfp16 -fp-contract=fast \| FileCheck %s

				define half @test_FMULADDH_OP1(half %a, half %b, half %c) {
				; CHECK-LABEL: test_FMULADDH_OP1:
				; CHECK: fmadd {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
				entry:
				%mul = fmul fast half %c, %b
				%add = fadd fast half %mul, %a
				ret half %add
				}

				define half @test_FMULADDH_OP2(half %a, half %b, half %c) {
				; CHECK-LABEL: test_FMULADDH_OP2:
				; CHECK: fmadd {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
				entry:
				%mul = fmul fast half %c, %b
				%add = fadd fast half %a, %mul
				ret half %add
				}

				define half @test_FMULSUBH_OP1(half %a, half %b, half %c) {
				; CHECK-LABEL: test_FMULSUBH_OP1:
				; CHECK: fnmsub {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
				entry:
				%mul = fmul fast half %c, %b
				%sub = fsub fast half %mul, %a
				ret half %sub
				}

				define half @test_FMULSUBH_OP2(half %a, half %b, half %c) {
				; CHECK-LABEL: test_FMULSUBH_OP2:
				; CHECK: fmsub {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
				entry:
				%mul = fmul fast half %c, %b
				%add = fsub fast half %a, %mul
				ret half %add
				}

				define half @test_FNMULSUBH_OP1(half %a, half %b, half %c) {
				; CHECK-LABEL: test_FNMULSUBH_OP1:
				; CHECK: fnmadd {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
				entry:
				%mul = fmul fast half %c, %b
				%neg = fsub fast half -0.0, %mul
				%add = fsub fast half %neg, %a
				ret half %add
				}

				define <4 x half> @test_FMLAv4f16_OP1(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
				; CHECK-LABEL: test_FMLAv4f16_OP1:
				; CHECK: fmla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
				entry:
				%mul = fmul fast <4 x half> %c, %b
				%add = fadd fast <4 x half> %mul, %a
				ret <4 x half> %add
				}

				define <4 x half> @test_FMLAv4f16_OP2(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
				; CHECK-LABEL: test_FMLAv4f16_OP2:
				; CHECK: fmla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
				entry:
				%mul = fmul fast <4 x half> %c, %b
				%add = fadd fast <4 x half> %a, %mul
				ret <4 x half> %add
				}

				define <8 x half> @test_FMLAv8f16_OP1(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
				; CHECK-LABEL: test_FMLAv8f16_OP1:
				; CHECK: fmla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
				entry:
				%mul = fmul fast <8 x half> %c, %b
				%add = fadd fast <8 x half> %mul, %a
				ret <8 x half> %add
				}

				define <8 x half> @test_FMLAv8f16_OP2(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
				; CHECK-LABEL: test_FMLAv8f16_OP2:
				; CHECK: fmla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
				entry:
				%mul = fmul fast <8 x half> %c, %b
				%add = fadd fast <8 x half> %a, %mul
				ret <8 x half> %add
				}

				define <4 x half> @test_FMLAv4i16_indexed_OP1(<4 x half> %a, <4 x i16> %b, <4 x i16> %c) {
				; CHECK-LABEL: test_FMLAv4i16_indexed_OP1:
				; CHECK-FIXME: Currently LLVM produces inefficient code:
				; CHECK: mul
				; CHECK: fadd
				; CHECK-FIXME: It should instead produce the following instruction:
				; CHECK-FIXME: fmla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
				entry:
				%mul = mul <4 x i16> %c, %b
				%m = bitcast <4 x i16> %mul to <4 x half>
				%add = fadd fast <4 x half> %m, %a
				ret <4 x half> %add
				}

				define <4 x half> @test_FMLAv4i16_indexed_OP2(<4 x half> %a, <4 x i16> %b, <4 x i16> %c) {
				; CHECK-LABEL: test_FMLAv4i16_indexed_OP2:
				; CHECK-FIXME: Currently LLVM produces inefficient code:
				; CHECK: mul
				; CHECK: fadd
				; CHECK-FIXME: It should instead produce the following instruction:
				; CHECK-FIXME: fmla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
				entry:
				%mul = mul <4 x i16> %c, %b
				%m = bitcast <4 x i16> %mul to <4 x half>
				%add = fadd fast <4 x half> %a, %m
				ret <4 x half> %add
				}

				define <8 x half> @test_FMLAv8i16_indexed_OP1(<8 x half> %a, <8 x i16> %b, <8 x i16> %c) {
				; CHECK-LABEL: test_FMLAv8i16_indexed_OP1:
				; CHECK-FIXME: Currently LLVM produces inefficient code:
				; CHECK: mul
				; CHECK: fadd
				; CHECK-FIXME: It should instead produce the following instruction:
				; CHECK-FIXME: fmla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
				entry:
				%mul = mul <8 x i16> %c, %b
				%m = bitcast <8 x i16> %mul to <8 x half>
				%add = fadd fast <8 x half> %m, %a
				ret <8 x half> %add
				}

				define <8 x half> @test_FMLAv8i16_indexed_OP2(<8 x half> %a, <8 x i16> %b, <8 x i16> %c) {
				; CHECK-LABEL: test_FMLAv8i16_indexed_OP2:
				; CHECK-FIXME: Currently LLVM produces inefficient code:
				; CHECK: mul
				; CHECK: fadd
				; CHECK-FIXME: It should instead produce the following instruction:
				; CHECK-FIXME: fmla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
				entry:
				%mul = mul <8 x i16> %c, %b
				%m = bitcast <8 x i16> %mul to <8 x half>
				%add = fadd fast <8 x half> %a, %m
				ret <8 x half> %add
				}

				define <4 x half> @test_FMLSv4f16_OP2(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
				; CHECK-LABEL: test_FMLSv4f16_OP2:
				; CHECK: fmls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
				entry:
				%mul = fmul fast <4 x half> %c, %b
				%sub = fsub fast <4 x half> %a, %mul
				ret <4 x half> %sub
				}

				define <8 x half> @test_FMLSv8f16_OP1(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
				; CHECK-LABEL: test_FMLSv8f16_OP1:
				; CHECK: fmls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
				entry:
				%mul = fmul fast <8 x half> %c, %b
				%sub = fsub fast <8 x half> %mul, %a
				ret <8 x half> %sub
				}

				define <8 x half> @test_FMLSv8f16_OP2(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
				; CHECK-LABEL: test_FMLSv8f16_OP2:
				; CHECK: fmls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
				entry:
				%mul = fmul fast <8 x half> %c, %b
				%sub = fsub fast <8 x half> %a, %mul
				ret <8 x half> %sub
				}

				define <4 x half> @test_FMLSv4i16_indexed_OP2(<4 x half> %a, <4 x i16> %b, <4 x i16> %c) {
				; CHECK-LABEL: test_FMLSv4i16_indexed_OP2:
				; CHECK-FIXME: Currently LLVM produces inefficient code:
				; CHECK: mul
				; CHECK: fsub
				; CHECK-FIXME: It should instead produce the following instruction:
				; CHECK-FIXME: fmls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
				entry:
				%mul = mul <4 x i16> %c, %b
				%m = bitcast <4 x i16> %mul to <4 x half>
				%sub = fsub fast <4 x half> %a, %m
				ret <4 x half> %sub
				}

				define <8 x half> @test_FMLSv8i16_indexed_OP1(<8 x half> %a, <8 x i16> %b, <8 x i16> %c) {
				; CHECK-LABEL: test_FMLSv8i16_indexed_OP1:
				; CHECK-FIXME: Currently LLVM produces inefficient code:
				; CHECK: mul
				; CHECK: fsub
				; CHECK-FIXME: It should instead produce the following instruction:
				; CHECK-FIXME: fmls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
				entry:
				%mul = mul <8 x i16> %c, %b
				%m = bitcast <8 x i16> %mul to <8 x half>
				%sub = fsub fast <8 x half> %m, %a
				ret <8 x half> %sub
				}

				define <8 x half> @test_FMLSv8i16_indexed_OP2(<8 x half> %a, <8 x i16> %b, <8 x i16> %c) {
				; CHECK-LABEL: test_FMLSv8i16_indexed_OP2:
				; CHECK-FIXME: Currently LLVM produces inefficient code:
				; CHECK: mul
				; CHECK: fsub
				; CHECK-FIXME: It should instead produce the following instruction:
				; CHECK-FIXME: fmls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
				entry:
				%mul = mul <8 x i16> %c, %b
				%m = bitcast <8 x i16> %mul to <8 x half>
				%sub = fsub fast <8 x half> %a, %m
				ret <8 x half> %sub
				}