This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
include/llvm/CodeGen/
-
llvm/
-
CodeGen/
-
MachineCombinerPattern.h
-
lib/Target/AArch64/
-
Target/
-
AArch64/
-
AArch64InstrInfo.cpp
-
test/CodeGen/AArch64/
-
CodeGen/
-
AArch64/
1/3
fp16-fmla.ll

Differential D67990

[aarch64] fix generation of fp16 fmls
ClosedPublic

Authored by sebpop on Sep 24 2019, 3:10 PM.

Download Raw Diff

Details

Reviewers

t.p.northover
SjoerdMeijer

Commits

rGd0d52edae92f: fix fmls fp16
rL374044: fix fmls fp16

Summary

Tim remarked that the added patterns produce wrong code in case the fsub
instruction has a multiplication as its first operand, i.e., all the patterns FMLSv*_OP1:

define <8 x half> @test_FMLSv8f16_OP1(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
; CHECK-LABEL: test_FMLSv8f16_OP1:
; CHECK: fmls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
entry:
%mul = fmul fast <8 x half> %c, %b
%sub = fsub fast <8 x half> %mul, %a
ret <8 x half> %sub
}

This doesn't look right to me. The exact instruction produced is "fmls
v0.8h, v2.8h, v1.8h", which I think calculates "v0 - v2*v1", but the
IR is calculating "v2*v1-v0". The equivalent <4 x float> code also
doesn't emit an fmls.

This patch generates an fmla and negates the value of the operand2 of the fsub.
Inspecting the pattern match, I found that there was another mistake in the
opcode to be selected: matching FMULv4*16 should generate FMLSv4*16
and not FMLSv2*32.

Tested on aarch64-linux with make check-all.

Diff Detail

Event Timeline

sebpop created this revision.Sep 24 2019, 3:10 PM

Herald added a project: Restricted Project. · View Herald TranscriptSep 24 2019, 3:10 PM

Herald added subscribers: hiraditya, kristof.beyls. · View Herald Transcript

Ping.

SjoerdMeijer added inline comments.Oct 7 2019, 8:28 AM

llvm/test/CodeGen/AArch64/fp16-fmla.ll
163	Why are we not generating a fmls? And a nit, but perhaps actually just using registers v0, v1, and v2 here makes things clearer?

sebpop marked an inline comment as done.Oct 7 2019, 11:24 AM

sebpop added inline comments.

llvm/test/CodeGen/AArch64/fp16-fmla.ll
163	That is part of the problem that Tim pointed out: when the multiply is the first operand of `fsub`, i.e., %sub = fsub fast <8 x half> %mul, %a that should not generate a fused multiply sub. With this patch, for `b * c - a` we negate the value of a and generate a fused multiply add `-a + b * c`.

Cheers, lgtm

llvm/test/CodeGen/AArch64/fp16-fmla.ll
163	Thanks, I just got myself confused here.

This revision is now accepted and ready to land.Oct 8 2019, 1:33 AM

Closed by commit rGd0d52edae92f: fix fmls fp16 (authored by Sebastian Pop <spop@amazon.com>). · Explain WhyOct 8 2019, 6:23 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

include/

llvm/

CodeGen/

MachineCombinerPattern.h

2 lines

lib/

Target/

AArch64/

AArch64InstrInfo.cpp

53 lines

test/

CodeGen/

AArch64/

fp16-fmla.ll

16 lines

Diff 221606

llvm/include/llvm/CodeGen/MachineCombinerPattern.h

Show First 20 Lines • Show All 74 Lines • ▼ Show 20 Lines	enum class MachineCombinerPattern {
FMLAv2i64_indexed_OP1,		FMLAv2i64_indexed_OP1,
FMLAv2i64_indexed_OP2,		FMLAv2i64_indexed_OP2,
FMLAv4f32_OP1,		FMLAv4f32_OP1,
FMLAv4f32_OP2,		FMLAv4f32_OP2,
FMLAv4i32_indexed_OP1,		FMLAv4i32_indexed_OP1,
FMLAv4i32_indexed_OP2,		FMLAv4i32_indexed_OP2,
FMLSv1i32_indexed_OP2,		FMLSv1i32_indexed_OP2,
FMLSv1i64_indexed_OP2,		FMLSv1i64_indexed_OP2,
		FMLSv4f16_OP1,
FMLSv4f16_OP2,		FMLSv4f16_OP2,
FMLSv8f16_OP1,		FMLSv8f16_OP1,
FMLSv8f16_OP2,		FMLSv8f16_OP2,
FMLSv2f32_OP1,		FMLSv2f32_OP1,
FMLSv2f32_OP2,		FMLSv2f32_OP2,
FMLSv2f64_OP1,		FMLSv2f64_OP1,
FMLSv2f64_OP2,		FMLSv2f64_OP2,
		FMLSv4i16_indexed_OP1,
FMLSv4i16_indexed_OP2,		FMLSv4i16_indexed_OP2,
FMLSv8i16_indexed_OP1,		FMLSv8i16_indexed_OP1,
FMLSv8i16_indexed_OP2,		FMLSv8i16_indexed_OP2,
FMLSv2i32_indexed_OP1,		FMLSv2i32_indexed_OP1,
FMLSv2i32_indexed_OP2,		FMLSv2i32_indexed_OP2,
FMLSv2i64_indexed_OP1,		FMLSv2i64_indexed_OP1,
FMLSv2i64_indexed_OP2,		FMLSv2i64_indexed_OP2,
FMLSv4f32_OP1,		FMLSv4f32_OP1,
FMLSv4f32_OP2,		FMLSv4f32_OP2,
FMLSv4i32_indexed_OP1,		FMLSv4i32_indexed_OP1,
FMLSv4i32_indexed_OP2		FMLSv4i32_indexed_OP2
};		};

} // end namespace llvm		} // end namespace llvm

#endif		#endif

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

Show First 20 Lines • Show All 3,773 Lines • ▼ Show 20 Lines	Found \|= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) \|\|
Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);		Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2);

Found \|= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);		Found \|= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1);
break;		break;
case AArch64::FSUBv4f16:		case AArch64::FSUBv4f16:
Found \|= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) \|\|		Found \|= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) \|\|
Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);		Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2);

Found \|= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv2i32_indexed_OP1) \|\|		Found \|= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) \|\|
Match(AArch64::FMULv4f16, 1, MCP::FMLSv2f32_OP1);		Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1);
break;		break;
case AArch64::FSUBv8f16:		case AArch64::FSUBv8f16:
Found \|= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) \|\|		Found \|= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) \|\|
Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);		Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2);

Found \|= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) \|\|		Found \|= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) \|\|
Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);		Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1);
break;		break;
▲ Show 20 Lines • Show All 64 Lines • ▼ Show 20 Lines	bool AArch64InstrInfo::isThroughputPattern(
case MachineCombinerPattern::FMLAv2i32_indexed_OP1:		case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
case MachineCombinerPattern::FMLAv2i32_indexed_OP2:		case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
case MachineCombinerPattern::FMLAv2i64_indexed_OP1:		case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
case MachineCombinerPattern::FMLAv2i64_indexed_OP2:		case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
case MachineCombinerPattern::FMLAv4f32_OP1:		case MachineCombinerPattern::FMLAv4f32_OP1:
case MachineCombinerPattern::FMLAv4f32_OP2:		case MachineCombinerPattern::FMLAv4f32_OP2:
case MachineCombinerPattern::FMLAv4i32_indexed_OP1:		case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
case MachineCombinerPattern::FMLAv4i32_indexed_OP2:		case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
		case MachineCombinerPattern::FMLSv4i16_indexed_OP1:
case MachineCombinerPattern::FMLSv4i16_indexed_OP2:		case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
case MachineCombinerPattern::FMLSv8i16_indexed_OP1:		case MachineCombinerPattern::FMLSv8i16_indexed_OP1:
case MachineCombinerPattern::FMLSv8i16_indexed_OP2:		case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
case MachineCombinerPattern::FMLSv1i32_indexed_OP2:		case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
case MachineCombinerPattern::FMLSv1i64_indexed_OP2:		case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
case MachineCombinerPattern::FMLSv2i32_indexed_OP2:		case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
case MachineCombinerPattern::FMLSv2i64_indexed_OP2:		case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
		case MachineCombinerPattern::FMLSv4f16_OP1:
case MachineCombinerPattern::FMLSv4f16_OP2:		case MachineCombinerPattern::FMLSv4f16_OP2:
case MachineCombinerPattern::FMLSv8f16_OP1:		case MachineCombinerPattern::FMLSv8f16_OP1:
case MachineCombinerPattern::FMLSv8f16_OP2:		case MachineCombinerPattern::FMLSv8f16_OP2:
case MachineCombinerPattern::FMLSv2f32_OP2:		case MachineCombinerPattern::FMLSv2f32_OP2:
case MachineCombinerPattern::FMLSv2f64_OP2:		case MachineCombinerPattern::FMLSv2f64_OP2:
case MachineCombinerPattern::FMLSv4i32_indexed_OP2:		case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
case MachineCombinerPattern::FMLSv4f32_OP2:		case MachineCombinerPattern::FMLSv4f32_OP2:
return true;		return true;
▲ Show 20 Lines • Show All 586 Lines • ▼ Show 20 Lines	void AArch64InstrInfo::genAlternativeCodeSequence(

case MachineCombinerPattern::FMLSv1i64_indexed_OP2:		case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
Opc = AArch64::FMLSv1i64_indexed;		Opc = AArch64::FMLSv1i64_indexed;
RC = &AArch64::FPR64RegClass;		RC = &AArch64::FPR64RegClass;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
FMAInstKind::Indexed);		FMAInstKind::Indexed);
break;		break;

		case MachineCombinerPattern::FMLSv4f16_OP1:
		case MachineCombinerPattern::FMLSv4i16_indexed_OP1: {
		RC = &AArch64::FPR64RegClass;
		Register NewVR = MRI.createVirtualRegister(RC);
		MachineInstrBuilder MIB1 =
		BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f16), NewVR)
		.add(Root.getOperand(2));
		InsInstrs.push_back(MIB1);
		InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
		if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) {
		Opc = AArch64::FMLAv4f16;
		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
		FMAInstKind::Accumulator, &NewVR);
		} else {
		Opc = AArch64::FMLAv4i16_indexed;
		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
		FMAInstKind::Indexed, &NewVR);
		}
		break;
		}
case MachineCombinerPattern::FMLSv4f16_OP2:		case MachineCombinerPattern::FMLSv4f16_OP2:
RC = &AArch64::FPR64RegClass;		RC = &AArch64::FPR64RegClass;
Opc = AArch64::FMLSv4f16;		Opc = AArch64::FMLSv4f16;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
FMAInstKind::Accumulator);		FMAInstKind::Accumulator);
break;		break;
case MachineCombinerPattern::FMLSv4i16_indexed_OP2:		case MachineCombinerPattern::FMLSv4i16_indexed_OP2:
RC = &AArch64::FPR64RegClass;		RC = &AArch64::FPR64RegClass;
Show All 12 Lines	case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
} else {		} else {
Opc = AArch64::FMLSv2f32;		Opc = AArch64::FMLSv2f32;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
FMAInstKind::Accumulator);		FMAInstKind::Accumulator);
}		}
break;		break;

case MachineCombinerPattern::FMLSv8f16_OP1:		case MachineCombinerPattern::FMLSv8f16_OP1:
		case MachineCombinerPattern::FMLSv8i16_indexed_OP1: {
RC = &AArch64::FPR128RegClass;		RC = &AArch64::FPR128RegClass;
Opc = AArch64::FMLSv8f16;		Register NewVR = MRI.createVirtualRegister(RC);
		MachineInstrBuilder MIB1 =
		BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv8f16), NewVR)
		.add(Root.getOperand(2));
		InsInstrs.push_back(MIB1);
		InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
		if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) {
		Opc = AArch64::FMLAv8f16;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
FMAInstKind::Accumulator);		FMAInstKind::Accumulator, &NewVR);
break;		} else {
case MachineCombinerPattern::FMLSv8i16_indexed_OP1:		Opc = AArch64::FMLAv8i16_indexed;
RC = &AArch64::FPR128RegClass;
Opc = AArch64::FMLSv8i16_indexed;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
FMAInstKind::Indexed);		FMAInstKind::Indexed, &NewVR);
		}
break;		break;
		}
case MachineCombinerPattern::FMLSv8f16_OP2:		case MachineCombinerPattern::FMLSv8f16_OP2:
RC = &AArch64::FPR128RegClass;		RC = &AArch64::FPR128RegClass;
Opc = AArch64::FMLSv8f16;		Opc = AArch64::FMLSv8f16;
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,		MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
FMAInstKind::Accumulator);		FMAInstKind::Accumulator);
break;		break;
case MachineCombinerPattern::FMLSv8i16_indexed_OP2:		case MachineCombinerPattern::FMLSv8i16_indexed_OP2:
RC = &AArch64::FPR128RegClass;		RC = &AArch64::FPR128RegClass;
▲ Show 20 Lines • Show All 1,122 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/fp16-fmla.ll

	Show First 20 Lines • Show All 132 Lines • ▼ Show 20 Lines
	; CHECK-FIXME: fmla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h			; CHECK-FIXME: fmla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
	entry:			entry:
	%mul = mul <8 x i16> %c, %b			%mul = mul <8 x i16> %c, %b
	%m = bitcast <8 x i16> %mul to <8 x half>			%m = bitcast <8 x i16> %mul to <8 x half>
	%add = fadd fast <8 x half> %a, %m			%add = fadd fast <8 x half> %a, %m
	ret <8 x half> %add			ret <8 x half> %add
	}			}

				define <4 x half> @test_FMLSv4f16_OP1(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
				; CHECK-LABEL: test_FMLSv4f16_OP1:
				; CHECK: fneg {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
				; CHECK: fmla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
				entry:
				%mul = fmul fast <4 x half> %c, %b
				%sub = fsub fast <4 x half> %mul, %a
				ret <4 x half> %sub
				}

	define <4 x half> @test_FMLSv4f16_OP2(<4 x half> %a, <4 x half> %b, <4 x half> %c) {			define <4 x half> @test_FMLSv4f16_OP2(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
	; CHECK-LABEL: test_FMLSv4f16_OP2:			; CHECK-LABEL: test_FMLSv4f16_OP2:
	; CHECK: fmls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h			; CHECK: fmls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
	entry:			entry:
	%mul = fmul fast <4 x half> %c, %b			%mul = fmul fast <4 x half> %c, %b
	%sub = fsub fast <4 x half> %a, %mul			%sub = fsub fast <4 x half> %a, %mul
	ret <4 x half> %sub			ret <4 x half> %sub
	}			}

	define <8 x half> @test_FMLSv8f16_OP1(<8 x half> %a, <8 x half> %b, <8 x half> %c) {			define <8 x half> @test_FMLSv8f16_OP1(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
	; CHECK-LABEL: test_FMLSv8f16_OP1:			; CHECK-LABEL: test_FMLSv8f16_OP1:
	; CHECK: fmls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h			; CHECK: fneg {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
				; CHECK: fmla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
				SjoerdMeijerUnsubmitted Not Done Reply Inline Actions Why are we not generating a fmls? And a nit, but perhaps actually just using registers v0, v1, and v2 here makes things clearer? SjoerdMeijer: Why are we not generating a fmls? And a nit, but perhaps actually just using registers v0, v1…
				sebpopAuthorUnsubmitted Done Reply Inline Actions That is part of the problem that Tim pointed out: when the multiply is the first operand of `fsub`, i.e., %sub = fsub fast <8 x half> %mul, %a that should not generate a fused multiply sub. With this patch, for `b * c - a` we negate the value of a and generate a fused multiply add `-a + b * c`. sebpop: That is part of the problem that Tim pointed out: when the multiply is the first operand of…
				SjoerdMeijerUnsubmitted Not Done Reply Inline Actions Thanks, I just got myself confused here. SjoerdMeijer: Thanks, I just got myself confused here.
	entry:			entry:
	%mul = fmul fast <8 x half> %c, %b			%mul = fmul fast <8 x half> %c, %b
	%sub = fsub fast <8 x half> %mul, %a			%sub = fsub fast <8 x half> %mul, %a
	ret <8 x half> %sub			ret <8 x half> %sub
	}			}

	define <8 x half> @test_FMLSv8f16_OP2(<8 x half> %a, <8 x half> %b, <8 x half> %c) {			define <8 x half> @test_FMLSv8f16_OP2(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
	; CHECK-LABEL: test_FMLSv8f16_OP2:			; CHECK-LABEL: test_FMLSv8f16_OP2:
	Show All 19 Lines
	}			}

	define <8 x half> @test_FMLSv8i16_indexed_OP1(<8 x half> %a, <8 x i16> %b, <8 x i16> %c) {			define <8 x half> @test_FMLSv8i16_indexed_OP1(<8 x half> %a, <8 x i16> %b, <8 x i16> %c) {
	; CHECK-LABEL: test_FMLSv8i16_indexed_OP1:			; CHECK-LABEL: test_FMLSv8i16_indexed_OP1:
	; CHECK-FIXME: Currently LLVM produces inefficient code:			; CHECK-FIXME: Currently LLVM produces inefficient code:
	; CHECK: mul			; CHECK: mul
	; CHECK: fsub			; CHECK: fsub
	; CHECK-FIXME: It should instead produce the following instruction:			; CHECK-FIXME: It should instead produce the following instruction:
	; CHECK-FIXME: fmls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h			; CHECK-FIXME: fneg {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
				; CHECK-FIXME: fmla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
	entry:			entry:
	%mul = mul <8 x i16> %c, %b			%mul = mul <8 x i16> %c, %b
	%m = bitcast <8 x i16> %mul to <8 x half>			%m = bitcast <8 x i16> %mul to <8 x half>
	%sub = fsub fast <8 x half> %m, %a			%sub = fsub fast <8 x half> %m, %a
	ret <8 x half> %sub			ret <8 x half> %sub
	}			}

	define <8 x half> @test_FMLSv8i16_indexed_OP2(<8 x half> %a, <8 x i16> %b, <8 x i16> %c) {			define <8 x half> @test_FMLSv8i16_indexed_OP2(<8 x half> %a, <8 x i16> %b, <8 x i16> %c) {
	Show All 12 Lines