This is an archive of the discontinued LLVM Phabricator instance.

[ARM] Do not fuse VADD and VMUL on the Cortex-M4 and Cortex-M33
ClosedPublic

Authored by SjoerdMeijer on Sep 20 2018, 1:53 AM.

Download Raw Diff

Details

Reviewers

samparker
dmgreen
t.p.northover
john.brawn
javed.absar

Commits

rGd986ede31322: [ARM] Do not fuse VADD and VMUL on the Cortex-M4 and Cortex-M33
rL342874: [ARM] Do not fuse VADD and VMUL on the Cortex-M4 and Cortex-M33

Summary

A sequence of VMUL and VADD instructions always give the same or better
performance than a fused VMLA instruction on the Cortex-M4 and Cortex-M33.
Executing the VMUL and VADD back-to-back requires the same cycles, but
having separate instructions allows scheduling to avoid the hazard between
these 2 instructions.

Diff Detail

Repository: rL LLVM

Event Timeline

SjoerdMeijer created this revision.Sep 20 2018, 1:53 AM

Herald added a reviewer: javed.absar. · View Herald TranscriptSep 20 2018, 1:53 AM

Herald added subscribers: chrib, kristof.beyls. · View Herald Transcript

Reshuffled the tests a bit.

Shouldn't we also consider code size here?

Good point. I wanted to worry about that later in a follow up patch, but perhaps that doesn't make sense. I will fix it now.

dmgreen added inline comments.Sep 20 2018, 3:41 AM

test/CodeGen/Thumb2/float-intrinsics-float.ll
192 ↗	(On Diff #166250)	This comment can be removed now?

Thanks for the reviews.
Now take code size into account, and removed outdated comment.

LGTM, cheers.

Sam Parker

Compilation Tools Engineer | Arm

. . . . . . . . . . . . . . . . . . . . . . . . . . .

Arm.com

From: Sjoerd Meijer via Phabricator <reviews@reviews.llvm.org>
Sent: 21 September 2018 15:45:01
To: Sjoerd Meijer; Sam Parker; David Green; t.p.northover@gmail.com; John Brawn; Javed Absar
Cc: Kristof Beyls; christian.bruel@st.com; llvm-commits@lists.llvm.org; kanheim@a-bix.com; James Molloy; diana.picus@linaro.org; Florian Hahn
Subject: [PATCH] D52289: [ARM] Do not fuse VADD and VMUL on the Cortex-M4 and Cortex-M33

SjoerdMeijer updated this revision to Diff 166483.
SjoerdMeijer added a comment.

Thanks for the reviews.
Now take code size into account, and removed outdated comment.

https://reviews.llvm.org/D52289

Files:

lib/Target/ARM/ARM.td
lib/Target/ARM/ARMInstrInfo.td
test/CodeGen/ARM/fmacs.ll
test/CodeGen/Thumb2/float-intrinsics-float.ll

Index: test/CodeGen/Thumb2/float-intrinsics-float.ll

test/CodeGen/Thumb2/float-intrinsics-float.ll

+++ test/CodeGen/Thumb2/float-intrinsics-float.ll
@@ -1,5 +1,6 @@
; RUN: llc < %s -mtriple=thumbv7-none-eabi -mcpu=cortex-m3 | FileCheck %s -check-prefix=CHECK -check-prefix=SOFT -check-prefix=NONE
-; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m4 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP -check-prefix=VMLA
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m4 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP -check-prefix=NO-VMLA
+; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m33 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP -check-prefix=NO-VMLA
; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m7 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=VFP -check-prefix=FP-ARMv8 -check-prefix=VMLA
; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m7 -mattr=+fp-only-sp | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP -check-prefix=FP-ARMv8 -check-prefix=VMLA
; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-a7 | FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=NEON -check-prefix=VFP4 -check-prefix=NO-VMLA
@@ -188,8 +189,6 @@

ret float %1

}

-; FIXME: why does cortex-m4 use vmla, while cortex-a7 uses vmul+vadd?
-; (these should be equivalent, even the rounding is the same)
declare float @llvm.fmuladd.f32(float %a, float %b, float %c)
define float @fmuladd_f(float %a, float %b, float %c) {
; CHECK-LABEL: fmuladd_f:

Index: test/CodeGen/ARM/fmacs.ll

test/CodeGen/ARM/fmacs.ll

+++ test/CodeGen/ARM/fmacs.ll
@@ -3,6 +3,8 @@
; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s -check-prefix=A8
; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - | FileCheck %s -check-prefix=A9
; RUN: llc -mtriple=arm-linux-gnueabi -mcpu=cortex-a9 -float-abi=hard %s -o - | FileCheck %s -check-prefix=HARD
+; RUN: llc -mtriple=arm-linux-gnueabi -mcpu=cortex-m4 -float-abi=hard %s -o - | FileCheck %s -check-prefix=VMLA
+; RUN: llc -mtriple=arm-linux-gnueabi -mcpu=cortex-m33 -float-abi=hard %s -o - | FileCheck %s -check-prefix=VMLA

define float @t1(float %acc, float %a, float %b) {
entry:
@@ -15,6 +17,21 @@
; A8-LABEL: t1:
; A8: vmul.f32
; A8: vadd.f32
+
+; VMLA-LABEL: t1:
+; VMLA: vmul.f32
+; VMLA-NEXT: vadd.f32
+
+ %0 = fmul float %a, %b
+ %1 = fadd float %acc, %0
+ ret float %1
+}
+
+define float @vlma_minsize(float %acc, float %a, float %b) #0 {
+entry:
+; VMLA-LABEL: vlma_minsize:
+; VLMA: vmla.f32 s0, s1, s2
+

%0 = fmul float %a, %b
%1 = fadd float %acc, %0
      ret float %1

@@ -102,3 +119,5 @@

%3 = fadd float %1, %2
ret float %3

}
+
+attributes #0 = { minsize nounwind optsize }

Index: lib/Target/ARM/ARMInstrInfo.td

lib/Target/ARM/ARMInstrInfo.td

+++ lib/Target/ARM/ARMInstrInfo.td
@@ -353,10 +353,10 @@
let RecomputePerFunction = 1 in {

def UseMovt          : Predicate<"Subtarget->useMovt(*MF)">;
def DontUseMovt      : Predicate<"!Subtarget->useMovt(*MF)">;

def UseMovtInPic : Predicate<"Subtarget->useMovt(*MF) && Subtarget->allowPositionIndependentMovt()">;
def DontUseMovtInPic : Predicate<"!Subtarget->useMovt(*MF) || !Subtarget->allowPositionIndependentMovt()">;

+ def UseMovtInPic : Predicate<"Subtarget->useMovt(*MF) && Subtarget->allowPositionIndependentMovt()">;
+ def DontUseMovtInPic : Predicate<"!Subtarget->useMovt(*MF) || !Subtarget->allowPositionIndependentMovt()">;
+ def UseFPVMLx : Predicate<"Subtarget->useFPVMLx() || MF->getFunction().optForMinSize()">;
}
-def UseFPVMLx : Predicate<"Subtarget->useFPVMLx()">;
def UseMulOps : Predicate<"Subtarget->useMulOps()">;

// Prefer fused MAC for fp mul + add over fp VMLA / VMLS if they are available.

Index: lib/Target/ARM/ARM.td

lib/Target/ARM/ARM.td

+++ lib/Target/ARM/ARM.td
@@ -966,6 +966,7 @@

FeatureVFPOnlySP,
FeatureD16,
FeaturePrefLoopAlign32,

+ FeatureHasSlowFPVMLx,

FeatureHasNoBranchPredictor]>;

def : ProcNoItin<"cortex-m7", [ARMv7em,
@@ -981,6 +982,7 @@

FeatureD16,
FeatureVFPOnlySP,
FeaturePrefLoopAlign32,

+ FeatureHasSlowFPVMLx,

FeatureHasNoBranchPredictor]>;

def : ProcNoItin<"cortex-a32", [ARMv8a,

msg-20362-369.txt162 BDownload

This revision was not accepted when it landed; it landed in state Needs Review.Sep 24 2018, 7:24 AM

Closed by commit rL342874: [ARM] Do not fuse VADD and VMUL on the Cortex-M4 and Cortex-M33 (authored by SjoerdMeijer). · Explain Why

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

ARM/

ARM.td

2 lines

ARMInstrInfo.td

6 lines

test/

CodeGen/

ARM/

fmacs.ll

19 lines

Thumb2/

float-intrinsics-float.ll

5 lines

Diff 166678

llvm/trunk/lib/Target/ARM/ARM.td

Show First 20 Lines • Show All 960 Lines • ▼ Show 20 Lines	def : ProcessorModel<"sc300", CortexM3Model, [ARMv7m,
ProcM3,		ProcM3,
FeatureHasNoBranchPredictor]>;		FeatureHasNoBranchPredictor]>;

def : ProcessorModel<"cortex-m4", CortexM3Model, [ARMv7em,		def : ProcessorModel<"cortex-m4", CortexM3Model, [ARMv7em,
FeatureVFP4,		FeatureVFP4,
FeatureVFPOnlySP,		FeatureVFPOnlySP,
FeatureD16,		FeatureD16,
FeaturePrefLoopAlign32,		FeaturePrefLoopAlign32,
		FeatureHasSlowFPVMLx,
FeatureHasNoBranchPredictor]>;		FeatureHasNoBranchPredictor]>;

def : ProcNoItin<"cortex-m7", [ARMv7em,		def : ProcNoItin<"cortex-m7", [ARMv7em,
FeatureFPARMv8,		FeatureFPARMv8,
FeatureD16]>;		FeatureD16]>;

def : ProcNoItin<"cortex-m23", [ARMv8mBaseline,		def : ProcNoItin<"cortex-m23", [ARMv8mBaseline,
FeatureNoMovt]>;		FeatureNoMovt]>;

def : ProcessorModel<"cortex-m33", CortexM3Model, [ARMv8mMainline,		def : ProcessorModel<"cortex-m33", CortexM3Model, [ARMv8mMainline,
FeatureDSP,		FeatureDSP,
FeatureFPARMv8,		FeatureFPARMv8,
FeatureD16,		FeatureD16,
FeatureVFPOnlySP,		FeatureVFPOnlySP,
FeaturePrefLoopAlign32,		FeaturePrefLoopAlign32,
		FeatureHasSlowFPVMLx,
FeatureHasNoBranchPredictor]>;		FeatureHasNoBranchPredictor]>;

def : ProcNoItin<"cortex-a32", [ARMv8a,		def : ProcNoItin<"cortex-a32", [ARMv8a,
FeatureHWDivThumb,		FeatureHWDivThumb,
FeatureHWDivARM,		FeatureHWDivARM,
FeatureCrypto,		FeatureCrypto,
FeatureCRC]>;		FeatureCRC]>;

▲ Show 20 Lines • Show All 119 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/ARM/ARMInstrInfo.td

Show First 20 Lines • Show All 347 Lines • ▼ Show 20 Lines	def UseNegativeImmediates :
Predicate<"false">,		Predicate<"false">,
AssemblerPredicate<"!FeatureNoNegativeImmediates",		AssemblerPredicate<"!FeatureNoNegativeImmediates",
"NegativeImmediates">;		"NegativeImmediates">;

// FIXME: Eventually this will be just "hasV6T2Ops".		// FIXME: Eventually this will be just "hasV6T2Ops".
let RecomputePerFunction = 1 in {		let RecomputePerFunction = 1 in {
def UseMovt : Predicate<"Subtarget->useMovt(*MF)">;		def UseMovt : Predicate<"Subtarget->useMovt(*MF)">;
def DontUseMovt : Predicate<"!Subtarget->useMovt(*MF)">;		def DontUseMovt : Predicate<"!Subtarget->useMovt(*MF)">;
def UseMovtInPic : Predicate<"Subtarget->useMovt(*MF) && Subtarget->allowPositionIndependentMovt()">;		def UseMovtInPic : Predicate<"Subtarget->useMovt(*MF) && Subtarget->allowPositionIndependentMovt()">;
def DontUseMovtInPic : Predicate<"!Subtarget->useMovt(*MF) \|\| !Subtarget->allowPositionIndependentMovt()">;		def DontUseMovtInPic : Predicate<"!Subtarget->useMovt(*MF) \|\| !Subtarget->allowPositionIndependentMovt()">;
		def UseFPVMLx : Predicate<"Subtarget->useFPVMLx() \|\| MF->getFunction().optForMinSize()">;
}		}
def UseFPVMLx : Predicate<"Subtarget->useFPVMLx()">;
def UseMulOps : Predicate<"Subtarget->useMulOps()">;		def UseMulOps : Predicate<"Subtarget->useMulOps()">;

// Prefer fused MAC for fp mul + add over fp VMLA / VMLS if they are available.		// Prefer fused MAC for fp mul + add over fp VMLA / VMLS if they are available.
// But only select them if more precision in FP computation is allowed.		// But only select them if more precision in FP computation is allowed.
// Do not use them for Darwin platforms.		// Do not use them for Darwin platforms.
def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion =="		def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion =="
" FPOpFusion::Fast && "		" FPOpFusion::Fast && "
" Subtarget->hasVFP4()) && "		" Subtarget->hasVFP4()) && "
▲ Show 20 Lines • Show All 5,810 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/ARM/fmacs.ll

	; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o - \| FileCheck %s -check-prefix=VFP2			; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o - \| FileCheck %s -check-prefix=VFP2
	; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - \| FileCheck %s -check-prefix=NEON			; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - \| FileCheck %s -check-prefix=NEON
	; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - \| FileCheck %s -check-prefix=A8			; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - \| FileCheck %s -check-prefix=A8
	; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - \| FileCheck %s -check-prefix=A9			; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - \| FileCheck %s -check-prefix=A9
	; RUN: llc -mtriple=arm-linux-gnueabi -mcpu=cortex-a9 -float-abi=hard %s -o - \| FileCheck %s -check-prefix=HARD			; RUN: llc -mtriple=arm-linux-gnueabi -mcpu=cortex-a9 -float-abi=hard %s -o - \| FileCheck %s -check-prefix=HARD
				; RUN: llc -mtriple=arm-linux-gnueabi -mcpu=cortex-m4 -float-abi=hard %s -o - \| FileCheck %s -check-prefix=VMLA
				; RUN: llc -mtriple=arm-linux-gnueabi -mcpu=cortex-m33 -float-abi=hard %s -o - \| FileCheck %s -check-prefix=VMLA

	define float @t1(float %acc, float %a, float %b) {			define float @t1(float %acc, float %a, float %b) {
	entry:			entry:
	; VFP2-LABEL: t1:			; VFP2-LABEL: t1:
	; VFP2: vmla.f32			; VFP2: vmla.f32

	; NEON-LABEL: t1:			; NEON-LABEL: t1:
	; NEON: vmla.f32			; NEON: vmla.f32

	; A8-LABEL: t1:			; A8-LABEL: t1:
	; A8: vmul.f32			; A8: vmul.f32
	; A8: vadd.f32			; A8: vadd.f32

				; VMLA-LABEL: t1:
				; VMLA: vmul.f32
				; VMLA-NEXT: vadd.f32

				%0 = fmul float %a, %b
				%1 = fadd float %acc, %0
				ret float %1
				}

				define float @vlma_minsize(float %acc, float %a, float %b) #0 {
				entry:
				; VMLA-LABEL: vlma_minsize:
				; VLMA: vmla.f32 s0, s1, s2

	%0 = fmul float %a, %b			%0 = fmul float %a, %b
	%1 = fadd float %acc, %0			%1 = fadd float %acc, %0
	ret float %1			ret float %1
	}			}

	define double @t2(double %acc, double %a, double %b) {			define double @t2(double %acc, double %a, double %b) {
	entry:			entry:
	; VFP2-LABEL: t2:			; VFP2-LABEL: t2:
	▲ Show 20 Lines • Show All 71 Lines • ▼ Show 20 Lines
	; HARD: vmul.f32 s0, s2, s3			; HARD: vmul.f32 s0, s2, s3
	; HARD: vadd.f32 s0, s4, s0			; HARD: vadd.f32 s0, s4, s0
	%0 = fmul float %a, %b			%0 = fmul float %a, %b
	%1 = fadd float %e, %0			%1 = fadd float %e, %0
	%2 = fmul float %c, %d			%2 = fmul float %c, %d
	%3 = fadd float %1, %2			%3 = fadd float %1, %2
	ret float %3			ret float %3
	}			}

				attributes #0 = { minsize nounwind optsize }

llvm/trunk/test/CodeGen/Thumb2/float-intrinsics-float.ll

	; RUN: llc < %s -mtriple=thumbv7-none-eabi -mcpu=cortex-m3 \| FileCheck %s -check-prefix=CHECK -check-prefix=SOFT -check-prefix=NONE			; RUN: llc < %s -mtriple=thumbv7-none-eabi -mcpu=cortex-m3 \| FileCheck %s -check-prefix=CHECK -check-prefix=SOFT -check-prefix=NONE
	; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m4 \| FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP -check-prefix=VMLA			; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m4 \| FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP -check-prefix=NO-VMLA
				; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m33 \| FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP -check-prefix=NO-VMLA
	; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m7 \| FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=VFP -check-prefix=FP-ARMv8 -check-prefix=VMLA			; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m7 \| FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=VFP -check-prefix=FP-ARMv8 -check-prefix=VMLA
	; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m7 -mattr=+fp-only-sp \| FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP -check-prefix=FP-ARMv8 -check-prefix=VMLA			; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-m7 -mattr=+fp-only-sp \| FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=SP -check-prefix=FP-ARMv8 -check-prefix=VMLA
	; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-a7 \| FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=NEON -check-prefix=VFP4 -check-prefix=NO-VMLA			; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-a7 \| FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=NEON -check-prefix=VFP4 -check-prefix=NO-VMLA
	; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-a57 \| FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=NEON -check-prefix=FP-ARMv8 -check-prefix=VMLA			; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mcpu=cortex-a57 \| FileCheck %s -check-prefix=CHECK -check-prefix=HARD -check-prefix=DP -check-prefix=NEON -check-prefix=FP-ARMv8 -check-prefix=VMLA

	declare float @llvm.sqrt.f32(float %Val)			declare float @llvm.sqrt.f32(float %Val)
	define float @sqrt_f(float %a) {			define float @sqrt_f(float %a) {
	; CHECK-LABEL: sqrt_f:			; CHECK-LABEL: sqrt_f:
	▲ Show 20 Lines • Show All 172 Lines • ▼ Show 20 Lines
	; CHECK-LABEL: round_f:			; CHECK-LABEL: round_f:
	; SOFT: bl roundf			; SOFT: bl roundf
	; VFP4: b roundf			; VFP4: b roundf
	; FP-ARMv8: vrinta.f32			; FP-ARMv8: vrinta.f32
	%1 = call float @llvm.round.f32(float %a)			%1 = call float @llvm.round.f32(float %a)
	ret float %1			ret float %1
	}			}

	; FIXME: why does cortex-m4 use vmla, while cortex-a7 uses vmul+vadd?
	; (these should be equivalent, even the rounding is the same)
	declare float @llvm.fmuladd.f32(float %a, float %b, float %c)			declare float @llvm.fmuladd.f32(float %a, float %b, float %c)
	define float @fmuladd_f(float %a, float %b, float %c) {			define float @fmuladd_f(float %a, float %b, float %c) {
	; CHECK-LABEL: fmuladd_f:			; CHECK-LABEL: fmuladd_f:
	; SOFT: bl __aeabi_fmul			; SOFT: bl __aeabi_fmul
	; SOFT: bl __aeabi_fadd			; SOFT: bl __aeabi_fadd
	; VMLA: vmla.f32			; VMLA: vmla.f32
	; NO-VMLA: vmul.f32			; NO-VMLA: vmul.f32
	; NO-VMLA: vadd.f32			; NO-VMLA: vadd.f32
	Show All 21 Lines