This is an archive of the discontinued LLVM Phabricator instance.

[DAGCombiner] Limit scaling of repeated FP divisor by splat factor
Needs ReviewPublic

Authored by c-rhodes on Jan 27 2022, 5:46 AM.

Download Raw Diff

Details

Reviewers

sdesmalen
dmgreen
david-arm

Summary

In combineRepeatedFPDivisors only scale num uses by splat if the
division can be converted into scalar op.

Diff Detail

Event Timeline

c-rhodes created this revision.Jan 27 2022, 5:46 AM

Herald added subscribers: ecnelises, hiraditya. · View Herald TranscriptJan 27 2022, 5:46 AM

c-rhodes requested review of this revision.Jan 27 2022, 5:46 AM

Herald added a project: Restricted Project. · View Herald TranscriptJan 27 2022, 5:46 AM

c-rhodes added a parent revision: D118343: [DAGCombiner] Fix invalid size request in combineRepeatedFPDivisors.Jan 27 2022, 5:47 AM

david-arm added inline comments.Feb 2 2022, 6:17 AM

llvm/test/CodeGen/AArch64/fdiv-combine.ll
199	To be honest, the original code looks faster to me even with the extra fmul and fmov. The latency of a fmul is a lot lower than fdiv and the throughput for fdiv is terrible, whereas it's pretty good for fmul.

david-arm resigned from this revision.Aug 31 2023, 2:44 AM

Herald added a project: Restricted Project. · View Herald TranscriptAug 31 2023, 2:44 AM

Revision Contents

Path

Size

llvm/

lib/

CodeGen/

SelectionDAG/

DAGCombiner.cpp

2 lines

test/

CodeGen/

AArch64/

fdiv-combine.ll

18 lines

Diff 403623

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 14,530 Lines • ▼ Show 20 Lines	SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {
// Exit early if the target does not want this transform or if there can't		// Exit early if the target does not want this transform or if there can't
// possibly be enough uses of the divisor to make the transform worthwhile.		// possibly be enough uses of the divisor to make the transform worthwhile.
unsigned MinUses = TLI.combineRepeatedFPDivisors();		unsigned MinUses = TLI.combineRepeatedFPDivisors();

// For splat vectors, scale the number of uses by the splat factor. If we can		// For splat vectors, scale the number of uses by the splat factor. If we can
// convert the division into a scalar op, that will likely be much faster.		// convert the division into a scalar op, that will likely be much faster.
unsigned NumElts = 1;		unsigned NumElts = 1;
EVT VT = N->getValueType(0);		EVT VT = N->getValueType(0);
if (VT.isVector() && DAG.isSplatValue(N1))		if (VT.isVector() && DAG.isSplatValue(N1) && TLI.isExtractVecEltCheap(VT, 0))
NumElts = VT.getVectorMinNumElements();		NumElts = VT.getVectorMinNumElements();

if (!MinUses \|\| (N1->use_size() * NumElts) < MinUses)		if (!MinUses \|\| (N1->use_size() * NumElts) < MinUses)
return SDValue();		return SDValue();

// Find all FDIV users of the same divisor.		// Find all FDIV users of the same divisor.
// Use a set because duplicates may be present in the user list.		// Use a set because duplicates may be present in the user list.
SetVector<SDNode *> Users;		SetVector<SDNode *> Users;
▲ Show 20 Lines • Show All 9,552 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/fdiv-combine.ll

Show First 20 Lines • Show All 114 Lines • ▼ Show 20 Lines	; CHECK-NEXT: b foo_3_4xf
%div2 = fdiv <4 x float> %c, %splat		%div2 = fdiv <4 x float> %c, %splat
tail call void @foo_3_4xf(<4 x float> %div, <4 x float> %div1, <4 x float> %div2)		tail call void @foo_3_4xf(<4 x float> %div, <4 x float> %div1, <4 x float> %div2)
ret void		ret void
}		}

define <4 x float> @splat_fdiv_v4f32(float %D, <4 x float> %a) #1 {		define <4 x float> @splat_fdiv_v4f32(float %D, <4 x float> %a) #1 {
; CHECK-LABEL: splat_fdiv_v4f32:		; CHECK-LABEL: splat_fdiv_v4f32:
; CHECK: // %bb.0: // %entry		; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmov v2.4s, #1.00000000
; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0		; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
; CHECK-NEXT: dup v0.4s, v0.s[0]		; CHECK-NEXT: dup v0.4s, v0.s[0]
; CHECK-NEXT: fdiv v0.4s, v2.4s, v0.4s		; CHECK-NEXT: fdiv v0.4s, v1.4s, v0.4s
; CHECK-NEXT: fmul v0.4s, v1.4s, v0.4s
; CHECK-NEXT: ret		; CHECK-NEXT: ret
entry:		entry:
%D.ins = insertelement <4 x float> poison, float %D, i64 0		%D.ins = insertelement <4 x float> poison, float %D, i64 0
%splat = shufflevector <4 x float> %D.ins, <4 x float> poison, <4 x i32> zeroinitializer		%splat = shufflevector <4 x float> %D.ins, <4 x float> poison, <4 x i32> zeroinitializer
%div = fdiv <4 x float> %a, %splat		%div = fdiv <4 x float> %a, %splat
ret <4 x float> %div		ret <4 x float> %div
}		}

define <vscale x 4 x float> @splat_fdiv_nxv4f32(float %D, <vscale x 4 x float> %a) #1 {		define <vscale x 4 x float> @splat_fdiv_nxv4f32(float %D, <vscale x 4 x float> %a) #1 {
; CHECK-LABEL: splat_fdiv_nxv4f32:		; CHECK-LABEL: splat_fdiv_nxv4f32:
; CHECK: // %bb.0: // %entry		; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0		; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
; CHECK-NEXT: fmov z2.s, #1.00000000
; CHECK-NEXT: ptrue p0.s		; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mov z0.s, s0		; CHECK-NEXT: mov z0.s, s0
; CHECK-NEXT: fdivr z0.s, p0/m, z0.s, z2.s		; CHECK-NEXT: fdivr z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: fmul z0.s, z1.s, z0.s
; CHECK-NEXT: ret		; CHECK-NEXT: ret
entry:		entry:
%D.ins = insertelement <vscale x 4 x float> poison, float %D, i64 0		%D.ins = insertelement <vscale x 4 x float> poison, float %D, i64 0
%splat = shufflevector <vscale x 4 x float> %D.ins, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer		%splat = shufflevector <vscale x 4 x float> %D.ins, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
%div = fdiv <vscale x 4 x float> %a, %splat		%div = fdiv <vscale x 4 x float> %a, %splat
ret <vscale x 4 x float> %div		ret <vscale x 4 x float> %div
}		}

Show All 33 Lines	entry:
%div = fdiv <vscale x 2 x double> %a, %splat		%div = fdiv <vscale x 2 x double> %a, %splat
ret <vscale x 2 x double> %div		ret <vscale x 2 x double> %div
}		}

define void @splat_two_fdiv_nxv2f64(double %D, <vscale x 2 x double> %a, <vscale x 2 x double> %b) #1 {		define void @splat_two_fdiv_nxv2f64(double %D, <vscale x 2 x double> %a, <vscale x 2 x double> %b) #1 {
; CHECK-LABEL: splat_two_fdiv_nxv2f64:		; CHECK-LABEL: splat_two_fdiv_nxv2f64:
; CHECK: // %bb.0: // %entry		; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0		; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: fmov z3.d, #1.00000000
; CHECK-NEXT: ptrue p0.d		; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mov z0.d, d0		; CHECK-NEXT: mov z3.d, d0
; CHECK-NEXT: fdiv z3.d, p0/m, z3.d, z0.d		; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fmul z0.d, z1.d, z3.d		; CHECK-NEXT: fdiv z0.d, p0/m, z0.d, z3.d
; CHECK-NEXT: fmul z1.d, z2.d, z3.d		; CHECK-NEXT: movprfx z1, z2
david-armUnsubmitted Not Done Reply Inline Actions To be honest, the original code looks faster to me even with the extra fmul and fmov. The latency of a fmul is a lot lower than fdiv and the throughput for fdiv is terrible, whereas it's pretty good for fmul. david-arm: To be honest, the original code looks faster to me even with the extra fmul and fmov. The…
		; CHECK-NEXT: fdiv z1.d, p0/m, z1.d, z3.d
; CHECK-NEXT: b foo_2_nxv2f64		; CHECK-NEXT: b foo_2_nxv2f64
entry:		entry:
%D.ins = insertelement <vscale x 2 x double> poison, double %D, i64 0		%D.ins = insertelement <vscale x 2 x double> poison, double %D, i64 0
%splat = shufflevector <vscale x 2 x double> %D.ins, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer		%splat = shufflevector <vscale x 2 x double> %D.ins, <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
%div = fdiv <vscale x 2 x double> %a, %splat		%div = fdiv <vscale x 2 x double> %a, %splat
%div1 = fdiv <vscale x 2 x double> %b, %splat		%div1 = fdiv <vscale x 2 x double> %b, %splat
tail call void @foo_2_nxv2f64(<vscale x 2 x double> %div, <vscale x 2 x double> %div1)		tail call void @foo_2_nxv2f64(<vscale x 2 x double> %div, <vscale x 2 x double> %div1)
ret void		ret void
Show All 13 Lines