This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
include/llvm/Support/
-
llvm/
-
Support/
-
MachineValueType.h
-
lib/Target/AArch64/
-
Target/
-
AArch64/
1
AArch64ISelLowering.cpp
-
test/CodeGen/AArch64/
-
CodeGen/
-
AArch64/
-
sve-fixed-length-fp-reduce.ll

Differential D132573

[AArch64 - SVE]: Use SVE to lower reduce.fadd.
ClosedPublic

Authored by hassnaa-arm on Aug 24 2022, 9:59 AM.

Download Raw Diff

Details

Reviewers

sdesmalen
david-arm

Commits

rGa6d9c944df95: [AArch64 - SVE]: Use SVE to lower reduce.fadd.

Summary

[AArch64 - SVE]: Use SVE to lower reduce.fadd.
Enable custom-lowering of NEON-like fixed-width vector versions of @llvm.vector.reduce.fadd to use SVE's fadda instruction, if SVE is enabled.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

hassnaa-arm created this revision.Aug 24 2022, 9:59 AM

Herald added a project: Restricted Project. · View Herald TranscriptAug 24 2022, 9:59 AM

Herald added subscribers: ctetreau, hiraditya, kristof.beyls, tschuett. · View Herald Transcript

hassnaa-arm requested review of this revision.Aug 24 2022, 9:59 AM

Herald added a project: Restricted Project. · View Herald TranscriptAug 24 2022, 9:59 AM

Herald added a subscriber: llvm-commits. · View Herald Transcript

hassnaa-arm added a reviewer: sdesmalen.Aug 24 2022, 10:00 AM

Matt added a subscriber: Matt.Aug 24 2022, 11:16 AM

Harbormaster completed remote builds in B183144: Diff 455261.Aug 24 2022, 11:23 AM

sdesmalen added inline comments.Aug 24 2022, 2:04 PM

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
1384–1385	This is quite a limited set of types. I wonder if it actually supports VECREDUCE_SEQ_FADD for e.g. <8 x float> when the vector length is 256, or <8 x double> when the vector length is 512. Can you try this out? Some other code in this file iterates through all possible FP/integer values and uses `useSVEForFixedLengthVectorVT()` to determine whether SVE should be used. This function has an option named `OverrideNEON`, which you can use to make it work for 128bit vectors.
llvm/test/CodeGen/AArch64/sve-fixed-length-fadd-reduce.ll
2 ↗	(On Diff #455261)	Could you also add some RUN lines for other vector lengths?

dmgreen added a subscriber: dmgreen.Aug 25 2022, 1:23 AM

dmgreen added inline comments.

llvm/test/CodeGen/AArch64/sve-fixed-length-fadd-reduce.ll
65 ↗	(On Diff #455261)	This is just an fadd, and is probably simpler to leave as it was before.

hassnaa-arm added a reviewer: david-arm.Aug 30 2022, 4:31 AM

david-arm added inline comments.Aug 30 2022, 5:37 AM

llvm/test/CodeGen/AArch64/sve-fixed-length-fadd-reduce.ll
1 ↗	(On Diff #455261)	Hi @hassnaa-arm, could you just reuse the existing fixed-length tests in CodeGen/AArch64/sve-fixed-length-fp-reduce.ll and just add another RUN line: ; RUN: llc -aarch64-sve-vector-bits-min=128 < %s \| FileCheck %s -check-prefixes=CHECK,VBITS_GE_128 That test file already contains many of the tests added here.
2 ↗	(On Diff #455261)	I think if we reuse sve-fixed-length-fp-reduce.ll, then we get the other lengths for free?

Changes of generated test code based on the new changes in codebase.

LGTM!

This revision is now accepted and ready to land.Aug 31 2022, 5:23 AM

This revision was landed with ongoing or failed builds.Aug 31 2022, 5:31 AM

Closed by commit rGa6d9c944df95: [AArch64 - SVE]: Use SVE to lower reduce.fadd. (authored by Hassnaa Hamdi <hassnaa.hamdi@arm.com>). · Explain Why

This revision was automatically updated to reflect the committed changes.

hassnaa-arm added a commit: rGa6d9c944df95: [AArch64 - SVE]: Use SVE to lower reduce.fadd..

Harbormaster completed remote builds in B184344: Diff 456932.Aug 31 2022, 5:46 AM

Revision Contents

Path

Size

llvm/

include/

llvm/

Support/

MachineValueType.h

2 lines

lib/

Target/

AArch64/

AArch64ISelLowering.cpp

8 lines

test/

CodeGen/

AArch64/

sve-fixed-length-fp-reduce.ll

62 lines

Diff 456936

llvm/include/llvm/Support/MachineValueType.h

Show First 20 Lines • Show All 140 Lines • ▼ Show 20 Lines	enum SimpleValueType : uint8_t {
v3f16 = 81, // 3 x f16		v3f16 = 81, // 3 x f16
v4f16 = 82, // 4 x f16		v4f16 = 82, // 4 x f16
v8f16 = 83, // 8 x f16		v8f16 = 83, // 8 x f16
v16f16 = 84, // 16 x f16		v16f16 = 84, // 16 x f16
v32f16 = 85, // 32 x f16		v32f16 = 85, // 32 x f16
v64f16 = 86, // 64 x f16		v64f16 = 86, // 64 x f16
v128f16 = 87, // 128 x f16		v128f16 = 87, // 128 x f16
v256f16 = 88, // 256 x f16		v256f16 = 88, // 256 x f16
v512f16 = 89, // 256 x f16		v512f16 = 89, // 512 x f16

v2bf16 = 90, // 2 x bf16		v2bf16 = 90, // 2 x bf16
v3bf16 = 91, // 3 x bf16		v3bf16 = 91, // 3 x bf16
v4bf16 = 92, // 4 x bf16		v4bf16 = 92, // 4 x bf16
v8bf16 = 93, // 8 x bf16		v8bf16 = 93, // 8 x bf16
v16bf16 = 94, // 16 x bf16		v16bf16 = 94, // 16 x bf16
v32bf16 = 95, // 32 x bf16		v32bf16 = 95, // 32 x bf16
v64bf16 = 96, // 64 x bf16		v64bf16 = 96, // 64 x bf16
▲ Show 20 Lines • Show All 1,361 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,374 Lines • ▼ Show 20 Lines	for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
setOperationAction(ISD::SDIV, VT, Custom);		setOperationAction(ISD::SDIV, VT, Custom);
setOperationAction(ISD::UDIV, VT, Custom);		setOperationAction(ISD::UDIV, VT, Custom);
}		}

// NEON doesn't support 64-bit vector integer muls, but SVE does.		// NEON doesn't support 64-bit vector integer muls, but SVE does.
setOperationAction(ISD::MUL, MVT::v1i64, Custom);		setOperationAction(ISD::MUL, MVT::v1i64, Custom);
setOperationAction(ISD::MUL, MVT::v2i64, Custom);		setOperationAction(ISD::MUL, MVT::v2i64, Custom);

		// NEON doesn't support across-vector reductions, but SVE does.
		for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
		setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
		sdesmalenUnsubmitted Not Done Reply Inline Actions This is quite a limited set of types. I wonder if it actually supports VECREDUCE_SEQ_FADD for e.g. <8 x float> when the vector length is 256, or <8 x double> when the vector length is 512. Can you try this out? Some other code in this file iterates through all possible FP/integer values and uses `useSVEForFixedLengthVectorVT()` to determine whether SVE should be used. This function has an option named `OverrideNEON`, which you can use to make it work for 128bit vectors. sdesmalen: This is quite a limited set of types. I wonder if it actually supports VECREDUCE_SEQ_FADD for e.

// NOTE: Currently this has to happen after computeRegisterProperties rather		// NOTE: Currently this has to happen after computeRegisterProperties rather
// than the preferred option of combining it with the addRegisterClass call.		// than the preferred option of combining it with the addRegisterClass call.
if (Subtarget->useSVEForFixedLengthVectors()) {		if (Subtarget->useSVEForFixedLengthVectors()) {
for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())		for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
if (useSVEForFixedLengthVectorVT(VT))		if (useSVEForFixedLengthVectorVT(VT))
addTypeForFixedLengthSVE(VT);		addTypeForFixedLengthSVE(VT);
for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())		for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
if (useSVEForFixedLengthVectorVT(VT))		if (useSVEForFixedLengthVectorVT(VT))
Show All 37 Lines	if (Subtarget->useSVEForFixedLengthVectors()) {
MVT::v2i32, MVT::v4i32, MVT::v2i64}) {		MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
setOperationAction(ISD::BITREVERSE, VT, Custom);		setOperationAction(ISD::BITREVERSE, VT, Custom);
setOperationAction(ISD::CTTZ, VT, Custom);		setOperationAction(ISD::CTTZ, VT, Custom);
setOperationAction(ISD::VECREDUCE_AND, VT, Custom);		setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
setOperationAction(ISD::VECREDUCE_OR, VT, Custom);		setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);		setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
}		}

// FP operations with no NEON support.
for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32,
MVT::v1f64, MVT::v2f64})
setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);

// Use SVE for vectors with more than 2 elements.		// Use SVE for vectors with more than 2 elements.
for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})		for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);		setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
}		}

setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);		setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);		setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
▲ Show 20 Lines • Show All 20,785 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/sve-fixed-length-fp-reduce.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py		; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s \| FileCheck %s -check-prefixes=CHECK,VBITS_GE_256		; RUN: llc -aarch64-sve-vector-bits-min=256 < %s \| FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s \| FileCheck %s -check-prefixes=CHECK,VBITS_GE_512		; RUN: llc -aarch64-sve-vector-bits-min=512 < %s \| FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s \| FileCheck %s -check-prefixes=CHECK,VBITS_GE_512		; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s \| FileCheck %s -check-prefixes=CHECK,VBITS_GE_512

target triple = "aarch64-unknown-linux-gnu"		target triple = "aarch64-unknown-linux-gnu"

;		;
; FADDA		; FADDA
;		;

; No single instruction NEON support. Use SVE.		; No single instruction NEON support. Use SVE.
define half @fadda_v4f16(half %start, <4 x half> %a) vscale_range(1,0) #0 {		define half @fadda_v4f16(half %start, <4 x half> %a) vscale_range(1,0) #0 {
; CHECK-LABEL: fadda_v4f16:		; CHECK-LABEL: fadda_v4f16:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1		; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
; CHECK-NEXT: mov h2, v1.h[1]		; CHECK-NEXT: ptrue p0.h, vl4
; CHECK-NEXT: fadd h0, h0, h1		; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: mov h3, v1.h[2]		; CHECK-NEXT: fadda h0, p0, h0, z1.h
; CHECK-NEXT: mov h1, v1.h[3]		; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
; CHECK-NEXT: fadd h0, h0, h2
; CHECK-NEXT: fadd h0, h0, h3
; CHECK-NEXT: fadd h0, h0, h1
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)		%res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
ret half %res		ret half %res
}		}

; No single instruction NEON support. Use SVE.		; No single instruction NEON support. Use SVE.
define half @fadda_v8f16(half %start, <8 x half> %a) vscale_range(1,0) #0 {		define half @fadda_v8f16(half %start, <8 x half> %a) vscale_range(1,0) #0 {
; CHECK-LABEL: fadda_v8f16:		; CHECK-LABEL: fadda_v8f16:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: mov h2, v1.h[1]		; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
; CHECK-NEXT: fadd h0, h0, h1		; CHECK-NEXT: ptrue p0.h, vl8
; CHECK-NEXT: mov h3, v1.h[2]		; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: fadd h0, h0, h2		; CHECK-NEXT: fadda h0, p0, h0, z1.h
; CHECK-NEXT: mov h2, v1.h[3]		; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
; CHECK-NEXT: fadd h0, h0, h3
; CHECK-NEXT: mov h3, v1.h[4]
; CHECK-NEXT: fadd h0, h0, h2
; CHECK-NEXT: mov h2, v1.h[5]
; CHECK-NEXT: fadd h0, h0, h3
; CHECK-NEXT: mov h3, v1.h[6]
; CHECK-NEXT: mov h1, v1.h[7]
; CHECK-NEXT: fadd h0, h0, h2
; CHECK-NEXT: fadd h0, h0, h3
; CHECK-NEXT: fadd h0, h0, h1
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%res = call half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a)		%res = call half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a)
ret half %res		ret half %res
}		}

define half @fadda_v16f16(half %start, <16 x half>* %a) vscale_range(2,0) #0 {		define half @fadda_v16f16(half %start, <16 x half>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fadda_v16f16:		; CHECK-LABEL: fadda_v16f16:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
▲ Show 20 Lines • Show All 61 Lines • ▼ Show 20 Lines	; CHECK-NEXT: ret
%res = call half @llvm.vector.reduce.fadd.v128f16(half %start, <128 x half> %op)		%res = call half @llvm.vector.reduce.fadd.v128f16(half %start, <128 x half> %op)
ret half %res		ret half %res
}		}

; No single instruction NEON support. Use SVE.		; No single instruction NEON support. Use SVE.
define float @fadda_v2f32(float %start, <2 x float> %a) vscale_range(1,0) #0 {		define float @fadda_v2f32(float %start, <2 x float> %a) vscale_range(1,0) #0 {
; CHECK-LABEL: fadda_v2f32:		; CHECK-LABEL: fadda_v2f32:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1		; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
; CHECK-NEXT: mov s2, v1.s[1]		; CHECK-NEXT: ptrue p0.s, vl2
; CHECK-NEXT: fadd s0, s0, s1		; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: fadd s0, s0, s2		; CHECK-NEXT: fadda s0, p0, s0, z1.s
		; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%res = call float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a)		%res = call float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a)
ret float %res		ret float %res
}		}

; No single instruction NEON support. Use SVE.		; No single instruction NEON support. Use SVE.
define float @fadda_v4f32(float %start, <4 x float> %a) vscale_range(1,0) #0 {		define float @fadda_v4f32(float %start, <4 x float> %a) vscale_range(1,0) #0 {
; CHECK-LABEL: fadda_v4f32:		; CHECK-LABEL: fadda_v4f32:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: mov s2, v1.s[1]		; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
; CHECK-NEXT: fadd s0, s0, s1		; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: mov s3, v1.s[2]		; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
; CHECK-NEXT: mov s1, v1.s[3]		; CHECK-NEXT: fadda s0, p0, s0, z1.s
; CHECK-NEXT: fadd s0, s0, s2		; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
; CHECK-NEXT: fadd s0, s0, s3
; CHECK-NEXT: fadd s0, s0, s1
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%res = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a)		%res = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a)
ret float %res		ret float %res
}		}

define float @fadda_v8f32(float %start, <8 x float>* %a) vscale_range(2,0) #0 {		define float @fadda_v8f32(float %start, <8 x float>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fadda_v8f32:		; CHECK-LABEL: fadda_v8f32:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
▲ Show 20 Lines • Show All 71 Lines • ▼ Show 20 Lines	; CHECK-NEXT: ret
%res = call double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a)		%res = call double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a)
ret double %res		ret double %res
}		}

; No single instruction NEON support. Use SVE.		; No single instruction NEON support. Use SVE.
define double @fadda_v2f64(double %start, <2 x double> %a) vscale_range(1,0) #0 {		define double @fadda_v2f64(double %start, <2 x double> %a) vscale_range(1,0) #0 {
; CHECK-LABEL: fadda_v2f64:		; CHECK-LABEL: fadda_v2f64:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: mov d2, v1.d[1]		; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: fadd d0, d0, d1		; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: fadd d0, d0, d2		; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
		; CHECK-NEXT: fadda d0, p0, d0, z1.d
		; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%res = call double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a)		%res = call double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a)
ret double %res		ret double %res
}		}

define double @fadda_v4f64(double %start, <4 x double>* %a) vscale_range(2,0) #0 {		define double @fadda_v4f64(double %start, <4 x double>* %a) vscale_range(2,0) #0 {
; CHECK-LABEL: fadda_v4f64:		; CHECK-LABEL: fadda_v4f64:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
▲ Show 20 Lines • Show All 897 Lines • Show Last 20 Lines