This is an archive of the discontinued LLVM Phabricator instance.

lizhijin retitled this revision from [AArch64] Add patterns to support indexed FMLA/FMLS to [AArch64][SVE] Add patterns to support sve indexed FMLA/FMLS.Jul 6 2023, 11:19 PM

Herald added a subscriber: psnobl. · View Herald TranscriptJul 6 2023, 11:19 PM

ping

paulwalker-arm added inline comments.Jul 7 2023, 5:17 AM

llvm/test/CodeGen/AArch64/sve-fma.ll
7–12	Looking at https://developer.arm.com/documentation/ddi0602/2023-06/SVE-Instructions/FMLA--indexed---Floating-point-fused-multiply-add-by-indexed-elements--Zda---Zda---Zn---Zm-indexed--- I think you've misunderstood how the indexed instructions operate. The index FMLA instruction does not multiple all elements of `Zn` by `Zm[0]` but rather is multiplies the elements within each 128-bit chunk of `Zn` by the element whose index applies to that same 128-bit chunk. Taking a 256-bit SVE implementation, an element type of f32 and an index of 1, the operation is: Za[0] += Zn[0]Zm[1]; Za[1] += Zn[1]Zm[1]; Za[2] += Zn[2]Zm[1]; Za[3] += Zn[3]Zm[1]; Za[4] += Zn[4]Zm[5]; Za[5] += Zn[5]Zm[5]; Za[6] += Zn[6]Zm[5]; Za[7] += Zn[7]Zm[5]; Which means in order for these tests to be functionally the same after the transformation an explicit splat is required and thus they'd be little point in using the index instruction.

Matt added a subscriber: Matt.Jul 10 2023, 8:40 AM

Revision Contents

Path

Size

llvm/

lib/

Target/

AArch64/

AArch64SVEInstrInfo.td

5 lines

SVEInstrFormats.td

12 lines

test/

CodeGen/

AArch64/

sve-fma.ll

148 lines

Diff 537560

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 696 Lines • ▼ Show 20 Lines	let Predicates = [HasSVEorSME] in {
defm FNMLA_ZPZZZ : sve_fp_3op_pred_hfd<AArch64fnmla_p>;		defm FNMLA_ZPZZZ : sve_fp_3op_pred_hfd<AArch64fnmla_p>;
defm FNMLS_ZPZZZ : sve_fp_3op_pred_hfd<AArch64fnmls_p>;		defm FNMLS_ZPZZZ : sve_fp_3op_pred_hfd<AArch64fnmls_p>;
} // End HasSVEorSME		} // End HasSVEorSME

let Predicates = [HasSVE] in {		let Predicates = [HasSVE] in {
defm FTMAD_ZZI : sve_fp_ftmad<"ftmad", int_aarch64_sve_ftmad_x>;		defm FTMAD_ZZI : sve_fp_ftmad<"ftmad", int_aarch64_sve_ftmad_x>;
} // End HasSVE		} // End HasSVE

		class TriSVEOpFrag<dag res> : PatFrag<(ops node:$pg, node:$zn, node:$zm, node:$za), res>;

let Predicates = [HasSVEorSME] in {		let Predicates = [HasSVEorSME] in {
defm FMLA_ZZZI : sve_fp_fma_by_indexed_elem<0b00, "fmla", int_aarch64_sve_fmla_lane>;		defm FMLA_ZZZI : sve_fp_fma_by_indexed_elem<0b00, "fmla", int_aarch64_sve_fmla_lane>;
defm FMLS_ZZZI : sve_fp_fma_by_indexed_elem<0b01, "fmls", int_aarch64_sve_fmls_lane>;		defm FMLS_ZZZI : sve_fp_fma_by_indexed_elem<0b01, "fmls", int_aarch64_sve_fmls_lane>;
		defm : SVEFPIndexedTiedPatterns<"FMLA_ZZZI", TriSVEOpFrag<(AArch64fma_p node:$pg, node:$zn, node:$zm, node:$za)>>;
		defm : SVEFPIndexedTiedPatterns<"FMLS_ZZZI", TriSVEOpFrag<(AArch64fma_p node:$pg, (AArch64fneg_mt node:$pg, node:$zn, (undef)), node:$zm, node:$za)>>;
		defm : SVEFPIndexedTiedPatterns<"FMLS_ZZZI", TriSVEOpFrag<(AArch64fma_p node:$pg, node:$zm, (AArch64fneg_mt node:$pg, node:$zn, (undef)), node:$za)>>;

defm FCMLA_ZZZI : sve_fp_fcmla_by_indexed_elem<"fcmla", int_aarch64_sve_fcmla_lane>;		defm FCMLA_ZZZI : sve_fp_fcmla_by_indexed_elem<"fcmla", int_aarch64_sve_fcmla_lane>;
defm FMUL_ZZZI : sve_fp_fmul_by_indexed_elem<"fmul", int_aarch64_sve_fmul_lane>;		defm FMUL_ZZZI : sve_fp_fmul_by_indexed_elem<"fmul", int_aarch64_sve_fmul_lane>;
} // End HasSVEorSME		} // End HasSVEorSME

let Predicates = [HasSVE] in {		let Predicates = [HasSVE] in {
// SVE floating point reductions.		// SVE floating point reductions.
defm FADDA_VPZ : sve_fp_2op_p_vd<0b000, "fadda", AArch64fadda_p>;		defm FADDA_VPZ : sve_fp_2op_p_vd<0b000, "fadda", AArch64fadda_p>;
▲ Show 20 Lines • Show All 3,273 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/SVEInstrFormats.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 2,387 Lines • ▼ Show 20 Lines	multiclass sve_fp_fma_by_indexed_elem<bits<2> opc, string asm,
def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, (i32 VectorIndexH32b_timm:$idx))),		def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, (i32 VectorIndexH32b_timm:$idx))),
(!cast<Instruction>(NAME # _H) $Op1, $Op2, $Op3, VectorIndexH32b_timm:$idx)>;		(!cast<Instruction>(NAME # _H) $Op1, $Op2, $Op3, VectorIndexH32b_timm:$idx)>;
def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, (i32 VectorIndexS32b_timm:$idx))),		def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, (i32 VectorIndexS32b_timm:$idx))),
(!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b_timm:$idx)>;		(!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b_timm:$idx)>;
def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, nxv2f64:$Op3, (i32 VectorIndexD32b_timm:$idx))),		def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, nxv2f64:$Op3, (i32 VectorIndexD32b_timm:$idx))),
(!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b_timm:$idx)>;		(!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b_timm:$idx)>;
}		}

		class SVEFPIndexedTiedPattern<SDPatternOperator OpNode, ValueType vtd,
		ValueType vtp, ValueType scalartp, Instruction inst, ZPRRegOp zprty, Operand imm_ty>
		: Pat <(vtd (OpNode (vtp (SVEAnyPredicate)),
		(vtd (splat_vector (scalartp (vector_extract (vtd zprty:$Zda), imm_ty:$idx)))),
		(vtd zprty:$Zn), (vtd zprty:$Zm))),
		(inst zprty:$Zda, zprty:$Zn, zprty:$Zm, (!cast<SDNodeXForm>("trunc_imm") $idx))>;

		multiclass SVEFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
		def : SVEFPIndexedTiedPattern<OpNode, nxv8f16, nxv8i1, f16, !cast<Instruction>(INST # "_H"), ZPR16, VectorIndexH>;
		def : SVEFPIndexedTiedPattern<OpNode, nxv4f32, nxv4i1, f32, !cast<Instruction>(INST # "_S"), ZPR32, VectorIndexS>;
		def : SVEFPIndexedTiedPattern<OpNode, nxv2f64, nxv2i1, f64, !cast<Instruction>(INST # "_D"), ZPR64, VectorIndexD>;
		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// SVE Floating Point Multiply - Indexed Group		// SVE Floating Point Multiply - Indexed Group
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

class sve_fp_fmul_by_indexed_elem<bits<2> sz, bit o2, string asm, ZPRRegOp zprty,		class sve_fp_fmul_by_indexed_elem<bits<2> sz, bit o2, string asm, ZPRRegOp zprty,
ZPRRegOp zprty2, Operand itype>		ZPRRegOp zprty2, Operand itype>
: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty2:$Zm, itype:$iop),		: I<(outs zprty:$Zd), (ins zprty:$Zn, zprty2:$Zm, itype:$iop),
▲ Show 20 Lines • Show All 7,616 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/sve-fma.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s \| FileCheck %s

				define <vscale x 8 x half> @sve_fma_nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) {
				; CHECK-LABEL: sve_fma_nxv8f16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: fmla z1.h, z0.h, z2.h[0]
				; CHECK-NEXT: mov z0.d, z1.d
				; CHECK-NEXT: ret
				%b0splat = shufflevector <vscale x 8 x half> %b, <vscale x 8 x half> undef, <vscale x 8 x i32> zeroinitializer
				%mad = call <vscale x 8 x half> @llvm.fma.nxv8f16(<vscale x 8 x half> %b0splat, <vscale x 8 x half> %a, <vscale x 8 x half> %c)
				ret <vscale x 8 x half> %mad
				paulwalker-armUnsubmitted Not Done Reply Inline Actions Looking at https://developer.arm.com/documentation/ddi0602/2023-06/SVE-Instructions/FMLA--indexed---Floating-point-fused-multiply-add-by-indexed-elements--Zda---Zda---Zn---Zm-indexed--- I think you've misunderstood how the indexed instructions operate. The index FMLA instruction does not multiple all elements of `Zn` by `Zm[0]` but rather is multiplies the elements within each 128-bit chunk of `Zn` by the element whose index applies to that same 128-bit chunk. Taking a 256-bit SVE implementation, an element type of f32 and an index of 1, the operation is: Za[0] += Zn[0]Zm[1]; Za[1] += Zn[1]Zm[1]; Za[2] += Zn[2]Zm[1]; Za[3] += Zn[3]Zm[1]; Za[4] += Zn[4]Zm[5]; Za[5] += Zn[5]Zm[5]; Za[6] += Zn[6]Zm[5]; Za[7] += Zn[7]Zm[5]; Which means in order for these tests to be functionally the same after the transformation an explicit splat is required and thus they'd be little point in using the index instruction. paulwalker-arm: Looking at https://developer.arm.com/documentation/ddi0602/2023-06/SVE-Instructions/FMLA…
				}

				define <vscale x 4 x float> @sve_fma_nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c) {
				; CHECK-LABEL: sve_fma_nxv4f32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: fmla z1.s, z0.s, z2.s[0]
				; CHECK-NEXT: mov z0.d, z1.d
				; CHECK-NEXT: ret
				%b0splat = shufflevector <vscale x 4 x float> %b, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
				%mad = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> %b0splat, <vscale x 4 x float> %a, <vscale x 4 x float> %c)
				ret <vscale x 4 x float> %mad
				}

				define <vscale x 2 x double> @sve_fma_nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c) {
				; CHECK-LABEL: sve_fma_nxv2f64:
				; CHECK: // %bb.0:
				; CHECK-NEXT: fmla z1.d, z0.d, z2.d[0]
				; CHECK-NEXT: mov z0.d, z1.d
				; CHECK-NEXT: ret
				%b0splat = shufflevector <vscale x 2 x double> %b, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer
				%mad = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> %b0splat, <vscale x 2 x double> %a, <vscale x 2 x double> %c)
				ret <vscale x 2 x double> %mad
				}

				define <vscale x 8 x half> @sve_fmuladd_nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) {
				; CHECK-LABEL: sve_fmuladd_nxv8f16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: fmla z1.h, z0.h, z2.h[0]
				; CHECK-NEXT: mov z0.d, z1.d
				; CHECK-NEXT: ret
				%b0splat = shufflevector <vscale x 8 x half> %b, <vscale x 8 x half> undef, <vscale x 8 x i32> zeroinitializer
				%mad = call <vscale x 8 x half> @llvm.fmuladd.nxv8f16(<vscale x 8 x half> %b0splat, <vscale x 8 x half> %a, <vscale x 8 x half> %c)
				ret <vscale x 8 x half> %mad
				}

				define <vscale x 4 x float> @sve_fmuladd_nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c) {
				; CHECK-LABEL: sve_fmuladd_nxv4f32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: fmla z1.s, z0.s, z2.s[0]
				; CHECK-NEXT: mov z0.d, z1.d
				; CHECK-NEXT: ret
				%b0splat = shufflevector <vscale x 4 x float> %b, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
				%mad = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> %b0splat, <vscale x 4 x float> %a, <vscale x 4 x float> %c)
				ret <vscale x 4 x float> %mad
				}

				define <vscale x 2 x double> @sve_fmuladd_nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c) {
				; CHECK-LABEL: sve_fmuladd_nxv2f64:
				; CHECK: // %bb.0:
				; CHECK-NEXT: fmla z1.d, z0.d, z2.d[0]
				; CHECK-NEXT: mov z0.d, z1.d
				; CHECK-NEXT: ret
				%b0splat = shufflevector <vscale x 2 x double> %b, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer
				%mad = call <vscale x 2 x double> @llvm.fmuladd.nxv2f64(<vscale x 2 x double> %b0splat, <vscale x 2 x double> %a, <vscale x 2 x double> %c)
				ret <vscale x 2 x double> %mad
				}

				define <vscale x 8 x half> @sve_fmls_nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) {
				; CHECK-LABEL: sve_fmls_nxv8f16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: fmls z1.h, z0.h, z2.h[0]
				; CHECK-NEXT: mov z0.d, z1.d
				; CHECK-NEXT: ret
				%b0splat = shufflevector <vscale x 8 x half> %b, <vscale x 8 x half> undef, <vscale x 8 x i32> zeroinitializer
				%b0splat_neg = fneg <vscale x 8 x half> %b0splat
				%mad = call <vscale x 8 x half> @llvm.fmuladd.nxv8f16(<vscale x 8 x half> %b0splat_neg, <vscale x 8 x half> %a, <vscale x 8 x half> %c)
				ret <vscale x 8 x half> %mad
				}

				define <vscale x 4 x float> @sve_fmls_nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c) {
				; CHECK-LABEL: sve_fmls_nxv4f32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: fmls z1.s, z0.s, z2.s[0]
				; CHECK-NEXT: mov z0.d, z1.d
				; CHECK-NEXT: ret
				%b0splat = shufflevector <vscale x 4 x float> %b, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
				%b0splat_neg = fneg <vscale x 4 x float> %b0splat
				%mad = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> %b0splat_neg, <vscale x 4 x float> %a, <vscale x 4 x float> %c)
				ret <vscale x 4 x float> %mad
				}

				define <vscale x 2 x double> @sve_fmls_nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c) {
				; CHECK-LABEL: sve_fmls_nxv2f64:
				; CHECK: // %bb.0:
				; CHECK-NEXT: fmls z1.d, z0.d, z2.d[0]
				; CHECK-NEXT: mov z0.d, z1.d
				; CHECK-NEXT: ret
				%b0splat = shufflevector <vscale x 2 x double> %b, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer
				%b0splat_neg = fneg <vscale x 2 x double> %b0splat
				%mad = call <vscale x 2 x double> @llvm.fmuladd.nxv2f64(<vscale x 2 x double> %b0splat_neg, <vscale x 2 x double> %a, <vscale x 2 x double> %c)
				ret <vscale x 2 x double> %mad
				}

				define <vscale x 8 x half> @sve_fmls_nxv8f16_1(<vscale x 8 x half> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) {
				; CHECK-LABEL: sve_fmls_nxv8f16_1:
				; CHECK: // %bb.0:
				; CHECK-NEXT: fmls z1.h, z0.h, z2.h[0]
				; CHECK-NEXT: mov z0.d, z1.d
				; CHECK-NEXT: ret
				%b0splat = shufflevector <vscale x 8 x half> %b, <vscale x 8 x half> undef, <vscale x 8 x i32> zeroinitializer
				%b0splat_neg = fneg <vscale x 8 x half> %b0splat
				%mad = call <vscale x 8 x half> @llvm.fmuladd.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b0splat_neg, <vscale x 8 x half> %c)
				ret <vscale x 8 x half> %mad
				}

				define <vscale x 4 x float> @sve_fmls_nxv4f32_1(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c) {
				; CHECK-LABEL: sve_fmls_nxv4f32_1:
				; CHECK: // %bb.0:
				; CHECK-NEXT: fmls z1.s, z0.s, z2.s[0]
				; CHECK-NEXT: mov z0.d, z1.d
				; CHECK-NEXT: ret
				%b0splat = shufflevector <vscale x 4 x float> %b, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
				%b0splat_neg = fneg <vscale x 4 x float> %b0splat
				%mad = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b0splat_neg, <vscale x 4 x float> %c)
				ret <vscale x 4 x float> %mad
				}

				define <vscale x 2 x double> @sve_fmls_nxv2f64_1(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x double> %c) {
				; CHECK-LABEL: sve_fmls_nxv2f64_1:
				; CHECK: // %bb.0:
				; CHECK-NEXT: fmls z1.d, z0.d, z2.d[0]
				; CHECK-NEXT: mov z0.d, z1.d
				; CHECK-NEXT: ret
				%b0splat = shufflevector <vscale x 2 x double> %b, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer
				%b0splat_neg = fneg <vscale x 2 x double> %b0splat
				%mad = call <vscale x 2 x double> @llvm.fmuladd.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b0splat_neg, <vscale x 2 x double> %c)
				ret <vscale x 2 x double> %mad
				}

				declare <vscale x 8 x half> @llvm.fma.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
				declare <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
				declare <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)

				declare <vscale x 8 x half> @llvm.fmuladd.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
				declare <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
				declare <vscale x 2 x double> @llvm.fmuladd.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)

This is an archive of the discontinued LLVM Phabricator instance.

[AArch64][SVE] Add patterns to support sve indexed FMLA/FMLSNeeds ReviewPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 537560

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

llvm/lib/Target/AArch64/SVEInstrFormats.td

llvm/test/CodeGen/AArch64/sve-fma.ll

[AArch64][SVE] Add patterns to support sve indexed FMLA/FMLS
Needs ReviewPublic