This is an archive of the discontinued LLVM Phabricator instance.

[AArch64] Fold an sqadd of a sqdmull at lane 0 into an sqdmlal
AbandonedPublic

Authored by samtebbs on Aug 31 2021, 8:36 AM.

Download Raw Diff

Details

Reviewers

dmgreen
SjoerdMeijer
NickGuy

Summary

This patch folds a sqadd (i32, vector_extract (sqdmull v4i16, v4i16, 0)) into a sqdmlal. We already generate an sqdmlal for lanes greater than 0, so this patch emits an sqdmlal of the same format but for a lane of 0, and is necessary as the existing pattern doesn't match.

Diff Detail

Unit TestsFailed

	Time	Test
	2,880 ms	x64 debian > AddressSanitizer-x86_64-linux.TestCases::strcmp.c
	570 ms	x64 debian > LLVM.CodeGen/AArch64::arm64-neon-2velem.ll
	920 ms	x64 windows > LLVM.CodeGen/AArch64::arm64-neon-2velem.ll

Event Timeline

samtebbs created this revision.Aug 31 2021, 8:36 AM

Herald added subscribers: hiraditya, kristof.beyls. · View Herald TranscriptAug 31 2021, 8:36 AM

samtebbs requested review of this revision.Aug 31 2021, 8:36 AM

Herald added a project: Restricted Project. · View Herald TranscriptAug 31 2021, 8:36 AM

Herald added a subscriber: llvm-commits. · View Herald Transcript

Harbormaster completed remote builds in B121943: Diff 369711.Aug 31 2021, 9:19 AM

dmgreen added inline comments.Sep 1 2021, 3:11 PM

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
1205	There should probably be other types to this? Not just v4i16. And it doesn't look like it should be in the SVE file. From looking around, it appears most of the other patterns are in SIMDIndexedLongSQDMLXSDTied. But if this is from a scalar intrinsic, should it be producing a scalar sqdmlal?

samtebbs abandoned this revision.May 25 2022, 7:56 AM

Herald added a project: Restricted Project. · View Herald TranscriptMay 25 2022, 7:56 AM

Revision Contents

Path

Size

llvm/

lib/

Target/

AArch64/

AArch64SVEInstrInfo.td

11 lines

test/

CodeGen/

AArch64/

arm64-neon-2velem.ll

21 lines

Diff 369711

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

Show First 20 Lines • Show All 1,195 Lines • ▼ Show 20 Lines	let Predicates = [HasSVE] in {
def : Pat<(nxv2i64 (int_aarch64_sve_adrb nxv2i64:$Op1, nxv2i64:$Op2)),		def : Pat<(nxv2i64 (int_aarch64_sve_adrb nxv2i64:$Op1, nxv2i64:$Op2)),
(ADR_LSL_ZZZ_D_0 $Op1, $Op2)>;		(ADR_LSL_ZZZ_D_0 $Op1, $Op2)>;
def : Pat<(nxv2i64 (int_aarch64_sve_adrh nxv2i64:$Op1, nxv2i64:$Op2)),		def : Pat<(nxv2i64 (int_aarch64_sve_adrh nxv2i64:$Op1, nxv2i64:$Op2)),
(ADR_LSL_ZZZ_D_1 $Op1, $Op2)>;		(ADR_LSL_ZZZ_D_1 $Op1, $Op2)>;
def : Pat<(nxv2i64 (int_aarch64_sve_adrw nxv2i64:$Op1, nxv2i64:$Op2)),		def : Pat<(nxv2i64 (int_aarch64_sve_adrw nxv2i64:$Op1, nxv2i64:$Op2)),
(ADR_LSL_ZZZ_D_2 $Op1, $Op2)>;		(ADR_LSL_ZZZ_D_2 $Op1, $Op2)>;
def : Pat<(nxv2i64 (int_aarch64_sve_adrd nxv2i64:$Op1, nxv2i64:$Op2)),		def : Pat<(nxv2i64 (int_aarch64_sve_adrd nxv2i64:$Op1, nxv2i64:$Op2)),
(ADR_LSL_ZZZ_D_3 $Op1, $Op2)>;		(ADR_LSL_ZZZ_D_3 $Op1, $Op2)>;

		def : Pat<(i32 (int_aarch64_neon_sqadd (i32 FPR32Op:$Rd),
		dmgreenUnsubmitted Not Done Reply Inline Actions There should probably be other types to this? Not just v4i16. And it doesn't look like it should be in the SVE file. From looking around, it appears most of the other patterns are in SIMDIndexedLongSQDMLXSDTied. But if this is from a scalar intrinsic, should it be producing a scalar sqdmlal? dmgreen: There should probably be other types to this? Not just v4i16. And it doesn't look like it…
		(i32 (vector_extract (v4i32 (int_aarch64_neon_sqdmull
		(v4i16 V64:$Rm),
		(v4i16 V64:$Rn))),
		(i64 0))))),
		(EXTRACT_SUBREG (SQDMLALv4i16_indexed
		(SUBREG_TO_REG (i32 0), FPR32Op:$Rd, ssub),
		V64:$Rm, (INSERT_SUBREG
		(v8i16 (IMPLICIT_DEF)), V64:$Rn, dsub),
		(i64 0)), ssub)>;
} // End HasSVE		} // End HasSVE

let Predicates = [HasSVEorStreamingSVE] in {		let Predicates = [HasSVEorStreamingSVE] in {
defm TBL_ZZZ : sve_int_perm_tbl<"tbl", AArch64tbl>;		defm TBL_ZZZ : sve_int_perm_tbl<"tbl", AArch64tbl>;

defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1", AArch64zip1>;		defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1", AArch64zip1>;
defm ZIP2_ZZZ : sve_int_perm_bin_perm_zz<0b001, "zip2", AArch64zip2>;		defm ZIP2_ZZZ : sve_int_perm_bin_perm_zz<0b001, "zip2", AArch64zip2>;
defm UZP1_ZZZ : sve_int_perm_bin_perm_zz<0b010, "uzp1", AArch64uzp1>;		defm UZP1_ZZZ : sve_int_perm_bin_perm_zz<0b010, "uzp1", AArch64uzp1>;
▲ Show 20 Lines • Show All 1,773 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll

	Show First 20 Lines • Show All 46 Lines • ▼ Show 20 Lines
	declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)			declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)

	declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)			declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)

	declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)			declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)

	declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)			declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)

				declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32) #1

	declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)			declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)

	declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)			declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)

	declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)			declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)

	declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)			declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)

	▲ Show 20 Lines • Show All 3,167 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer			%shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
	%vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)			%vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
	%vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)			%vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
	ret <4 x i32> %vqdmlal4.i			ret <4 x i32> %vqdmlal4.i
	}			}

				define i32 @test_vqdmlal_lane_s16_0_i32(i32 %a, i16 %b, <4 x i16> %c) {
				; CHECK-LABEL: test_vqdmlal_lane_s16_0_i32:
				; CHECK: // %bb.0: // %entry
				; CHECK-NEXT: fmov s1, w1
				; CHECK-NEXT: fmov s2, w0
				; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
				; CHECK-NEXT: sqdmlal v2.4s, v1.4h, v0.h[0]
				; CHECK-NEXT: fmov w0, s2
				; CHECK-NEXT: ret
				entry:
				%0 = insertelement <4 x i16> undef, i16 %b, i64 0
				%1 = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
				%vqdmlXl = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %0, <4 x i16> %1)
				%lane0 = extractelement <4 x i32> %vqdmlXl, i64 0
				%vqdmlXl1 = tail call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 %lane0)
				ret i32 %vqdmlXl1
				}


	define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {			define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
	; CHECK-LABEL: test_vqdmlal_lane_s32_0:			; CHECK-LABEL: test_vqdmlal_lane_s32_0:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2			; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
	; CHECK-NEXT: sqdmlal v0.2d, v1.2s, v2.s[0]			; CHECK-NEXT: sqdmlal v0.2d, v1.2s, v2.s[0]
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer			%shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
	▲ Show 20 Lines • Show All 472 Lines • Show Last 20 Lines