This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Target/AArch64/
-
Target/
-
AArch64/
-
AArch64InstrInfo.td
-
test/CodeGen/AArch64/
-
CodeGen/
-
AArch64/
-
neon-insextbitcast.ll

Differential D134022

[AArch64] Insert/Extract of bitcast patterns
ClosedPublic

Authored by dmgreen on Sep 16 2022, 1:07 AM.

Download Raw Diff

Details

Reviewers

labrinea
samtebbs
sdesmalen
stuij
jaykang10

Commits

rG9a20596f4825: [AArch64] Insert/Extract of bitcast patterns

Summary

This adds some quick tablegen patterns for vector_insert(bitcast(..)) and bitcast(vector_extract(..)), allowing us to avoid a round-trip through GPRs.

Diff Detail

Unit TestsFailed

	Time	Test
	60,040 ms	x64 debian > libFuzzer.libFuzzer::fuzzer-leak.test

Event Timeline

dmgreen created this revision.Sep 16 2022, 1:07 AM

Herald added a project: Restricted Project. · View Herald TranscriptSep 16 2022, 1:07 AM

Herald added subscribers: hiraditya, kristof.beyls. · View Herald Transcript

dmgreen requested review of this revision.Sep 16 2022, 1:07 AM

Herald added a project: Restricted Project. · View Herald TranscriptSep 16 2022, 1:07 AM

Harbormaster completed remote builds in B187082: Diff 460664.Sep 16 2022, 1:44 AM

Nice change

This revision is now accepted and ready to land.Sep 20 2022, 6:14 AM

This revision was landed with ongoing or failed builds.Sep 21 2022, 1:54 AM

Closed by commit rG9a20596f4825: [AArch64] Insert/Extract of bitcast patterns (authored by dmgreen). · Explain Why

This revision was automatically updated to reflect the committed changes.

dmgreen added a commit: rG9a20596f4825: [AArch64] Insert/Extract of bitcast patterns.

Looks like some of our code gets miscompiled on aarch64 after this change. I can't yet share an isolated test, but this is the difference in the assembly:

@@ -109,11 +109,10 @@
        ldr     x10, [sp, #48]
        stp     x8, x9, [sp, #64]
        blr     x10
-       scvtf   d0, w0
        mov     w8, #1
-       fmov    x9, d0
+       scvtf   d1, w0
        dup     v0.2d, x8
-       mov     v0.d[0], x9
+       fmov    d0, d1
 .LBB1_5:
        adrp    x1, .L.str.3
        add     x1, x1, :lo12:.L.str.3

The code misbehaves only on arm with -O2 and higher (works fine on arm with -O1 and on x86-64 with any optimization level) and I don't see any undefined behavior in it.

Yeah that certainly looks wrong. The fmov will be clearing the result of the dup. I suspect that INSERT_SUBREG without a IMPLICIT_DEF operand is what is going wrong.

I'll revert the patch.

dmgreen mentioned this in rG401481daac90: [AArch64] Remove incorrect zero element insert-bitcast patterns.Sep 27 2022, 9:08 AM

I've only removed the zero-element patterns in rG401481daac90. I believe those were the ones going wrong. Did that fix the issues that you are seeing? If not we can revert the rest whilst we investigate.

In D134022#3818424, @dmgreen wrote:

I've only removed the zero-element patterns in rG401481daac90. I believe those were the ones going wrong. Did that fix the issues that you are seeing? If not we can revert the rest whilst we investigate.

Thank you Dave! rG401481daac90 fixes this particular problem. I'm now testing it completely to ensure it doesn't break anything else.

Thanks for the confirmation. Let us know.

In D134022#3822808, @dmgreen wrote:

Thanks for the confirmation. Let us know.

No new issues discovered after this patch. Thanks again for the quick resolution!

Revision Contents

Path

Size

llvm/

lib/

Target/

AArch64/

AArch64InstrInfo.td

21 lines

test/

CodeGen/

AArch64/

neon-insextbitcast.ll

33 lines

Diff 460664

llvm/lib/Target/AArch64/AArch64InstrInfo.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 5,878 Lines • ▼ Show 20 Lines	def : Pat<(VT64 (vector_insert V64:$src,
dsub)>;		dsub)>;
}		}

defm : Neon_INS_elt_pattern<v8f16, v4f16, f16, INSvi16lane>;		defm : Neon_INS_elt_pattern<v8f16, v4f16, f16, INSvi16lane>;
defm : Neon_INS_elt_pattern<v8bf16, v4bf16, bf16, INSvi16lane>;		defm : Neon_INS_elt_pattern<v8bf16, v4bf16, bf16, INSvi16lane>;
defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>;		defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>;
defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>;		defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>;

		// Insert from bitcast
		// vector_insert(bitcast(f32 src), n, lane) -> INSvi32lane(src, lane, INSERT_SUBREG(-, n), 0)
		def : Pat<(v4i32 (vector_insert v4i32:$src, (i32 (bitconvert (f32 FPR32:$Sn))), imm:$Immd)),
		(INSvi32lane V128:$src, imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$Sn, ssub), 0)>;
		def : Pat<(v4i32 (vector_insert v4i32:$src, (i32 (bitconvert (f32 FPR32:$Sn))), 0)),
		(INSERT_SUBREG v4i32:$src, FPR32:$Sn, ssub)>;
		def : Pat<(v2i64 (vector_insert v2i64:$src, (i64 (bitconvert (f64 FPR64:$Sn))), imm:$Immd)),
		(INSvi64lane V128:$src, imm:$Immd, (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$Sn, dsub), 0)>;
		def : Pat<(v2i64 (vector_insert v2i64:$src, (i64 (bitconvert (f64 FPR64:$Sn))), 0)),
		(INSERT_SUBREG v2i64:$src, FPR64:$Sn, dsub)>;

		// bitcast of an extract
		// f32 bitcast(vector_extract(v4i32 src, lane)) -> EXTRACT_SUBREG(INSvi32lane(-, 0, src, lane))
		def : Pat<(f32 (bitconvert (i32 (vector_extract v4i32:$src, imm:$Immd)))),
		(EXTRACT_SUBREG (INSvi32lane (IMPLICIT_DEF), 0, V128:$src, imm:$Immd), ssub)>;
		def : Pat<(f32 (bitconvert (i32 (vector_extract v4i32:$src, 0)))),
		(EXTRACT_SUBREG V128:$src, ssub)>;
		def : Pat<(f64 (bitconvert (i64 (vector_extract v2i64:$src, imm:$Immd)))),
		(EXTRACT_SUBREG (INSvi64lane (IMPLICIT_DEF), 0, V128:$src, imm:$Immd), dsub)>;
		def : Pat<(f64 (bitconvert (i64 (vector_extract v2i64:$src, 0)))),
		(EXTRACT_SUBREG V128:$src, dsub)>;

// Floating point vector extractions are codegen'd as either a sequence of		// Floating point vector extractions are codegen'd as either a sequence of
// subregister extractions, or a MOV (aka DUP here) if		// subregister extractions, or a MOV (aka DUP here) if
// the lane number is anything other than zero.		// the lane number is anything other than zero.
def : Pat<(vector_extract (v2f64 V128:$Rn), 0),		def : Pat<(vector_extract (v2f64 V128:$Rn), 0),
(f64 (EXTRACT_SUBREG V128:$Rn, dsub))>;		(f64 (EXTRACT_SUBREG V128:$Rn, dsub))>;
def : Pat<(vector_extract (v4f32 V128:$Rn), 0),		def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
(f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;		(f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
▲ Show 20 Lines • Show All 2,526 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/neon-insextbitcast.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon \| FileCheck %s			; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon \| FileCheck %s

	define <4 x i32> @test_vins_v4i32(<4 x i32> %a, float %b) {			define <4 x i32> @test_vins_v4i32(<4 x i32> %a, float %b) {
	; CHECK-LABEL: test_vins_v4i32:			; CHECK-LABEL: test_vins_v4i32:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: fmov w8, s1			; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1
	; CHECK-NEXT: mov v0.s[3], w8			; CHECK-NEXT: mov v0.s[3], v1.s[0]
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%c = bitcast float %b to i32			%c = bitcast float %b to i32
	%d = insertelement <4 x i32> %a, i32 %c, i32 3			%d = insertelement <4 x i32> %a, i32 %c, i32 3
	ret <4 x i32> %d			ret <4 x i32> %d
	}			}

	define <4 x i32> @test_vins_v4i32_0(<4 x i32> %a, float %b) {			define <4 x i32> @test_vins_v4i32_0(<4 x i32> %a, float %b) {
	; CHECK-LABEL: test_vins_v4i32_0:			; CHECK-LABEL: test_vins_v4i32_0:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: fmov w8, s1			; CHECK-NEXT: fmov s0, s1
	; CHECK-NEXT: mov v0.s[0], w8
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%c = bitcast float %b to i32			%c = bitcast float %b to i32
	%d = insertelement <4 x i32> %a, i32 %c, i32 0			%d = insertelement <4 x i32> %a, i32 %c, i32 0
	ret <4 x i32> %d			ret <4 x i32> %d
	}			}

	define <2 x i32> @test_vins_v2i32(<2 x i32> %a, float %b) {			define <2 x i32> @test_vins_v2i32(<2 x i32> %a, float %b) {
	; CHECK-LABEL: test_vins_v2i32:			; CHECK-LABEL: test_vins_v2i32:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: fmov w8, s1
	; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0			; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
	; CHECK-NEXT: mov v0.s[1], w8			; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1
				; CHECK-NEXT: mov v0.s[1], v1.s[0]
	; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0			; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%c = bitcast float %b to i32			%c = bitcast float %b to i32
	%d = insertelement <2 x i32> %a, i32 %c, i32 1			%d = insertelement <2 x i32> %a, i32 %c, i32 1
	ret <2 x i32> %d			ret <2 x i32> %d
	}			}

	define <2 x i32> @test_vins_v2i32_0(<2 x i32> %a, float %b) {			define <2 x i32> @test_vins_v2i32_0(<2 x i32> %a, float %b) {
	; CHECK-LABEL: test_vins_v2i32_0:			; CHECK-LABEL: test_vins_v2i32_0:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: fmov w8, s1
	; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0			; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
	; CHECK-NEXT: mov v0.s[0], w8			; CHECK-NEXT: fmov s0, s1
	; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0			; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%c = bitcast float %b to i32			%c = bitcast float %b to i32
	%d = insertelement <2 x i32> %a, i32 %c, i32 0			%d = insertelement <2 x i32> %a, i32 %c, i32 0
	ret <2 x i32> %d			ret <2 x i32> %d
	}			}

	define <2 x i64> @test_vins_v2i64(<2 x i64> %a, double %b) {			define <2 x i64> @test_vins_v2i64(<2 x i64> %a, double %b) {
	; CHECK-LABEL: test_vins_v2i64:			; CHECK-LABEL: test_vins_v2i64:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: fmov x8, d1			; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
	; CHECK-NEXT: mov v0.d[1], x8			; CHECK-NEXT: mov v0.d[1], v1.d[0]
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%c = bitcast double %b to i64			%c = bitcast double %b to i64
	%d = insertelement <2 x i64> %a, i64 %c, i32 1			%d = insertelement <2 x i64> %a, i64 %c, i32 1
	ret <2 x i64> %d			ret <2 x i64> %d
	}			}

	define <2 x i64> @test_vins_v2i64_0(<2 x i64> %a, double %b) {			define <2 x i64> @test_vins_v2i64_0(<2 x i64> %a, double %b) {
	; CHECK-LABEL: test_vins_v2i64_0:			; CHECK-LABEL: test_vins_v2i64_0:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: fmov x8, d1			; CHECK-NEXT: fmov d0, d1
	; CHECK-NEXT: mov v0.d[0], x8
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%c = bitcast double %b to i64			%c = bitcast double %b to i64
	%d = insertelement <2 x i64> %a, i64 %c, i32 0			%d = insertelement <2 x i64> %a, i64 %c, i32 0
	ret <2 x i64> %d			ret <2 x i64> %d
	}			}

	define <1 x i64> @test_vins_v1i64(<1 x i64> %a, double %b) {			define <1 x i64> @test_vins_v1i64(<1 x i64> %a, double %b) {
	; CHECK-LABEL: test_vins_v1i64:			; CHECK-LABEL: test_vins_v1i64:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: fmov d0, d1			; CHECK-NEXT: fmov d0, d1
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%c = bitcast double %b to i64			%c = bitcast double %b to i64
	%d = insertelement <1 x i64> %a, i64 %c, i32 0			%d = insertelement <1 x i64> %a, i64 %c, i32 0
	ret <1 x i64> %d			ret <1 x i64> %d
	}			}


	define float @test_vext_v4i32(<4 x i32> %a) {			define float @test_vext_v4i32(<4 x i32> %a) {
	; CHECK-LABEL: test_vext_v4i32:			; CHECK-LABEL: test_vext_v4i32:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: mov w8, v0.s[3]			; CHECK-NEXT: mov v0.s[0], v0.s[3]
	; CHECK-NEXT: fmov s0, w8			; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%b = extractelement <4 x i32> %a, i32 3			%b = extractelement <4 x i32> %a, i32 3
	%c = bitcast i32 %b to float			%c = bitcast i32 %b to float
	ret float %c			ret float %c
	}			}

	define float @test_vext_v4i32_0(<4 x i32> %a) {			define float @test_vext_v4i32_0(<4 x i32> %a) {
	; CHECK-LABEL: test_vext_v4i32_0:			; CHECK-LABEL: test_vext_v4i32_0:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0			; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%b = extractelement <4 x i32> %a, i32 0			%b = extractelement <4 x i32> %a, i32 0
	%c = bitcast i32 %b to float			%c = bitcast i32 %b to float
	ret float %c			ret float %c
	}			}

	define float @test_vext_v2i32(<2 x i32> %a) {			define float @test_vext_v2i32(<2 x i32> %a) {
	; CHECK-LABEL: test_vext_v2i32:			; CHECK-LABEL: test_vext_v2i32:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0			; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
	; CHECK-NEXT: mov w8, v0.s[1]			; CHECK-NEXT: mov v0.s[0], v0.s[1]
	; CHECK-NEXT: fmov s0, w8			; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%b = extractelement <2 x i32> %a, i32 1			%b = extractelement <2 x i32> %a, i32 1
	%c = bitcast i32 %b to float			%c = bitcast i32 %b to float
	ret float %c			ret float %c
	}			}

	define float @test_vext_v2i32_0(<2 x i32> %a) {			define float @test_vext_v2i32_0(<2 x i32> %a) {
	; CHECK-LABEL: test_vext_v2i32_0:			; CHECK-LABEL: test_vext_v2i32_0:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0			; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
	; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0			; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%b = extractelement <2 x i32> %a, i32 0			%b = extractelement <2 x i32> %a, i32 0
	%c = bitcast i32 %b to float			%c = bitcast i32 %b to float
	ret float %c			ret float %c
	}			}

	define double @test_vext_v2i64(<2 x i64> %a) {			define double @test_vext_v2i64(<2 x i64> %a) {
	; CHECK-LABEL: test_vext_v2i64:			; CHECK-LABEL: test_vext_v2i64:
	; CHECK: // %bb.0: // %entry			; CHECK: // %bb.0: // %entry
	; CHECK-NEXT: mov x8, v0.d[1]			; CHECK-NEXT: mov v0.d[0], v0.d[1]
	; CHECK-NEXT: fmov d0, x8			; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	entry:			entry:
	%b = extractelement <2 x i64> %a, i32 1			%b = extractelement <2 x i64> %a, i32 1
	%c = bitcast i64 %b to double			%c = bitcast i64 %b to double
	ret double %c			ret double %c
	}			}

	define double @test_vext_v2i64_0(<2 x i64> %a) {			define double @test_vext_v2i64_0(<2 x i64> %a) {
	Show All 21 Lines