This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Target/AArch64/
-
Target/
-
AArch64/
-
AArch64InstrAtomics.td
-
test/CodeGen/AArch64/
-
CodeGen/
-
AArch64/
-
rcpc3-sve.ll
1
rcpc3.ll

Differential D153129

[AArch64][RCPC3] Instruction selection for LDAP1/STL1 instructions
ClosedPublic

Authored by pratlucas on Jun 16 2023, 5:53 AM.

Download Raw Diff

Details

Reviewers

tmatheson
vhscampos
dmgreen
LukeGeeson
efriedma

Commits

rG54c7aec449c3: [AArch64][RCPC3] Instruction selection for LDAP1/STL1 instructions

Summary

This implements the DAG patterns to enable instruction selection for the
LDAP1 and STL1 instructions from FEAT_LRCPC3. The instructions should
match the following combinations:

Aqcuiring atomic load + vector insert element for LDAP1.
Vector extract element + releasing atomic store for STL1.

Patterns have also been added to cope with the DAG structure found when
dealing with 1-lane sub-vectors.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

pratlucas created this revision.Jun 16 2023, 5:53 AM

Herald added a project: Restricted Project. · View Herald TranscriptJun 16 2023, 5:53 AM

Herald added subscribers: hiraditya, kristof.beyls. · View Herald Transcript

pratlucas requested review of this revision.Jun 16 2023, 5:53 AM

Herald added a project: Restricted Project. · View Herald TranscriptJun 16 2023, 5:53 AM

Herald added a subscriber: llvm-commits. · View Herald Transcript

Harbormaster completed remote builds in B239391: Diff 532113.Jun 16 2023, 6:27 AM

pratlucas added a reviewer: LukeGeeson.Jun 19 2023, 3:17 AM

From my understanding of LDAP1 and STL1 these instructions (in particular LDAP) have the same ordering semantics as LDAPR [1] and hence I expect the correctness argument will be similar to the LDAPR testing [2].

I expect this patch is ok (as long as LDAP1 is not used for SC accesses or mixed) - but I'll need to implement some testing to be certain.

[1] https://developer.arm.com/documentation/ddi0602/2022-12/SIMD-FP-Instructions/LDAP1--SIMD-FP---Load-Acquire-RCpc-one-single-element-structure-to-one-lane-of-one-register-
[2] https://reviews.llvm.org/D126250

LGTM. As @LukeGeeson notes these are not for sequentially consistent ops so won't cause something like [1], and they are gated behind +rcpc3.

[1] https://github.com/llvm/llvm-project/issues/62652

This revision is now accepted and ready to land.Jun 20 2023, 3:21 AM

tmatheson added a reviewer: efriedma.Jun 20 2023, 3:21 AM

tmatheson added a parent revision: D153128: [AArch64][RCPC3] Add Neon intrinsics for LDAP1 and STL1.

I'd like to see test coverage for load/store of:

a plain non-vector double value
64-bit vectors
insert/extract from SVE vectors
inserting into lane zero of a <2 x i64> vector

(I'm not expecting any particular result for the above, but I'd like to see what happens beyond the exact patterns you've implemented.)

Matt added a subscriber: Matt.Jun 20 2023, 2:08 PM

Increasing test coverage, with contributions from @tmatheson.

Harbormaster completed remote builds in B241819: Diff 535432.Jun 28 2023, 10:13 AM

Ping.

LGTM

Not implementing optimized lowering for "double" and SVE seems fine for now, but you might want to consider it for the future.

llvm/test/CodeGen/AArch64/rcpc3.ll
248	The tests for a non-atomic load of a vector don't seem necessary. (I forgot you can't do an atomic load of a vector.) Maybe worth testing "load atomic i64" followed by a bitcast to <2 x i32>, instead.

Adding tests for atomic load/store of i64 + bitcast to/from <2 x i32>.

I'm keeping the non-atomic load/store tests to make sure we have minimal
covereage of those operations with rcpc3 enabled, as I couldn't find it
in any other test.

This revision was landed with ongoing or failed builds.Jul 7 2023, 4:33 AM

Closed by commit rG54c7aec449c3: [AArch64][RCPC3] Instruction selection for LDAP1/STL1 instructions (authored by pratlucas). · Explain Why

This revision was automatically updated to reflect the committed changes.

pratlucas added a commit: rG54c7aec449c3: [AArch64][RCPC3] Instruction selection for LDAP1/STL1 instructions.

Harbormaster completed remote builds in B243733: Diff 538075.Jul 7 2023, 6:05 AM

Revision Contents

Path

Size

llvm/

lib/

Target/

AArch64/

AArch64InstrAtomics.td

31 lines

test/

CodeGen/

AArch64/

rcpc3-sve.ll

56 lines

rcpc3.ll

325 lines

Diff 538080

llvm/lib/Target/AArch64/AArch64InstrAtomics.td

Show First 20 Lines • Show All 536 Lines • ▼ Show 20 Lines	let Predicates = [HasLSE] in {
defm : CASregister_patterns<"CAS", "atomic_cmp_swap">;		defm : CASregister_patterns<"CAS", "atomic_cmp_swap">;

// These two patterns are only needed for global isel, selection dag isel		// These two patterns are only needed for global isel, selection dag isel
// converts atomic load-sub into a sub and atomic load-add, and likewise for		// converts atomic load-sub into a sub and atomic load-add, and likewise for
// and -> clr.		// and -> clr.
defm : LDOPregister_patterns_mod<"LDADD", "atomic_load_sub", "SUB">;		defm : LDOPregister_patterns_mod<"LDADD", "atomic_load_sub", "SUB">;
defm : LDOPregister_patterns_mod<"LDCLR", "atomic_load_and", "ORN">;		defm : LDOPregister_patterns_mod<"LDCLR", "atomic_load_and", "ORN">;
}		}

		// v8.9a/v9.4a FEAT_LRCPC patterns
		let Predicates = [HasRCPC3, HasNEON] in {
		// LDAP1 loads
		def : Pat<(vector_insert (v2i64 VecListOne128:$Rd),
		(i64 (acquiring_load<atomic_load_64> GPR64sp:$Rn)), VectorIndexD:$idx),
		(LDAP1 VecListOne128:$Rd, VectorIndexD:$idx, GPR64sp:$Rn)>;
		def : Pat<(vector_insert (v2f64 VecListOne128:$Rd),
		(f64 (bitconvert (i64 (acquiring_load<atomic_load_64> GPR64sp:$Rn)))), VectorIndexD:$idx),
		(LDAP1 VecListOne128:$Rd, VectorIndexD:$idx, GPR64sp:$Rn)>;
		def : Pat<(v1i64 (scalar_to_vector
		(i64 (acquiring_load<atomic_load_64> GPR64sp:$Rn)))),
		(EXTRACT_SUBREG (LDAP1 (v2i64 (IMPLICIT_DEF)), (i64 0), GPR64sp:$Rn), dsub)>;
		def : Pat<(v1f64 (scalar_to_vector
		(f64 (bitconvert (i64 (acquiring_load<atomic_load_64> GPR64sp:$Rn)))))),
		(EXTRACT_SUBREG (LDAP1 (v2f64 (IMPLICIT_DEF)), (i64 0), GPR64sp:$Rn), dsub)>;

		// STL1 stores
		def : Pat<(releasing_store<atomic_store_64> GPR64sp:$Rn,
		(i64 (vector_extract (v2i64 VecListOne128:$Vt), VectorIndexD:$idx))),
		(STL1 VecListOne128:$Vt, VectorIndexD:$idx, GPR64sp:$Rn)>;
		def : Pat<(releasing_store<atomic_store_64> GPR64sp:$Rn,
		(i64 (bitconvert (f64 (vector_extract (v2f64 VecListOne128:$Vt), VectorIndexD:$idx))))),
		(STL1 VecListOne128:$Vt, VectorIndexD:$idx, GPR64sp:$Rn)>;
		// The v1i64 version of the vldap1_lane_* intrinsic is represented as a
		// vector_insert -> vector_extract -> atomic store sequence, which is captured
		// by the patterns above. We only need to cover the v1f64 case manually.
		def : Pat<(releasing_store<atomic_store_64> GPR64sp:$Rn,
		(i64 (bitconvert (v1f64 VecListOne64:$Vt)))),
		(STL1 (SUBREG_TO_REG (i64 0), VecListOne64:$Vt, dsub), (i64 0), GPR64sp:$Rn)>;
		}

llvm/test/CodeGen/AArch64/rcpc3-sve.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
				; RUN: llc -mtriple=aarch64-none-eabi -mattr=+v8.9a -mattr=+sve -mattr=+rcpc3 < %s \| FileCheck %s
				; RUN: llc -mtriple=aarch64-none-eabi -mattr=+v8.9a -mattr=+sve < %s \| FileCheck %s

				; Show what happens with RCPC3 for extract/insert into SVE vectors.
				; Currently there is no RCPC3 codegen expected for this.

				define hidden <vscale x 2 x i64> @test_load_sve_lane0(ptr nocapture noundef readonly %a, <vscale x 2 x i64> noundef %b) local_unnamed_addr {
				; CHECK-LABEL: test_load_sve_lane0:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ldapr x8, [x0]
				; CHECK-NEXT: ptrue p0.d, vl1
				; CHECK-NEXT: mov z0.d, p0/m, x8
				; CHECK-NEXT: ret
				%1 = load atomic i64, ptr %a acquire, align 8
				%vldap1_lane = insertelement <vscale x 2 x i64> %b, i64 %1, i64 0
				ret <vscale x 2 x i64> %vldap1_lane
				}

				define hidden <vscale x 2 x i64> @test_load_sve_lane1(ptr nocapture noundef readonly %a, <vscale x 2 x i64> noundef %b) local_unnamed_addr {
				; CHECK-LABEL: test_load_sve_lane1:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov w8, #1 // =0x1
				; CHECK-NEXT: ldapr x9, [x0]
				; CHECK-NEXT: index z2.d, #0, #1
				; CHECK-NEXT: ptrue p0.d
				; CHECK-NEXT: mov z1.d, x8
				; CHECK-NEXT: cmpeq p0.d, p0/z, z2.d, z1.d
				; CHECK-NEXT: mov z0.d, p0/m, x9
				; CHECK-NEXT: ret
				%1 = load atomic i64, ptr %a acquire, align 8
				%vldap1_lane = insertelement <vscale x 2 x i64> %b, i64 %1, i64 1
				ret <vscale x 2 x i64> %vldap1_lane
				}

				define hidden void @test_store_sve_lane0(ptr nocapture noundef writeonly %a, <vscale x 2 x i64> noundef %b) local_unnamed_addr {
				; CHECK-LABEL: test_store_sve_lane0:
				; CHECK: // %bb.0:
				; CHECK-NEXT: fmov x8, d0
				; CHECK-NEXT: stlr x8, [x0]
				; CHECK-NEXT: ret
				%1 = extractelement <vscale x 2 x i64> %b, i64 0
				store atomic i64 %1, ptr %a release, align 8
				ret void
				}

				define hidden void @test_store_sve_lane1(ptr nocapture noundef writeonly %a, <vscale x 2 x i64> noundef %b) local_unnamed_addr {
				; CHECK-LABEL: test_store_sve_lane1:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov x8, v0.d[1]
				; CHECK-NEXT: stlr x8, [x0]
				; CHECK-NEXT: ret
				%1 = extractelement <vscale x 2 x i64> %b, i64 1
				store atomic i64 %1, ptr %a release, align 8
				ret void
				}

llvm/test/CodeGen/AArch64/rcpc3.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
				; RUN: llc -mtriple=aarch64-none-eabi -mattr=+v8.9a -mattr=+rcpc3 < %s \| FileCheck --check-prefixes=BOTH,RCPC3 %s
				; RUN: llc -mtriple=aarch64-none-eabi -mattr=+v8.9a < %s \| FileCheck --check-prefixes=BOTH,NO-RCPC3 %s

				define hidden <2 x i64> @test_ldap1_2xi64_lane0(ptr nocapture noundef readonly %a, <2 x i64> noundef %b) local_unnamed_addr {
				;
				; RCPC3-LABEL: test_ldap1_2xi64_lane0:
				; RCPC3: // %bb.0:
				; RCPC3-NEXT: ldap1 { v0.d }[0], [x0]
				; RCPC3-NEXT: ret
				;
				; NO-RCPC3-LABEL: test_ldap1_2xi64_lane0:
				; NO-RCPC3: // %bb.0:
				; NO-RCPC3-NEXT: ldapr x8, [x0]
				; NO-RCPC3-NEXT: mov v0.d[0], x8
				; NO-RCPC3-NEXT: ret
				%1 = load atomic i64, ptr %a acquire, align 8
				%ldap1 = insertelement <2 x i64> %b, i64 %1, i64 0
				ret <2 x i64> %ldap1
				}

				define hidden <2 x i64> @test_ldap1_2xi64_lane1(ptr nocapture noundef readonly %a, <2 x i64> noundef %b) local_unnamed_addr {
				;
				; RCPC3-LABEL: test_ldap1_2xi64_lane1:
				; RCPC3: // %bb.0:
				; RCPC3-NEXT: ldap1 { v0.d }[1], [x0]
				; RCPC3-NEXT: ret
				;
				; NO-RCPC3-LABEL: test_ldap1_2xi64_lane1:
				; NO-RCPC3: // %bb.0:
				; NO-RCPC3-NEXT: ldapr x8, [x0]
				; NO-RCPC3-NEXT: mov v0.d[1], x8
				; NO-RCPC3-NEXT: ret
				%1 = load atomic i64, ptr %a acquire, align 8
				%ldap1 = insertelement <2 x i64> %b, i64 %1, i64 1
				ret <2 x i64> %ldap1
				}

				define hidden nofpclass(nan inf) <2 x double> @test_ldap1_2xdouble_lane0(ptr nocapture noundef readonly %a, <2 x double> noundef nofpclass(nan inf) %b) local_unnamed_addr {
				;
				; RCPC3-LABEL: test_ldap1_2xdouble_lane0:
				; RCPC3: // %bb.0:
				; RCPC3-NEXT: ldap1 { v0.d }[0], [x0]
				; RCPC3-NEXT: ret
				;
				; NO-RCPC3-LABEL: test_ldap1_2xdouble_lane0:
				; NO-RCPC3: // %bb.0:
				; NO-RCPC3-NEXT: ldapr x8, [x0]
				; NO-RCPC3-NEXT: fmov d1, x8
				; NO-RCPC3-NEXT: mov v0.d[0], v1.d[0]
				; NO-RCPC3-NEXT: ret
				%1 = load atomic double, ptr %a acquire, align 8
				%ldap1 = insertelement <2 x double> %b, double %1, i64 0
				ret <2 x double> %ldap1
				}

				define hidden nofpclass(nan inf) <2 x double> @test_ldap1_2xdouble_lane1(ptr nocapture noundef readonly %a, <2 x double> noundef nofpclass(nan inf) %b) local_unnamed_addr {
				;
				; RCPC3-LABEL: test_ldap1_2xdouble_lane1:
				; RCPC3: // %bb.0:
				; RCPC3-NEXT: ldap1 { v0.d }[1], [x0]
				; RCPC3-NEXT: ret
				;
				; NO-RCPC3-LABEL: test_ldap1_2xdouble_lane1:
				; NO-RCPC3: // %bb.0:
				; NO-RCPC3-NEXT: ldapr x8, [x0]
				; NO-RCPC3-NEXT: fmov d1, x8
				; NO-RCPC3-NEXT: mov v0.d[1], v1.d[0]
				; NO-RCPC3-NEXT: ret
				%1 = load atomic double, ptr %a acquire, align 8
				%ldap1 = insertelement <2 x double> %b, double %1, i64 1
				ret <2 x double> %ldap1
				}

				define hidden <1 x i64> @test_ldap1_1xi64_lane0(ptr nocapture noundef readonly %a, <1 x i64> noundef %b) local_unnamed_addr {
				;
				; RCPC3-LABEL: test_ldap1_1xi64_lane0:
				; RCPC3: // %bb.0:
				; RCPC3-NEXT: ldap1 { v0.d }[0], [x0]
				; RCPC3-NEXT: // kill: def $d0 killed $d0 killed $q0
				; RCPC3-NEXT: ret
				;
				; NO-RCPC3-LABEL: test_ldap1_1xi64_lane0:
				; NO-RCPC3: // %bb.0:
				; NO-RCPC3-NEXT: ldapr x8, [x0]
				; NO-RCPC3-NEXT: fmov d0, x8
				; NO-RCPC3-NEXT: ret
				%1 = load atomic i64, ptr %a acquire, align 8
				%ldap1 = insertelement <1 x i64> poison, i64 %1, i64 0
				ret <1 x i64> %ldap1
				}

				define hidden nofpclass(nan inf) <1 x double> @test_ldap1_1xdouble_lane0(ptr nocapture noundef readonly %a, <1 x double> noundef nofpclass(nan inf) %b) local_unnamed_addr {
				;
				; RCPC3-LABEL: test_ldap1_1xdouble_lane0:
				; RCPC3: // %bb.0:
				; RCPC3-NEXT: ldap1 { v0.d }[0], [x0]
				; RCPC3-NEXT: // kill: def $d0 killed $d0 killed $q0
				; RCPC3-NEXT: ret
				;
				; NO-RCPC3-LABEL: test_ldap1_1xdouble_lane0:
				; NO-RCPC3: // %bb.0:
				; NO-RCPC3-NEXT: ldapr x8, [x0]
				; NO-RCPC3-NEXT: fmov d0, x8
				; NO-RCPC3-NEXT: ret
				%1 = load atomic double, ptr %a acquire, align 8
				%ldap1 = insertelement <1 x double> poison, double %1, i64 0
				ret <1 x double> %ldap1
				}

				define hidden void @test_stl1_2xi64_lane0(ptr nocapture noundef writeonly %a, <2 x i64> noundef %b) local_unnamed_addr {
				;
				; RCPC3-LABEL: test_stl1_2xi64_lane0:
				; RCPC3: // %bb.0:
				; RCPC3-NEXT: stl1 { v0.d }[0], [x0]
				; RCPC3-NEXT: ret
				;
				; NO-RCPC3-LABEL: test_stl1_2xi64_lane0:
				; NO-RCPC3: // %bb.0:
				; NO-RCPC3-NEXT: fmov x8, d0
				; NO-RCPC3-NEXT: stlr x8, [x0]
				; NO-RCPC3-NEXT: ret
				%1 = extractelement <2 x i64> %b, i64 0
				store atomic i64 %1, ptr %a release, align 8
				ret void
				}

				define hidden void @test_stl1_2xi64_lane1(ptr nocapture noundef writeonly %a, <2 x i64> noundef %b) local_unnamed_addr {
				;
				; RCPC3-LABEL: test_stl1_2xi64_lane1:
				; RCPC3: // %bb.0:
				; RCPC3-NEXT: stl1 { v0.d }[1], [x0]
				; RCPC3-NEXT: ret
				;
				; NO-RCPC3-LABEL: test_stl1_2xi64_lane1:
				; NO-RCPC3: // %bb.0:
				; NO-RCPC3-NEXT: mov x8, v0.d[1]
				; NO-RCPC3-NEXT: stlr x8, [x0]
				; NO-RCPC3-NEXT: ret
				%1 = extractelement <2 x i64> %b, i64 1
				store atomic i64 %1, ptr %a release, align 8
				ret void
				}

				define hidden void @test_stl1_2xdouble_lane0(ptr nocapture noundef writeonly %a, <2 x double> noundef nofpclass(nan inf) %b) local_unnamed_addr {
				;
				; RCPC3-LABEL: test_stl1_2xdouble_lane0:
				; RCPC3: // %bb.0:
				; RCPC3-NEXT: stl1 { v0.d }[0], [x0]
				; RCPC3-NEXT: ret
				;
				; NO-RCPC3-LABEL: test_stl1_2xdouble_lane0:
				; NO-RCPC3: // %bb.0:
				; NO-RCPC3-NEXT: fmov x8, d0
				; NO-RCPC3-NEXT: stlr x8, [x0]
				; NO-RCPC3-NEXT: ret
				%1 = extractelement <2 x double> %b, i64 0
				store atomic double %1, ptr %a release, align 8
				ret void
				}

				define hidden void @test_stl1_2xdouble_lane1(ptr nocapture noundef writeonly %a, <2 x double> noundef nofpclass(nan inf) %b) local_unnamed_addr {
				;
				; RCPC3-LABEL: test_stl1_2xdouble_lane1:
				; RCPC3: // %bb.0:
				; RCPC3-NEXT: stl1 { v0.d }[1], [x0]
				; RCPC3-NEXT: ret
				;
				; NO-RCPC3-LABEL: test_stl1_2xdouble_lane1:
				; NO-RCPC3: // %bb.0:
				; NO-RCPC3-NEXT: mov d0, v0.d[1]
				; NO-RCPC3-NEXT: fmov x8, d0
				; NO-RCPC3-NEXT: stlr x8, [x0]
				; NO-RCPC3-NEXT: ret
				%1 = extractelement <2 x double> %b, i64 1
				store atomic double %1, ptr %a release, align 8
				ret void
				}

				define hidden void @test_stl1_1xi64_lane0(ptr nocapture noundef writeonly %a, <1 x i64> noundef %b) local_unnamed_addr {
				;
				; RCPC3-LABEL: test_stl1_1xi64_lane0:
				; RCPC3: // %bb.0:
				; RCPC3-NEXT: // kill: def $d0 killed $d0 def $q0
				; RCPC3-NEXT: stl1 { v0.d }[0], [x0]
				; RCPC3-NEXT: ret
				;
				; NO-RCPC3-LABEL: test_stl1_1xi64_lane0:
				; NO-RCPC3: // %bb.0:
				; NO-RCPC3-NEXT: // kill: def $d0 killed $d0 def $q0
				; NO-RCPC3-NEXT: fmov x8, d0
				; NO-RCPC3-NEXT: stlr x8, [x0]
				; NO-RCPC3-NEXT: ret
				%1 = extractelement <1 x i64> %b, i64 0
				store atomic i64 %1, ptr %a release, align 8
				ret void
				}

				define hidden void @test_stl1_1xdouble_lane0(ptr nocapture noundef writeonly %a, <1 x double> noundef nofpclass(nan inf) %b) local_unnamed_addr {
				;
				; RCPC3-LABEL: test_stl1_1xdouble_lane0:
				; RCPC3: // %bb.0:
				; RCPC3-NEXT: // kill: def $d0 killed $d0 def $q0
				; RCPC3-NEXT: stl1 { v0.d }[0], [x0]
				; RCPC3-NEXT: ret
				;
				; NO-RCPC3-LABEL: test_stl1_1xdouble_lane0:
				; NO-RCPC3: // %bb.0:
				; NO-RCPC3-NEXT: fmov x8, d0
				; NO-RCPC3-NEXT: stlr x8, [x0]
				; NO-RCPC3-NEXT: ret
				%1 = extractelement <1 x double> %b, i64 0
				store atomic double %1, ptr %a release, align 8
				ret void
				}

				; The remaining tests do not have any particular RCPC3-specific codegen:

				; load-acquire a plain non-vector double value
				define hidden double @test_double_load(ptr nocapture noundef readonly %a) local_unnamed_addr {
				; BOTH-LABEL: test_double_load:
				; BOTH: // %bb.0:
				; BOTH-NEXT: ldapr x8, [x0]
				; BOTH-NEXT: fmov d0, x8
				; BOTH-NEXT: ret
				%1 = load atomic double, ptr %a acquire, align 8
				ret double %1
				}

				; store-release a plain non-vector double value
				define hidden void @test_double_store(ptr nocapture noundef writeonly %a, double noundef %b) local_unnamed_addr {
				; BOTH-LABEL: test_double_store:
				; BOTH: // %bb.0:
				; BOTH-NEXT: fmov x8, d0
				; BOTH-NEXT: stlr x8, [x0]
				; BOTH-NEXT: ret
				store atomic double %b, ptr %a release, align 8
				ret void
				}

				; load-acquire an i64, followed by a bitcast to a 64-bit vector
				define hidden <2 x i32> @test_load_i64_bitcast_2xi32(ptr nocapture noundef readonly %a) local_unnamed_addr {
				; BOTH-LABEL: test_load_i64_bitcast_2xi32:
				; BOTH: // %bb.0:
				; BOTH-NEXT: ldapr x8, [x0]
				; BOTH-NEXT: fmov d0, x8
				; BOTH-NEXT: ret
				%1 = load atomic i64, ptr %a acquire, align 8
				efriedmaUnsubmitted Not Done Reply Inline Actions The tests for a non-atomic load of a vector don't seem necessary. (I forgot you can't do an atomic load of a vector.) Maybe worth testing "load atomic i64" followed by a bitcast to <2 x i32>, instead. efriedma: The tests for a non-atomic load of a vector don't seem necessary. (I forgot you can't do an…
				%2 = bitcast i64 %1 to <2 x i32>
				ret <2 x i32> %2
				}

				; bitcast from a 64-bit vector, followed by a store-release of the i64
				define hidden void @test_bitcast_2xi32_store_i64(ptr nocapture noundef readonly %a, <2 x i32> noundef %b) local_unnamed_addr {
				; BOTH-LABEL: test_bitcast_2xi32_store_i64:
				; BOTH: // %bb.0:
				; BOTH-NEXT: fmov x8, d0
				; BOTH-NEXT: stlr x8, [x0]
				; BOTH-NEXT: ret
				%1 = bitcast <2 x i32> %b to i64
				store atomic i64 %1, ptr %a release, align 8
				ret void
				}

				; (non-atomic) load a 64-bit vector
				define hidden <2 x i32> @test_load_2xi32(ptr nocapture noundef readonly %a) local_unnamed_addr {
				; BOTH-LABEL: test_load_2xi32:
				; BOTH: // %bb.0:
				; BOTH-NEXT: ldr d0, [x0]
				; BOTH-NEXT: ret
				%1 = load <2 x i32>, ptr %a, align 8
				ret <2 x i32> %1
				}

				; (non-atomic) store a 64-bit vector
				define hidden void @test_store_2xi32(ptr nocapture noundef writeonly %a, <2 x i32> noundef %b) local_unnamed_addr {
				; BOTH-LABEL: test_store_2xi32:
				; BOTH: // %bb.0:
				; BOTH-NEXT: str d0, [x0]
				; BOTH-NEXT: ret
				store <2 x i32> %b, ptr %a, align 8
				ret void
				}

				; (non-atomic) load a 64-bit vector
				define hidden <1 x i64> @test_load_1xi64(ptr nocapture noundef readonly %a) local_unnamed_addr {
				; BOTH-LABEL: test_load_1xi64:
				; BOTH: // %bb.0:
				; BOTH-NEXT: ldr d0, [x0]
				; BOTH-NEXT: ret
				%1 = load <1 x i64>, ptr %a, align 8
				ret <1 x i64> %1
				}

				; (non-atomic) store a 64-bit vector
				define hidden void @test_store_1xi64(ptr nocapture noundef writeonly %a, <1 x i64> noundef %b) local_unnamed_addr {
				; BOTH-LABEL: test_store_1xi64:
				; BOTH: // %bb.0:
				; BOTH-NEXT: str d0, [x0]
				; BOTH-NEXT: ret
				store <1 x i64> %b, ptr %a, align 8
				ret void
				}

				; (non-atomic) load a 64-bit value and insert into vector
				define hidden <2 x i64> @test_load_insert_2xi64(ptr nocapture noundef readonly %a, <2 x i64> noundef %b) local_unnamed_addr {
				; BOTH-LABEL: test_load_insert_2xi64:
				; BOTH: // %bb.0:
				; BOTH-NEXT: ld1 { v0.d }[0], [x0]
				; BOTH-NEXT: ret
				%1 = load i64, ptr %a, align 8
				%2 = insertelement <2 x i64> %b, i64 %1, i64 0
				ret <2 x i64> %2
				}

				; extract from vector and (non-atomic) store a 64-bit value
				define hidden void @test_extract_store_2xi64(ptr nocapture noundef writeonly %a, <2 x i64> noundef %b) local_unnamed_addr {
				; BOTH-LABEL: test_extract_store_2xi64:
				; BOTH: // %bb.0:
				; BOTH-NEXT: st1 { v0.d }[1], [x0]
				; BOTH-NEXT: ret
				%1 = extractelement <2 x i64> %b, i64 1
				store i64 %1, ptr %a, align 8
				ret void
				}