diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td --- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td +++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td @@ -542,3 +542,34 @@ defm : LDOPregister_patterns_mod<"LDADD", "atomic_load_sub", "SUB">; defm : LDOPregister_patterns_mod<"LDCLR", "atomic_load_and", "ORN">; } + +// v8.9a/v9.4a FEAT_LRCPC patterns +let Predicates = [HasRCPC3, HasNEON] in { + // LDAP1 loads + def : Pat<(vector_insert (v2i64 VecListOne128:$Rd), + (i64 (acquiring_load GPR64sp:$Rn)), VectorIndexD:$idx), + (LDAP1 VecListOne128:$Rd, VectorIndexD:$idx, GPR64sp:$Rn)>; + def : Pat<(vector_insert (v2f64 VecListOne128:$Rd), + (f64 (bitconvert (i64 (acquiring_load GPR64sp:$Rn)))), VectorIndexD:$idx), + (LDAP1 VecListOne128:$Rd, VectorIndexD:$idx, GPR64sp:$Rn)>; + def : Pat<(v1i64 (scalar_to_vector + (i64 (acquiring_load GPR64sp:$Rn)))), + (EXTRACT_SUBREG (LDAP1 (v2i64 (IMPLICIT_DEF)), (i64 0), GPR64sp:$Rn), dsub)>; + def : Pat<(v1f64 (scalar_to_vector + (f64 (bitconvert (i64 (acquiring_load GPR64sp:$Rn)))))), + (EXTRACT_SUBREG (LDAP1 (v2f64 (IMPLICIT_DEF)), (i64 0), GPR64sp:$Rn), dsub)>; + + // STL1 stores + def : Pat<(releasing_store GPR64sp:$Rn, + (i64 (vector_extract (v2i64 VecListOne128:$Vt), VectorIndexD:$idx))), + (STL1 VecListOne128:$Vt, VectorIndexD:$idx, GPR64sp:$Rn)>; + def : Pat<(releasing_store GPR64sp:$Rn, + (i64 (bitconvert (f64 (vector_extract (v2f64 VecListOne128:$Vt), VectorIndexD:$idx))))), + (STL1 VecListOne128:$Vt, VectorIndexD:$idx, GPR64sp:$Rn)>; + // The v1i64 version of the vldap1_lane_* intrinsic is represented as a + // vector_insert -> vector_extract -> atomic store sequence, which is captured + // by the patterns above. We only need to cover the v1f64 case manually. + def : Pat<(releasing_store GPR64sp:$Rn, + (i64 (bitconvert (v1f64 VecListOne64:$Vt)))), + (STL1 (SUBREG_TO_REG (i64 0), VecListOne64:$Vt, dsub), (i64 0), GPR64sp:$Rn)>; +} diff --git a/llvm/test/CodeGen/AArch64/rcpc3-sve.ll b/llvm/test/CodeGen/AArch64/rcpc3-sve.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/rcpc3-sve.ll @@ -0,0 +1,56 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-none-eabi -mattr=+v8.9a -mattr=+sve -mattr=+rcpc3 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-none-eabi -mattr=+v8.9a -mattr=+sve < %s | FileCheck %s + +; Show what happens with RCPC3 for extract/insert into SVE vectors. +; Currently there is no RCPC3 codegen expected for this. + +define hidden @test_load_sve_lane0(ptr nocapture noundef readonly %a, noundef %b) local_unnamed_addr { +; CHECK-LABEL: test_load_sve_lane0: +; CHECK: // %bb.0: +; CHECK-NEXT: ldapr x8, [x0] +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: mov z0.d, p0/m, x8 +; CHECK-NEXT: ret + %1 = load atomic i64, ptr %a acquire, align 8 + %vldap1_lane = insertelement %b, i64 %1, i64 0 + ret %vldap1_lane +} + +define hidden @test_load_sve_lane1(ptr nocapture noundef readonly %a, noundef %b) local_unnamed_addr { +; CHECK-LABEL: test_load_sve_lane1: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: ldapr x9, [x0] +; CHECK-NEXT: index z2.d, #0, #1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: cmpeq p0.d, p0/z, z2.d, z1.d +; CHECK-NEXT: mov z0.d, p0/m, x9 +; CHECK-NEXT: ret + %1 = load atomic i64, ptr %a acquire, align 8 + %vldap1_lane = insertelement %b, i64 %1, i64 1 + ret %vldap1_lane +} + +define hidden void @test_store_sve_lane0(ptr nocapture noundef writeonly %a, noundef %b) local_unnamed_addr { +; CHECK-LABEL: test_store_sve_lane0: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: stlr x8, [x0] +; CHECK-NEXT: ret + %1 = extractelement %b, i64 0 + store atomic i64 %1, ptr %a release, align 8 + ret void +} + +define hidden void @test_store_sve_lane1(ptr nocapture noundef writeonly %a, noundef %b) local_unnamed_addr { +; CHECK-LABEL: test_store_sve_lane1: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: stlr x8, [x0] +; CHECK-NEXT: ret + %1 = extractelement %b, i64 1 + store atomic i64 %1, ptr %a release, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AArch64/rcpc3.ll b/llvm/test/CodeGen/AArch64/rcpc3.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/rcpc3.ll @@ -0,0 +1,325 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-none-eabi -mattr=+v8.9a -mattr=+rcpc3 < %s | FileCheck --check-prefixes=BOTH,RCPC3 %s +; RUN: llc -mtriple=aarch64-none-eabi -mattr=+v8.9a < %s | FileCheck --check-prefixes=BOTH,NO-RCPC3 %s + +define hidden <2 x i64> @test_ldap1_2xi64_lane0(ptr nocapture noundef readonly %a, <2 x i64> noundef %b) local_unnamed_addr { +; +; RCPC3-LABEL: test_ldap1_2xi64_lane0: +; RCPC3: // %bb.0: +; RCPC3-NEXT: ldap1 { v0.d }[0], [x0] +; RCPC3-NEXT: ret +; +; NO-RCPC3-LABEL: test_ldap1_2xi64_lane0: +; NO-RCPC3: // %bb.0: +; NO-RCPC3-NEXT: ldapr x8, [x0] +; NO-RCPC3-NEXT: mov v0.d[0], x8 +; NO-RCPC3-NEXT: ret + %1 = load atomic i64, ptr %a acquire, align 8 + %ldap1 = insertelement <2 x i64> %b, i64 %1, i64 0 + ret <2 x i64> %ldap1 +} + +define hidden <2 x i64> @test_ldap1_2xi64_lane1(ptr nocapture noundef readonly %a, <2 x i64> noundef %b) local_unnamed_addr { +; +; RCPC3-LABEL: test_ldap1_2xi64_lane1: +; RCPC3: // %bb.0: +; RCPC3-NEXT: ldap1 { v0.d }[1], [x0] +; RCPC3-NEXT: ret +; +; NO-RCPC3-LABEL: test_ldap1_2xi64_lane1: +; NO-RCPC3: // %bb.0: +; NO-RCPC3-NEXT: ldapr x8, [x0] +; NO-RCPC3-NEXT: mov v0.d[1], x8 +; NO-RCPC3-NEXT: ret + %1 = load atomic i64, ptr %a acquire, align 8 + %ldap1 = insertelement <2 x i64> %b, i64 %1, i64 1 + ret <2 x i64> %ldap1 +} + +define hidden nofpclass(nan inf) <2 x double> @test_ldap1_2xdouble_lane0(ptr nocapture noundef readonly %a, <2 x double> noundef nofpclass(nan inf) %b) local_unnamed_addr { +; +; RCPC3-LABEL: test_ldap1_2xdouble_lane0: +; RCPC3: // %bb.0: +; RCPC3-NEXT: ldap1 { v0.d }[0], [x0] +; RCPC3-NEXT: ret +; +; NO-RCPC3-LABEL: test_ldap1_2xdouble_lane0: +; NO-RCPC3: // %bb.0: +; NO-RCPC3-NEXT: ldapr x8, [x0] +; NO-RCPC3-NEXT: fmov d1, x8 +; NO-RCPC3-NEXT: mov v0.d[0], v1.d[0] +; NO-RCPC3-NEXT: ret + %1 = load atomic double, ptr %a acquire, align 8 + %ldap1 = insertelement <2 x double> %b, double %1, i64 0 + ret <2 x double> %ldap1 +} + +define hidden nofpclass(nan inf) <2 x double> @test_ldap1_2xdouble_lane1(ptr nocapture noundef readonly %a, <2 x double> noundef nofpclass(nan inf) %b) local_unnamed_addr { +; +; RCPC3-LABEL: test_ldap1_2xdouble_lane1: +; RCPC3: // %bb.0: +; RCPC3-NEXT: ldap1 { v0.d }[1], [x0] +; RCPC3-NEXT: ret +; +; NO-RCPC3-LABEL: test_ldap1_2xdouble_lane1: +; NO-RCPC3: // %bb.0: +; NO-RCPC3-NEXT: ldapr x8, [x0] +; NO-RCPC3-NEXT: fmov d1, x8 +; NO-RCPC3-NEXT: mov v0.d[1], v1.d[0] +; NO-RCPC3-NEXT: ret + %1 = load atomic double, ptr %a acquire, align 8 + %ldap1 = insertelement <2 x double> %b, double %1, i64 1 + ret <2 x double> %ldap1 +} + +define hidden <1 x i64> @test_ldap1_1xi64_lane0(ptr nocapture noundef readonly %a, <1 x i64> noundef %b) local_unnamed_addr { +; +; RCPC3-LABEL: test_ldap1_1xi64_lane0: +; RCPC3: // %bb.0: +; RCPC3-NEXT: ldap1 { v0.d }[0], [x0] +; RCPC3-NEXT: // kill: def $d0 killed $d0 killed $q0 +; RCPC3-NEXT: ret +; +; NO-RCPC3-LABEL: test_ldap1_1xi64_lane0: +; NO-RCPC3: // %bb.0: +; NO-RCPC3-NEXT: ldapr x8, [x0] +; NO-RCPC3-NEXT: fmov d0, x8 +; NO-RCPC3-NEXT: ret + %1 = load atomic i64, ptr %a acquire, align 8 + %ldap1 = insertelement <1 x i64> poison, i64 %1, i64 0 + ret <1 x i64> %ldap1 +} + +define hidden nofpclass(nan inf) <1 x double> @test_ldap1_1xdouble_lane0(ptr nocapture noundef readonly %a, <1 x double> noundef nofpclass(nan inf) %b) local_unnamed_addr { +; +; RCPC3-LABEL: test_ldap1_1xdouble_lane0: +; RCPC3: // %bb.0: +; RCPC3-NEXT: ldap1 { v0.d }[0], [x0] +; RCPC3-NEXT: // kill: def $d0 killed $d0 killed $q0 +; RCPC3-NEXT: ret +; +; NO-RCPC3-LABEL: test_ldap1_1xdouble_lane0: +; NO-RCPC3: // %bb.0: +; NO-RCPC3-NEXT: ldapr x8, [x0] +; NO-RCPC3-NEXT: fmov d0, x8 +; NO-RCPC3-NEXT: ret + %1 = load atomic double, ptr %a acquire, align 8 + %ldap1 = insertelement <1 x double> poison, double %1, i64 0 + ret <1 x double> %ldap1 +} + +define hidden void @test_stl1_2xi64_lane0(ptr nocapture noundef writeonly %a, <2 x i64> noundef %b) local_unnamed_addr { +; +; RCPC3-LABEL: test_stl1_2xi64_lane0: +; RCPC3: // %bb.0: +; RCPC3-NEXT: stl1 { v0.d }[0], [x0] +; RCPC3-NEXT: ret +; +; NO-RCPC3-LABEL: test_stl1_2xi64_lane0: +; NO-RCPC3: // %bb.0: +; NO-RCPC3-NEXT: fmov x8, d0 +; NO-RCPC3-NEXT: stlr x8, [x0] +; NO-RCPC3-NEXT: ret + %1 = extractelement <2 x i64> %b, i64 0 + store atomic i64 %1, ptr %a release, align 8 + ret void +} + +define hidden void @test_stl1_2xi64_lane1(ptr nocapture noundef writeonly %a, <2 x i64> noundef %b) local_unnamed_addr { +; +; RCPC3-LABEL: test_stl1_2xi64_lane1: +; RCPC3: // %bb.0: +; RCPC3-NEXT: stl1 { v0.d }[1], [x0] +; RCPC3-NEXT: ret +; +; NO-RCPC3-LABEL: test_stl1_2xi64_lane1: +; NO-RCPC3: // %bb.0: +; NO-RCPC3-NEXT: mov x8, v0.d[1] +; NO-RCPC3-NEXT: stlr x8, [x0] +; NO-RCPC3-NEXT: ret + %1 = extractelement <2 x i64> %b, i64 1 + store atomic i64 %1, ptr %a release, align 8 + ret void +} + +define hidden void @test_stl1_2xdouble_lane0(ptr nocapture noundef writeonly %a, <2 x double> noundef nofpclass(nan inf) %b) local_unnamed_addr { +; +; RCPC3-LABEL: test_stl1_2xdouble_lane0: +; RCPC3: // %bb.0: +; RCPC3-NEXT: stl1 { v0.d }[0], [x0] +; RCPC3-NEXT: ret +; +; NO-RCPC3-LABEL: test_stl1_2xdouble_lane0: +; NO-RCPC3: // %bb.0: +; NO-RCPC3-NEXT: fmov x8, d0 +; NO-RCPC3-NEXT: stlr x8, [x0] +; NO-RCPC3-NEXT: ret + %1 = extractelement <2 x double> %b, i64 0 + store atomic double %1, ptr %a release, align 8 + ret void +} + +define hidden void @test_stl1_2xdouble_lane1(ptr nocapture noundef writeonly %a, <2 x double> noundef nofpclass(nan inf) %b) local_unnamed_addr { +; +; RCPC3-LABEL: test_stl1_2xdouble_lane1: +; RCPC3: // %bb.0: +; RCPC3-NEXT: stl1 { v0.d }[1], [x0] +; RCPC3-NEXT: ret +; +; NO-RCPC3-LABEL: test_stl1_2xdouble_lane1: +; NO-RCPC3: // %bb.0: +; NO-RCPC3-NEXT: mov d0, v0.d[1] +; NO-RCPC3-NEXT: fmov x8, d0 +; NO-RCPC3-NEXT: stlr x8, [x0] +; NO-RCPC3-NEXT: ret + %1 = extractelement <2 x double> %b, i64 1 + store atomic double %1, ptr %a release, align 8 + ret void +} + +define hidden void @test_stl1_1xi64_lane0(ptr nocapture noundef writeonly %a, <1 x i64> noundef %b) local_unnamed_addr { +; +; RCPC3-LABEL: test_stl1_1xi64_lane0: +; RCPC3: // %bb.0: +; RCPC3-NEXT: // kill: def $d0 killed $d0 def $q0 +; RCPC3-NEXT: stl1 { v0.d }[0], [x0] +; RCPC3-NEXT: ret +; +; NO-RCPC3-LABEL: test_stl1_1xi64_lane0: +; NO-RCPC3: // %bb.0: +; NO-RCPC3-NEXT: // kill: def $d0 killed $d0 def $q0 +; NO-RCPC3-NEXT: fmov x8, d0 +; NO-RCPC3-NEXT: stlr x8, [x0] +; NO-RCPC3-NEXT: ret + %1 = extractelement <1 x i64> %b, i64 0 + store atomic i64 %1, ptr %a release, align 8 + ret void +} + +define hidden void @test_stl1_1xdouble_lane0(ptr nocapture noundef writeonly %a, <1 x double> noundef nofpclass(nan inf) %b) local_unnamed_addr { +; +; RCPC3-LABEL: test_stl1_1xdouble_lane0: +; RCPC3: // %bb.0: +; RCPC3-NEXT: // kill: def $d0 killed $d0 def $q0 +; RCPC3-NEXT: stl1 { v0.d }[0], [x0] +; RCPC3-NEXT: ret +; +; NO-RCPC3-LABEL: test_stl1_1xdouble_lane0: +; NO-RCPC3: // %bb.0: +; NO-RCPC3-NEXT: fmov x8, d0 +; NO-RCPC3-NEXT: stlr x8, [x0] +; NO-RCPC3-NEXT: ret + %1 = extractelement <1 x double> %b, i64 0 + store atomic double %1, ptr %a release, align 8 + ret void +} + +; The remaining tests do not have any particular RCPC3-specific codegen: + +; load-acquire a plain non-vector double value +define hidden double @test_double_load(ptr nocapture noundef readonly %a) local_unnamed_addr { +; BOTH-LABEL: test_double_load: +; BOTH: // %bb.0: +; BOTH-NEXT: ldapr x8, [x0] +; BOTH-NEXT: fmov d0, x8 +; BOTH-NEXT: ret + %1 = load atomic double, ptr %a acquire, align 8 + ret double %1 +} + +; store-release a plain non-vector double value +define hidden void @test_double_store(ptr nocapture noundef writeonly %a, double noundef %b) local_unnamed_addr { +; BOTH-LABEL: test_double_store: +; BOTH: // %bb.0: +; BOTH-NEXT: fmov x8, d0 +; BOTH-NEXT: stlr x8, [x0] +; BOTH-NEXT: ret + store atomic double %b, ptr %a release, align 8 + ret void +} + +; load-acquire an i64, followed by a bitcast to a 64-bit vector +define hidden <2 x i32> @test_load_i64_bitcast_2xi32(ptr nocapture noundef readonly %a) local_unnamed_addr { +; BOTH-LABEL: test_load_i64_bitcast_2xi32: +; BOTH: // %bb.0: +; BOTH-NEXT: ldapr x8, [x0] +; BOTH-NEXT: fmov d0, x8 +; BOTH-NEXT: ret + %1 = load atomic i64, ptr %a acquire, align 8 + %2 = bitcast i64 %1 to <2 x i32> + ret <2 x i32> %2 +} + +; bitcast from a 64-bit vector, followed by a store-release of the i64 +define hidden void @test_bitcast_2xi32_store_i64(ptr nocapture noundef readonly %a, <2 x i32> noundef %b) local_unnamed_addr { +; BOTH-LABEL: test_bitcast_2xi32_store_i64: +; BOTH: // %bb.0: +; BOTH-NEXT: fmov x8, d0 +; BOTH-NEXT: stlr x8, [x0] +; BOTH-NEXT: ret + %1 = bitcast <2 x i32> %b to i64 + store atomic i64 %1, ptr %a release, align 8 + ret void +} + +; (non-atomic) load a 64-bit vector +define hidden <2 x i32> @test_load_2xi32(ptr nocapture noundef readonly %a) local_unnamed_addr { +; BOTH-LABEL: test_load_2xi32: +; BOTH: // %bb.0: +; BOTH-NEXT: ldr d0, [x0] +; BOTH-NEXT: ret + %1 = load <2 x i32>, ptr %a, align 8 + ret <2 x i32> %1 +} + +; (non-atomic) store a 64-bit vector +define hidden void @test_store_2xi32(ptr nocapture noundef writeonly %a, <2 x i32> noundef %b) local_unnamed_addr { +; BOTH-LABEL: test_store_2xi32: +; BOTH: // %bb.0: +; BOTH-NEXT: str d0, [x0] +; BOTH-NEXT: ret + store <2 x i32> %b, ptr %a, align 8 + ret void +} + +; (non-atomic) load a 64-bit vector +define hidden <1 x i64> @test_load_1xi64(ptr nocapture noundef readonly %a) local_unnamed_addr { +; BOTH-LABEL: test_load_1xi64: +; BOTH: // %bb.0: +; BOTH-NEXT: ldr d0, [x0] +; BOTH-NEXT: ret + %1 = load <1 x i64>, ptr %a, align 8 + ret <1 x i64> %1 +} + +; (non-atomic) store a 64-bit vector +define hidden void @test_store_1xi64(ptr nocapture noundef writeonly %a, <1 x i64> noundef %b) local_unnamed_addr { +; BOTH-LABEL: test_store_1xi64: +; BOTH: // %bb.0: +; BOTH-NEXT: str d0, [x0] +; BOTH-NEXT: ret + store <1 x i64> %b, ptr %a, align 8 + ret void +} + +; (non-atomic) load a 64-bit value and insert into vector +define hidden <2 x i64> @test_load_insert_2xi64(ptr nocapture noundef readonly %a, <2 x i64> noundef %b) local_unnamed_addr { +; BOTH-LABEL: test_load_insert_2xi64: +; BOTH: // %bb.0: +; BOTH-NEXT: ld1 { v0.d }[0], [x0] +; BOTH-NEXT: ret + %1 = load i64, ptr %a, align 8 + %2 = insertelement <2 x i64> %b, i64 %1, i64 0 + ret <2 x i64> %2 +} + +; extract from vector and (non-atomic) store a 64-bit value +define hidden void @test_extract_store_2xi64(ptr nocapture noundef writeonly %a, <2 x i64> noundef %b) local_unnamed_addr { +; BOTH-LABEL: test_extract_store_2xi64: +; BOTH: // %bb.0: +; BOTH-NEXT: st1 { v0.d }[1], [x0] +; BOTH-NEXT: ret + %1 = extractelement <2 x i64> %b, i64 1 + store i64 %1, ptr %a, align 8 + ret void +}