diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -2086,3 +2086,9 @@ "csilUcUsUiUlhfdPcPsPlQcQsQiQlQUcQUsQUiQUlQhQfQdQPcQPsQPlQPk", "bQb">; } } + +// v8.9a/v9.4a LRCPC3 intrinsics +let ArchGuard = "defined(__aarch64__)", TargetGuard = "rcpc3" in { + def VLDAP1_LANE : WInst<"vldap1_lane", ".(c*!).I", "QUlQlUlldQdPlQPl">; + def VSTL1_LANE : WInst<"vstl1_lane", "v*(.!)I", "QUlQlUlldQdPlQPl">; +} diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -6764,6 +6764,21 @@ { NEON::BI__builtin_neon_vuzpq_f16, NEON::BI__builtin_neon_vuzpq_v, }, { NEON::BI__builtin_neon_vzip_f16, NEON::BI__builtin_neon_vzip_v, }, { NEON::BI__builtin_neon_vzipq_f16, NEON::BI__builtin_neon_vzipq_v, }, + // The mangling rules cause us to have one ID for each type for vldap1(q)_lane + // and vstl1(q)_lane, but codegen is equivalent for all of them. Choose an + // arbitrary one to be handled as tha canonical variation. + { NEON::BI__builtin_neon_vldap1_lane_u64, NEON::BI__builtin_neon_vldap1_lane_s64 }, + { NEON::BI__builtin_neon_vldap1_lane_f64, NEON::BI__builtin_neon_vldap1_lane_s64 }, + { NEON::BI__builtin_neon_vldap1_lane_p64, NEON::BI__builtin_neon_vldap1_lane_s64 }, + { NEON::BI__builtin_neon_vldap1q_lane_u64, NEON::BI__builtin_neon_vldap1q_lane_s64 }, + { NEON::BI__builtin_neon_vldap1q_lane_f64, NEON::BI__builtin_neon_vldap1q_lane_s64 }, + { NEON::BI__builtin_neon_vldap1q_lane_p64, NEON::BI__builtin_neon_vldap1q_lane_s64 }, + { NEON::BI__builtin_neon_vstl1_lane_u64, NEON::BI__builtin_neon_vstl1_lane_s64 }, + { NEON::BI__builtin_neon_vstl1_lane_f64, NEON::BI__builtin_neon_vstl1_lane_s64 }, + { NEON::BI__builtin_neon_vstl1_lane_p64, NEON::BI__builtin_neon_vstl1_lane_s64 }, + { NEON::BI__builtin_neon_vstl1q_lane_u64, NEON::BI__builtin_neon_vstl1q_lane_s64 }, + { NEON::BI__builtin_neon_vstl1q_lane_f64, NEON::BI__builtin_neon_vstl1q_lane_s64 }, + { NEON::BI__builtin_neon_vstl1q_lane_p64, NEON::BI__builtin_neon_vstl1q_lane_s64 }, }; #undef NEONMAP0 @@ -10581,6 +10596,10 @@ case NEON::BI__builtin_neon_vst1q_v: case NEON::BI__builtin_neon_vst1_lane_v: case NEON::BI__builtin_neon_vst1q_lane_v: + case NEON::BI__builtin_neon_vldap1_lane_s64: + case NEON::BI__builtin_neon_vldap1q_lane_s64: + case NEON::BI__builtin_neon_vstl1_lane_s64: + case NEON::BI__builtin_neon_vstl1q_lane_s64: // Get the alignment for the argument in addition to the value; // we'll use it later. PtrOp0 = EmitPointerWithAlignment(E->getArg(0)); @@ -12179,6 +12198,17 @@ PtrOp0.getAlignment()); return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vld1_lane"); } + case NEON::BI__builtin_neon_vldap1_lane_s64: + case NEON::BI__builtin_neon_vldap1q_lane_s64: { + Ops[1] = Builder.CreateBitCast(Ops[1], Ty); + Ty = llvm::PointerType::getUnqual(VTy->getElementType()); + Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + llvm::LoadInst *LI = Builder.CreateAlignedLoad( + VTy->getElementType(), Ops[0], PtrOp0.getAlignment()); + LI->setAtomic(llvm::AtomicOrdering::Acquire); + Ops[0] = LI; + return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vldap1_lane"); + } case NEON::BI__builtin_neon_vld1_dup_v: case NEON::BI__builtin_neon_vld1q_dup_v: { Value *V = PoisonValue::get(Ty); @@ -12197,6 +12227,16 @@ Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); return Builder.CreateAlignedStore(Ops[1], Builder.CreateBitCast(Ops[0], Ty), PtrOp0.getAlignment()); + case NEON::BI__builtin_neon_vstl1_lane_s64: + case NEON::BI__builtin_neon_vstl1q_lane_s64: { + Ops[1] = Builder.CreateBitCast(Ops[1], Ty); + Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]); + Ty = llvm::PointerType::getUnqual(Ops[1]->getType()); + llvm::StoreInst *SI = Builder.CreateAlignedStore( + Ops[1], Builder.CreateBitCast(Ops[0], Ty), PtrOp0.getAlignment()); + SI->setAtomic(llvm::AtomicOrdering::Release); + return SI; + } case NEON::BI__builtin_neon_vld2_v: case NEON::BI__builtin_neon_vld2q_v: { llvm::Type *PTy = llvm::PointerType::getUnqual(VTy); diff --git a/clang/test/CodeGen/aarch64-neon-ldst-one-rcpc3.c b/clang/test/CodeGen/aarch64-neon-ldst-one-rcpc3.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-neon-ldst-one-rcpc3.c @@ -0,0 +1,201 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple aarch64-arm-none-eabi -target-feature +neon \ +// RUN: -target-feature +rcpc3 -disable-O0-optnone -emit-llvm -o - %s \ +// RUN: | opt -S -passes=mem2reg | FileCheck %s + +// REQUIRES: aarch64-registered-target + +#include + + +// CHECK-LABEL: @test_vldap1q_lane_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8 +// CHECK-NEXT: [[VLDAP1_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[TMP2]], i32 1 +// CHECK-NEXT: ret <2 x i64> [[VLDAP1_LANE]] +// +uint64x2_t test_vldap1q_lane_u64(uint64_t *a, uint64x2_t b) { + return vldap1q_lane_u64(a, b, 1); +} + +// CHECK-LABEL: @test_vldap1q_lane_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8 +// CHECK-NEXT: [[VLDAP1_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[TMP2]], i32 1 +// CHECK-NEXT: ret <2 x i64> [[VLDAP1_LANE]] +// +int64x2_t test_vldap1q_lane_s64(int64_t *a, int64x2_t b) { + return vldap1q_lane_s64(a, b, 1); +} + +// CHECK-LABEL: @test_vldap1q_lane_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK-NEXT: [[TMP2:%.*]] = load atomic double, ptr [[A:%.*]] acquire, align 8 +// CHECK-NEXT: [[VLDAP1_LANE:%.*]] = insertelement <2 x double> [[TMP1]], double [[TMP2]], i32 1 +// CHECK-NEXT: ret <2 x double> [[VLDAP1_LANE]] +// +float64x2_t test_vldap1q_lane_f64(float64_t *a, float64x2_t b) { + return vldap1q_lane_f64(a, b, 1); +} + +// CHECK-LABEL: @test_vldap1q_lane_p64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8 +// CHECK-NEXT: [[VLDAP1_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[TMP2]], i32 1 +// CHECK-NEXT: ret <2 x i64> [[VLDAP1_LANE]] +// +poly64x2_t test_vldap1q_lane_p64(poly64_t *a, poly64x2_t b) { + return vldap1q_lane_p64(a, b, 1); +} + +// CHECK-LABEL: @test_vldap1_lane_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8 +// CHECK-NEXT: [[VLDAP1_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[VLDAP1_LANE]] +// +uint64x1_t test_vldap1_lane_u64(uint64_t *a, uint64x1_t b) { + return vldap1_lane_u64(a, b, 0); +} + +// CHECK-LABEL: @test_vldap1_lane_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8 +// CHECK-NEXT: [[VLDAP1_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[VLDAP1_LANE]] +// +int64x1_t test_vldap1_lane_s64(int64_t *a, int64x1_t b) { + return vldap1_lane_s64(a, b, 0); +} + +// CHECK-LABEL: @test_vldap1_lane_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK-NEXT: [[TMP2:%.*]] = load atomic double, ptr [[A:%.*]] acquire, align 8 +// CHECK-NEXT: [[VLDAP1_LANE:%.*]] = insertelement <1 x double> [[TMP1]], double [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x double> [[VLDAP1_LANE]] +// +float64x1_t test_vldap1_lane_f64(float64_t *a, float64x1_t b) { + return vldap1_lane_f64(a, b, 0); +} + +// CHECK-LABEL: @test_vldap1_lane_p64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8 +// CHECK-NEXT: [[VLDAP1_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[VLDAP1_LANE]] +// +poly64x1_t test_vldap1_lane_p64(poly64_t *a, poly64x1_t b) { + return vldap1_lane_p64(a, b, 0); +} + +// CHECK-LABEL: @test_vstl1q_lane_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +// CHECK-NEXT: store atomic i64 [[TMP2]], ptr [[A:%.*]] release, align 8 +// CHECK-NEXT: ret void +// +void test_vstl1q_lane_u64(uint64_t *a, uint64x2_t b) { + vstl1q_lane_u64(a, b, 1); +} + +// CHECK-LABEL: @test_vstl1q_lane_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +// CHECK-NEXT: store atomic i64 [[TMP2]], ptr [[A:%.*]] release, align 8 +// CHECK-NEXT: ret void +// +void test_vstl1q_lane_s64(int64_t *a, int64x2_t b) { + vstl1q_lane_s64(a, b, 1); +} + +// CHECK-LABEL: @test_vstl1q_lane_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +// CHECK-NEXT: store atomic double [[TMP2]], ptr [[A:%.*]] release, align 8 +// CHECK-NEXT: ret void +// +void test_vstl1q_lane_f64(float64_t *a, float64x2_t b) { + vstl1q_lane_f64(a, b, 1); +} + +// CHECK-LABEL: @test_vstl1q_lane_p64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +// CHECK-NEXT: store atomic i64 [[TMP2]], ptr [[A:%.*]] release, align 8 +// CHECK-NEXT: ret void +// +void test_vstl1q_lane_p64(poly64_t *a, poly64x2_t b) { + vstl1q_lane_p64(a, b, 1); +} + +// CHECK-LABEL: @test_vstl1_lane_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0 +// CHECK-NEXT: store atomic i64 [[TMP2]], ptr [[A:%.*]] release, align 8 +// CHECK-NEXT: ret void +// +void test_vstl1_lane_u64(uint64_t *a, uint64x1_t b) { + vstl1_lane_u64(a, b, 0); +} + +// CHECK-LABEL: @test_vstl1_lane_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0 +// CHECK-NEXT: store atomic i64 [[TMP2]], ptr [[A:%.*]] release, align 8 +// CHECK-NEXT: ret void +// +void test_vstl1_lane_s64(int64_t *a, int64x1_t b) { + vstl1_lane_s64(a, b, 0); +} + +// CHECK-LABEL: @test_vstl1_lane_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <1 x double> [[TMP1]], i32 0 +// CHECK-NEXT: store atomic double [[TMP2]], ptr [[A:%.*]] release, align 8 +// CHECK-NEXT: ret void +// +void test_vstl1_lane_f64(float64_t *a, float64x1_t b) { + vstl1_lane_f64(a, b, 0); +} + +// CHECK-LABEL: @test_vstl1_lane_p64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0 +// CHECK-NEXT: store atomic i64 [[TMP2]], ptr [[A:%.*]] release, align 8 +// CHECK-NEXT: ret void +// +void test_vstl1_lane_p64(poly64_t *a, poly64x1_t b) { + vstl1_lane_p64(a, b, 0); +} diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -2086,12 +2086,13 @@ std::string Name = Def->getName(); // Omit type checking for the pointer arguments of vld1_lane, vld1_dup, - // and vst1_lane intrinsics. Using a pointer to the vector element - // type with one of those operations causes codegen to select an aligned - // load/store instruction. If you want an unaligned operation, - // the pointer argument needs to have less alignment than element type, - // so just accept any pointer type. - if (Name == "vld1_lane" || Name == "vld1_dup" || Name == "vst1_lane") { + // vst1_lane, vldap1_lane, and vstl1_lane intrinsics. Using a pointer to + // the vector element type with one of those operations causes codegen to + // select an aligned load/store instruction. If you want an unaligned + // operation, the pointer argument needs to have less alignment than element + // type, so just accept any pointer type. + if (Name == "vld1_lane" || Name == "vld1_dup" || Name == "vst1_lane" || + Name == "vldap1_lane" || Name == "vstl1_lane") { PtrArgNum = -1; HasConstPtr = false; }