Index: clang/include/clang/Basic/arm_neon.td =================================================================== --- clang/include/clang/Basic/arm_neon.td +++ clang/include/clang/Basic/arm_neon.td @@ -236,6 +236,22 @@ : Op<(call "vusdot", $p0, (cast "8", "U", (call_mangled "splat_lane", (bitcast "int32x4_t", $p2), $p3)), $p1)>; +def OP_BFDOT_LN + : Op<(call "vbfdot", $p0, $p1, + (bitcast $p1, (call_mangled "splat_lane", (bitcast "float32x2_t", $p2), $p3)))>; + +def OP_BFDOT_LNQ + : Op<(call "vbfdot", $p0, $p1, + (bitcast $p1, (call_mangled "splat_lane", (bitcast "float32x4_t", $p2), $p3)))>; + +def OP_BFMLALB_LN + : Op<(call "vbfmlalb", $p0, $p1, + (dup_typed $p1, (call "vget_lane", $p2, $p3)))>; + +def OP_BFMLALT_LN + : Op<(call "vbfmlalt", $p0, $p1, + (dup_typed $p1, (call "vget_lane", $p2, $p3)))>; + //===----------------------------------------------------------------------===// // Auxiliary Instructions //===----------------------------------------------------------------------===// @@ -1833,6 +1849,25 @@ } } +let ArchGuard = "defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC)" in { + def VDOT_BF : SInst<"vbfdot", "..BB", "fQf">; + def VDOT_LANE_BF : SOpInst<"vbfdot_lane", "..B(Bq)I", "fQf", OP_BFDOT_LN>; + def VDOT_LANEQ_BF : SOpInst<"vbfdot_laneq", "..B(BQ)I", "fQf", OP_BFDOT_LNQ> { + let isLaneQ = 1; + } + + def VFMMLA_BF : SInst<"vbfmmla", "..BB", "Qf">; + + def VFMLALB_BF : SInst<"vbfmlalb", "..BB", "Qf">; + def VFMLALT_BF : SInst<"vbfmlalt", "..BB", "Qf">; + + def VFMLALB_LANE_BF : SOpInst<"vbfmlalb_lane", "..B(Bq)I", "Qf", OP_BFMLALB_LN>; + def VFMLALB_LANEQ_BF : SOpInst<"vbfmlalb_laneq", "..B(BQ)I", "Qf", OP_BFMLALB_LN>; + + def VFMLALT_LANE_BF : SOpInst<"vbfmlalt_lane", "..B(Bq)I", "Qf", OP_BFMLALT_LN>; + def VFMLALT_LANEQ_BF : SOpInst<"vbfmlalt_laneq", "..B(BQ)I", "Qf", OP_BFMLALT_LN>; +} + // v8.3-A Vector complex addition intrinsics let ArchGuard = "defined(__ARM_FEATURE_COMPLEX) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)" in { def VCADD_ROT90_FP16 : SInst<"vcadd_rot90", "...", "h">; Index: clang/lib/CodeGen/CGBuiltin.cpp =================================================================== --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -4956,6 +4956,11 @@ NEONMAP1(vaeseq_v, aarch64_crypto_aese, 0), NEONMAP1(vaesimcq_v, aarch64_crypto_aesimc, 0), NEONMAP1(vaesmcq_v, aarch64_crypto_aesmc, 0), + NEONMAP1(vbfdot_v, aarch64_neon_bfdot, 0), + NEONMAP1(vbfdotq_v, aarch64_neon_bfdot, 0), + NEONMAP1(vbfmlalbq_v, aarch64_neon_bfmlalb, 0), + NEONMAP1(vbfmlaltq_v, aarch64_neon_bfmlalt, 0), + NEONMAP1(vbfmmlaq_v, aarch64_neon_bfmmla, 0), NEONMAP1(vcadd_rot270_v, aarch64_neon_vcadd_rot270, Add1ArgType), NEONMAP1(vcadd_rot90_v, aarch64_neon_vcadd_rot90, Add1ArgType), NEONMAP1(vcaddq_rot270_v, aarch64_neon_vcadd_rot270, Add1ArgType), @@ -6127,6 +6132,31 @@ llvm::Type *Tys[2] = { Ty, InputTy }; return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusdot"); } + case NEON::BI__builtin_neon_vbfdot_v: + case NEON::BI__builtin_neon_vbfdotq_v: { + llvm::Type *InputTy = + llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); + llvm::Type *Tys[2] = { Ty, InputTy }; + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfdot"); + } + case NEON::BI__builtin_neon_vbfmmlaq_v: { + llvm::Type *InputTy = + llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); + llvm::Type *Tys[2] = { Ty, InputTy }; + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmmla"); + } + case NEON::BI__builtin_neon_vbfmlalbq_v: { + llvm::Type *InputTy = + llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); + llvm::Type *Tys[2] = { Ty, InputTy }; + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmlalb"); + } + case NEON::BI__builtin_neon_vbfmlaltq_v: { + llvm::Type *InputTy = + llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); + llvm::Type *Tys[2] = { Ty, InputTy }; + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmlalt"); + } } assert(Int && "Expected valid intrinsic number"); Index: clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c =================================================================== --- /dev/null +++ clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c @@ -0,0 +1,134 @@ +// RUN: %clang_cc1 -triple aarch64-arm-none-eabi \ +// RUN: -O2 -target-feature +neon -target-feature +bf16 \ +// RUN: -emit-llvm -o - %s | FileCheck %s + +#include + +// CHECK-LABEL: test_vbfdot_f32 +// CHECK: %0 = bitcast <4 x bfloat> %a to <8 x i8> +// CHECK: %1 = bitcast <4 x bfloat> %b to <8 x i8> +// CHECK: %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1) +// CHECK: ret <2 x float> %vbfdot1.i +float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) { + return vbfdot_f32(r, a, b); +} + +// CHECK-LABEL: test_vbfdotq_f32 +// CHECK: %0 = bitcast <8 x bfloat> %a to <16 x i8> +// CHECK: %1 = bitcast <8 x bfloat> %b to <16 x i8> +// CHECK: %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) +// CHECK: ret <4 x float> %vbfdot1.i +float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){ + return vbfdotq_f32(r, a, b); +} + +// CHECK-LABEL: test_vbfdot_lane_f32 +// CHECK: %0 = bitcast <4 x bfloat> %b to <2 x float> +// CHECK: %lane = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer +// CHECK: %1 = bitcast <4 x bfloat> %a to <8 x i8> +// CHECK: %2 = bitcast <2 x float> %lane to <8 x i8> +// CHECK: %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) +// CHECK: ret <2 x float> %vbfdot1.i +float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){ + return vbfdot_lane_f32(r, a, b, 0); +} + +// CHECK-LABEL: test_vbfdotq_laneq_f32 +// CHECK: %0 = bitcast <8 x bfloat> %b to <4 x float> +// CHECK: %lane = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> +// CHECK: %1 = bitcast <8 x bfloat> %a to <16 x i8> +// CHECK: %2 = bitcast <4 x float> %lane to <16 x i8> +// CHECK: %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) +// CHECK: ret <4 x float> %vbfdot1.i +float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { + return vbfdotq_laneq_f32(r, a, b, 3); +} + +// CHECK-LABEL: test_vbfdot_laneq_f32 +// CHECK: %0 = bitcast <8 x bfloat> %b to <4 x float> +// CHECK: %lane = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> +// CHECK: %1 = bitcast <4 x bfloat> %a to <8 x i8> +// CHECK: %2 = bitcast <2 x float> %lane to <8 x i8> +// CHECK: %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) +// CHECK: ret <2 x float> %vbfdot1.i +float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) { + return vbfdot_laneq_f32(r, a, b, 3); +} + +// CHECK-LABEL: test_vbfdotq_lane_f32 +// CHECK: %0 = bitcast <4 x bfloat> %b to <2 x float> +// CHECK: %lane = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer +// CHECK: %1 = bitcast <8 x bfloat> %a to <16 x i8> +// CHECK: %2 = bitcast <4 x float> %lane to <16 x i8> +// CHECK: %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) +// CHECK: ret <4 x float> %vbfdot1.i +float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { + return vbfdotq_lane_f32(r, a, b, 0); +} + +// CHECK-LABEL: test_vbfmmlaq_f32 +// CHECK: %0 = bitcast <8 x bfloat> %a to <16 x i8> +// CHECK: %1 = bitcast <8 x bfloat> %b to <16 x i8> +// CHECK: %vbfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) +// CHECK: ret <4 x float> %vbfmmla1.i +float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { + return vbfmmlaq_f32(r, a, b); +} + +// CHECK-LABEL: test_vbfmlalbq_f32 +// CHECK: %0 = bitcast <8 x bfloat> %a to <16 x i8> +// CHECK: %1 = bitcast <8 x bfloat> %b to <16 x i8> +// CHECK: %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) +// CHECK: ret <4 x float> %vbfmlalb1.i +float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { + return vbfmlalbq_f32(r, a, b); +} + +// CHECK-LABEL: test_vbfmlaltq_f32 +// CHECK: %0 = bitcast <8 x bfloat> %a to <16 x i8> +// CHECK: %1 = bitcast <8 x bfloat> %b to <16 x i8> +// CHECK: %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) +// CHECK: ret <4 x float> %vbfmlalt1.i +float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { + return vbfmlaltq_f32(r, a, b); +} + +// CHECK-LABEL: test_vbfmlalbq_lane_f32 +// CHECK: %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer +// CHECK: %0 = bitcast <8 x bfloat> %a to <16 x i8> +// CHECK: %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> +// CHECK: %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) +// CHECK: ret <4 x float> %vbfmlalb1.i +float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { + return vbfmlalbq_lane_f32(r, a, b, 0); +} + +// CHECK-LABEL: test_vbfmlalbq_laneq_f32 +// CHECK: %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> +// CHECK: %0 = bitcast <8 x bfloat> %a to <16 x i8> +// CHECK: %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> +// CHECK: %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) +// CHECK: ret <4 x float> %vbfmlalb1.i +float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { + return vbfmlalbq_laneq_f32(r, a, b, 3); +} + +// CHECK-LABEL: test_vbfmlaltq_lane_f32 +// CHECK: %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer +// CHECK: %0 = bitcast <8 x bfloat> %a to <16 x i8> +// CHECK: %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> +// CHECK: %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) +// CHECK: ret <4 x float> %vbfmlalt1.i +float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { + return vbfmlaltq_lane_f32(r, a, b, 0); +} + +// CHECK-LABEL: test_vbfmlaltq_laneq_f32 +// CHECK: %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> +// CHECK: %0 = bitcast <8 x bfloat> %a to <16 x i8> +// CHECK: %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> +// CHECK: %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) +// CHECK: ret <4 x float> %vbfmlalt1.i +float32x4_t test_vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { + return vbfmlaltq_laneq_f32(r, a, b, 3); +} Index: clang/test/CodeGen/aarch64-bf16-ldst-intrinsics.c =================================================================== --- clang/test/CodeGen/aarch64-bf16-ldst-intrinsics.c +++ clang/test/CodeGen/aarch64-bf16-ldst-intrinsics.c @@ -165,35 +165,18 @@ return vld3_lane_bf16(ptr, src, 1); } // CHECK-LABEL: test_vld3_lane_bf16 - -// %src.coerce.fca.0.extract = extractvalue [3 x <4 x bfloat>] %src.coerce, 0 -// %src.coerce.fca.1.extract = extractvalue [3 x <4 x bfloat>] %src.coerce, 1 -// %src.coerce.fca.2.extract = extractvalue [3 x <4 x bfloat>] %src.coerce, 2 -// %0 = bitcast bfloat* %ptr to i8* -// %vld3_lane = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, i64 1, i8* %0) -// %vld3_lane.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane, 0 -// %vld3_lane.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane, 1 -// %vld3_lane.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane, 2 -// %.fca.0.0.insert = insertvalue %struct.bfloat16x4x3_t undef, <4 x bfloat> %vld3_lane.fca.0.extract, 0, 0 -// %.fca.0.1.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.0.insert, <4 x bfloat> %vld3_lane.fca.1.extract, 0, 1 -// %.fca.0.2.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.1.insert, <4 x bfloat> %vld3_lane.fca.2.extract, 0, 2 +// CHECK64: %vld3_lane = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, i64 1, i8* %0) +// CHECK32: %3 = bitcast bfloat* %ptr to i8* +// CHECK32: %vld3_lane_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3lane.v4bf16.p0i8(i8* %3, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 1, i32 2) bfloat16x8x3_t test_vld3q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x3_t src) { return vld3q_lane_bf16(ptr, src, 7); // return vld3q_lane_bf16(ptr, src, 8); } // CHECK-LABEL: test_vld3q_lane_bf16 -// %src.coerce.fca.0.extract = extractvalue [3 x <8 x bfloat>] %src.coerce, 0 -// %src.coerce.fca.1.extract = extractvalue [3 x <8 x bfloat>] %src.coerce, 1 -// %src.coerce.fca.2.extract = extractvalue [3 x <8 x bfloat>] %src.coerce, 2 -// %0 = bitcast bfloat* %ptr to i8* -// %vld3_lane = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, i64 7, i8* %0) -// %vld3_lane.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3_lane, 0 -// %vld3_lane.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3_lane, 1 -// %vld3_lane.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3_lane, 2 -// %.fca.0.0.insert = insertvalue %struct.bfloat16x8x3_t undef, <8 x bfloat> %vld3_lane.fca.0.extract, 0, 0 -// %.fca.0.1.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.0.insert, <8 x bfloat> %vld3_lane.fca.1.extract, 0, 1 -// %.fca.0.2.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.1.insert, <8 x bfloat> %vld3_lane.fca.2.extract, 0, 2 +// CHECK64: %vld3_lane = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, i64 7, i8* %0) +// CHECK32: %3 = bitcast bfloat* %ptr to i8* +// CHECK32: %vld3q_lane_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3lane.v8bf16.p0i8(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 7, i32 2) bfloat16x4x4_t test_vld4_bf16(bfloat16_t const *ptr) { return vld4_bf16(ptr); @@ -215,39 +198,17 @@ return vld4_lane_bf16(ptr, src, 1); } // CHECK-LABEL: test_vld4_lane_bf16 -// %src.coerce.fca.0.extract = extractvalue [4 x <4 x bfloat>] %src.coerce, 0 -// %src.coerce.fca.1.extract = extractvalue [4 x <4 x bfloat>] %src.coerce, 1 -// %src.coerce.fca.2.extract = extractvalue [4 x <4 x bfloat>] %src.coerce, 2 -// %src.coerce.fca.3.extract = extractvalue [4 x <4 x bfloat>] %src.coerce, 3 -// %0 = bitcast bfloat* %ptr to i8* -// %vld4_lane = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, <4 x bfloat> %src.coerce.fca.3.extract, i64 1, i8* %0) -// %vld4_lane.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane, 0 -// %vld4_lane.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane, 1 -// %vld4_lane.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane, 2 -// %vld4_lane.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane, 3 -// %.fca.0.0.insert = insertvalue %struct.bfloat16x4x4_t undef, <4 x bfloat> %vld4_lane.fca.0.extract, 0, 0 -// %.fca.0.1.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.0.insert, <4 x bfloat> %vld4_lane.fca.1.extract, 0, 1 -// %.fca.0.2.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.1.insert, <4 x bfloat> %vld4_lane.fca.2.extract, 0, 2 -// %.fca.0.3.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.2.insert, <4 x bfloat> %vld4_lane.fca.3.extract, 0, 3 +// CHECK64: %vld4_lane = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, <4 x bfloat> %src.coerce.fca.3.extract, i64 1, i8* %0) +// CHECK32: %4 = bitcast bfloat* %ptr to i8* +// CHECK32: %vld4_lane_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4lane.v4bf16.p0i8(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 1, i32 2) bfloat16x8x4_t test_vld4q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x4_t src) { return vld4q_lane_bf16(ptr, src, 7); } // CHECK-LABEL: test_vld4q_lane_bf16 -// %src.coerce.fca.0.extract = extractvalue [4 x <8 x bfloat>] %src.coerce, 0 -// %src.coerce.fca.1.extract = extractvalue [4 x <8 x bfloat>] %src.coerce, 1 -// %src.coerce.fca.2.extract = extractvalue [4 x <8 x bfloat>] %src.coerce, 2 -// %src.coerce.fca.3.extract = extractvalue [4 x <8 x bfloat>] %src.coerce, 3 -// %0 = bitcast bfloat* %ptr to i8* -// %vld4_lane = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, <8 x bfloat> %src.coerce.fca.3.extract, i64 7, i8* %0) -// %vld4_lane.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4_lane, 0 -// %vld4_lane.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4_lane, 1 -// %vld4_lane.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4_lane, 2 -// %vld4_lane.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4_lane, 3 -// %.fca.0.0.insert = insertvalue %struct.bfloat16x4x4_t undef, <8 x bfloat> %vld4_lane.fca.0.extract, 0, 0 -// %.fca.0.1.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.0.insert, <8 x bfloat> %vld4_lane.fca.1.extract, 0, 1 -// %.fca.0.2.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.1.insert, <8 x bfloat> %vld4_lane.fca.2.extract, 0, 2 -// %.fca.0.3.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.2.insert, <8 x bfloat> %vld4_lane.fca.3.extract, 0, 3 +// CHECK64: %vld4_lane = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, <8 x bfloat> %src.coerce.fca.3.extract, i64 7, i8* %0) +// CHECK32: %4 = bitcast bfloat* %ptr to i8* +// CHECK32: %vld4q_lane_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4lane.v8bf16.p0i8(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 7, i32 2) bfloat16x4x2_t test_vld2_dup_bf16(bfloat16_t const *ptr) { return vld2_dup_bf16(ptr); Index: llvm/include/llvm/IR/IntrinsicsAArch64.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAArch64.td +++ llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -178,6 +178,12 @@ : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>], [IntrNoMem]>; + + class AdvSIMD_FML_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>], + [IntrNoMem]>; + } // Arithmetic ops @@ -459,6 +465,11 @@ def int_aarch64_neon_smmla : AdvSIMD_MatMul_Intrinsic; def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic; def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic; + def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic; + def int_aarch64_neon_bfmmla : AdvSIMD_MatMul_Intrinsic; + def int_aarch64_neon_bfmlalb : AdvSIMD_FML_Intrinsic; + def int_aarch64_neon_bfmlalt : AdvSIMD_FML_Intrinsic; + // v8.2-A FP16 Fused Multiply-Add Long def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic; Index: llvm/lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -7815,16 +7815,19 @@ class BaseSIMDThreeSameVectorBFDot - : BaseSIMDThreeSameVectorTied { + : BaseSIMDThreeSameVectorTied { let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}"); } multiclass SIMDThreeSameVectorBFDot { - def v4f16 : BaseSIMDThreeSameVectorBFDot<0, U, asm, ".2s", ".4h", V64, + def v4bf16 : BaseSIMDThreeSameVectorBFDot<0, U, asm, ".2s", ".4h", V64, v2f32, v8i8>; - def v8f16 : BaseSIMDThreeSameVectorBFDot<1, U, asm, ".4s", ".8h", V128, + def v8bf16 : BaseSIMDThreeSameVectorBFDot<1, U, asm, ".4s", ".8h", V128, v4f32, v16i8>; } @@ -7837,7 +7840,13 @@ : BaseSIMDIndexedTied { + [(set (AccumType RegType:$dst), + (AccumType (int_aarch64_neon_bfdot + (AccumType RegType:$Rd), + (InputType RegType:$Rn), + (InputType (bitconvert (AccumType + (AArch64duplane32 (v4f32 V128:$Rm), + VectorIndexH:$idx)))))))]> { bits<2> idx; let Inst{21} = idx{0}; // L @@ -7846,23 +7855,30 @@ multiclass SIMDThreeSameVectorBF16DotI { - def v4f16 : BaseSIMDThreeSameVectorBF16DotI<0, U, asm, ".2s", ".4h", + def v4bf16 : BaseSIMDThreeSameVectorBF16DotI<0, U, asm, ".2s", ".4h", ".2h", V64, v2f32, v8i8>; - def v8f16 : BaseSIMDThreeSameVectorBF16DotI<1, U, asm, ".4s", ".8h", + def v8bf16 : BaseSIMDThreeSameVectorBF16DotI<1, U, asm, ".4s", ".8h", ".2h", V128, v4f32, v16i8>; } -class SIMDBF16MLAL +class SIMDBF16MLAL : BaseSIMDThreeSameVectorTied { // TODO: Add intrinsics + [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd), + (v16i8 V128:$Rn), + (v16i8 V128:$Rm)))]> { let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}"); } -class SIMDBF16MLALIndex +class SIMDBF16MLALIndex : I<(outs V128:$dst), (ins V128:$Rd, V128:$Rn, V128_lo:$Rm, VectorIndexH:$idx), asm, "{\t$Rd.4s, $Rn.8h, $Rm.h$idx}", "$Rd = $dst", - []>, // TODO: Add intrinsics + [(set (v4f32 V128:$dst), + (v4f32 (OpNode (v4f32 V128:$Rd), + (v16i8 V128:$Rn), + (v16i8 (bitconvert (v8bf16 + (AArch64duplane16 (v8bf16 V128_lo:$Rm), + VectorIndexH:$idx)))))))]>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; @@ -7884,7 +7900,10 @@ class SIMDThreeSameVectorBF16MatrixMul : BaseSIMDThreeSameVectorTied<1, 1, 0b010, 0b11101, V128, asm, ".4s", - []> { + [(set (v4f32 V128:$dst), + (int_aarch64_neon_bfmmla (v4f32 V128:$Rd), + (v16i8 V128:$Rn), + (v16i8 V128:$Rm)))]> { let AsmString = !strconcat(asm, "{\t$Rd", ".4s", ", $Rn", ".8h", ", $Rm", ".8h", "}"); } Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -768,10 +768,10 @@ defm BFDOT : SIMDThreeSameVectorBFDot<1, "bfdot">; defm BF16DOTlane : SIMDThreeSameVectorBF16DotI<0, "bfdot">; def BFMMLA : SIMDThreeSameVectorBF16MatrixMul<"bfmmla">; -def BFMLALB : SIMDBF16MLAL<0, "bfmlalb">; -def BFMLALT : SIMDBF16MLAL<1, "bfmlalt">; -def BFMLALBIdx : SIMDBF16MLALIndex<0, "bfmlalb">; -def BFMLALTIdx : SIMDBF16MLALIndex<1, "bfmlalt">; +def BFMLALB : SIMDBF16MLAL<0, "bfmlalb", int_aarch64_neon_bfmlalb>; +def BFMLALT : SIMDBF16MLAL<1, "bfmlalt", int_aarch64_neon_bfmlalt>; +def BFMLALBIdx : SIMDBF16MLALIndex<0, "bfmlalb", int_aarch64_neon_bfmlalb>; +def BFMLALTIdx : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>; def BFCVTN : SIMD_BFCVTN; def BFCVTN2 : SIMD_BFCVTN2; def BFCVT : BF16ToSinglePrecision<"bfcvt">; Index: llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll @@ -0,0 +1,149 @@ +; RUN: llc -mtriple aarch64-arm-none-eabi -mattr=+bf16 %s -o - | FileCheck %s + +; CHECK-LABEL: test_vbfdot_f32 +; CHECK: bfdot v0.2s, v1.4h, v2.4h +define <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) { +entry: + %0 = bitcast <4 x bfloat> %a to <8 x i8> + %1 = bitcast <4 x bfloat> %b to <8 x i8> + %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1) + ret <2 x float> %vbfdot1.i +} + +; CHECK-LABEL: test_vbfdotq_f32 +; CHECK: bfdot v0.4s, v1.8h, v2.8h +define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { +entry: + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %b to <16 x i8> + %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) + ret <4 x float> %vbfdot1.i +} + +; CHECK-LABEL: test_vbfdot_lane_f32 +; CHECK: bfdot v0.2s, v1.4h, v2.2h[0] +define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) { +entry: + %0 = bitcast <4 x bfloat> %b to <2 x float> + %shuffle = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer + %1 = bitcast <4 x bfloat> %a to <8 x i8> + %2 = bitcast <2 x float> %shuffle to <8 x i8> + %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) + ret <2 x float> %vbfdot1.i +} + +; CHECK-LABEL: test_vbfdotq_laneq_f32 +; CHECK: bfdot v0.4s, v1.8h, v2.2h[3] +define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { +entry: + %0 = bitcast <8 x bfloat> %b to <4 x float> + %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> + %1 = bitcast <8 x bfloat> %a to <16 x i8> + %2 = bitcast <4 x float> %shuffle to <16 x i8> + %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) + ret <4 x float> %vbfdot1.i +} + +; CHECK-LABEL: test_vbfdot_laneq_f32 +; CHECK: bfdot v0.2s, v1.4h, v2.2h[3] +define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) { +entry: + %0 = bitcast <8 x bfloat> %b to <4 x float> + %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> + %1 = bitcast <4 x bfloat> %a to <8 x i8> + %2 = bitcast <2 x float> %shuffle to <8 x i8> + %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) + ret <2 x float> %vbfdot1.i +} + +; CHECK-LABEL: test_vbfdotq_lane_f32 +; CHECK: bfdot v0.4s, v1.8h, v2.2h[0] +define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) { +entry: + %0 = bitcast <4 x bfloat> %b to <2 x float> + %shuffle = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer + %1 = bitcast <8 x bfloat> %a to <16 x i8> + %2 = bitcast <4 x float> %shuffle to <16 x i8> + %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) + ret <4 x float> %vbfdot1.i +} + +; CHECK-LABEL: test_vbfmmlaq_f32 +; CHECK: bfmmla v0.4s, v1.8h, v2.8h +define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { +entry: + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %b to <16 x i8> + %vbfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) + ret <4 x float> %vbfmmla1.i +} + +; CHECK-LABEL: test_vbfmlalbq_f32 +; CHECK: bfmlalb v0.4s, v1.8h, v2.8h +define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { +entry: + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %b to <16 x i8> + %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) + ret <4 x float> %vbfmlalb1.i +} + +; CHECK-LABEL: test_vbfmlaltq_f32 +; CHECK: bfmlalt v0.4s, v1.8h, v2.8h +define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { +entry: + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %b to <16 x i8> + %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) + ret <4 x float> %vbfmlalt1.i +} + +; CHECK-LABEL: test_vbfmlalbq_lane_f32 +; CHECK: bfmlalb v0.4s, v1.8h, v2.h[0] +define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) { +entry: + %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> + %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) + ret <4 x float> %vbfmlalb1.i +} + +; CHECK-LABEL: test_vbfmlalbq_laneq_f32 +; CHECK; bfmlalb v0.4s, v1.8h, v2.h[3] +define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { +entry: + %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> + %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) + ret <4 x float> %vbfmlalb1.i +} + +; CHECK-LABEL: test_vbfmlaltq_lane_f32 +; CHECK: bfmlalt v0.4s, v1.8h, v2.h[0] +define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) { +entry: + %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> + %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) + ret <4 x float> %vbfmlalt1.i +} + +; CHECK-LABEL: test_vbfmlaltq_laneq_f32 +; CHECK: bfmlalt v0.4s, v1.8h, v2.h[3] +define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { +entry: + %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> + %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) + ret <4 x float> %vbfmlalt1.i +} + +declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>) #2 +declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2 +declare <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2 +declare <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2 +declare <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2 Index: llvm/test/CodeGen/AArch64/aarch64-bf16-ldst-intrinsics.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/aarch64-bf16-ldst-intrinsics.ll @@ -0,0 +1,826 @@ +; RUN: llc -mtriple aarch64-arm-none-eabi -mattr=+bf16 %s -o - | FileCheck %s + +%struct.bfloat16x4x2_t = type { [2 x <4 x bfloat>] } +%struct.bfloat16x8x2_t = type { [2 x <8 x bfloat>] } +%struct.bfloat16x4x3_t = type { [3 x <4 x bfloat>] } +%struct.bfloat16x8x3_t = type { [3 x <8 x bfloat>] } +%struct.bfloat16x4x4_t = type { [4 x <4 x bfloat>] } +%struct.bfloat16x8x4_t = type { [4 x <8 x bfloat>] } + +; CHECK-LABEL: test_vld1_bf16 +; CHECK: ldr d0, [x0] +define <4 x bfloat> @test_vld1_bf16(bfloat* nocapture readonly %ptr) local_unnamed_addr #0 { +entry: + %0 = bitcast bfloat* %ptr to <4 x bfloat>* + %1 = load <4 x bfloat>, <4 x bfloat>* %0, align 2 + ret <4 x bfloat> %1 +} + +; CHECK-LABEL: test_vld1q_bf16 +; CHECK: ldr q0, [x0] +define <8 x bfloat> @test_vld1q_bf16(bfloat* nocapture readonly %ptr) local_unnamed_addr #1 { +entry: + %0 = bitcast bfloat* %ptr to <8 x bfloat>* + %1 = load <8 x bfloat>, <8 x bfloat>* %0, align 2 + ret <8 x bfloat> %1 +} + +; CHECK-LABEL: test_vld1_lane_bf16 +; CHECK: ld1 { v0.h }[0], [x0] +define <4 x bfloat> @test_vld1_lane_bf16(bfloat* nocapture readonly %ptr, <4 x bfloat> %src) local_unnamed_addr #0 { +entry: + %0 = load bfloat, bfloat* %ptr, align 2 + %vld1_lane = insertelement <4 x bfloat> %src, bfloat %0, i32 0 + ret <4 x bfloat> %vld1_lane +} + +; CHECK-LABEL: test_vld1q_lane_bf16 +; CHECK: ld1 { v0.h }[7], [x0] +define <8 x bfloat> @test_vld1q_lane_bf16(bfloat* nocapture readonly %ptr, <8 x bfloat> %src) local_unnamed_addr #1 { +entry: + %0 = load bfloat, bfloat* %ptr, align 2 + %vld1_lane = insertelement <8 x bfloat> %src, bfloat %0, i32 7 + ret <8 x bfloat> %vld1_lane +} + +; CHECK-LABEL: test_vld1_dup_bf16 +; CHECK: ld1r { v0.4h }, [x0] +define <4 x bfloat> @test_vld1_dup_bf16(bfloat* nocapture readonly %ptr) local_unnamed_addr #0 { +entry: + %0 = load bfloat, bfloat* %ptr, align 2 + %1 = insertelement <4 x bfloat> undef, bfloat %0, i32 0 + %lane = shufflevector <4 x bfloat> %1, <4 x bfloat> undef, <4 x i32> zeroinitializer + ret <4 x bfloat> %lane +} + +; CHECK-LABEL: test_vld1_bf16_x2 +; CHECK: ld1 { v0.4h, v1.4h }, [x0] +define %struct.bfloat16x4x2_t @test_vld1_bf16_x2(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %vld1xN = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x2.v4bf16.p0bf16(bfloat* %ptr) + %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld1xN, 0 + %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld1xN, 1 + %.fca.0.0.insert = insertvalue %struct.bfloat16x4x2_t undef, <4 x bfloat> %vld1xN.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vld1xN.fca.1.extract, 0, 1 + ret %struct.bfloat16x4x2_t %.fca.0.1.insert +} + +declare { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x2.v4bf16.p0bf16(bfloat*) #3 + +; CHECK-LABEL: test_vld1q_bf16_x2 +; CHECK: ld1 { v0.8h, v1.8h }, [x0] +define %struct.bfloat16x8x2_t @test_vld1q_bf16_x2(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %vld1xN = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x2.v8bf16.p0bf16(bfloat* %ptr) + %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld1xN, 0 + %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld1xN, 1 + %.fca.0.0.insert = insertvalue %struct.bfloat16x8x2_t undef, <8 x bfloat> %vld1xN.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vld1xN.fca.1.extract, 0, 1 + ret %struct.bfloat16x8x2_t %.fca.0.1.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x2.v8bf16.p0bf16(bfloat*) #3 + +; CHECK-LABEL: test_vld1_bf16_x3 +; CHECK: ld1 { v0.4h, v1.4h, v2.4h }, [x0] +define %struct.bfloat16x4x3_t @test_vld1_bf16_x3(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x3.v4bf16.p0bf16(bfloat* %ptr) + %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 0 + %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 1 + %vld1xN.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 2 + %.fca.0.0.insert = insertvalue %struct.bfloat16x4x3_t undef, <4 x bfloat> %vld1xN.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.0.insert, <4 x bfloat> %vld1xN.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.1.insert, <4 x bfloat> %vld1xN.fca.2.extract, 0, 2 + ret %struct.bfloat16x4x3_t %.fca.0.2.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x3.v4bf16.p0bf16(bfloat*) #3 + +; CHECK-LABEL: test_vld1q_bf16_x3 +; CHECK: ld1 { v0.8h, v1.8h, v2.8h }, [x0] +define %struct.bfloat16x8x3_t @test_vld1q_bf16_x3(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x3.v8bf16.p0bf16(bfloat* %ptr) + %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 0 + %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 1 + %vld1xN.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 2 + %.fca.0.0.insert = insertvalue %struct.bfloat16x8x3_t undef, <8 x bfloat> %vld1xN.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.0.insert, <8 x bfloat> %vld1xN.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.1.insert, <8 x bfloat> %vld1xN.fca.2.extract, 0, 2 + ret %struct.bfloat16x8x3_t %.fca.0.2.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x3.v8bf16.p0bf16(bfloat*) #3 + +; CHECK-LABEL: test_vld1_bf16_x4 +; CHECK: ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] +define %struct.bfloat16x4x4_t @test_vld1_bf16_x4(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x4.v4bf16.p0bf16(bfloat* %ptr) + %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 0 + %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 1 + %vld1xN.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 2 + %vld1xN.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 3 + %.fca.0.0.insert = insertvalue %struct.bfloat16x4x4_t undef, <4 x bfloat> %vld1xN.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.0.insert, <4 x bfloat> %vld1xN.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.1.insert, <4 x bfloat> %vld1xN.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.2.insert, <4 x bfloat> %vld1xN.fca.3.extract, 0, 3 + ret %struct.bfloat16x4x4_t %.fca.0.3.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x4.v4bf16.p0bf16(bfloat*) #3 + +; CHECK-LABEL: test_vld1q_bf16_x4 +; CHECK: ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] +define %struct.bfloat16x8x4_t @test_vld1q_bf16_x4(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x4.v8bf16.p0bf16(bfloat* %ptr) + %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 0 + %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 1 + %vld1xN.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 2 + %vld1xN.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 3 + %.fca.0.0.insert = insertvalue %struct.bfloat16x8x4_t undef, <8 x bfloat> %vld1xN.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.0.insert, <8 x bfloat> %vld1xN.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.1.insert, <8 x bfloat> %vld1xN.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.2.insert, <8 x bfloat> %vld1xN.fca.3.extract, 0, 3 + ret %struct.bfloat16x8x4_t %.fca.0.3.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x4.v8bf16.p0bf16(bfloat*) #3 + +; CHECK-LABEL: test_vld1q_dup_bf16 +; CHECK: ld1r { v0.8h }, [x0] +define <8 x bfloat> @test_vld1q_dup_bf16(bfloat* nocapture readonly %ptr) local_unnamed_addr #1 { +entry: + %0 = load bfloat, bfloat* %ptr, align 2 + %1 = insertelement <8 x bfloat> undef, bfloat %0, i32 0 + %lane = shufflevector <8 x bfloat> %1, <8 x bfloat> undef, <8 x i32> zeroinitializer + ret <8 x bfloat> %lane +} + +; CHECK-LABEL: test_vld2_bf16 +; CHECK: ld2 { v0.4h, v1.4h }, [x0] +define %struct.bfloat16x4x2_t @test_vld2_bf16(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %0 = bitcast bfloat* %ptr to <4 x bfloat>* + %vld2 = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2.v4bf16.p0v4bf16(<4 x bfloat>* %0) + %vld2.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.bfloat16x4x2_t undef, <4 x bfloat> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vld2.fca.1.extract, 0, 1 + ret %struct.bfloat16x4x2_t %.fca.0.1.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2.v4bf16.p0v4bf16(<4 x bfloat>*) #3 + +; CHECK-LABEL: test_vld2q_bf16 +; CHECK: ld2 { v0.8h, v1.8h }, [x0] +define %struct.bfloat16x8x2_t @test_vld2q_bf16(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %0 = bitcast bfloat* %ptr to <8 x bfloat>* + %vld2 = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2.v8bf16.p0v8bf16(<8 x bfloat>* %0) + %vld2.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.bfloat16x8x2_t undef, <8 x bfloat> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vld2.fca.1.extract, 0, 1 + ret %struct.bfloat16x8x2_t %.fca.0.1.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2.v8bf16.p0v8bf16(<8 x bfloat>*) #3 + +; CHECK-LABEL: test_vld2_lane_bf16 +; CHECK: ld2 { v0.h, v1.h }[1], [x0] +define %struct.bfloat16x4x2_t @test_vld2_lane_bf16(bfloat* %ptr, [2 x <4 x bfloat>] %src.coerce) local_unnamed_addr #2 { +entry: + %src.coerce.fca.0.extract = extractvalue [2 x <4 x bfloat>] %src.coerce, 0 + %src.coerce.fca.1.extract = extractvalue [2 x <4 x bfloat>] %src.coerce, 1 + %0 = bitcast bfloat* %ptr to i8* + %vld2_lane = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, i64 1, i8* %0) + %vld2_lane.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.bfloat16x4x2_t undef, <4 x bfloat> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.bfloat16x4x2_t %.fca.0.1.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, i64, i8*) #3 + +; CHECK-LABEL: test_vld2q_lane_bf16 +; CHECK: ld2 { v0.h, v1.h }[7], [x0] +define %struct.bfloat16x8x2_t @test_vld2q_lane_bf16(bfloat* %ptr, [2 x <8 x bfloat>] %src.coerce) local_unnamed_addr #2 { +entry: + %src.coerce.fca.0.extract = extractvalue [2 x <8 x bfloat>] %src.coerce, 0 + %src.coerce.fca.1.extract = extractvalue [2 x <8 x bfloat>] %src.coerce, 1 + %0 = bitcast bfloat* %ptr to i8* + %vld2_lane = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, i64 7, i8* %0) + %vld2_lane.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.bfloat16x8x2_t undef, <8 x bfloat> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.bfloat16x8x2_t %.fca.0.1.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, i64, i8*) #3 + +; CHECK-LABEL: test_vld3_bf16 +; CHECK: ld3 { v0.4h, v1.4h, v2.4h }, [x0] +define %struct.bfloat16x4x3_t @test_vld3_bf16(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %0 = bitcast bfloat* %ptr to <4 x bfloat>* + %vld3 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3.v4bf16.p0v4bf16(<4 x bfloat>* %0) + %vld3.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.bfloat16x4x3_t undef, <4 x bfloat> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.0.insert, <4 x bfloat> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.1.insert, <4 x bfloat> %vld3.fca.2.extract, 0, 2 + ret %struct.bfloat16x4x3_t %.fca.0.2.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3.v4bf16.p0v4bf16(<4 x bfloat>*) #3 + +; CHECK-LABEL: test_vld3q_bf16 +; CHECK: ld3 { v0.8h, v1.8h, v2.8h }, [x0] +define %struct.bfloat16x8x3_t @test_vld3q_bf16(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %0 = bitcast bfloat* %ptr to <8 x bfloat>* + %vld3 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3.v8bf16.p0v8bf16(<8 x bfloat>* %0) + %vld3.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.bfloat16x8x3_t undef, <8 x bfloat> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.0.insert, <8 x bfloat> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.1.insert, <8 x bfloat> %vld3.fca.2.extract, 0, 2 + ret %struct.bfloat16x8x3_t %.fca.0.2.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3.v8bf16.p0v8bf16(<8 x bfloat>*) #3 + +; CHECK-LABEL: test_vld3_lane_bf16 +; CHECK: ld3 { v0.h, v1.h, v2.h }[1], [x0] +define %struct.bfloat16x4x3_t @test_vld3_lane_bf16(bfloat* %ptr, [3 x <4 x bfloat>] %src.coerce) local_unnamed_addr #2 { +entry: + %src.coerce.fca.0.extract = extractvalue [3 x <4 x bfloat>] %src.coerce, 0 + %src.coerce.fca.1.extract = extractvalue [3 x <4 x bfloat>] %src.coerce, 1 + %src.coerce.fca.2.extract = extractvalue [3 x <4 x bfloat>] %src.coerce, 2 + %0 = bitcast bfloat* %ptr to i8* + %vld3_lane = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, i64 1, i8* %0) + %vld3_lane.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane, 2 + %.fca.0.0.insert = insertvalue %struct.bfloat16x4x3_t undef, <4 x bfloat> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.0.insert, <4 x bfloat> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.1.insert, <4 x bfloat> %vld3_lane.fca.2.extract, 0, 2 + ret %struct.bfloat16x4x3_t %.fca.0.2.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i64, i8*) #3 + +; CHECK-LABEL: test_vld3q_lane_bf16 +; CHECK: ld3 { v0.h, v1.h, v2.h }[7], [x0] +define %struct.bfloat16x8x3_t @test_vld3q_lane_bf16(bfloat* %ptr, [3 x <8 x bfloat>] %src.coerce) local_unnamed_addr #2 { +entry: + %src.coerce.fca.0.extract = extractvalue [3 x <8 x bfloat>] %src.coerce, 0 + %src.coerce.fca.1.extract = extractvalue [3 x <8 x bfloat>] %src.coerce, 1 + %src.coerce.fca.2.extract = extractvalue [3 x <8 x bfloat>] %src.coerce, 2 + %0 = bitcast bfloat* %ptr to i8* + %vld3_lane = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, i64 7, i8* %0) + %vld3_lane.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3_lane, 2 + %.fca.0.0.insert = insertvalue %struct.bfloat16x8x3_t undef, <8 x bfloat> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.0.insert, <8 x bfloat> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.1.insert, <8 x bfloat> %vld3_lane.fca.2.extract, 0, 2 + ret %struct.bfloat16x8x3_t %.fca.0.2.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i64, i8*) #3 + +; CHECK-LABEL: test_vld4_bf16 +; CHECK: ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] +define %struct.bfloat16x4x4_t @test_vld4_bf16(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %0 = bitcast bfloat* %ptr to <4 x bfloat>* + %vld4 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4.v4bf16.p0v4bf16(<4 x bfloat>* %0) + %vld4.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.bfloat16x4x4_t undef, <4 x bfloat> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.0.insert, <4 x bfloat> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.1.insert, <4 x bfloat> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.2.insert, <4 x bfloat> %vld4.fca.3.extract, 0, 3 + ret %struct.bfloat16x4x4_t %.fca.0.3.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4.v4bf16.p0v4bf16(<4 x bfloat>*) #3 + +; CHECK-LABEL: test_vld4q_bf16 +; CHECK: ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] +define %struct.bfloat16x8x4_t @test_vld4q_bf16(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %0 = bitcast bfloat* %ptr to <8 x bfloat>* + %vld4 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4.v8bf16.p0v8bf16(<8 x bfloat>* %0) + %vld4.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.bfloat16x8x4_t undef, <8 x bfloat> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.0.insert, <8 x bfloat> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.1.insert, <8 x bfloat> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.2.insert, <8 x bfloat> %vld4.fca.3.extract, 0, 3 + ret %struct.bfloat16x8x4_t %.fca.0.3.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4.v8bf16.p0v8bf16(<8 x bfloat>*) #3 + +; CHECK-LABEL: test_vld4_lane_bf16 +; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[1], [x0] +define %struct.bfloat16x4x4_t @test_vld4_lane_bf16(bfloat* %ptr, [4 x <4 x bfloat>] %src.coerce) local_unnamed_addr #2 { +entry: + %src.coerce.fca.0.extract = extractvalue [4 x <4 x bfloat>] %src.coerce, 0 + %src.coerce.fca.1.extract = extractvalue [4 x <4 x bfloat>] %src.coerce, 1 + %src.coerce.fca.2.extract = extractvalue [4 x <4 x bfloat>] %src.coerce, 2 + %src.coerce.fca.3.extract = extractvalue [4 x <4 x bfloat>] %src.coerce, 3 + %0 = bitcast bfloat* %ptr to i8* + %vld4_lane = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, <4 x bfloat> %src.coerce.fca.3.extract, i64 1, i8* %0) + %vld4_lane.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane, 0 + %vld4_lane.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane, 1 + %vld4_lane.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane, 2 + %vld4_lane.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane, 3 + %.fca.0.0.insert = insertvalue %struct.bfloat16x4x4_t undef, <4 x bfloat> %vld4_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.0.insert, <4 x bfloat> %vld4_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.1.insert, <4 x bfloat> %vld4_lane.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.2.insert, <4 x bfloat> %vld4_lane.fca.3.extract, 0, 3 + ret %struct.bfloat16x4x4_t %.fca.0.3.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i64, i8*) #3 + +; CHECK-LABEL: test_vld4q_lane_bf16 +; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[7], [x0] +define %struct.bfloat16x8x4_t @test_vld4q_lane_bf16(bfloat* %ptr, [4 x <8 x bfloat>] %src.coerce) local_unnamed_addr #2 { +entry: + %src.coerce.fca.0.extract = extractvalue [4 x <8 x bfloat>] %src.coerce, 0 + %src.coerce.fca.1.extract = extractvalue [4 x <8 x bfloat>] %src.coerce, 1 + %src.coerce.fca.2.extract = extractvalue [4 x <8 x bfloat>] %src.coerce, 2 + %src.coerce.fca.3.extract = extractvalue [4 x <8 x bfloat>] %src.coerce, 3 + %0 = bitcast bfloat* %ptr to i8* + %vld4_lane = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, <8 x bfloat> %src.coerce.fca.3.extract, i64 7, i8* %0) + %vld4_lane.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4_lane, 0 + %vld4_lane.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4_lane, 1 + %vld4_lane.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4_lane, 2 + %vld4_lane.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4_lane, 3 + %.fca.0.0.insert = insertvalue %struct.bfloat16x8x4_t undef, <8 x bfloat> %vld4_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.0.insert, <8 x bfloat> %vld4_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.1.insert, <8 x bfloat> %vld4_lane.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.2.insert, <8 x bfloat> %vld4_lane.fca.3.extract, 0, 3 + ret %struct.bfloat16x8x4_t %.fca.0.3.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i64, i8*) #3 + +; CHECK-LABEL: test_vld2_dup_bf16 +; CHECK: ld2r { v0.4h, v1.4h }, [x0] +define %struct.bfloat16x4x2_t @test_vld2_dup_bf16(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %vld2 = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2r.v4bf16.p0bf16(bfloat* %ptr) + %vld2.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.bfloat16x4x2_t undef, <4 x bfloat> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vld2.fca.1.extract, 0, 1 + ret %struct.bfloat16x4x2_t %.fca.0.1.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2r.v4bf16.p0bf16(bfloat*) #3 + +; CHECK-LABEL: test_vld2q_dup_bf16 +; CHECK: ld2r { v0.8h, v1.8h }, [x0] +define %struct.bfloat16x8x2_t @test_vld2q_dup_bf16(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %vld2 = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2r.v8bf16.p0bf16(bfloat* %ptr) + %vld2.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.bfloat16x8x2_t undef, <8 x bfloat> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vld2.fca.1.extract, 0, 1 + ret %struct.bfloat16x8x2_t %.fca.0.1.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2r.v8bf16.p0bf16(bfloat*) #3 + +; CHECK-LABEL: test_vld3_dup_bf16 +; CHECK: ld3r { v0.4h, v1.4h, v2.4h }, [x0] +define %struct.bfloat16x4x3_t @test_vld3_dup_bf16(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %vld3 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3r.v4bf16.p0bf16(bfloat* %ptr) + %vld3.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.bfloat16x4x3_t undef, <4 x bfloat> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.0.insert, <4 x bfloat> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.1.insert, <4 x bfloat> %vld3.fca.2.extract, 0, 2 + ret %struct.bfloat16x4x3_t %.fca.0.2.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3r.v4bf16.p0bf16(bfloat*) #3 + +; CHECK-LABEL: test_vld3q_dup_bf16 +; CHECK: ld3r { v0.8h, v1.8h, v2.8h }, [x0] +define %struct.bfloat16x8x3_t @test_vld3q_dup_bf16(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %vld3 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3r.v8bf16.p0bf16(bfloat* %ptr) + %vld3.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.bfloat16x8x3_t undef, <8 x bfloat> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.0.insert, <8 x bfloat> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.1.insert, <8 x bfloat> %vld3.fca.2.extract, 0, 2 + ret %struct.bfloat16x8x3_t %.fca.0.2.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3r.v8bf16.p0bf16(bfloat*) #3 + +; CHECK-LABEL: test_vld4_dup_bf16 +; CHECK: ld4r { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] +define %struct.bfloat16x4x4_t @test_vld4_dup_bf16(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %vld4 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4r.v4bf16.p0bf16(bfloat* %ptr) + %vld4.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.bfloat16x4x4_t undef, <4 x bfloat> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.0.insert, <4 x bfloat> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.1.insert, <4 x bfloat> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.2.insert, <4 x bfloat> %vld4.fca.3.extract, 0, 3 + ret %struct.bfloat16x4x4_t %.fca.0.3.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4r.v4bf16.p0bf16(bfloat*) #3 + +; CHECK-LABEL: test_vld4q_dup_bf16 +; CHECK: ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] +define %struct.bfloat16x8x4_t @test_vld4q_dup_bf16(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %vld4 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4r.v8bf16.p0bf16(bfloat* %ptr) + %vld4.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.bfloat16x8x4_t undef, <8 x bfloat> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.0.insert, <8 x bfloat> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.1.insert, <8 x bfloat> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.2.insert, <8 x bfloat> %vld4.fca.3.extract, 0, 3 + ret %struct.bfloat16x8x4_t %.fca.0.3.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4r.v8bf16.p0bf16(bfloat*) #3 + +; CHECK-LABEL: test_vst1_bf16 +; CHECK: str d0, [x0] +define void @test_vst1_bf16(bfloat* nocapture %ptr, <4 x bfloat> %val) local_unnamed_addr #4 { +entry: + %0 = bitcast bfloat* %ptr to <4 x bfloat>* + store <4 x bfloat> %val, <4 x bfloat>* %0, align 8 + ret void +} + +; CHECK-LABEL: test_vst1q_bf16 +; CHECK: str q0, [x0] +define void @test_vst1q_bf16(bfloat* nocapture %ptr, <8 x bfloat> %val) local_unnamed_addr #5 { +entry: + %0 = bitcast bfloat* %ptr to <8 x bfloat>* + store <8 x bfloat> %val, <8 x bfloat>* %0, align 16 + ret void +} + +; CHECK-LABEL: test_vst1_lane_bf16 +; CHECK: st1 { v0.h }[1], [x0] +define void @test_vst1_lane_bf16(bfloat* nocapture %ptr, <4 x bfloat> %val) local_unnamed_addr #4 { +entry: + %0 = extractelement <4 x bfloat> %val, i32 1 + store bfloat %0, bfloat* %ptr, align 2 + ret void +} + +; CHECK-LABEL: test_vst1q_lane_bf16 +; CHECK: st1 { v0.h }[7], [x0] +define void @test_vst1q_lane_bf16(bfloat* nocapture %ptr, <8 x bfloat> %val) local_unnamed_addr #5 { +entry: + %0 = extractelement <8 x bfloat> %val, i32 7 + store bfloat %0, bfloat* %ptr, align 2 + ret void +} + +; CHECK-LABEL: test_vst1_bf16_x2 +; CHECK: st1 { v0.4h, v1.4h }, [x0] +define void @test_vst1_bf16_x2(bfloat* nocapture %ptr, [2 x <4 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 1 + tail call void @llvm.aarch64.neon.st1x2.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, bfloat* %ptr) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st1x2.v4bf16.p0bf16(<4 x bfloat>, <4 x bfloat>, bfloat* nocapture) #7 + +; CHECK-LABEL: test_vst1q_bf16_x2 +; CHECK: st1 { v0.8h, v1.8h }, [x0] +define void @test_vst1q_bf16_x2(bfloat* nocapture %ptr, [2 x <8 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 1 + tail call void @llvm.aarch64.neon.st1x2.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, bfloat* %ptr) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st1x2.v8bf16.p0bf16(<8 x bfloat>, <8 x bfloat>, bfloat* nocapture) #7 + +; CHECK-LABEL: test_vst1_bf16_x3 +; CHECK: st1 { v0.4h, v1.4h, v2.4h }, [x0] +define void @test_vst1_bf16_x3(bfloat* nocapture %ptr, [3 x <4 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 1 + %val.coerce.fca.2.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 2 + tail call void @llvm.aarch64.neon.st1x3.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, bfloat* %ptr) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st1x3.v4bf16.p0bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, bfloat* nocapture) #7 + +; CHECK-LABEL: test_vst1q_bf16_x3 +; CHECK: st1 { v0.8h, v1.8h, v2.8h }, [x0] +define void @test_vst1q_bf16_x3(bfloat* nocapture %ptr, [3 x <8 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 1 + %val.coerce.fca.2.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 2 + tail call void @llvm.aarch64.neon.st1x3.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, bfloat* %ptr) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st1x3.v8bf16.p0bf16(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, bfloat* nocapture) #7 + +; Function Attrs: nounwind +; CHECK-LABEL: test_vst1_bf16_x4 +; CHECK: st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] +define void @test_vst1_bf16_x4(bfloat* nocapture %ptr, [4 x <4 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 1 + %val.coerce.fca.2.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 2 + %val.coerce.fca.3.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 3 + tail call void @llvm.aarch64.neon.st1x4.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, bfloat* %ptr) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st1x4.v4bf16.p0bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, bfloat* nocapture) #7 + +; CHECK-LABEL: test_vst1q_bf16_x4 +; CHECK: st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] +define void @test_vst1q_bf16_x4(bfloat* nocapture %ptr, [4 x <8 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 1 + %val.coerce.fca.2.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 2 + %val.coerce.fca.3.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 3 + tail call void @llvm.aarch64.neon.st1x4.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, bfloat* %ptr) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st1x4.v8bf16.p0bf16(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, bfloat* nocapture) #7 + +; CHECK-LABEL: test_vst2_bf16 +; CHECK: st2 { v0.4h, v1.4h }, [x0] +define void @test_vst2_bf16(bfloat* nocapture %ptr, [2 x <4 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 1 + %0 = bitcast bfloat* %ptr to i8* + tail call void @llvm.aarch64.neon.st2.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, i8* %0) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st2.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, i8* nocapture) #7 + +; CHECK-LABEL: test_vst2q_bf16 +; CHECK: st2 { v0.8h, v1.8h }, [x0] +define void @test_vst2q_bf16(bfloat* nocapture %ptr, [2 x <8 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 1 + %0 = bitcast bfloat* %ptr to i8* + tail call void @llvm.aarch64.neon.st2.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, i8* %0) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st2.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, i8* nocapture) #7 + +; CHECK-LABEL: test_vst2_lane_bf16 +; CHECK: st2 { v0.h, v1.h }[1], [x0] +define void @test_vst2_lane_bf16(bfloat* nocapture %ptr, [2 x <4 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 1 + %0 = bitcast bfloat* %ptr to i8* + tail call void @llvm.aarch64.neon.st2lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, i64 1, i8* %0) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st2lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, i64, i8* nocapture) #7 + +; Function Attrs: nounwind +; CHECK-LABEL: test_vst2q_lane_bf16 +; CHECK: st2 { v0.h, v1.h }[7], [x0] +define void @test_vst2q_lane_bf16(bfloat* nocapture %ptr, [2 x <8 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 1 + %0 = bitcast bfloat* %ptr to i8* + tail call void @llvm.aarch64.neon.st2lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, i64 7, i8* %0) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st2lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, i64, i8* nocapture) #7 + +; Function Attrs: nounwind +; CHECK-LABEL: test_vst3_bf16 +; CHECK: st3 { v0.4h, v1.4h, v2.4h }, [x0] +define void @test_vst3_bf16(bfloat* nocapture %ptr, [3 x <4 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 1 + %val.coerce.fca.2.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 2 + %0 = bitcast bfloat* %ptr to i8* + tail call void @llvm.aarch64.neon.st3.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, i8* %0) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st3.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i8* nocapture) #7 + +; Function Attrs: nounwind +; CHECK-LABEL: test_vst3q_bf16 +; CHECK: st3 { v0.8h, v1.8h, v2.8h }, [x0] +define void @test_vst3q_bf16(bfloat* nocapture %ptr, [3 x <8 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 1 + %val.coerce.fca.2.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 2 + %0 = bitcast bfloat* %ptr to i8* + tail call void @llvm.aarch64.neon.st3.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, i8* %0) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st3.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i8* nocapture) #7 + +; Function Attrs: nounwind +; CHECK-LABEL: test_vst3_lane_bf16 +; CHECK: st3 { v0.h, v1.h, v2.h }[1], [x0] +define void @test_vst3_lane_bf16(bfloat* nocapture %ptr, [3 x <4 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 1 + %val.coerce.fca.2.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 2 + %0 = bitcast bfloat* %ptr to i8* + tail call void @llvm.aarch64.neon.st3lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, i64 1, i8* %0) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st3lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i64, i8* nocapture) #7 + +; Function Attrs: nounwind +; CHECK-LABEL: test_vst3q_lane_bf16 +; CHECK: st3 { v0.h, v1.h, v2.h }[7], [x0] +define void @test_vst3q_lane_bf16(bfloat* nocapture %ptr, [3 x <8 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 1 + %val.coerce.fca.2.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 2 + %0 = bitcast bfloat* %ptr to i8* + tail call void @llvm.aarch64.neon.st3lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, i64 7, i8* %0) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st3lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i64, i8* nocapture) #7 + +; Function Attrs: nounwind +; CHECK-LABEL: test_vst4_bf16 +; CHECK: st4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] +define void @test_vst4_bf16(bfloat* nocapture %ptr, [4 x <4 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 1 + %val.coerce.fca.2.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 2 + %val.coerce.fca.3.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 3 + %0 = bitcast bfloat* %ptr to i8* + tail call void @llvm.aarch64.neon.st4.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, i8* %0) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st4.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i8* nocapture) #7 + +; Function Attrs: nounwind +; CHECK-LABEL: test_vst4q_bf16 +; CHECK: st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] +define void @test_vst4q_bf16(bfloat* nocapture %ptr, [4 x <8 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 1 + %val.coerce.fca.2.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 2 + %val.coerce.fca.3.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 3 + %0 = bitcast bfloat* %ptr to i8* + tail call void @llvm.aarch64.neon.st4.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, i8* %0) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st4.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i8* nocapture) #7 + +; Function Attrs: nounwind +; CHECK-LABEL: test_vst4_lane_bf16 +; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[1], [x0] +define void @test_vst4_lane_bf16(bfloat* nocapture %ptr, [4 x <4 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 1 + %val.coerce.fca.2.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 2 + %val.coerce.fca.3.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 3 + %0 = bitcast bfloat* %ptr to i8* + tail call void @llvm.aarch64.neon.st4lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, i64 1, i8* %0) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st4lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i64, i8* nocapture) #7 + +; Function Attrs: nounwind +; CHECK-LABEL: test_vst4q_lane_bf16 +; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[7], [x0] +define void @test_vst4q_lane_bf16(bfloat* nocapture %ptr, [4 x <8 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 1 + %val.coerce.fca.2.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 2 + %val.coerce.fca.3.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 3 + %0 = bitcast bfloat* %ptr to i8* + tail call void @llvm.aarch64.neon.st4lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, i64 7, i8* %0) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st4lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i64, i8* nocapture) #7 + +attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="64" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+bf16,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+bf16,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+bf16,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { argmemonly nounwind readonly } +attributes #4 = { nofree norecurse nounwind writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="64" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+bf16,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { nofree norecurse nounwind writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+bf16,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+bf16,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #7 = { argmemonly nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 11.0.0 (https://git.research.arm.com/corstu01/llvm-project.git bbc7a9e9d4ef536605fc70136adfe9d2b5809c4e)"}