Index: clang/include/clang/Basic/arm_neon.td =================================================================== --- clang/include/clang/Basic/arm_neon.td +++ clang/include/clang/Basic/arm_neon.td @@ -1874,6 +1874,40 @@ def SCALAR_VDUP_LANE_BF : IInst<"vdup_lane", "1.I", "Sb">; def SCALAR_VDUP_LANEQ_BF : IInst<"vdup_laneq", "1QI", "Sb">; + + def VLD1_BF : WInst<"vld1", ".(c*!)", "bQb">; + def VLD2_BF : WInst<"vld2", "2(c*!)", "bQb">; + def VLD3_BF : WInst<"vld3", "3(c*!)", "bQb">; + def VLD4_BF : WInst<"vld4", "4(c*!)", "bQb">; + + def VST1_BF : WInst<"vst1", "v*(.!)", "bQb">; + def VST2_BF : WInst<"vst2", "v*(2!)", "bQb">; + def VST3_BF : WInst<"vst3", "v*(3!)", "bQb">; + def VST4_BF : WInst<"vst4", "v*(4!)", "bQb">; + + def VLD1_X2_BF : WInst<"vld1_x2", "2(c*!)", "bQb">; + def VLD1_X3_BF : WInst<"vld1_x3", "3(c*!)", "bQb">; + def VLD1_X4_BF : WInst<"vld1_x4", "4(c*!)", "bQb">; + + def VST1_X2_BF : WInst<"vst1_x2", "v*(2!)", "bQb">; + def VST1_X3_BF : WInst<"vst1_x3", "v*(3!)", "bQb">; + def VST1_X4_BF : WInst<"vst1_x4", "v*(4!)", "bQb">; + + def VLD1_LANE_BF : WInst<"vld1_lane", ".(c*!).I", "bQb">; + def VLD2_LANE_BF : WInst<"vld2_lane", "2(c*!)2I", "bQb">; + def VLD3_LANE_BF : WInst<"vld3_lane", "3(c*!)3I", "bQb">; + def VLD4_LANE_BF : WInst<"vld4_lane", "4(c*!)4I", "bQb">; + def VST1_LANE_BF : WInst<"vst1_lane", "v*(.!)I", "bQb">; + def VST2_LANE_BF : WInst<"vst2_lane", "v*(2!)I", "bQb">; + def VST3_LANE_BF : WInst<"vst3_lane", "v*(3!)I", "bQb">; + def VST4_LANE_BF : WInst<"vst4_lane", "v*(4!)I", "bQb">; + + def VLD1_DUP_BF : WInst<"vld1_dup", ".(c*!)", "bQb">; + def VLD2_DUP_BF : WInst<"vld2_dup", "2(c*!)", "bQb">; + def VLD3_DUP_BF : WInst<"vld3_dup", "3(c*!)", "bQb">; + def VLD4_DUP_BF : WInst<"vld4_dup", "4(c*!)", "bQb">; + + } let ArchGuard = "defined(__ARM_FEATURE_BF16) && !defined(__aarch64__)" in { Index: clang/lib/CodeGen/CGBuiltin.cpp =================================================================== --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -10364,9 +10364,9 @@ } case NEON::BI__builtin_neon_vld1_v: case NEON::BI__builtin_neon_vld1q_v: { + auto Alignment = CGM.getNaturalPointeeTypeAlignment( + E->getArg(0)->IgnoreParenCasts()->getType()); Ops[0] = Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(VTy)); - auto Alignment = CharUnits::fromQuantity( - BuiltinID == NEON::BI__builtin_neon_vld1_v ? 8 : 16); return Builder.CreateAlignedLoad(VTy, Ops[0], Alignment); } case NEON::BI__builtin_neon_vst1_v: @@ -10379,8 +10379,8 @@ Ops[1] = Builder.CreateBitCast(Ops[1], Ty); Ty = llvm::PointerType::getUnqual(VTy->getElementType()); Ops[0] = Builder.CreateBitCast(Ops[0], Ty); - auto Alignment = CharUnits::fromQuantity( - BuiltinID == NEON::BI__builtin_neon_vld1_lane_v ? 8 : 16); + auto Alignment = CGM.getNaturalPointeeTypeAlignment( + E->getArg(0)->IgnoreParenCasts()->getType()); Ops[0] = Builder.CreateAlignedLoad(VTy->getElementType(), Ops[0], Alignment); return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vld1_lane"); @@ -10390,8 +10390,8 @@ Value *V = UndefValue::get(Ty); Ty = llvm::PointerType::getUnqual(VTy->getElementType()); Ops[0] = Builder.CreateBitCast(Ops[0], Ty); - auto Alignment = CharUnits::fromQuantity( - BuiltinID == NEON::BI__builtin_neon_vld1_dup_v ? 8 : 16); + auto Alignment = CGM.getNaturalPointeeTypeAlignment( + E->getArg(0)->IgnoreParenCasts()->getType()); Ops[0] = Builder.CreateAlignedLoad(VTy->getElementType(), Ops[0], Alignment); llvm::Constant *CI = ConstantInt::get(Int32Ty, 0); Index: clang/test/CodeGen/aarch64-bf16-ldst-intrinsics.c =================================================================== --- /dev/null +++ clang/test/CodeGen/aarch64-bf16-ldst-intrinsics.c @@ -0,0 +1,415 @@ +// RUN: %clang_cc1 -triple aarch64-arm-none-eabi -target-feature +neon -target-feature +bf16 \ +// RUN: -O2 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK64 +// RUN: %clang_cc1 -triple armv8.6a-arm-none-eabi -target-feature +neon -target-feature +bf16 -mfloat-abi hard \ +// RUN: -O2 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK32 + +#include "arm_neon.h" + +bfloat16x4_t test_vld1_bf16(bfloat16_t const *ptr) { + return vld1_bf16(ptr); +} +// CHECK-LABEL: test_vld1_bf16 +// CHECK64: %1 = load <4 x bfloat>, <4 x bfloat>* %0 +// CHECK64: ret <4 x bfloat> %1 +// CHECK32: = load <4 x bfloat>, <4 x bfloat>* %0, align 2 +// CHECK32: ret <4 x bfloat> %1 + +bfloat16x8_t test_vld1q_bf16(bfloat16_t const *ptr) { + return vld1q_bf16(ptr); +} +// CHECK-LABEL: test_vld1q_bf16 +// CHECK64: %1 = load <8 x bfloat>, <8 x bfloat>* %0 +// CHECK64: ret <8 x bfloat> %1 +// CHECK32: %1 = load <8 x bfloat>, <8 x bfloat>* %0, align 2 +// CHECK32: ret <8 x bfloat> %1 + +bfloat16x4_t test_vld1_lane_bf16(bfloat16_t const *ptr, bfloat16x4_t src) { + return vld1_lane_bf16(ptr, src, 0); +} +// CHECK-LABEL: test_vld1_lane_bf16 +// CHECK64: %0 = load bfloat, bfloat* %ptr, align 2 +// CHECK64: %vld1_lane = insertelement <4 x bfloat> %src, bfloat %0, i32 0 +// CHECK64: ret <4 x bfloat> %vld1_lane +// CHECK32: %0 = load bfloat, bfloat* %ptr, align 2 +// CHECK32: %vld1_lane = insertelement <4 x bfloat> %src, bfloat %0, i32 0 +// CHECK32: ret <4 x bfloat> %vld1_lane + +bfloat16x8_t test_vld1q_lane_bf16(bfloat16_t const *ptr, bfloat16x8_t src) { + return vld1q_lane_bf16(ptr, src, 7); +} +// CHECK-LABEL: test_vld1q_lane_bf16 +// CHECK64: %0 = load bfloat, bfloat* %ptr, align 2 +// CHECK64: %vld1_lane = insertelement <8 x bfloat> %src, bfloat %0, i32 7 +// CHECK64: ret <8 x bfloat> %vld1_lane +// CHECK32: %0 = load bfloat, bfloat* %ptr, align 2 +// CHECK32: %vld1_lane = insertelement <8 x bfloat> %src, bfloat %0, i32 7 +// CHECK32: ret <8 x bfloat> %vld1_lane + +bfloat16x4_t test_vld1_dup_bf16(bfloat16_t const *ptr) { + return vld1_dup_bf16(ptr); +} +// CHECK-LABEL: test_vld1_dup_bf16 +// CHECK64: %0 = load bfloat, bfloat* %ptr, align 2 +// CHECK64: %1 = insertelement <4 x bfloat> undef, bfloat %0, i32 0 +// CHECK64: %lane = shufflevector <4 x bfloat> %1, <4 x bfloat> undef, <4 x i32> zeroinitializer +// CHECK64: ret <4 x bfloat> %lane +// CHECK32: %0 = load bfloat, bfloat* %ptr, align 2 +// CHECK32: %1 = insertelement <4 x bfloat> undef, bfloat %0, i32 0 +// CHECK32: %lane = shufflevector <4 x bfloat> %1, <4 x bfloat> undef, <4 x i32> zeroinitializer +// CHECK32: ret <4 x bfloat> %lane + +bfloat16x4x2_t test_vld1_bf16_x2(bfloat16_t const *ptr) { + return vld1_bf16_x2(ptr); +} +// CHECK-LABEL: test_vld1_bf16_x2 +// CHECK64: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x2.v4bf16.p0bf16(bfloat* %ptr) +// CHECK32: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x2.v4bf16.p0bf16(bfloat* %ptr) + +bfloat16x8x2_t test_vld1q_bf16_x2(bfloat16_t const *ptr) { + return vld1q_bf16_x2(ptr); +} +// CHECK-LABEL: test_vld1q_bf16_x2 +// CHECK64: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x2.v8bf16.p0bf16(bfloat* %ptr) +// CHECK32: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x2.v8bf16.p0bf16(bfloat* %ptr) + +bfloat16x4x3_t test_vld1_bf16_x3(bfloat16_t const *ptr) { + return vld1_bf16_x3(ptr); +} +// CHECK-LABEL: test_vld1_bf16_x3 +// CHECK64: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x3.v4bf16.p0bf16(bfloat* %ptr) +// CHECK32: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x3.v4bf16.p0bf16(bfloat* %ptr) + +bfloat16x8x3_t test_vld1q_bf16_x3(bfloat16_t const *ptr) { + return vld1q_bf16_x3(ptr); +} +// CHECK-LABEL: test_vld1q_bf16_x3 +// CHECK64: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x3.v8bf16.p0bf16(bfloat* %ptr) +// CHECK32: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x3.v8bf16.p0bf16(bfloat* %ptr) + +bfloat16x4x4_t test_vld1_bf16_x4(bfloat16_t const *ptr) { + return vld1_bf16_x4(ptr); +} +// CHECK-LABEL: test_vld1_bf16_x4 +// CHECK64: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x4.v4bf16.p0bf16(bfloat* %ptr) +// CHECK32: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x4.v4bf16.p0bf16(bfloat* %ptr) + +bfloat16x8x4_t test_vld1q_bf16_x4(bfloat16_t const *ptr) { + return vld1q_bf16_x4(ptr); +} +// CHECK-LABEL: test_vld1q_bf16_x4 +// CHECK64: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x4.v8bf16.p0bf16(bfloat* %ptr) +// CHECK32: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x4.v8bf16.p0bf16(bfloat* %ptr) + +bfloat16x8_t test_vld1q_dup_bf16(bfloat16_t const *ptr) { + return vld1q_dup_bf16(ptr); +} +// CHECK-LABEL: test_vld1q_dup_bf16 +// CHECK64: %0 = load bfloat, bfloat* %ptr, align 2 +// CHECK64: %1 = insertelement <8 x bfloat> undef, bfloat %0, i32 0 +// CHECK64: %lane = shufflevector <8 x bfloat> %1, <8 x bfloat> undef, <8 x i32> zeroinitializer +// CHECK64: ret <8 x bfloat> %lane +// CHECK32: %0 = load bfloat, bfloat* %ptr, align 2 +// CHECK32: %1 = insertelement <8 x bfloat> undef, bfloat %0, i32 0 +// CHECK32: %lane = shufflevector <8 x bfloat> %1, <8 x bfloat> undef, <8 x i32> zeroinitializer +// CHECK32: ret <8 x bfloat> %lane + +bfloat16x4x2_t test_vld2_bf16(bfloat16_t const *ptr) { + return vld2_bf16(ptr); +} +// CHECK-LABEL: test_vld2_bf16 +// CHECK64: %0 = bitcast bfloat* %ptr to <4 x bfloat>* +// CHECK64: %vld2 = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2.v4bf16.p0v4bf16(<4 x bfloat>* %0) +// CHECK32: %0 = bitcast bfloat* %ptr to i8* +// CHECK32: %vld2_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2.v4bf16.p0i8(i8* %0, i32 2) + +bfloat16x8x2_t test_vld2q_bf16(bfloat16_t const *ptr) { + return vld2q_bf16(ptr); +} +// CHECK-LABEL: test_vld2q_bf16 +// CHECK64: %0 = bitcast bfloat* %ptr to <8 x bfloat>* +// CHECK64: %vld2 = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2.v8bf16.p0v8bf16(<8 x bfloat>* %0) +// CHECK32: %0 = bitcast bfloat* %ptr to i8* +// CHECK32: %vld2q_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2.v8bf16.p0i8(i8* %0, i32 2) + +bfloat16x4x2_t test_vld2_lane_bf16(bfloat16_t const *ptr, bfloat16x4x2_t src) { + return vld2_lane_bf16(ptr, src, 1); +} +// CHECK-LABEL: test_vld2_lane_bf16 +// CHECK64: %vld2_lane = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, i64 1, i8* %0) +// CHECK32: %vld2_lane_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2lane.v4bf16.p0i8(i8* %2, <4 x bfloat> %0, <4 x bfloat> %1, i32 1, i32 2) + +bfloat16x8x2_t test_vld2q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x2_t src) { + return vld2q_lane_bf16(ptr, src, 7); +} +// CHECK-LABEL: test_vld2q_lane_bf16 +// CHECK64: %vld2_lane = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, i64 7, i8* %0) +// CHECK32: %vld2q_lane_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2lane.v8bf16.p0i8(i8* %2, <8 x bfloat> %0, <8 x bfloat> %1, i32 7, i32 2) + +bfloat16x4x3_t test_vld3_bf16(bfloat16_t const *ptr) { + return vld3_bf16(ptr); +} +// CHECK-LABEL: test_vld3_bf16 +// CHECK64: %vld3 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3.v4bf16.p0v4bf16(<4 x bfloat>* %0) +// CHECK32: %0 = bitcast bfloat* %ptr to i8* +// CHECK32: %vld3_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3.v4bf16.p0i8(i8* %0, i32 2) + +bfloat16x8x3_t test_vld3q_bf16(bfloat16_t const *ptr) { + return vld3q_bf16(ptr); +} +// CHECK-LABEL: test_vld3q_bf16 +// CHECK64: %vld3 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3.v8bf16.p0v8bf16(<8 x bfloat>* %0) +// CHECK32: %0 = bitcast bfloat* %ptr to i8* +// CHECK32: %vld3q_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3.v8bf16.p0i8(i8* %0, i32 2) + +bfloat16x4x3_t test_vld3_lane_bf16(bfloat16_t const *ptr, bfloat16x4x3_t src) { + return vld3_lane_bf16(ptr, src, 1); +} +// CHECK-LABEL: test_vld3_lane_bf16 +// CHECK64: %vld3_lane = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, i64 1, i8* %0) +// CHECK32: %3 = bitcast bfloat* %ptr to i8* +// CHECK32: %vld3_lane_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3lane.v4bf16.p0i8(i8* %3, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 1, i32 2) + +bfloat16x8x3_t test_vld3q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x3_t src) { + return vld3q_lane_bf16(ptr, src, 7); + // return vld3q_lane_bf16(ptr, src, 8); +} +// CHECK-LABEL: test_vld3q_lane_bf16 +// CHECK64: %vld3_lane = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, i64 7, i8* %0) +// CHECK32: %3 = bitcast bfloat* %ptr to i8* +// CHECK32: %vld3q_lane_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3lane.v8bf16.p0i8(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 7, i32 2) + +bfloat16x4x4_t test_vld4_bf16(bfloat16_t const *ptr) { + return vld4_bf16(ptr); +} +// CHECK-LABEL: test_vld4_bf16 +// CHECK64: %vld4 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4.v4bf16.p0v4bf16(<4 x bfloat>* %0) +// CHECK32: %0 = bitcast bfloat* %ptr to i8* +// CHECK32: %vld4_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4.v4bf16.p0i8(i8* %0, i32 2) + +bfloat16x8x4_t test_vld4q_bf16(bfloat16_t const *ptr) { + return vld4q_bf16(ptr); +} +// CHECK-LABEL: test_vld4q_bf16 +// CHECK64: %vld4 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4.v8bf16.p0v8bf16(<8 x bfloat>* %0) +// CHECK32: %0 = bitcast bfloat* %ptr to i8* +// CHECK32: %vld4q_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4.v8bf16.p0i8(i8* %0, i32 2) + +bfloat16x4x4_t test_vld4_lane_bf16(bfloat16_t const *ptr, bfloat16x4x4_t src) { + return vld4_lane_bf16(ptr, src, 1); +} +// CHECK-LABEL: test_vld4_lane_bf16 +// CHECK64: %vld4_lane = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, <4 x bfloat> %src.coerce.fca.3.extract, i64 1, i8* %0) +// CHECK32: %4 = bitcast bfloat* %ptr to i8* +// CHECK32: %vld4_lane_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4lane.v4bf16.p0i8(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 1, i32 2) + +bfloat16x8x4_t test_vld4q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x4_t src) { + return vld4q_lane_bf16(ptr, src, 7); +} +// CHECK-LABEL: test_vld4q_lane_bf16 +// CHECK64: %vld4_lane = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, <8 x bfloat> %src.coerce.fca.3.extract, i64 7, i8* %0) +// CHECK32: %4 = bitcast bfloat* %ptr to i8* +// CHECK32: %vld4q_lane_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4lane.v8bf16.p0i8(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 7, i32 2) + +bfloat16x4x2_t test_vld2_dup_bf16(bfloat16_t const *ptr) { + return vld2_dup_bf16(ptr); +} +// CHECK-LABEL: test_vld2_dup_bf16 +// CHECK64: %vld2 = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2r.v4bf16.p0bf16(bfloat* %ptr) +// CHECK32: %vld2_dup_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2dup.v4bf16.p0i8(i8* %0, i32 2) + +bfloat16x8x2_t test_vld2q_dup_bf16(bfloat16_t const *ptr) { + return vld2q_dup_bf16(ptr); +} +// CHECK-LABEL: test_vld2q_dup_bf16 +// CHECK64: %vld2 = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2r.v8bf16.p0bf16(bfloat* %ptr) +// CHECK32: %vld2q_dup_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2dup.v8bf16.p0i8(i8* %0, i32 2) + +bfloat16x4x3_t test_vld3_dup_bf16(bfloat16_t const *ptr) { + return vld3_dup_bf16(ptr); +} +// CHECK-LABEL: test_vld3_dup_bf16 +// CHECK64: %vld3 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3r.v4bf16.p0bf16(bfloat* %ptr) +// CHECK32: %vld3_dup_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3dup.v4bf16.p0i8(i8* %0, i32 2) + +bfloat16x8x3_t test_vld3q_dup_bf16(bfloat16_t const *ptr) { + return vld3q_dup_bf16(ptr); +} +// CHECK-LABEL: test_vld3q_dup_bf16 +// CHECK64: %vld3 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3r.v8bf16.p0bf16(bfloat* %ptr) +// CHECK32: %vld3q_dup_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3dup.v8bf16.p0i8(i8* %0, i32 2) + +bfloat16x4x4_t test_vld4_dup_bf16(bfloat16_t const *ptr) { + return vld4_dup_bf16(ptr); +} +// CHECK-LABEL: test_vld4_dup_bf16 +// CHECK64: %vld4 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4r.v4bf16.p0bf16(bfloat* %ptr) +// CHECK32: %vld4_dup_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4dup.v4bf16.p0i8(i8* %0, i32 2) + +bfloat16x8x4_t test_vld4q_dup_bf16(bfloat16_t const *ptr) { + return vld4q_dup_bf16(ptr); +} +// CHECK-LABEL: test_vld4q_dup_bf16 +// CHECK64: %vld4 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4r.v8bf16.p0bf16(bfloat* %ptr) +// CHECK32: %vld4q_dup_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4dup.v8bf16.p0i8(i8* %0, i32 2) + +void test_vst1_bf16(bfloat16_t *ptr, bfloat16x4_t val) { + vst1_bf16(ptr, val); +} +// CHECK-LABEL: test_vst1_bf16 +// CHECK64: %0 = bitcast bfloat* %ptr to <4 x bfloat>* +// CHECK64: store <4 x bfloat> %val, <4 x bfloat>* %0, align 8 +// CHECK32: %0 = bitcast bfloat* %ptr to i8* +// CHECK32: tail call void @llvm.arm.neon.vst1.p0i8.v4bf16(i8* %0, <4 x bfloat> %val, i32 2) + +void test_vst1q_bf16(bfloat16_t *ptr, bfloat16x8_t val) { + vst1q_bf16(ptr, val); +} +// CHECK-LABEL: test_vst1q_bf16 +// CHECK64: %0 = bitcast bfloat* %ptr to <8 x bfloat>* +// CHECK64: store <8 x bfloat> %val, <8 x bfloat>* %0, align 16 +// CHECK32: %0 = bitcast bfloat* %ptr to i8* +// CHECK32: tail call void @llvm.arm.neon.vst1.p0i8.v8bf16(i8* %0, <8 x bfloat> %val, i32 2) + +void test_vst1_lane_bf16(bfloat16_t *ptr, bfloat16x4_t val) { + vst1_lane_bf16(ptr, val, 1); +} +// CHECK-LABEL: test_vst1_lane_bf16 +// CHECK64: %0 = extractelement <4 x bfloat> %val, i32 1 +// CHECK64: store bfloat %0, bfloat* %ptr, align 2 +// CHECK32: %0 = extractelement <4 x bfloat> %val, i32 1 +// CHECK32: store bfloat %0, bfloat* %ptr, align 2 + +void test_vst1q_lane_bf16(bfloat16_t *ptr, bfloat16x8_t val) { + vst1q_lane_bf16(ptr, val, 7); +} +// CHECK-LABEL: test_vst1q_lane_bf16 +// CHECK64: %0 = extractelement <8 x bfloat> %val, i32 7 +// CHECK64: store bfloat %0, bfloat* %ptr, align 2 +// CHECK32: %0 = extractelement <8 x bfloat> %val, i32 7 +// CHECK32: store bfloat %0, bfloat* %ptr, align 2 + +void test_vst1_bf16_x2(bfloat16_t *ptr, bfloat16x4x2_t val) { + vst1_bf16_x2(ptr, val); +} +// CHECK-LABEL: test_vst1_bf16_x2 +// CHECK64: tail call void @llvm.aarch64.neon.st1x2.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, bfloat* %ptr) +// CHECK32: tail call void @llvm.arm.neon.vst1x2.p0bf16.v4bf16(bfloat* %ptr, <4 x bfloat> %0, <4 x bfloat> %1) + +void test_vst1q_bf16_x2(bfloat16_t *ptr, bfloat16x8x2_t val) { + vst1q_bf16_x2(ptr, val); +} +// CHECK-LABEL: test_vst1q_bf16_x2 +// CHECK64: tail call void @llvm.aarch64.neon.st1x2.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, bfloat* %ptr) +// CHECK32: tail call void @llvm.arm.neon.vst1x2.p0bf16.v8bf16(bfloat* %ptr, <8 x bfloat> %0, <8 x bfloat> %1) + +void test_vst1_bf16_x3(bfloat16_t *ptr, bfloat16x4x3_t val) { + vst1_bf16_x3(ptr, val); +} +// CHECK-LABEL: test_vst1_bf16_x3 +// CHECK64: tail call void @llvm.aarch64.neon.st1x3.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, bfloat* %ptr) +// CHECK32: tail call void @llvm.arm.neon.vst1x3.p0bf16.v4bf16(bfloat* %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2) + +void test_vst1q_bf16_x3(bfloat16_t *ptr, bfloat16x8x3_t val) { + vst1q_bf16_x3(ptr, val); +} +// CHECK-LABEL: test_vst1q_bf16_x3 +// CHECK64: tail call void @llvm.aarch64.neon.st1x3.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, bfloat* %ptr) +// CHECK32: tail call void @llvm.arm.neon.vst1x3.p0bf16.v8bf16(bfloat* %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2) + +void test_vst1_bf16_x4(bfloat16_t *ptr, bfloat16x4x4_t val) { + vst1_bf16_x4(ptr, val); +} +// CHECK-LABEL: test_vst1_bf16_x4 +// CHECK64: tail call void @llvm.aarch64.neon.st1x4.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, bfloat* %ptr) +// CHECK32: tail call void @llvm.arm.neon.vst1x4.p0bf16.v4bf16(bfloat* %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3) + +void test_vst1q_bf16_x4(bfloat16_t *ptr, bfloat16x8x4_t val) { + vst1q_bf16_x4(ptr, val); +} +// CHECK-LABEL: test_vst1q_bf16_x4 +// CHECK64: tail call void @llvm.aarch64.neon.st1x4.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, bfloat* %ptr) +// CHECK32: tail call void @llvm.arm.neon.vst1x4.p0bf16.v8bf16(bfloat* %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3) + +void test_vst2_bf16(bfloat16_t *ptr, bfloat16x4x2_t val) { + vst2_bf16(ptr, val); +} +// CHECK-LABEL: test_vst2_bf16 +// CHECK64: tail call void @llvm.aarch64.neon.st2.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, i8* %0) +// CHECK32: tail call void @llvm.arm.neon.vst2.p0i8.v4bf16(i8* %2, <4 x bfloat> %0, <4 x bfloat> %1, i32 2) + +void test_vst2q_bf16(bfloat16_t *ptr, bfloat16x8x2_t val) { + vst2q_bf16(ptr, val); +} +// CHECK-LABEL: test_vst2q_bf16 +// CHECK64: tail call void @llvm.aarch64.neon.st2.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, i8* %0) +// CHECK32: tail call void @llvm.arm.neon.vst2.p0i8.v8bf16(i8* %2, <8 x bfloat> %0, <8 x bfloat> %1, i32 2) + +void test_vst2_lane_bf16(bfloat16_t *ptr, bfloat16x4x2_t val) { + vst2_lane_bf16(ptr, val, 1); +} +// CHECK-LABEL: test_vst2_lane_bf16 +// CHECK64: tail call void @llvm.aarch64.neon.st2lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, i64 1, i8* %0) +// CHECK32: tail call void @llvm.arm.neon.vst2lane.p0i8.v4bf16(i8* %2, <4 x bfloat> %0, <4 x bfloat> %1, i32 1, i32 2) + +void test_vst2q_lane_bf16(bfloat16_t *ptr, bfloat16x8x2_t val) { + vst2q_lane_bf16(ptr, val, 7); +} +// CHECK-LABEL: test_vst2q_lane_bf16 +// CHECK64: tail call void @llvm.aarch64.neon.st2lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, i64 7, i8* %0) +// CHECK32: tail call void @llvm.arm.neon.vst2lane.p0i8.v8bf16(i8* %2, <8 x bfloat> %0, <8 x bfloat> %1, i32 7, i32 2) + +void test_vst3_bf16(bfloat16_t *ptr, bfloat16x4x3_t val) { + vst3_bf16(ptr, val); +} +// CHECK-LABEL: test_vst3_bf16 +// CHECK64: tail call void @llvm.aarch64.neon.st3.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, i8* %0) +// CHECK32 tail call void @llvm.arm.neon.vst3.p0i8.v4bf16(i8* %3, <4 x bfloat> %0, <4 x bfloat> %2, <4 x bfloat> %3, i32 2) + +void test_vst3q_bf16(bfloat16_t *ptr, bfloat16x8x3_t val) { + vst3q_bf16(ptr, val); +} +// CHECK-LABEL: test_vst3q_bf16 +// CHECK64: tail call void @llvm.aarch64.neon.st3.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, i8* %0) +// CHECK32 tail call void @llvm.arm.neon.vst3.p0i8.v8bf16(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 2) + +void test_vst3_lane_bf16(bfloat16_t *ptr, bfloat16x4x3_t val) { + vst3_lane_bf16(ptr, val, 1); +} +// CHECK-LABEL: test_vst3_lane_bf16 +// CHECK64: tail call void @llvm.aarch64.neon.st3lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, i64 1, i8* %0) +// CHECK32: tail call void @llvm.arm.neon.vst3lane.p0i8.v4bf16(i8* %3, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 1, i32 2) + +void test_vst3q_lane_bf16(bfloat16_t *ptr, bfloat16x8x3_t val) { + vst3q_lane_bf16(ptr, val, 7); +} +// CHECK-LABEL: test_vst3q_lane_bf16 +// CHECK64: tail call void @llvm.aarch64.neon.st3lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, i64 7, i8* %0) +// CHECK32: tail call void @llvm.arm.neon.vst3lane.p0i8.v8bf16(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 7, i32 2) + +void test_vst4_bf16(bfloat16_t *ptr, bfloat16x4x4_t val) { + vst4_bf16(ptr, val); +} +// CHECK-LABEL: test_vst4_bf16 +// CHECK64: tail call void @llvm.aarch64.neon.st4.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, i8* %0) +// CHECK32: tail call void @llvm.arm.neon.vst4.p0i8.v4bf16(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 2) + +void test_vst4q_bf16(bfloat16_t *ptr, bfloat16x8x4_t val) { + vst4q_bf16(ptr, val); +} +// CHECK-LABEL: test_vst4q_bf16 +// CHECK64: tail call void @llvm.aarch64.neon.st4.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, i8* %0) +// CHECK32: tail call void @llvm.arm.neon.vst4.p0i8.v8bf16(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 2) + +void test_vst4_lane_bf16(bfloat16_t *ptr, bfloat16x4x4_t val) { + vst4_lane_bf16(ptr, val, 1); +} +// CHECK-LABEL: test_vst4_lane_bf16 +// CHECK64: tail call void @llvm.aarch64.neon.st4lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, i64 1, i8* %0) +// CHECK32: tail call void @llvm.arm.neon.vst4lane.p0i8.v4bf16(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 1, i32 2) + +void test_vst4q_lane_bf16(bfloat16_t *ptr, bfloat16x8x4_t val) { + vst4q_lane_bf16(ptr, val, 7); +} +// CHECK-LABEL: test_vst4q_lane_bf16 +// CHECK64: tail call void @llvm.aarch64.neon.st4lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, i64 7, i8* %0) +// CHECK32: tail call void @llvm.arm.neon.vst4lane.p0i8.v8bf16(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 7, i32 2) Index: clang/test/Sema/aarch64-bf16-ldst-intrinsics.c =================================================================== --- /dev/null +++ clang/test/Sema/aarch64-bf16-ldst-intrinsics.c @@ -0,0 +1,102 @@ +// RUN: %clang_cc1 -triple aarch64-arm-none-eabi -target-feature +neon -target-feature +bf16 \ +// RUN: -O2 -fallow-half-arguments-and-returns -verify -fsyntax-only %s + +#include "arm_neon.h" + +int x; + +bfloat16x4_t test_vld1_lane_bf16(bfloat16_t const *ptr, bfloat16x4_t src) { + (void)vld1_lane_bf16(ptr, src, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}} + (void)vld1_lane_bf16(ptr, src, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}} + return vld1_lane_bf16(ptr, src, x); // expected-error-re {{argument {{.*}} must be a constant integer}} +} + +bfloat16x8_t test_vld1q_lane_bf16(bfloat16_t const *ptr, bfloat16x8_t src) { + (void)vld1q_lane_bf16(ptr, src, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}} + (void)vld1q_lane_bf16(ptr, src, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}} + return vld1q_lane_bf16(ptr, src, x); // expected-error-re {{argument {{.*}} must be a constant integer}} +} + +bfloat16x4x2_t test_vld2_lane_bf16(bfloat16_t const *ptr, bfloat16x4x2_t src) { + (void)vld2_lane_bf16(ptr, src, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}} + (void)vld2_lane_bf16(ptr, src, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}} + return vld2_lane_bf16(ptr, src, x); // expected-error-re {{argument {{.*}} must be a constant integer}} +} + +bfloat16x8x2_t test_vld2q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x2_t src) { + (void)vld2q_lane_bf16(ptr, src, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}} + (void)vld2q_lane_bf16(ptr, src, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}} + return vld2q_lane_bf16(ptr, src, x); // expected-error-re {{argument {{.*}} must be a constant integer}} +} + +bfloat16x4x3_t test_vld3_lane_bf16(bfloat16_t const *ptr, bfloat16x4x3_t src) { + (void)vld3_lane_bf16(ptr, src, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}} + (void)vld3_lane_bf16(ptr, src, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}} + return vld3_lane_bf16(ptr, src, x); // expected-error-re {{argument {{.*}} must be a constant integer}} +} + +bfloat16x8x3_t test_vld3q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x3_t src) { + (void)vld3q_lane_bf16(ptr, src, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}} + (void)vld3q_lane_bf16(ptr, src, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}} + return vld3q_lane_bf16(ptr, src, x); // expected-error-re {{argument {{.*}} must be a constant integer}} +} + +bfloat16x4x4_t test_vld4_lane_bf16(bfloat16_t const *ptr, bfloat16x4x4_t src) { + (void)vld4_lane_bf16(ptr, src, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}} + (void)vld4_lane_bf16(ptr, src, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}} + return vld4_lane_bf16(ptr, src, x); // expected-error-re {{argument {{.*}} must be a constant integer}} +} + +bfloat16x8x4_t test_vld4q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x4_t src) { + (void)vld4q_lane_bf16(ptr, src, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}} + (void)vld4q_lane_bf16(ptr, src, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}} + return vld4q_lane_bf16(ptr, src, x); // expected-error-re {{argument {{.*}} must be a constant integer}} +} + +void test_vst1_lane_bf16(bfloat16_t *ptr, bfloat16x4_t val) { + vst1_lane_bf16(ptr, val, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}} + vst1_lane_bf16(ptr, val, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}} + vst1_lane_bf16(ptr, val, x); // expected-error-re {{argument {{.*}} must be a constant integer}} +} + +void test_vst1q_lane_bf16(bfloat16_t *ptr, bfloat16x8_t val) { + vst1q_lane_bf16(ptr, val, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}} + vst1q_lane_bf16(ptr, val, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}} + vst1q_lane_bf16(ptr, val, x); // expected-error-re {{argument {{.*}} must be a constant integer}} +} + +void test_vst2_lane_bf16(bfloat16_t *ptr, bfloat16x4x2_t val) { + vst2_lane_bf16(ptr, val, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}} + vst2_lane_bf16(ptr, val, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}} + vst2_lane_bf16(ptr, val, x); // expected-error-re {{argument {{.*}} must be a constant integer}} +} + +void test_vst2q_lane_bf16(bfloat16_t *ptr, bfloat16x8x2_t val) { + vst2q_lane_bf16(ptr, val, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}} + vst2q_lane_bf16(ptr, val, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}} + vst2q_lane_bf16(ptr, val, x); // expected-error-re {{argument {{.*}} must be a constant integer}} +} + +void test_vst3_lane_bf16(bfloat16_t *ptr, bfloat16x4x3_t val) { + vst3_lane_bf16(ptr, val, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}} + vst3_lane_bf16(ptr, val, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}} + vst3_lane_bf16(ptr, val, x); // expected-error-re {{argument {{.*}} must be a constant integer}} +} + +void test_vst3q_lane_bf16(bfloat16_t *ptr, bfloat16x8x3_t val) { + vst3q_lane_bf16(ptr, val, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}} + vst3q_lane_bf16(ptr, val, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}} + vst3q_lane_bf16(ptr, val, x); // expected-error-re {{argument {{.*}} must be a constant integer}} +} + +void test_vst4_lane_bf16(bfloat16_t *ptr, bfloat16x4x4_t val) { + vst4_lane_bf16(ptr, val, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}} + vst4_lane_bf16(ptr, val, 4); // expected-error-re {{argument value {{.*}} is outside the valid range}} + vst4_lane_bf16(ptr, val, x); // expected-error-re {{argument {{.*}} must be a constant integer}} +} + +void test_vst4q_lane_bf16(bfloat16_t *ptr, bfloat16x8x4_t val) { + vst4q_lane_bf16(ptr, val, -1); // expected-error-re {{argument value {{.*}} is outside the valid range}} + vst4q_lane_bf16(ptr, val, 8); // expected-error-re {{argument value {{.*}} is outside the valid range}} + vst4q_lane_bf16(ptr, val, x); // expected-error-re {{argument {{.*}} must be a constant integer}} +} Index: llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -3380,10 +3380,10 @@ } else if (VT == MVT::v16i8) { SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3407,10 +3407,10 @@ } else if (VT == MVT::v16i8) { SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3434,10 +3434,10 @@ } else if (VT == MVT::v16i8) { SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3461,10 +3461,10 @@ } else if (VT == MVT::v16i8) { SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3488,10 +3488,10 @@ } else if (VT == MVT::v16i8) { SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3515,10 +3515,10 @@ } else if (VT == MVT::v16i8) { SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3542,10 +3542,10 @@ } else if (VT == MVT::v16i8) { SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3569,10 +3569,10 @@ } else if (VT == MVT::v16i8) { SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3596,10 +3596,10 @@ } else if (VT == MVT::v16i8) { SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3621,7 +3621,7 @@ SelectLoadLane(Node, 2, AArch64::LD2i8); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectLoadLane(Node, 2, AArch64::LD2i16); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -3639,7 +3639,7 @@ SelectLoadLane(Node, 3, AArch64::LD3i8); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectLoadLane(Node, 3, AArch64::LD3i16); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -3657,7 +3657,7 @@ SelectLoadLane(Node, 4, AArch64::LD4i8); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectLoadLane(Node, 4, AArch64::LD4i16); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -3732,10 +3732,12 @@ } else if (VT == MVT::v16i8) { SelectStore(Node, 2, AArch64::ST1Twov16b); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v4bf16) { SelectStore(Node, 2, AArch64::ST1Twov4h); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || + VT == MVT::v8bf16) { SelectStore(Node, 2, AArch64::ST1Twov8h); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3760,10 +3762,12 @@ } else if (VT == MVT::v16i8) { SelectStore(Node, 3, AArch64::ST1Threev16b); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v4bf16) { SelectStore(Node, 3, AArch64::ST1Threev4h); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || + VT == MVT::v8bf16) { SelectStore(Node, 3, AArch64::ST1Threev8h); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3788,10 +3792,12 @@ } else if (VT == MVT::v16i8) { SelectStore(Node, 4, AArch64::ST1Fourv16b); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v4bf16) { SelectStore(Node, 4, AArch64::ST1Fourv4h); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || + VT == MVT::v8bf16) { SelectStore(Node, 4, AArch64::ST1Fourv8h); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3816,10 +3822,12 @@ } else if (VT == MVT::v16i8) { SelectStore(Node, 2, AArch64::ST2Twov16b); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v4bf16) { SelectStore(Node, 2, AArch64::ST2Twov4h); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || + VT == MVT::v8bf16) { SelectStore(Node, 2, AArch64::ST2Twov8h); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3844,10 +3852,12 @@ } else if (VT == MVT::v16i8) { SelectStore(Node, 3, AArch64::ST3Threev16b); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v4bf16) { SelectStore(Node, 3, AArch64::ST3Threev4h); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || + VT == MVT::v8bf16) { SelectStore(Node, 3, AArch64::ST3Threev8h); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3872,10 +3882,12 @@ } else if (VT == MVT::v16i8) { SelectStore(Node, 4, AArch64::ST4Fourv16b); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || + VT == MVT::v4bf16) { SelectStore(Node, 4, AArch64::ST4Fourv4h); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || + VT == MVT::v8bf16) { SelectStore(Node, 4, AArch64::ST4Fourv8h); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -3898,7 +3910,7 @@ SelectStoreLane(Node, 2, AArch64::ST2i8); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectStoreLane(Node, 2, AArch64::ST2i16); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -3917,7 +3929,7 @@ SelectStoreLane(Node, 3, AArch64::ST3i8); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectStoreLane(Node, 3, AArch64::ST3i16); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -3936,7 +3948,7 @@ SelectStoreLane(Node, 4, AArch64::ST4i8); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectStoreLane(Node, 4, AArch64::ST4i16); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -4020,10 +4032,10 @@ } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4048,10 +4060,10 @@ } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4076,10 +4088,10 @@ } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4104,10 +4116,10 @@ } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4132,10 +4144,10 @@ } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4160,10 +4172,10 @@ } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4188,10 +4200,10 @@ } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4216,10 +4228,10 @@ } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4244,10 +4256,10 @@ } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4272,10 +4284,10 @@ } else if (VT == MVT::v16i8) { SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4298,7 +4310,7 @@ SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -4317,7 +4329,7 @@ SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -4336,7 +4348,7 @@ SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -4355,7 +4367,7 @@ SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -4377,10 +4389,10 @@ } else if (VT == MVT::v16i8) { SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4406,10 +4418,10 @@ } else if (VT == MVT::v16i8) { SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4435,10 +4447,10 @@ } else if (VT == MVT::v16i8) { SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4464,10 +4476,10 @@ } else if (VT == MVT::v16i8) { SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4493,10 +4505,10 @@ } else if (VT == MVT::v16i8) { SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16 ) { SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4522,10 +4534,10 @@ } else if (VT == MVT::v16i8) { SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST); return; - } else if (VT == MVT::v4i16 || VT == MVT::v4f16) { + } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) { SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST); return; - } else if (VT == MVT::v8i16 || VT == MVT::v8f16) { + } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) { SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST); return; } else if (VT == MVT::v2i32 || VT == MVT::v2f32) { @@ -4549,7 +4561,7 @@ SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -4569,7 +4581,7 @@ SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || @@ -4589,7 +4601,7 @@ SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST); return; } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8f16) { + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST); return; } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -2233,6 +2233,7 @@ defm : VecROLoadPat; defm : VecROLoadPat; defm : VecROLoadPat; + defm : VecROLoadPat; } defm : VecROLoadPat; @@ -2247,6 +2248,7 @@ defm : VecROLoadPat; defm : VecROLoadPat; defm : VecROLoadPat; + defm : VecROLoadPat; defm : VecROLoadPat; } } // AddedComplexity = 10 @@ -2382,6 +2384,8 @@ (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; def : Pat<(v4f16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; + def : Pat<(v4bf16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), + (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; } def : Pat<(v1f64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; @@ -2405,6 +2409,8 @@ (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; def : Pat<(v8f16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(v8bf16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), + (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; } def : Pat<(f128 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; @@ -2903,6 +2909,7 @@ defm : VecROStorePat; defm : VecROStorePat; defm : VecROStorePat; + defm : VecROStorePat; } defm : VecROStorePat; @@ -2918,6 +2925,7 @@ defm : VecROStorePat; defm : VecROStorePat; defm : VecROStorePat; + defm : VecROStorePat; } } // AddedComplexity = 10 @@ -3010,6 +3018,9 @@ def : Pat<(store (v4f16 FPR64:$Rt), (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; + def : Pat<(store (v4bf16 FPR64:$Rt), + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), + (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; } // Match all store 128 bits width whose type is compatible with FPR128 @@ -3040,6 +3051,9 @@ def : Pat<(store (v8f16 FPR128:$Rt), (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(store (v8bf16 FPR128:$Rt), + (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), + (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; } // truncstore i64 @@ -3147,6 +3161,9 @@ def : Pat<(store (v4f16 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v4bf16 FPR64:$Rt), + (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), + (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; } // Match all store 128 bits width whose type is compatible with FPR128 @@ -3179,6 +3196,9 @@ def : Pat<(store (v8f16 FPR128:$Rt), (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v8bf16 FPR128:$Rt), + (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), + (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; } } // AddedComplexity = 10 @@ -6316,6 +6336,10 @@ (LD1Rv4h GPR64sp:$Rn)>; def : Pat<(v8f16 (AArch64dup (f16 (load GPR64sp:$Rn)))), (LD1Rv8h GPR64sp:$Rn)>; +def : Pat<(v4bf16 (AArch64dup (bf16 (load GPR64sp:$Rn)))), + (LD1Rv4h GPR64sp:$Rn)>; +def : Pat<(v8bf16 (AArch64dup (bf16 (load GPR64sp:$Rn)))), + (LD1Rv8h GPR64sp:$Rn)>; class Ld1Lane128Pat @@ -6330,6 +6354,7 @@ def : Ld1Lane128Pat; def : Ld1Lane128Pat; def : Ld1Lane128Pat; +def : Ld1Lane128Pat; class Ld1Lane64Pat @@ -6345,6 +6370,7 @@ def : Ld1Lane64Pat; def : Ld1Lane64Pat; def : Ld1Lane64Pat; +def : Ld1Lane64Pat; defm LD1 : SIMDLdSt1SingleAliases<"ld1">; @@ -6373,6 +6399,7 @@ def : St1Lane128Pat; def : St1Lane128Pat; def : St1Lane128Pat; +def : St1Lane128Pat; let AddedComplexity = 19 in class St1Lane64Pat; def : St1Lane64Pat; def : St1Lane64Pat; +def : St1Lane64Pat; multiclass St1LanePost64Pat; defm : St1LanePost64Pat; defm : St1LanePost64Pat; +defm : St1LanePost64Pat; multiclass St1LanePost128Pat; defm : St1LanePost128Pat; defm : St1LanePost128Pat; +defm : St1LanePost128Pat; let mayStore = 1, hasSideEffects = 0 in { defm ST2 : SIMDStSingleB<1, 0b000, "st2", VecListTwob, GPR64pi2>; Index: llvm/test/CodeGen/AArch64/aarch64-bf16-ldst-intrinsics.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/aarch64-bf16-ldst-intrinsics.ll @@ -0,0 +1,826 @@ +; RUN: llc -mtriple aarch64-arm-none-eabi -mattr=+bf16 %s -o - | FileCheck %s + +%struct.bfloat16x4x2_t = type { [2 x <4 x bfloat>] } +%struct.bfloat16x8x2_t = type { [2 x <8 x bfloat>] } +%struct.bfloat16x4x3_t = type { [3 x <4 x bfloat>] } +%struct.bfloat16x8x3_t = type { [3 x <8 x bfloat>] } +%struct.bfloat16x4x4_t = type { [4 x <4 x bfloat>] } +%struct.bfloat16x8x4_t = type { [4 x <8 x bfloat>] } + +; CHECK-LABEL: test_vld1_bf16 +; CHECK: ldr d0, [x0] +define <4 x bfloat> @test_vld1_bf16(bfloat* nocapture readonly %ptr) local_unnamed_addr #0 { +entry: + %0 = bitcast bfloat* %ptr to <4 x bfloat>* + %1 = load <4 x bfloat>, <4 x bfloat>* %0, align 2 + ret <4 x bfloat> %1 +} + +; CHECK-LABEL: test_vld1q_bf16 +; CHECK: ldr q0, [x0] +define <8 x bfloat> @test_vld1q_bf16(bfloat* nocapture readonly %ptr) local_unnamed_addr #1 { +entry: + %0 = bitcast bfloat* %ptr to <8 x bfloat>* + %1 = load <8 x bfloat>, <8 x bfloat>* %0, align 2 + ret <8 x bfloat> %1 +} + +; CHECK-LABEL: test_vld1_lane_bf16 +; CHECK: ld1 { v0.h }[0], [x0] +define <4 x bfloat> @test_vld1_lane_bf16(bfloat* nocapture readonly %ptr, <4 x bfloat> %src) local_unnamed_addr #0 { +entry: + %0 = load bfloat, bfloat* %ptr, align 2 + %vld1_lane = insertelement <4 x bfloat> %src, bfloat %0, i32 0 + ret <4 x bfloat> %vld1_lane +} + +; CHECK-LABEL: test_vld1q_lane_bf16 +; CHECK: ld1 { v0.h }[7], [x0] +define <8 x bfloat> @test_vld1q_lane_bf16(bfloat* nocapture readonly %ptr, <8 x bfloat> %src) local_unnamed_addr #1 { +entry: + %0 = load bfloat, bfloat* %ptr, align 2 + %vld1_lane = insertelement <8 x bfloat> %src, bfloat %0, i32 7 + ret <8 x bfloat> %vld1_lane +} + +; CHECK-LABEL: test_vld1_dup_bf16 +; CHECK: ld1r { v0.4h }, [x0] +define <4 x bfloat> @test_vld1_dup_bf16(bfloat* nocapture readonly %ptr) local_unnamed_addr #0 { +entry: + %0 = load bfloat, bfloat* %ptr, align 2 + %1 = insertelement <4 x bfloat> undef, bfloat %0, i32 0 + %lane = shufflevector <4 x bfloat> %1, <4 x bfloat> undef, <4 x i32> zeroinitializer + ret <4 x bfloat> %lane +} + +; CHECK-LABEL: test_vld1_bf16_x2 +; CHECK: ld1 { v0.4h, v1.4h }, [x0] +define %struct.bfloat16x4x2_t @test_vld1_bf16_x2(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %vld1xN = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x2.v4bf16.p0bf16(bfloat* %ptr) + %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld1xN, 0 + %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld1xN, 1 + %.fca.0.0.insert = insertvalue %struct.bfloat16x4x2_t undef, <4 x bfloat> %vld1xN.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vld1xN.fca.1.extract, 0, 1 + ret %struct.bfloat16x4x2_t %.fca.0.1.insert +} + +declare { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x2.v4bf16.p0bf16(bfloat*) #3 + +; CHECK-LABEL: test_vld1q_bf16_x2 +; CHECK: ld1 { v0.8h, v1.8h }, [x0] +define %struct.bfloat16x8x2_t @test_vld1q_bf16_x2(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %vld1xN = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x2.v8bf16.p0bf16(bfloat* %ptr) + %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld1xN, 0 + %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld1xN, 1 + %.fca.0.0.insert = insertvalue %struct.bfloat16x8x2_t undef, <8 x bfloat> %vld1xN.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vld1xN.fca.1.extract, 0, 1 + ret %struct.bfloat16x8x2_t %.fca.0.1.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x2.v8bf16.p0bf16(bfloat*) #3 + +; CHECK-LABEL: test_vld1_bf16_x3 +; CHECK: ld1 { v0.4h, v1.4h, v2.4h }, [x0] +define %struct.bfloat16x4x3_t @test_vld1_bf16_x3(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x3.v4bf16.p0bf16(bfloat* %ptr) + %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 0 + %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 1 + %vld1xN.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 2 + %.fca.0.0.insert = insertvalue %struct.bfloat16x4x3_t undef, <4 x bfloat> %vld1xN.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.0.insert, <4 x bfloat> %vld1xN.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.1.insert, <4 x bfloat> %vld1xN.fca.2.extract, 0, 2 + ret %struct.bfloat16x4x3_t %.fca.0.2.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x3.v4bf16.p0bf16(bfloat*) #3 + +; CHECK-LABEL: test_vld1q_bf16_x3 +; CHECK: ld1 { v0.8h, v1.8h, v2.8h }, [x0] +define %struct.bfloat16x8x3_t @test_vld1q_bf16_x3(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x3.v8bf16.p0bf16(bfloat* %ptr) + %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 0 + %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 1 + %vld1xN.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 2 + %.fca.0.0.insert = insertvalue %struct.bfloat16x8x3_t undef, <8 x bfloat> %vld1xN.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.0.insert, <8 x bfloat> %vld1xN.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.1.insert, <8 x bfloat> %vld1xN.fca.2.extract, 0, 2 + ret %struct.bfloat16x8x3_t %.fca.0.2.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x3.v8bf16.p0bf16(bfloat*) #3 + +; CHECK-LABEL: test_vld1_bf16_x4 +; CHECK: ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] +define %struct.bfloat16x4x4_t @test_vld1_bf16_x4(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x4.v4bf16.p0bf16(bfloat* %ptr) + %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 0 + %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 1 + %vld1xN.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 2 + %vld1xN.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 3 + %.fca.0.0.insert = insertvalue %struct.bfloat16x4x4_t undef, <4 x bfloat> %vld1xN.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.0.insert, <4 x bfloat> %vld1xN.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.1.insert, <4 x bfloat> %vld1xN.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.2.insert, <4 x bfloat> %vld1xN.fca.3.extract, 0, 3 + ret %struct.bfloat16x4x4_t %.fca.0.3.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x4.v4bf16.p0bf16(bfloat*) #3 + +; CHECK-LABEL: test_vld1q_bf16_x4 +; CHECK: ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] +define %struct.bfloat16x8x4_t @test_vld1q_bf16_x4(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x4.v8bf16.p0bf16(bfloat* %ptr) + %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 0 + %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 1 + %vld1xN.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 2 + %vld1xN.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 3 + %.fca.0.0.insert = insertvalue %struct.bfloat16x8x4_t undef, <8 x bfloat> %vld1xN.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.0.insert, <8 x bfloat> %vld1xN.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.1.insert, <8 x bfloat> %vld1xN.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.2.insert, <8 x bfloat> %vld1xN.fca.3.extract, 0, 3 + ret %struct.bfloat16x8x4_t %.fca.0.3.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x4.v8bf16.p0bf16(bfloat*) #3 + +; CHECK-LABEL: test_vld1q_dup_bf16 +; CHECK: ld1r { v0.8h }, [x0] +define <8 x bfloat> @test_vld1q_dup_bf16(bfloat* nocapture readonly %ptr) local_unnamed_addr #1 { +entry: + %0 = load bfloat, bfloat* %ptr, align 2 + %1 = insertelement <8 x bfloat> undef, bfloat %0, i32 0 + %lane = shufflevector <8 x bfloat> %1, <8 x bfloat> undef, <8 x i32> zeroinitializer + ret <8 x bfloat> %lane +} + +; CHECK-LABEL: test_vld2_bf16 +; CHECK: ld2 { v0.4h, v1.4h }, [x0] +define %struct.bfloat16x4x2_t @test_vld2_bf16(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %0 = bitcast bfloat* %ptr to <4 x bfloat>* + %vld2 = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2.v4bf16.p0v4bf16(<4 x bfloat>* %0) + %vld2.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.bfloat16x4x2_t undef, <4 x bfloat> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vld2.fca.1.extract, 0, 1 + ret %struct.bfloat16x4x2_t %.fca.0.1.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2.v4bf16.p0v4bf16(<4 x bfloat>*) #3 + +; CHECK-LABEL: test_vld2q_bf16 +; CHECK: ld2 { v0.8h, v1.8h }, [x0] +define %struct.bfloat16x8x2_t @test_vld2q_bf16(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %0 = bitcast bfloat* %ptr to <8 x bfloat>* + %vld2 = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2.v8bf16.p0v8bf16(<8 x bfloat>* %0) + %vld2.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.bfloat16x8x2_t undef, <8 x bfloat> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vld2.fca.1.extract, 0, 1 + ret %struct.bfloat16x8x2_t %.fca.0.1.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2.v8bf16.p0v8bf16(<8 x bfloat>*) #3 + +; CHECK-LABEL: test_vld2_lane_bf16 +; CHECK: ld2 { v0.h, v1.h }[1], [x0] +define %struct.bfloat16x4x2_t @test_vld2_lane_bf16(bfloat* %ptr, [2 x <4 x bfloat>] %src.coerce) local_unnamed_addr #2 { +entry: + %src.coerce.fca.0.extract = extractvalue [2 x <4 x bfloat>] %src.coerce, 0 + %src.coerce.fca.1.extract = extractvalue [2 x <4 x bfloat>] %src.coerce, 1 + %0 = bitcast bfloat* %ptr to i8* + %vld2_lane = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, i64 1, i8* %0) + %vld2_lane.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.bfloat16x4x2_t undef, <4 x bfloat> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.bfloat16x4x2_t %.fca.0.1.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, i64, i8*) #3 + +; CHECK-LABEL: test_vld2q_lane_bf16 +; CHECK: ld2 { v0.h, v1.h }[7], [x0] +define %struct.bfloat16x8x2_t @test_vld2q_lane_bf16(bfloat* %ptr, [2 x <8 x bfloat>] %src.coerce) local_unnamed_addr #2 { +entry: + %src.coerce.fca.0.extract = extractvalue [2 x <8 x bfloat>] %src.coerce, 0 + %src.coerce.fca.1.extract = extractvalue [2 x <8 x bfloat>] %src.coerce, 1 + %0 = bitcast bfloat* %ptr to i8* + %vld2_lane = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, i64 7, i8* %0) + %vld2_lane.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2_lane, 0 + %vld2_lane.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2_lane, 1 + %.fca.0.0.insert = insertvalue %struct.bfloat16x8x2_t undef, <8 x bfloat> %vld2_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vld2_lane.fca.1.extract, 0, 1 + ret %struct.bfloat16x8x2_t %.fca.0.1.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, i64, i8*) #3 + +; CHECK-LABEL: test_vld3_bf16 +; CHECK: ld3 { v0.4h, v1.4h, v2.4h }, [x0] +define %struct.bfloat16x4x3_t @test_vld3_bf16(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %0 = bitcast bfloat* %ptr to <4 x bfloat>* + %vld3 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3.v4bf16.p0v4bf16(<4 x bfloat>* %0) + %vld3.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.bfloat16x4x3_t undef, <4 x bfloat> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.0.insert, <4 x bfloat> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.1.insert, <4 x bfloat> %vld3.fca.2.extract, 0, 2 + ret %struct.bfloat16x4x3_t %.fca.0.2.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3.v4bf16.p0v4bf16(<4 x bfloat>*) #3 + +; CHECK-LABEL: test_vld3q_bf16 +; CHECK: ld3 { v0.8h, v1.8h, v2.8h }, [x0] +define %struct.bfloat16x8x3_t @test_vld3q_bf16(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %0 = bitcast bfloat* %ptr to <8 x bfloat>* + %vld3 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3.v8bf16.p0v8bf16(<8 x bfloat>* %0) + %vld3.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.bfloat16x8x3_t undef, <8 x bfloat> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.0.insert, <8 x bfloat> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.1.insert, <8 x bfloat> %vld3.fca.2.extract, 0, 2 + ret %struct.bfloat16x8x3_t %.fca.0.2.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3.v8bf16.p0v8bf16(<8 x bfloat>*) #3 + +; CHECK-LABEL: test_vld3_lane_bf16 +; CHECK: ld3 { v0.h, v1.h, v2.h }[1], [x0] +define %struct.bfloat16x4x3_t @test_vld3_lane_bf16(bfloat* %ptr, [3 x <4 x bfloat>] %src.coerce) local_unnamed_addr #2 { +entry: + %src.coerce.fca.0.extract = extractvalue [3 x <4 x bfloat>] %src.coerce, 0 + %src.coerce.fca.1.extract = extractvalue [3 x <4 x bfloat>] %src.coerce, 1 + %src.coerce.fca.2.extract = extractvalue [3 x <4 x bfloat>] %src.coerce, 2 + %0 = bitcast bfloat* %ptr to i8* + %vld3_lane = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, i64 1, i8* %0) + %vld3_lane.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane, 2 + %.fca.0.0.insert = insertvalue %struct.bfloat16x4x3_t undef, <4 x bfloat> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.0.insert, <4 x bfloat> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.1.insert, <4 x bfloat> %vld3_lane.fca.2.extract, 0, 2 + ret %struct.bfloat16x4x3_t %.fca.0.2.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i64, i8*) #3 + +; CHECK-LABEL: test_vld3q_lane_bf16 +; CHECK: ld3 { v0.h, v1.h, v2.h }[7], [x0] +define %struct.bfloat16x8x3_t @test_vld3q_lane_bf16(bfloat* %ptr, [3 x <8 x bfloat>] %src.coerce) local_unnamed_addr #2 { +entry: + %src.coerce.fca.0.extract = extractvalue [3 x <8 x bfloat>] %src.coerce, 0 + %src.coerce.fca.1.extract = extractvalue [3 x <8 x bfloat>] %src.coerce, 1 + %src.coerce.fca.2.extract = extractvalue [3 x <8 x bfloat>] %src.coerce, 2 + %0 = bitcast bfloat* %ptr to i8* + %vld3_lane = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, i64 7, i8* %0) + %vld3_lane.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3_lane, 0 + %vld3_lane.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3_lane, 1 + %vld3_lane.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3_lane, 2 + %.fca.0.0.insert = insertvalue %struct.bfloat16x8x3_t undef, <8 x bfloat> %vld3_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.0.insert, <8 x bfloat> %vld3_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.1.insert, <8 x bfloat> %vld3_lane.fca.2.extract, 0, 2 + ret %struct.bfloat16x8x3_t %.fca.0.2.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i64, i8*) #3 + +; CHECK-LABEL: test_vld4_bf16 +; CHECK: ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] +define %struct.bfloat16x4x4_t @test_vld4_bf16(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %0 = bitcast bfloat* %ptr to <4 x bfloat>* + %vld4 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4.v4bf16.p0v4bf16(<4 x bfloat>* %0) + %vld4.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.bfloat16x4x4_t undef, <4 x bfloat> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.0.insert, <4 x bfloat> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.1.insert, <4 x bfloat> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.2.insert, <4 x bfloat> %vld4.fca.3.extract, 0, 3 + ret %struct.bfloat16x4x4_t %.fca.0.3.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4.v4bf16.p0v4bf16(<4 x bfloat>*) #3 + +; CHECK-LABEL: test_vld4q_bf16 +; CHECK: ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] +define %struct.bfloat16x8x4_t @test_vld4q_bf16(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %0 = bitcast bfloat* %ptr to <8 x bfloat>* + %vld4 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4.v8bf16.p0v8bf16(<8 x bfloat>* %0) + %vld4.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.bfloat16x8x4_t undef, <8 x bfloat> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.0.insert, <8 x bfloat> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.1.insert, <8 x bfloat> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.2.insert, <8 x bfloat> %vld4.fca.3.extract, 0, 3 + ret %struct.bfloat16x8x4_t %.fca.0.3.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4.v8bf16.p0v8bf16(<8 x bfloat>*) #3 + +; CHECK-LABEL: test_vld4_lane_bf16 +; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[1], [x0] +define %struct.bfloat16x4x4_t @test_vld4_lane_bf16(bfloat* %ptr, [4 x <4 x bfloat>] %src.coerce) local_unnamed_addr #2 { +entry: + %src.coerce.fca.0.extract = extractvalue [4 x <4 x bfloat>] %src.coerce, 0 + %src.coerce.fca.1.extract = extractvalue [4 x <4 x bfloat>] %src.coerce, 1 + %src.coerce.fca.2.extract = extractvalue [4 x <4 x bfloat>] %src.coerce, 2 + %src.coerce.fca.3.extract = extractvalue [4 x <4 x bfloat>] %src.coerce, 3 + %0 = bitcast bfloat* %ptr to i8* + %vld4_lane = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, <4 x bfloat> %src.coerce.fca.3.extract, i64 1, i8* %0) + %vld4_lane.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane, 0 + %vld4_lane.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane, 1 + %vld4_lane.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane, 2 + %vld4_lane.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane, 3 + %.fca.0.0.insert = insertvalue %struct.bfloat16x4x4_t undef, <4 x bfloat> %vld4_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.0.insert, <4 x bfloat> %vld4_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.1.insert, <4 x bfloat> %vld4_lane.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.2.insert, <4 x bfloat> %vld4_lane.fca.3.extract, 0, 3 + ret %struct.bfloat16x4x4_t %.fca.0.3.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i64, i8*) #3 + +; CHECK-LABEL: test_vld4q_lane_bf16 +; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[7], [x0] +define %struct.bfloat16x8x4_t @test_vld4q_lane_bf16(bfloat* %ptr, [4 x <8 x bfloat>] %src.coerce) local_unnamed_addr #2 { +entry: + %src.coerce.fca.0.extract = extractvalue [4 x <8 x bfloat>] %src.coerce, 0 + %src.coerce.fca.1.extract = extractvalue [4 x <8 x bfloat>] %src.coerce, 1 + %src.coerce.fca.2.extract = extractvalue [4 x <8 x bfloat>] %src.coerce, 2 + %src.coerce.fca.3.extract = extractvalue [4 x <8 x bfloat>] %src.coerce, 3 + %0 = bitcast bfloat* %ptr to i8* + %vld4_lane = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, <8 x bfloat> %src.coerce.fca.3.extract, i64 7, i8* %0) + %vld4_lane.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4_lane, 0 + %vld4_lane.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4_lane, 1 + %vld4_lane.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4_lane, 2 + %vld4_lane.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4_lane, 3 + %.fca.0.0.insert = insertvalue %struct.bfloat16x8x4_t undef, <8 x bfloat> %vld4_lane.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.0.insert, <8 x bfloat> %vld4_lane.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.1.insert, <8 x bfloat> %vld4_lane.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.2.insert, <8 x bfloat> %vld4_lane.fca.3.extract, 0, 3 + ret %struct.bfloat16x8x4_t %.fca.0.3.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i64, i8*) #3 + +; CHECK-LABEL: test_vld2_dup_bf16 +; CHECK: ld2r { v0.4h, v1.4h }, [x0] +define %struct.bfloat16x4x2_t @test_vld2_dup_bf16(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %vld2 = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2r.v4bf16.p0bf16(bfloat* %ptr) + %vld2.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.bfloat16x4x2_t undef, <4 x bfloat> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vld2.fca.1.extract, 0, 1 + ret %struct.bfloat16x4x2_t %.fca.0.1.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2r.v4bf16.p0bf16(bfloat*) #3 + +; CHECK-LABEL: test_vld2q_dup_bf16 +; CHECK: ld2r { v0.8h, v1.8h }, [x0] +define %struct.bfloat16x8x2_t @test_vld2q_dup_bf16(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %vld2 = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2r.v8bf16.p0bf16(bfloat* %ptr) + %vld2.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2, 0 + %vld2.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2, 1 + %.fca.0.0.insert = insertvalue %struct.bfloat16x8x2_t undef, <8 x bfloat> %vld2.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vld2.fca.1.extract, 0, 1 + ret %struct.bfloat16x8x2_t %.fca.0.1.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2r.v8bf16.p0bf16(bfloat*) #3 + +; CHECK-LABEL: test_vld3_dup_bf16 +; CHECK: ld3r { v0.4h, v1.4h, v2.4h }, [x0] +define %struct.bfloat16x4x3_t @test_vld3_dup_bf16(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %vld3 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3r.v4bf16.p0bf16(bfloat* %ptr) + %vld3.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.bfloat16x4x3_t undef, <4 x bfloat> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.0.insert, <4 x bfloat> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x4x3_t %.fca.0.1.insert, <4 x bfloat> %vld3.fca.2.extract, 0, 2 + ret %struct.bfloat16x4x3_t %.fca.0.2.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3r.v4bf16.p0bf16(bfloat*) #3 + +; CHECK-LABEL: test_vld3q_dup_bf16 +; CHECK: ld3r { v0.8h, v1.8h, v2.8h }, [x0] +define %struct.bfloat16x8x3_t @test_vld3q_dup_bf16(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %vld3 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3r.v8bf16.p0bf16(bfloat* %ptr) + %vld3.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 0 + %vld3.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 1 + %vld3.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3, 2 + %.fca.0.0.insert = insertvalue %struct.bfloat16x8x3_t undef, <8 x bfloat> %vld3.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.0.insert, <8 x bfloat> %vld3.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x8x3_t %.fca.0.1.insert, <8 x bfloat> %vld3.fca.2.extract, 0, 2 + ret %struct.bfloat16x8x3_t %.fca.0.2.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3r.v8bf16.p0bf16(bfloat*) #3 + +; CHECK-LABEL: test_vld4_dup_bf16 +; CHECK: ld4r { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] +define %struct.bfloat16x4x4_t @test_vld4_dup_bf16(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %vld4 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4r.v4bf16.p0bf16(bfloat* %ptr) + %vld4.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.bfloat16x4x4_t undef, <4 x bfloat> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.0.insert, <4 x bfloat> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.1.insert, <4 x bfloat> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.bfloat16x4x4_t %.fca.0.2.insert, <4 x bfloat> %vld4.fca.3.extract, 0, 3 + ret %struct.bfloat16x4x4_t %.fca.0.3.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4r.v4bf16.p0bf16(bfloat*) #3 + +; CHECK-LABEL: test_vld4q_dup_bf16 +; CHECK: ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] +define %struct.bfloat16x8x4_t @test_vld4q_dup_bf16(bfloat* %ptr) local_unnamed_addr #2 { +entry: + %vld4 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4r.v8bf16.p0bf16(bfloat* %ptr) + %vld4.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 0 + %vld4.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 1 + %vld4.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 2 + %vld4.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4, 3 + %.fca.0.0.insert = insertvalue %struct.bfloat16x8x4_t undef, <8 x bfloat> %vld4.fca.0.extract, 0, 0 + %.fca.0.1.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.0.insert, <8 x bfloat> %vld4.fca.1.extract, 0, 1 + %.fca.0.2.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.1.insert, <8 x bfloat> %vld4.fca.2.extract, 0, 2 + %.fca.0.3.insert = insertvalue %struct.bfloat16x8x4_t %.fca.0.2.insert, <8 x bfloat> %vld4.fca.3.extract, 0, 3 + ret %struct.bfloat16x8x4_t %.fca.0.3.insert +} + +; Function Attrs: argmemonly nounwind readonly +declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4r.v8bf16.p0bf16(bfloat*) #3 + +; CHECK-LABEL: test_vst1_bf16 +; CHECK: str d0, [x0] +define void @test_vst1_bf16(bfloat* nocapture %ptr, <4 x bfloat> %val) local_unnamed_addr #4 { +entry: + %0 = bitcast bfloat* %ptr to <4 x bfloat>* + store <4 x bfloat> %val, <4 x bfloat>* %0, align 8 + ret void +} + +; CHECK-LABEL: test_vst1q_bf16 +; CHECK: str q0, [x0] +define void @test_vst1q_bf16(bfloat* nocapture %ptr, <8 x bfloat> %val) local_unnamed_addr #5 { +entry: + %0 = bitcast bfloat* %ptr to <8 x bfloat>* + store <8 x bfloat> %val, <8 x bfloat>* %0, align 16 + ret void +} + +; CHECK-LABEL: test_vst1_lane_bf16 +; CHECK: st1 { v0.h }[1], [x0] +define void @test_vst1_lane_bf16(bfloat* nocapture %ptr, <4 x bfloat> %val) local_unnamed_addr #4 { +entry: + %0 = extractelement <4 x bfloat> %val, i32 1 + store bfloat %0, bfloat* %ptr, align 2 + ret void +} + +; CHECK-LABEL: test_vst1q_lane_bf16 +; CHECK: st1 { v0.h }[7], [x0] +define void @test_vst1q_lane_bf16(bfloat* nocapture %ptr, <8 x bfloat> %val) local_unnamed_addr #5 { +entry: + %0 = extractelement <8 x bfloat> %val, i32 7 + store bfloat %0, bfloat* %ptr, align 2 + ret void +} + +; CHECK-LABEL: test_vst1_bf16_x2 +; CHECK: st1 { v0.4h, v1.4h }, [x0] +define void @test_vst1_bf16_x2(bfloat* nocapture %ptr, [2 x <4 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 1 + tail call void @llvm.aarch64.neon.st1x2.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, bfloat* %ptr) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st1x2.v4bf16.p0bf16(<4 x bfloat>, <4 x bfloat>, bfloat* nocapture) #7 + +; CHECK-LABEL: test_vst1q_bf16_x2 +; CHECK: st1 { v0.8h, v1.8h }, [x0] +define void @test_vst1q_bf16_x2(bfloat* nocapture %ptr, [2 x <8 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 1 + tail call void @llvm.aarch64.neon.st1x2.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, bfloat* %ptr) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st1x2.v8bf16.p0bf16(<8 x bfloat>, <8 x bfloat>, bfloat* nocapture) #7 + +; CHECK-LABEL: test_vst1_bf16_x3 +; CHECK: st1 { v0.4h, v1.4h, v2.4h }, [x0] +define void @test_vst1_bf16_x3(bfloat* nocapture %ptr, [3 x <4 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 1 + %val.coerce.fca.2.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 2 + tail call void @llvm.aarch64.neon.st1x3.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, bfloat* %ptr) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st1x3.v4bf16.p0bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, bfloat* nocapture) #7 + +; CHECK-LABEL: test_vst1q_bf16_x3 +; CHECK: st1 { v0.8h, v1.8h, v2.8h }, [x0] +define void @test_vst1q_bf16_x3(bfloat* nocapture %ptr, [3 x <8 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 1 + %val.coerce.fca.2.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 2 + tail call void @llvm.aarch64.neon.st1x3.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, bfloat* %ptr) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st1x3.v8bf16.p0bf16(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, bfloat* nocapture) #7 + +; Function Attrs: nounwind +; CHECK-LABEL: test_vst1_bf16_x4 +; CHECK: st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] +define void @test_vst1_bf16_x4(bfloat* nocapture %ptr, [4 x <4 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 1 + %val.coerce.fca.2.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 2 + %val.coerce.fca.3.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 3 + tail call void @llvm.aarch64.neon.st1x4.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, bfloat* %ptr) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st1x4.v4bf16.p0bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, bfloat* nocapture) #7 + +; CHECK-LABEL: test_vst1q_bf16_x4 +; CHECK: st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] +define void @test_vst1q_bf16_x4(bfloat* nocapture %ptr, [4 x <8 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 1 + %val.coerce.fca.2.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 2 + %val.coerce.fca.3.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 3 + tail call void @llvm.aarch64.neon.st1x4.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, bfloat* %ptr) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st1x4.v8bf16.p0bf16(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, bfloat* nocapture) #7 + +; CHECK-LABEL: test_vst2_bf16 +; CHECK: st2 { v0.4h, v1.4h }, [x0] +define void @test_vst2_bf16(bfloat* nocapture %ptr, [2 x <4 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 1 + %0 = bitcast bfloat* %ptr to i8* + tail call void @llvm.aarch64.neon.st2.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, i8* %0) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st2.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, i8* nocapture) #7 + +; CHECK-LABEL: test_vst2q_bf16 +; CHECK: st2 { v0.8h, v1.8h }, [x0] +define void @test_vst2q_bf16(bfloat* nocapture %ptr, [2 x <8 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 1 + %0 = bitcast bfloat* %ptr to i8* + tail call void @llvm.aarch64.neon.st2.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, i8* %0) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st2.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, i8* nocapture) #7 + +; CHECK-LABEL: test_vst2_lane_bf16 +; CHECK: st2 { v0.h, v1.h }[1], [x0] +define void @test_vst2_lane_bf16(bfloat* nocapture %ptr, [2 x <4 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [2 x <4 x bfloat>] %val.coerce, 1 + %0 = bitcast bfloat* %ptr to i8* + tail call void @llvm.aarch64.neon.st2lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, i64 1, i8* %0) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st2lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, i64, i8* nocapture) #7 + +; Function Attrs: nounwind +; CHECK-LABEL: test_vst2q_lane_bf16 +; CHECK: st2 { v0.h, v1.h }[7], [x0] +define void @test_vst2q_lane_bf16(bfloat* nocapture %ptr, [2 x <8 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [2 x <8 x bfloat>] %val.coerce, 1 + %0 = bitcast bfloat* %ptr to i8* + tail call void @llvm.aarch64.neon.st2lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, i64 7, i8* %0) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st2lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, i64, i8* nocapture) #7 + +; Function Attrs: nounwind +; CHECK-LABEL: test_vst3_bf16 +; CHECK: st3 { v0.4h, v1.4h, v2.4h }, [x0] +define void @test_vst3_bf16(bfloat* nocapture %ptr, [3 x <4 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 1 + %val.coerce.fca.2.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 2 + %0 = bitcast bfloat* %ptr to i8* + tail call void @llvm.aarch64.neon.st3.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, i8* %0) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st3.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i8* nocapture) #7 + +; Function Attrs: nounwind +; CHECK-LABEL: test_vst3q_bf16 +; CHECK: st3 { v0.8h, v1.8h, v2.8h }, [x0] +define void @test_vst3q_bf16(bfloat* nocapture %ptr, [3 x <8 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 1 + %val.coerce.fca.2.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 2 + %0 = bitcast bfloat* %ptr to i8* + tail call void @llvm.aarch64.neon.st3.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, i8* %0) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st3.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i8* nocapture) #7 + +; Function Attrs: nounwind +; CHECK-LABEL: test_vst3_lane_bf16 +; CHECK: st3 { v0.h, v1.h, v2.h }[1], [x0] +define void @test_vst3_lane_bf16(bfloat* nocapture %ptr, [3 x <4 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 1 + %val.coerce.fca.2.extract = extractvalue [3 x <4 x bfloat>] %val.coerce, 2 + %0 = bitcast bfloat* %ptr to i8* + tail call void @llvm.aarch64.neon.st3lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, i64 1, i8* %0) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st3lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i64, i8* nocapture) #7 + +; Function Attrs: nounwind +; CHECK-LABEL: test_vst3q_lane_bf16 +; CHECK: st3 { v0.h, v1.h, v2.h }[7], [x0] +define void @test_vst3q_lane_bf16(bfloat* nocapture %ptr, [3 x <8 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 1 + %val.coerce.fca.2.extract = extractvalue [3 x <8 x bfloat>] %val.coerce, 2 + %0 = bitcast bfloat* %ptr to i8* + tail call void @llvm.aarch64.neon.st3lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, i64 7, i8* %0) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st3lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i64, i8* nocapture) #7 + +; Function Attrs: nounwind +; CHECK-LABEL: test_vst4_bf16 +; CHECK: st4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] +define void @test_vst4_bf16(bfloat* nocapture %ptr, [4 x <4 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 1 + %val.coerce.fca.2.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 2 + %val.coerce.fca.3.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 3 + %0 = bitcast bfloat* %ptr to i8* + tail call void @llvm.aarch64.neon.st4.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, i8* %0) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st4.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i8* nocapture) #7 + +; Function Attrs: nounwind +; CHECK-LABEL: test_vst4q_bf16 +; CHECK: st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] +define void @test_vst4q_bf16(bfloat* nocapture %ptr, [4 x <8 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 1 + %val.coerce.fca.2.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 2 + %val.coerce.fca.3.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 3 + %0 = bitcast bfloat* %ptr to i8* + tail call void @llvm.aarch64.neon.st4.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, i8* %0) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st4.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i8* nocapture) #7 + +; Function Attrs: nounwind +; CHECK-LABEL: test_vst4_lane_bf16 +; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[1], [x0] +define void @test_vst4_lane_bf16(bfloat* nocapture %ptr, [4 x <4 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 1 + %val.coerce.fca.2.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 2 + %val.coerce.fca.3.extract = extractvalue [4 x <4 x bfloat>] %val.coerce, 3 + %0 = bitcast bfloat* %ptr to i8* + tail call void @llvm.aarch64.neon.st4lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, i64 1, i8* %0) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st4lane.v4bf16.p0i8(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i64, i8* nocapture) #7 + +; Function Attrs: nounwind +; CHECK-LABEL: test_vst4q_lane_bf16 +; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[7], [x0] +define void @test_vst4q_lane_bf16(bfloat* nocapture %ptr, [4 x <8 x bfloat>] %val.coerce) local_unnamed_addr #6 { +entry: + %val.coerce.fca.0.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 0 + %val.coerce.fca.1.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 1 + %val.coerce.fca.2.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 2 + %val.coerce.fca.3.extract = extractvalue [4 x <8 x bfloat>] %val.coerce, 3 + %0 = bitcast bfloat* %ptr to i8* + tail call void @llvm.aarch64.neon.st4lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, i64 7, i8* %0) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.aarch64.neon.st4lane.v8bf16.p0i8(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i64, i8* nocapture) #7 + +attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="64" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+bf16,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+bf16,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+bf16,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { argmemonly nounwind readonly } +attributes #4 = { nofree norecurse nounwind writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="64" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+bf16,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { nofree norecurse nounwind writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+bf16,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #6 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+bf16,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #7 = { argmemonly nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang version 11.0.0 (https://git.research.arm.com/corstu01/llvm-project.git bbc7a9e9d4ef536605fc70136adfe9d2b5809c4e)"}