diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -1841,3 +1841,28 @@ def VCADDQ_ROT90_FP64 : SInst<"vcaddq_rot90", "QQQ", "d">; def VCADDQ_ROT270_FP64 : SInst<"vcaddq_rot270", "QQQ", "d">; } + +// V8.2-A BFloat intrinsics +let ArchGuard = "defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC)" in { + def VCREATE_BF : NoTestOpInst<"vcreate", ".(IU>)", "b", OP_CAST> { + let BigEndianSafe = 1; + } + + def VDUP_N_BF : WOpInst<"vdup_n", ".1", "bQb", OP_DUP>; + + def VDUP_LANE_BF : WOpInst<"vdup_lane", ".qI", "bQb", OP_DUP_LN>; + def VDUP_LANEQ_BF: WOpInst<"vdup_laneq", ".QI", "bQb", OP_DUP_LN> { + let isLaneQ = 1; + } + + def VCOMBINE_BF : NoTestOpInst<"vcombine", "Q..", "b", OP_CONC>; + + def VGET_HIGH_BF : NoTestOpInst<"vget_high", ".Q", "b", OP_HI>; + def VGET_LOW_BF : NoTestOpInst<"vget_low", ".Q", "b", OP_LO>; + + def VGET_LANE_BF : IInst<"vget_lane", "1.I", "bQb">; + def VSET_LANE_BF : IInst<"vset_lane", ".1.I", "bQb">; + + def SCALAR_VDUP_LANE_BF : IInst<"vdup_lane", "1.I", "Sb">; + def SCALAR_VDUP_LANEQ_BF : IInst<"vdup_laneq", "1QI", "Sb">; +} diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -6305,21 +6305,27 @@ case NEON::BI__builtin_neon_vget_lane_i32: case NEON::BI__builtin_neon_vget_lane_i64: case NEON::BI__builtin_neon_vget_lane_f32: + case NEON::BI__builtin_neon_vget_lane_bf16: + case NEON::BI__builtin_neon_vduph_lane_bf16: case NEON::BI__builtin_neon_vgetq_lane_i8: case NEON::BI__builtin_neon_vgetq_lane_i16: case NEON::BI__builtin_neon_vgetq_lane_i32: case NEON::BI__builtin_neon_vgetq_lane_i64: case NEON::BI__builtin_neon_vgetq_lane_f32: + case NEON::BI__builtin_neon_vgetq_lane_bf16: + case NEON::BI__builtin_neon_vduph_laneq_bf16: case NEON::BI__builtin_neon_vset_lane_i8: case NEON::BI__builtin_neon_vset_lane_i16: case NEON::BI__builtin_neon_vset_lane_i32: case NEON::BI__builtin_neon_vset_lane_i64: case NEON::BI__builtin_neon_vset_lane_f32: + case NEON::BI__builtin_neon_vset_lane_bf16: case NEON::BI__builtin_neon_vsetq_lane_i8: case NEON::BI__builtin_neon_vsetq_lane_i16: case NEON::BI__builtin_neon_vsetq_lane_i32: case NEON::BI__builtin_neon_vsetq_lane_i64: case NEON::BI__builtin_neon_vsetq_lane_f32: + case NEON::BI__builtin_neon_vsetq_lane_bf16: case NEON::BI__builtin_neon_vsha1h_u32: case NEON::BI__builtin_neon_vsha1cq_u32: case NEON::BI__builtin_neon_vsha1pq_u32: @@ -6768,6 +6774,10 @@ case NEON::BI__builtin_neon_vgetq_lane_i32: case NEON::BI__builtin_neon_vgetq_lane_i64: case NEON::BI__builtin_neon_vgetq_lane_f32: + case NEON::BI__builtin_neon_vget_lane_bf16: + case NEON::BI__builtin_neon_vduph_lane_bf16: + case NEON::BI__builtin_neon_vgetq_lane_bf16: + case NEON::BI__builtin_neon_vduph_laneq_bf16: return Builder.CreateExtractElement(Ops[0], Ops[1], "vget_lane"); case NEON::BI__builtin_neon_vrndns_f32: { @@ -6786,6 +6796,8 @@ case NEON::BI__builtin_neon_vsetq_lane_i32: case NEON::BI__builtin_neon_vsetq_lane_i64: case NEON::BI__builtin_neon_vsetq_lane_f32: + case NEON::BI__builtin_neon_vset_lane_bf16: + case NEON::BI__builtin_neon_vsetq_lane_bf16: return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane"); case NEON::BI__builtin_neon_vsha1h_u32: @@ -9066,6 +9078,8 @@ case NEON::BI__builtin_neon_vsetq_lane_i32: case NEON::BI__builtin_neon_vsetq_lane_i64: case NEON::BI__builtin_neon_vsetq_lane_f32: + case NEON::BI__builtin_neon_vset_lane_bf16: + case NEON::BI__builtin_neon_vsetq_lane_bf16: Ops.push_back(EmitScalarExpr(E->getArg(2))); return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane"); case NEON::BI__builtin_neon_vset_lane_f64: @@ -9153,6 +9167,14 @@ llvm::VectorType::get(DoubleTy, 2)); return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), "vgetq_lane"); + case NEON::BI__builtin_neon_vget_lane_bf16: + Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(BFloatTy, 4)); + return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), + "vget_lane"); + case NEON::BI__builtin_neon_vgetq_lane_bf16: + Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(BFloatTy, 8)); + return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), + "vgetq_lane"); case NEON::BI__builtin_neon_vaddh_f16: Ops.push_back(EmitScalarExpr(E->getArg(1))); return Builder.CreateFAdd(Ops[0], Ops[1], "vaddh"); @@ -9335,10 +9357,12 @@ : Intrinsic::aarch64_neon_sqsub; return EmitNeonCall(CGM.getIntrinsic(AccInt, Int64Ty), Ops, "vqdmlXl"); } + case NEON::BI__builtin_neon_vduph_lane_bf16: case NEON::BI__builtin_neon_vduph_lane_f16: { return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), "vget_lane"); } + case NEON::BI__builtin_neon_vduph_laneq_bf16: case NEON::BI__builtin_neon_vduph_laneq_f16: { return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), "vgetq_lane"); diff --git a/clang/test/CodeGen/aarch64-bf16-getset-intrinsics.c b/clang/test/CodeGen/aarch64-bf16-getset-intrinsics.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-bf16-getset-intrinsics.c @@ -0,0 +1,120 @@ +// RUN: %clang_cc1 -triple aarch64-arm-none-eabi -target-feature +neon -target-feature +bf16 \ +// RUN: -O2 -fallow-half-arguments-and-returns -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK64 +// RUN: %clang_cc1 -triple armv8.6a-arm-none-eabi -target-feature +neon -target-feature +bf16 -mfloat-abi hard \ +// RUN: -O2 -fallow-half-arguments-and-returns -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK32 + +#include + +bfloat16x4_t test_vcreate_bf16(uint64_t a) { + return vcreate_bf16(a); +} +// CHECK-LABEL: test_vcreate_bf16 +// CHECK64: %0 = bitcast i64 %a to <4 x bfloat> +// CHECK32: %0 = bitcast i64 %a to <4 x bfloat> + +bfloat16x4_t test_vdup_n_bf16(bfloat16_t v) { + return vdup_n_bf16(v); +} +// CHECK-LABEL: test_vdup_n_bf16 +// CHECK64: %vecinit.i = insertelement <4 x bfloat> undef, bfloat %v, i32 0 +// CHECK32: %vecinit.i = insertelement <4 x bfloat> undef, bfloat %v, i32 0 +// CHECK: %vecinit{{.*}} = shufflevector <4 x bfloat> %vecinit.i, <4 x bfloat> undef, <4 x i32> zeroinitializer + +bfloat16x8_t test_vdupq_n_bf16(bfloat16_t v) { + return vdupq_n_bf16(v); +} +// CHECK-LABEL: test_vdupq_n_bf16 +// CHECK64: %vecinit.i = insertelement <8 x bfloat> undef, bfloat %v, i32 0 +// CHECK32: %vecinit.i = insertelement <8 x bfloat> undef, bfloat %v, i32 0 +// CHECK: %vecinit{{.*}} = shufflevector <8 x bfloat> %vecinit.i, <8 x bfloat> undef, <8 x i32> zeroinitializer + +bfloat16x4_t test_vdup_lane_bf16(bfloat16x4_t v) { + return vdup_lane_bf16(v, 1); +} +// CHECK-LABEL: test_vdup_lane_bf16 +// CHECK64: %lane = shufflevector <4 x bfloat> %v, <4 x bfloat> undef, <4 x i32> +// CHECK32: %lane = shufflevector <4 x bfloat> %v, <4 x bfloat> undef, <4 x i32> + +bfloat16x8_t test_vdupq_lane_bf16(bfloat16x4_t v) { + return vdupq_lane_bf16(v, 1); +} +// CHECK-LABEL: test_vdupq_lane_bf16 +// CHECK64: %lane = shufflevector <4 x bfloat> %v, <4 x bfloat> undef, <8 x i32> +// CHECK32: %lane = shufflevector <4 x bfloat> %v, <4 x bfloat> undef, <8 x i32> + +bfloat16x4_t test_vdup_laneq_bf16(bfloat16x8_t v) { + return vdup_laneq_bf16(v, 7); +} +// CHECK-LABEL: test_vdup_laneq_bf16 +// CHECK64: %lane = shufflevector <8 x bfloat> %v, <8 x bfloat> undef, <4 x i32> +// CHECK32: %lane = shufflevector <8 x bfloat> %v, <8 x bfloat> undef, <4 x i32> + +bfloat16x8_t test_vdupq_laneq_bf16(bfloat16x8_t v) { + return vdupq_laneq_bf16(v, 7); +} +// CHECK-LABEL: test_vdupq_laneq_bf16 +// CHECK64: %lane = shufflevector <8 x bfloat> %v, <8 x bfloat> undef, <8 x i32> +// CHECK32: %lane = shufflevector <8 x bfloat> %v, <8 x bfloat> undef, <8 x i32> + +bfloat16x8_t test_vcombine_bf16(bfloat16x4_t low, bfloat16x4_t high) { + return vcombine_bf16(low, high); +} +// CHECK-LABEL: test_vcombine_bf16 +// CHECK64: %shuffle.i = shufflevector <4 x bfloat> %low, <4 x bfloat> %high, <8 x i32> +// CHECK32: %shuffle.i = shufflevector <4 x bfloat> %low, <4 x bfloat> %high, <8 x i32> + +bfloat16x4_t test_vget_high_bf16(bfloat16x8_t a) { + return vget_high_bf16(a); +} +// CHECK-LABEL: test_vget_high_bf16 +// CHECK64: %shuffle.i = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <4 x i32> +// CHECK32: %shuffle.i = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <4 x i32> + +bfloat16x4_t test_vget_low_bf16(bfloat16x8_t a) { + return vget_low_bf16(a); +} +// CHECK-LABEL: test_vget_low_bf16 +// CHECK64: %shuffle.i = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <4 x i32> +// CHECK32: %shuffle.i = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <4 x i32> + +bfloat16_t test_vget_lane_bf16(bfloat16x4_t v) { + return vget_lane_bf16(v, 1); +} +// CHECK-LABEL: test_vget_lane_bf16 +// CHECK64: %vget_lane = extractelement <4 x bfloat> %v, i32 1 +// CHECK32: %vget_lane = extractelement <4 x bfloat> %v, i32 1 + +bfloat16_t test_vgetq_lane_bf16(bfloat16x8_t v) { + return vgetq_lane_bf16(v, 7); +} +// CHECK-LABEL: test_vgetq_lane_bf16 +// CHECK64: %vgetq_lane = extractelement <8 x bfloat> %v, i32 7 +// CHECK32: %vget_lane = extractelement <8 x bfloat> %v, i32 7 + +bfloat16x4_t test_vset_lane_bf16(bfloat16_t a, bfloat16x4_t v) { + return vset_lane_bf16(a, v, 1); +} +// CHECK-LABEL: test_vset_lane_bf16 +// CHECK64: %vset_lane = insertelement <4 x bfloat> %v, bfloat %a, i32 1 +// CHECK32: %vset_lane = insertelement <4 x bfloat> %v, bfloat %a, i32 1 + +bfloat16x8_t test_vsetq_lane_bf16(bfloat16_t a, bfloat16x8_t v) { + return vsetq_lane_bf16(a, v, 7); +} +// CHECK-LABEL: test_vsetq_lane_bf16 +// CHECK64: %vset_lane = insertelement <8 x bfloat> %v, bfloat %a, i32 7 +// CHECK32: %vset_lane = insertelement <8 x bfloat> %v, bfloat %a, i32 7 + +bfloat16_t test_vduph_lane_bf16(bfloat16x4_t v) { + return vduph_lane_bf16(v, 1); +} +// CHECK-LABEL: test_vduph_lane_bf16 +// CHECK64: %vget_lane = extractelement <4 x bfloat> %v, i32 1 +// CHECK32: %vget_lane = extractelement <4 x bfloat> %v, i32 1 + +bfloat16_t test_vduph_laneq_bf16(bfloat16x8_t v) { + return vduph_laneq_bf16(v, 7); +} +// CHECK-LABEL: test_vduph_laneq_bf16 +// CHECK64: %vgetq_lane = extractelement <8 x bfloat> %v, i32 7 +// CHECK32: %vget_lane = extractelement <8 x bfloat> %v, i32 7