diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -252,6 +252,34 @@ : Op<(call "vbfmlalt", $p0, $p1, (dup_typed $p1, (call "vget_lane", $p2, $p3)))>; +def OP_VCVT_F32_BF16 + : Op<(bitcast "R", + (call "vshll_n", (bitcast "int16x4_t", $p0), + (literal "int32_t", "16")))>; +def OP_VCVT_F32_BF16_LO + : Op<(call "vcvt_f32_bf16", (call "vget_low", $p0))>; +def OP_VCVT_F32_BF16_HI + : Op<(call "vcvt_f32_bf16", (call "vget_high", $p0))>; + +def OP_VCVT_BF16_F32_LO_A64 + : Op<(call "__a64_vcvtq_low_bf16", $p0)>; +def OP_VCVT_BF16_F32_A64 + : Op<(call "vget_low", (call "__a64_vcvtq_low_bf16", $p0))>; + +def OP_VCVT_BF16_F32_A32 + : Op<(call "__a32_vcvt_bf16", $p0)>; + +def OP_VCVT_BF16_F32_LO_A32 + : Op<(call "vcombine", (cast "bfloat16x4_t", (literal "uint64_t", "0ULL")), + (call "__a32_vcvt_bf16", $p0))>; +def OP_VCVT_BF16_F32_HI_A32 + : Op<(call "vcombine", (call "__a32_vcvt_bf16", $p1), + (call "vget_low", $p0))>; + +def OP_CVT_F32_BF16 + : Op<(bitcast "R", (op "<<", (bitcast "int32_t", $p0), + (literal "int32_t", "16")))>; + //===----------------------------------------------------------------------===// // Auxiliary Instructions //===----------------------------------------------------------------------===// @@ -1949,6 +1977,31 @@ def VLD3_DUP_BF : WInst<"vld3_dup", "3(c*!)", "bQb">; def VLD4_DUP_BF : WInst<"vld4_dup", "4(c*!)", "bQb">; + def VCVT_F32_BF16 : SOpInst<"vcvt_f32_bf16", "(F>)(Bq!)", "Qb", OP_VCVT_F32_BF16>; + def VCVT_LOW_F32_BF16 : SOpInst<"vcvt_low_f32", "(F>)(BQ!)", "Qb", OP_VCVT_F32_BF16_LO>; + def VCVT_HIGH_F32_BF16 : SOpInst<"vcvt_high_f32", "(F>)(BQ!)", "Qb", OP_VCVT_F32_BF16_HI>; + + def SCALAR_CVT_BF16_F32 : SInst<"vcvth_bf16", "(1B)1", "f">; + def SCALAR_CVT_F32_BF16 : SOpInst<"vcvtah_f32", "(1F>)(1!)", "b", OP_CVT_F32_BF16>; +} + +let ArchGuard = "defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && !defined(__aarch64__)" in { + def VCVT_BF16_F32_A32_INTERNAL : WInst<"__a32_vcvt_bf16", "BQ", "f">; + def VCVT_BF16_F32_A32 : SOpInst<"vcvt_bf16", "BQ", "f", OP_VCVT_BF16_F32_A32>; + def VCVT_LOW_BF16_F32_A32 : SOpInst<"vcvt_low_bf16", "BQ", "Qf", OP_VCVT_BF16_F32_LO_A32>; + def VCVT_HIGH_BF16_F32_A32 : SOpInst<"vcvt_high_bf16", "BBQ", "Qf", OP_VCVT_BF16_F32_HI_A32>; +} + +let ArchGuard = "defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && defined(__aarch64__)" in { + def VCVT_LOW_BF16_F32_A64_INTERNAL : WInst<"__a64_vcvtq_low_bf16", "BQ", "Hf">; + def VCVT_LOW_BF16_F32_A64 : SOpInst<"vcvt_low_bf16", "BQ", "Qf", OP_VCVT_BF16_F32_LO_A64>; + def VCVT_HIGH_BF16_F32_A64 : SInst<"vcvt_high_bf16", "BBQ", "Qf">; + def VCVT_BF16_F32 : SOpInst<"vcvt_bf16", "BQ", "f", OP_VCVT_BF16_F32_A64>; + + def COPY_LANE_BF16 : IOpInst<"vcopy_lane", "..I.I", "b", OP_COPY_LN>; + def COPYQ_LANE_BF16 : IOpInst<"vcopy_lane", "..IqI", "Qb", OP_COPY_LN>; + def COPY_LANEQ_BF16 : IOpInst<"vcopy_laneq", "..IQI", "b", OP_COPY_LN>; + def COPYQ_LANEQ_BF16 : IOpInst<"vcopy_laneq", "..I.I", "Qb", OP_COPY_LN>; } let ArchGuard = "defined(__ARM_FEATURE_BF16) && !defined(__aarch64__)" in { diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -4735,6 +4735,7 @@ TypeModifier } static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = { + NEONMAP1(__a32_vcvt_bf16_v, arm_neon_vcvtfp2bf, 0), NEONMAP0(splat_lane_v), NEONMAP0(splat_laneq_v), NEONMAP0(splatq_lane_v), @@ -4813,6 +4814,7 @@ NEONMAP1(vcvtaq_u16_v, arm_neon_vcvtau, 0), NEONMAP1(vcvtaq_u32_v, arm_neon_vcvtau, 0), NEONMAP1(vcvtaq_u64_v, arm_neon_vcvtau, 0), + NEONMAP1(vcvth_bf16_f32, arm_neon_vcvtbfp2bf, 0), NEONMAP1(vcvtm_s16_v, arm_neon_vcvtms, 0), NEONMAP1(vcvtm_s32_v, arm_neon_vcvtms, 0), NEONMAP1(vcvtm_s64_v, arm_neon_vcvtms, 0), @@ -5029,6 +5031,7 @@ }; static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = { + NEONMAP1(__a64_vcvtq_low_bf16_v, aarch64_neon_bfcvtn, 0), NEONMAP0(splat_lane_v), NEONMAP0(splat_laneq_v), NEONMAP0(splatq_lane_v), @@ -5088,6 +5091,7 @@ NEONMAP1(vcvt_n_u64_v, aarch64_neon_vcvtfp2fxu, 0), NEONMAP0(vcvtq_f16_v), NEONMAP0(vcvtq_f32_v), + NEONMAP1(vcvtq_high_bf16_v, aarch64_neon_bfcvtn2, 0), NEONMAP2(vcvtq_n_f16_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0), NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0), NEONMAP2(vcvtq_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0), @@ -5243,6 +5247,7 @@ NEONMAP1(vcvtd_n_f64_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType), NEONMAP1(vcvtd_n_s64_f64, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType), NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType), + NEONMAP1(vcvth_bf16_f32, aarch64_neon_bfcvt, 0), NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType), NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType), NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType), @@ -6241,6 +6246,11 @@ llvm::Type *Tys[2] = { Ty, InputTy }; return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmlalt"); } + case NEON::BI__builtin_neon___a32_vcvt_bf16_v: { + llvm::Type *Tys[1] = { Ty }; + Function *F = CGM.getIntrinsic(Int, Tys); + return EmitNeonCall(F, Ops, "vcvtfp2bf"); + } } @@ -6449,6 +6459,7 @@ case NEON::BI__builtin_neon_vsha1cq_u32: case NEON::BI__builtin_neon_vsha1pq_u32: case NEON::BI__builtin_neon_vsha1mq_u32: + case NEON::BI__builtin_neon_vcvth_bf16_f32: case clang::ARM::BI_MoveToCoprocessor: case clang::ARM::BI_MoveToCoprocessor2: return false; @@ -6932,6 +6943,11 @@ return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1m), Ops, "vsha1h"); + case NEON::BI__builtin_neon_vcvth_bf16_f32: { + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vcvtbfp2bf), Ops, + "vcvtbfp2bf"); + } + // The ARM _MoveToCoprocessor builtins put the input register value as // the first argument, but the LLVM intrinsic expects it as the third one. case ARM::BI_MoveToCoprocessor: diff --git a/clang/test/CodeGen/aarch64-bf16-lane-intrinsics.c b/clang/test/CodeGen/aarch64-bf16-lane-intrinsics.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-bf16-lane-intrinsics.c @@ -0,0 +1,145 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple aarch64-arm-none-eabi -target-feature +neon -target-feature +bf16 \ +// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -mem2reg -instcombine | FileCheck --check-prefix=CHECK-LE %s +// RUN: %clang_cc1 -triple aarch64_be-arm-none-eabi -target-feature +neon -target-feature +bf16 \ +// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -mem2reg -instcombine | FileCheck --check-prefix=CHECK-BE %s + +#include + +// CHECK-LE-LABEL: @test_vcopy_lane_bf16_v1( +// CHECK-LE-NEXT: entry: +// CHECK-LE-NEXT: [[VSET_LANE:%.*]] = shufflevector <4 x bfloat> [[A:%.*]], <4 x bfloat> [[B:%.*]], <4 x i32> +// CHECK-LE-NEXT: ret <4 x bfloat> [[VSET_LANE]] +// +// CHECK-BE-LABEL: @test_vcopy_lane_bf16_v1( +// CHECK-BE-NEXT: entry: +// CHECK-BE-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x bfloat> [[A:%.*]], <4 x bfloat> undef, <4 x i32> +// CHECK-BE-NEXT: [[VSET_LANE:%.*]] = shufflevector <4 x bfloat> [[SHUFFLE]], <4 x bfloat> [[B:%.*]], <4 x i32> +// CHECK-BE-NEXT: [[SHUFFLE5:%.*]] = shufflevector <4 x bfloat> [[VSET_LANE]], <4 x bfloat> undef, <4 x i32> +// CHECK-BE-NEXT: ret <4 x bfloat> [[SHUFFLE5]] +// +bfloat16x4_t test_vcopy_lane_bf16_v1(bfloat16x4_t a, bfloat16x4_t b) { + return vcopy_lane_bf16(a, 1, b, 3); +} + +// CHECK-LE-LABEL: @test_vcopy_lane_bf16_v2( +// CHECK-LE-NEXT: entry: +// CHECK-LE-NEXT: [[VSET_LANE:%.*]] = shufflevector <4 x bfloat> [[A:%.*]], <4 x bfloat> [[B:%.*]], <4 x i32> +// CHECK-LE-NEXT: ret <4 x bfloat> [[VSET_LANE]] +// +// CHECK-BE-LABEL: @test_vcopy_lane_bf16_v2( +// CHECK-BE-NEXT: entry: +// CHECK-BE-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x bfloat> [[A:%.*]], <4 x bfloat> undef, <4 x i32> +// CHECK-BE-NEXT: [[VSET_LANE:%.*]] = shufflevector <4 x bfloat> [[SHUFFLE]], <4 x bfloat> [[B:%.*]], <4 x i32> +// CHECK-BE-NEXT: [[SHUFFLE5:%.*]] = shufflevector <4 x bfloat> [[VSET_LANE]], <4 x bfloat> undef, <4 x i32> +// CHECK-BE-NEXT: ret <4 x bfloat> [[SHUFFLE5]] +// +bfloat16x4_t test_vcopy_lane_bf16_v2(bfloat16x4_t a, bfloat16x4_t b) { + return vcopy_lane_bf16(a, 2, b, 0); +} + +// CHECK-LE-LABEL: @test_vcopyq_lane_bf16_v1( +// CHECK-LE-NEXT: entry: +// CHECK-LE-NEXT: [[TMP0:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> +// CHECK-LE-NEXT: [[VSET_LANE:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[TMP0]], <8 x i32> +// CHECK-LE-NEXT: ret <8 x bfloat> [[VSET_LANE]] +// +// CHECK-BE-LABEL: @test_vcopyq_lane_bf16_v1( +// CHECK-BE-NEXT: entry: +// CHECK-BE-NEXT: [[TMP0:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> +// CHECK-BE-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> undef, <8 x i32> +// CHECK-BE-NEXT: [[VSET_LANE:%.*]] = shufflevector <8 x bfloat> [[SHUFFLE]], <8 x bfloat> [[TMP0]], <8 x i32> +// CHECK-BE-NEXT: [[SHUFFLE5:%.*]] = shufflevector <8 x bfloat> [[VSET_LANE]], <8 x bfloat> undef, <8 x i32> +// CHECK-BE-NEXT: ret <8 x bfloat> [[SHUFFLE5]] +// +bfloat16x8_t test_vcopyq_lane_bf16_v1(bfloat16x8_t a, bfloat16x4_t b) { + return vcopyq_lane_bf16(a, 0, b, 2); +} + +// CHECK-LE-LABEL: @test_vcopyq_lane_bf16_v2( +// CHECK-LE-NEXT: entry: +// CHECK-LE-NEXT: [[TMP0:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> +// CHECK-LE-NEXT: [[VSET_LANE:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[TMP0]], <8 x i32> +// CHECK-LE-NEXT: ret <8 x bfloat> [[VSET_LANE]] +// +// CHECK-BE-LABEL: @test_vcopyq_lane_bf16_v2( +// CHECK-BE-NEXT: entry: +// CHECK-BE-NEXT: [[TMP0:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> +// CHECK-BE-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> undef, <8 x i32> +// CHECK-BE-NEXT: [[VSET_LANE:%.*]] = shufflevector <8 x bfloat> [[SHUFFLE]], <8 x bfloat> [[TMP0]], <8 x i32> +// CHECK-BE-NEXT: [[SHUFFLE5:%.*]] = shufflevector <8 x bfloat> [[VSET_LANE]], <8 x bfloat> undef, <8 x i32> +// CHECK-BE-NEXT: ret <8 x bfloat> [[SHUFFLE5]] +// +bfloat16x8_t test_vcopyq_lane_bf16_v2(bfloat16x8_t a, bfloat16x4_t b) { + return vcopyq_lane_bf16(a, 6, b, 0); +} + +// CHECK-LE-LABEL: @test_vcopy_laneq_bf16_v1( +// CHECK-LE-NEXT: entry: +// CHECK-LE-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x bfloat> [[B:%.*]], i32 7 +// CHECK-LE-NEXT: [[VSET_LANE:%.*]] = insertelement <4 x bfloat> [[A:%.*]], bfloat [[VGETQ_LANE]], i32 0 +// CHECK-LE-NEXT: ret <4 x bfloat> [[VSET_LANE]] +// +// CHECK-BE-LABEL: @test_vcopy_laneq_bf16_v1( +// CHECK-BE-NEXT: entry: +// CHECK-BE-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x bfloat> [[A:%.*]], <4 x bfloat> undef, <4 x i32> +// CHECK-BE-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x bfloat> [[B:%.*]], i32 0 +// CHECK-BE-NEXT: [[VSET_LANE:%.*]] = insertelement <4 x bfloat> [[SHUFFLE]], bfloat [[VGETQ_LANE]], i32 0 +// CHECK-BE-NEXT: [[SHUFFLE5:%.*]] = shufflevector <4 x bfloat> [[VSET_LANE]], <4 x bfloat> undef, <4 x i32> +// CHECK-BE-NEXT: ret <4 x bfloat> [[SHUFFLE5]] +// +bfloat16x4_t test_vcopy_laneq_bf16_v1(bfloat16x4_t a, bfloat16x8_t b) { + return vcopy_laneq_bf16(a, 0, b, 7); +} + +// CHECK-LE-LABEL: @test_vcopy_laneq_bf16_v2( +// CHECK-LE-NEXT: entry: +// CHECK-LE-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x bfloat> [[B:%.*]], i32 4 +// CHECK-LE-NEXT: [[VSET_LANE:%.*]] = insertelement <4 x bfloat> [[A:%.*]], bfloat [[VGETQ_LANE]], i32 3 +// CHECK-LE-NEXT: ret <4 x bfloat> [[VSET_LANE]] +// +// CHECK-BE-LABEL: @test_vcopy_laneq_bf16_v2( +// CHECK-BE-NEXT: entry: +// CHECK-BE-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x bfloat> [[A:%.*]], <4 x bfloat> undef, <4 x i32> +// CHECK-BE-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x bfloat> [[B:%.*]], i32 3 +// CHECK-BE-NEXT: [[VSET_LANE:%.*]] = insertelement <4 x bfloat> [[SHUFFLE]], bfloat [[VGETQ_LANE]], i32 3 +// CHECK-BE-NEXT: [[SHUFFLE5:%.*]] = shufflevector <4 x bfloat> [[VSET_LANE]], <4 x bfloat> undef, <4 x i32> +// CHECK-BE-NEXT: ret <4 x bfloat> [[SHUFFLE5]] +// +bfloat16x4_t test_vcopy_laneq_bf16_v2(bfloat16x4_t a, bfloat16x8_t b) { + return vcopy_laneq_bf16(a, 3, b, 4); +} + +// CHECK-LE-LABEL: @test_vcopyq_laneq_bf16_v1( +// CHECK-LE-NEXT: entry: +// CHECK-LE-NEXT: [[VSET_LANE:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]], <8 x i32> +// CHECK-LE-NEXT: ret <8 x bfloat> [[VSET_LANE]] +// +// CHECK-BE-LABEL: @test_vcopyq_laneq_bf16_v1( +// CHECK-BE-NEXT: entry: +// CHECK-BE-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> undef, <8 x i32> +// CHECK-BE-NEXT: [[VSET_LANE:%.*]] = shufflevector <8 x bfloat> [[SHUFFLE]], <8 x bfloat> [[B:%.*]], <8 x i32> +// CHECK-BE-NEXT: [[SHUFFLE5:%.*]] = shufflevector <8 x bfloat> [[VSET_LANE]], <8 x bfloat> undef, <8 x i32> +// CHECK-BE-NEXT: ret <8 x bfloat> [[SHUFFLE5]] +// +bfloat16x8_t test_vcopyq_laneq_bf16_v1(bfloat16x8_t a, bfloat16x8_t b) { + return vcopyq_laneq_bf16(a, 3, b, 7); + +} + +// CHECK-LE-LABEL: @test_vcopyq_laneq_bf16_v2( +// CHECK-LE-NEXT: entry: +// CHECK-LE-NEXT: [[VSET_LANE:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]], <8 x i32> +// CHECK-LE-NEXT: ret <8 x bfloat> [[VSET_LANE]] +// +// CHECK-BE-LABEL: @test_vcopyq_laneq_bf16_v2( +// CHECK-BE-NEXT: entry: +// CHECK-BE-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> undef, <8 x i32> +// CHECK-BE-NEXT: [[VSET_LANE:%.*]] = shufflevector <8 x bfloat> [[SHUFFLE]], <8 x bfloat> [[B:%.*]], <8 x i32> +// CHECK-BE-NEXT: [[SHUFFLE5:%.*]] = shufflevector <8 x bfloat> [[VSET_LANE]], <8 x bfloat> undef, <8 x i32> +// CHECK-BE-NEXT: ret <8 x bfloat> [[SHUFFLE5]] +// +bfloat16x8_t test_vcopyq_laneq_bf16_v2(bfloat16x8_t a, bfloat16x8_t b) { + return vcopyq_laneq_bf16(a, 6, b, 2); +} + diff --git a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c @@ -0,0 +1,88 @@ +// RUN: %clang_cc1 \ +// RUN: -triple aarch64-arm-none-eabi -target-feature +neon -target-feature +bf16 \ +// RUN: -disable-O0-optnone -emit-llvm -o - %s \ +// RUN: | opt -S -mem2reg -instcombine \ +// RUN: | FileCheck --check-prefixes=CHECK,CHECK-A64 %s +// RUN: %clang_cc1 \ +// RUN: -triple armv8.6a-arm-none-eabi -target-feature +neon \ +// RUN: -target-feature +bf16 -mfloat-abi hard \ +// RUN: -disable-O0-optnone -emit-llvm -o - %s \ +// RUN: | opt -S -mem2reg -instcombine \ +// RUN: | FileCheck --check-prefixes=CHECK,CHECK-A32-HARDFP %s +// RUN: %clang_cc1 \ +// RUN: -triple armv8.6a-arm-none-eabi -target-feature +neon \ +// RUN: -target-feature +bf16 -mfloat-abi softfp \ +// RUN: -disable-O0-optnone -emit-llvm -o - %s \ +// RUN: | opt -S -mem2reg -instcombine \ +// RUN: | FileCheck --check-prefixes=CHECK,CHECK-A32-SOFTFP %s + +#include + +// CHECK-LABEL: test_vcvt_f32_bf16 +// CHECK: %[[EXT:.*]] = zext <4 x i16> %{{.*}} to <4 x i32> +// CHECK: shl nuw <4 x i32> %[[EXT]], +float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) { + return vcvt_f32_bf16(a); +} + +// CHECK-LABEL: test_vcvtq_low_f32_bf16 +// CHECK: shufflevector <8 x bfloat> %{{.*}}, <8 x bfloat> undef, <4 x i32> +// CHECK: %[[EXT:.*]] = zext <4 x i16> %{{.*}} to <4 x i32> +// CHECK: shl nuw <4 x i32> %[[EXT]], +float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) { + return vcvtq_low_f32_bf16(a); +} + +// CHECK-LABEL: test_vcvtq_high_f32_bf16 +// CHECK: shufflevector <8 x bfloat> %{{.*}}, <8 x bfloat> undef, <4 x i32> +// CHECK: %[[EXT:.*]] = zext <4 x i16> %{{.*}} to <4 x i32> +// CHECK: shl nuw <4 x i32> %[[EXT]], +float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) { + return vcvtq_high_f32_bf16(a); +} + +// CHECK-LABEL: test_vcvt_bf16_f32 +// CHECK-A64: %[[CVT:.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> %a) +// CHECK-A64: shufflevector <8 x bfloat> %[[CVT]], <8 x bfloat> undef, <4 x i32> +// CHECK-A32-HARDFP: call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> %a) +// CHECK-A32-SOFTFP: call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> %a) +bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) { + return vcvt_bf16_f32(a); +} + +// CHECK-LABEL: test_vcvtq_low_bf16_f32 +// CHECK-A64: call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> %a) +// CHECK-A32-HARDFP: %[[CVT:.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16 +// CHECK-A32-HARDFP: shufflevector <4 x bfloat> zeroinitializer, <4 x bfloat> %[[CVT]], <8 x i32> +// CHECK-A32-SOFTFP: call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16 +// CHECK-A32-SOFTFP: shufflevector <4 x bfloat> zeroinitializer, <4 x bfloat> %{{.*}}, <8 x i32> +bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) { + return vcvtq_low_bf16_f32(a); +} + +// CHECK-LABEL: test_vcvtq_high_bf16_f32 +// CHECK-A64: call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat> %inactive, <4 x float> %a) +// CHECK-A32-HARDFP: %[[CVT:.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> %a) +// CHECK-A32-HARDFP: %[[INACT:.*]] = shufflevector <8 x bfloat> %inactive, <8 x bfloat> undef, <4 x i32> +// CHECK-A32-HARDFP: shufflevector <4 x bfloat> %[[CVT]], <4 x bfloat> %[[INACT]], <8 x i32> +// CHECK-A32-SOFTFP: call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> %a) +// CHECK-A32-SOFTFP: shufflevector <8 x bfloat> %{{.*}}, <8 x bfloat> undef, <4 x i32> +// CHECK-A32-SOFTFP: shufflevector <4 x bfloat> %{{.*}}, <4 x bfloat> %{{.*}}, <8 x i32> +bfloat16x8_t test_vcvtq_high_bf16_f32(bfloat16x8_t inactive, float32x4_t a) { + return vcvtq_high_bf16_f32(inactive, a); +} + +// CHECK-LABEL: test_vcvth_bf16_f32 +// CHECK-A64: call bfloat @llvm.aarch64.neon.bfcvt(float %a) +// CHECK-A32-HARDFP: call bfloat @llvm.arm.neon.vcvtbfp2bf(float %a) +// CHECK-A32-SOFTFP: call bfloat @llvm.arm.neon.vcvtbfp2bf(float %a) +bfloat16_t test_vcvth_bf16_f32(float32_t a) { + return vcvth_bf16_f32(a); +} + +// CHECK-LABEL: test_vcvtah_f32_bf16 +// CHECK: shl i32 %{{.*}}, 16 +float32_t test_vcvtah_f32_bf16(bfloat16_t a) { + return vcvtah_f32_bf16(a); +} + diff --git a/clang/test/Sema/aarch64-neon-bf16-ranges.c b/clang/test/Sema/aarch64-neon-bf16-ranges.c new file mode 100644 --- /dev/null +++ b/clang/test/Sema/aarch64-neon-bf16-ranges.c @@ -0,0 +1,47 @@ +// RUN: %clang_cc1 -fsyntax-only -verify \ +// RUN: -triple aarch64-arm-none-eabi -target-feature +neon \ +// RUN: -target-feature +bf16 %s + +#include + +int x; + +void test_vcopy_lane_bf16(bfloat16x4_t a, bfloat16x8_t b) { + // 0 <= lane1 <= 3; 0 <= lane2 <= 3 + (void)vcopy_lane_bf16(a, 3, a, 3); + (void)vcopy_lane_bf16(a, 0, a, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + (void)vcopy_lane_bf16(a, 1, a, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + (void)vcopy_lane_bf16(a, 4, a, 0); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + (void)vcopy_lane_bf16(a, -1, a, 1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + (void)vcopy_lane_bf16(a, 0, a, x); // expected-error-re {{argument {{.*}} must be a constant integer}} + (void)vcopy_lane_bf16(a, x, a, 0); // expected-error-re {{argument {{.*}} must be a constant integer}} + + // 0 <= lane1 <= 7; 0 <= lane2 <= 3 + (void)vcopyq_lane_bf16(b, 7, a, 3); + (void)vcopyq_lane_bf16(b, 0, a, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + (void)vcopyq_lane_bf16(b, 1, a, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + (void)vcopyq_lane_bf16(b, 8, a, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + (void)vcopyq_lane_bf16(b, -1, a, 1); // expected-error {{argument value -1 is outside the valid range [0, 7]}} + (void)vcopyq_lane_bf16(b, 0, a, x); // expected-error-re {{argument {{.*}} must be a constant integer}} + (void)vcopyq_lane_bf16(b, x, a, 0); // expected-error-re {{argument {{.*}} must be a constant integer}} + + // 0 <= lane1 <= 3; 0 <= lane2 <= 7 + (void)vcopy_laneq_bf16(a, 3, b, 7); + (void)vcopy_laneq_bf16(a, 0, b, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + (void)vcopy_laneq_bf16(a, 1, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 7]}} + (void)vcopy_laneq_bf16(a, 4, b, 0); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + (void)vcopy_laneq_bf16(a, -1, b, 1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + (void)vcopy_laneq_bf16(a, 0, b, x); // expected-error-re {{argument {{.*}} must be a constant integer}} + (void)vcopy_laneq_bf16(a, x, b, 0); // expected-error-re {{argument {{.*}} must be a constant integer}} + + + // 0 <= lane1 <= 7; 0 <= lane2 <= 7 + (void)vcopyq_laneq_bf16(b, 7, b, 7); + (void)vcopyq_laneq_bf16(b, 0, b, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + (void)vcopyq_laneq_bf16(b, 1, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 7]}} + (void)vcopyq_laneq_bf16(b, 8, b, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + (void)vcopyq_laneq_bf16(b, -1, b, 1); // expected-error {{argument value -1 is outside the valid range [0, 7]}} + (void)vcopyq_laneq_bf16(b, 0, b, x); // expected-error-re {{argument {{.*}} must be a constant integer}} + (void)vcopyq_laneq_bf16(b, x, b, 0); // expected-error-re {{argument {{.*}} must be a constant integer}} +} + diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -1062,7 +1062,8 @@ std::string S = Name; if (Name == "vcvt_f16_f32" || Name == "vcvt_f32_f16" || - Name == "vcvt_f32_f64" || Name == "vcvt_f64_f32") + Name == "vcvt_f32_f64" || Name == "vcvt_f64_f32" || + Name == "vcvt_f32_bf16") return Name; if (!typeCode.empty()) { diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -471,6 +471,16 @@ def int_aarch64_neon_bfmlalt : AdvSIMD_FML_Intrinsic; + // v8.6-A Bfloat Intrinsics + def int_aarch64_neon_bfcvt + : Intrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>; + def int_aarch64_neon_bfcvtn + : Intrinsic<[llvm_v8bf16_ty], [llvm_v4f32_ty], [IntrNoMem]>; + def int_aarch64_neon_bfcvtn2 + : Intrinsic<[llvm_v8bf16_ty], + [llvm_v8bf16_ty, llvm_v4f32_ty], + [IntrNoMem]>; + // v8.2-A FP16 Fused Multiply-Add Long def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic; def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic; diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -785,6 +785,11 @@ def int_arm_neon_usdot : Neon_Dot_Intrinsic; // v8.6-A Bfloat Intrinsics +def int_arm_neon_vcvtfp2bf + : Intrinsic<[llvm_anyvector_ty], [llvm_v4f32_ty], [IntrNoMem]>; +def int_arm_neon_vcvtbfp2bf + : Intrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>; + def int_arm_neon_bfdot : Neon_Dot_Intrinsic; def int_arm_neon_bfmmla : Neon_MatMul_Intrinsic; diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -7911,15 +7911,18 @@ class SIMD_BFCVTN : BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128, "bfcvtn", ".4h", ".4s", - []>; + [(set (v8bf16 V128:$Rd), + (int_aarch64_neon_bfcvtn (v4f32 V128:$Rn)))]>; class SIMD_BFCVTN2 : BaseSIMDMixedTwoVectorTied<1, 0, 0b10, 0b10110, V128, V128, "bfcvtn2", ".8h", ".4s", - []>; + [(set (v8bf16 V128:$dst), + (int_aarch64_neon_bfcvtn2 (v8bf16 V128:$Rd), (v4f32 V128:$Rn)))]>; class BF16ToSinglePrecision - : I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "", []>, + : I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "", + [(set (bf16 FPR16:$Rd), (int_aarch64_neon_bfcvt (f32 FPR32:$Rn)))]>, Sched<[WriteFCvt]> { bits<5> Rd; bits<5> Rn; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -3955,12 +3955,16 @@ defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>; defm XTN : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>; -def : Pat<(v4f16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>; -def : Pat<(v4f16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>; -def : Pat<(v8f16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>; -def : Pat<(v8f16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>; -def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>; -def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>; +def : Pat<(v4f16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>; +def : Pat<(v4f16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>; +def : Pat<(v4bf16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>; +def : Pat<(v4bf16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>; +def : Pat<(v8f16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>; +def : Pat<(v8f16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>; +def : Pat<(v8bf16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>; +def : Pat<(v8bf16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>; +def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>; +def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>; // Patterns for vector long shift (by element width). These need to match all // three of zext, sext and anyext so it's easier to pull the patterns out of the diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -4743,6 +4743,29 @@ default: break; + // Scalar f32 -> bf16 + case Intrinsic::arm_neon_vcvtbfp2bf: { + SDLoc dl(N); + const SDValue &Src = N->getOperand(1); + llvm::EVT DestTy = N->getValueType(0); + SDValue Pred = getAL(CurDAG, dl); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { Src, Src, Pred, Reg0 }; + CurDAG->SelectNodeTo(N, ARM::BF16_VCVTB, DestTy, Ops); + return; + } + + // Vector v4f32 -> v4bf16 + case Intrinsic::arm_neon_vcvtfp2bf: { + SDLoc dl(N); + const SDValue &Src = N->getOperand(1); + SDValue Pred = getAL(CurDAG, dl); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { Src, Pred, Reg0 }; + CurDAG->SelectNodeTo(N, ARM::BF16_VCVT, MVT::v4bf16, Ops); + return; + } + case Intrinsic::arm_mve_urshrl: SelectMVE_LongShift(N, ARM::MVE_URSHRL, true, false); return; diff --git a/llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll b/llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll @@ -0,0 +1,34 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-arm-none-eabi -mattr=+neon -mattr=+bf16 | FileCheck %s + +declare bfloat @llvm.aarch64.neon.bfcvt(float) +declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float>) +declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat>, <4 x float>) + +; CHECK-LABEL: test_vcvth_bf16_f32 +; CHECK: bfcvt h0, s0 +; CHECK-NEXT: ret +define bfloat @test_vcvth_bf16_f32(float %a) { +entry: + %vcvth_bf16_f32 = call bfloat @llvm.aarch64.neon.bfcvt(float %a) + ret bfloat %vcvth_bf16_f32 +} + +; CHECK-LABEL: test_vcvtq_low_bf16_f32 +; CHECK: bfcvtn v0.4h, v0.4s +; CHECK-NEXT: ret +define <8 x bfloat> @test_vcvtq_low_bf16_f32(<4 x float> %a) { +entry: + %cvt = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> %a) + ret <8 x bfloat> %cvt +} + +; CHECK-LABEL: test_vcvtq_high_bf16_f32 +; CHECK: bfcvtn2 v1.8h, v0.4s +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +define <8 x bfloat> @test_vcvtq_high_bf16_f32(<4 x float> %a, <8 x bfloat> %inactive) { +entry: + %cvt = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat> %inactive, <4 x float> %a) + ret <8 x bfloat> %cvt +} + diff --git a/llvm/test/CodeGen/AArch64/bf16-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/bf16-vector-shuffle.ll --- a/llvm/test/CodeGen/AArch64/bf16-vector-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/bf16-vector-shuffle.ll @@ -163,3 +163,87 @@ %vgetq_lane = extractelement <8 x bfloat> %v, i32 7 ret bfloat %vgetq_lane } + +; vcopy_lane_bf16(a, 1, b, 3); +define <4 x bfloat> @test_vcopy_lane_bf16_v1(<4 x bfloat> %a, <4 x bfloat> %b) nounwind { +; CHECK-LABEL: test_vcopy_lane_bf16_v1: +; CHECK-NEXT: mov v0.h[1], v1.h[3] +; CHECK-NEXT: ret +entry: + %vset_lane = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> + ret <4 x bfloat> %vset_lane +} + +; vcopy_lane_bf16(a, 2, b, 0); +define <4 x bfloat> @test_vcopy_lane_bf16_v2(<4 x bfloat> %a, <4 x bfloat> %b) nounwind { +; CHECK-LABEL: test_vcopy_lane_bf16_v2: +; CHECK-NEXT: mov v0.h[2], v1.h[0] +; CHECK-NEXT: ret +entry: + %vset_lane = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> + ret <4 x bfloat> %vset_lane +} + +; vcopyq_lane_bf16(a, 0, b, 2); +define <8 x bfloat> @test_vcopyq_lane_bf16_v1(<8 x bfloat> %a, <4 x bfloat> %b) nounwind { +; CHECK-LABEL: test_vcopyq_lane_bf16_v1: +; CHECK-NEXT: mov v0.h[0], v1.h[2] +; CHECK-NEXT: ret +entry: + %0 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> + %vset_lane = shufflevector <8 x bfloat> %a, <8 x bfloat> %0, <8 x i32> + ret <8 x bfloat> %vset_lane +} + +; vcopyq_lane_bf16(a, 6, b, 0); +define <8 x bfloat> @test_vcopyq_lane_bf16_v2(<8 x bfloat> %a, <4 x bfloat> %b) nounwind { +; CHECK-LABEL: test_vcopyq_lane_bf16_v2: +; CHECK-NEXT: mov v0.h[6], v1.h[0] +; CHECK-NEXT: ret +entry: + %0 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> + %vset_lane = shufflevector <8 x bfloat> %a, <8 x bfloat> %0, <8 x i32> + ret <8 x bfloat> %vset_lane +} + +; vcopy_laneq_bf16(a, 0, b, 7); +define <4 x bfloat> @test_vcopy_laneq_bf16_v1(<4 x bfloat> %a, <8 x bfloat> %b) nounwind { +; CHECK-LABEL: test_vcopy_laneq_bf16_v1: +; CHECK-NEXT: mov v0.h[0], v1.h[7] +; CHECK-NEXT: ret +entry: + %vgetq_lane = extractelement <8 x bfloat> %b, i32 7 + %vset_lane = insertelement <4 x bfloat> %a, bfloat %vgetq_lane, i32 0 + ret <4 x bfloat> %vset_lane +} + +; vcopy_laneq_bf16(a, 3, b, 4); +define <4 x bfloat> @test_vcopy_laneq_bf16_v2(<4 x bfloat> %a, <8 x bfloat> %b) nounwind { +; CHECK-LABEL: test_vcopy_laneq_bf16_v2: +; CHECK-NEXT: mov v0.h[3], v1.h[4] +; CHECK-NEXT: ret +entry: + %vgetq_lane = extractelement <8 x bfloat> %b, i32 4 + %vset_lane = insertelement <4 x bfloat> %a, bfloat %vgetq_lane, i32 3 + ret <4 x bfloat> %vset_lane +} + +; vcopyq_laneq_bf16(a, 3, b, 7); +define <8 x bfloat> @test_vcopyq_laneq_bf16_v1(<8 x bfloat> %a, <8 x bfloat> %b) nounwind { +; CHECK-LABEL: test_vcopyq_laneq_bf16_v1: +; CHECK-NEXT: mov v0.h[3], v1.h[7] +; CHECK-NEXT: ret +entry: + %vset_lane = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> + ret <8 x bfloat> %vset_lane +} + +; vcopyq_laneq_bf16(a, 6, b, 2); +define <8 x bfloat> @test_vcopyq_laneq_bf16_v2(<8 x bfloat> %a, <8 x bfloat> %b) nounwind { +; CHECK-LABEL: test_vcopyq_laneq_bf16_v2: +; CHECK-NEXT: mov v0.h[6], v1.h[2] +; CHECK-NEXT: ret +entry: + %vset_lane = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> + ret <8 x bfloat> %vset_lane +} diff --git a/llvm/test/CodeGen/ARM/bf16-convert-intrinsics.ll b/llvm/test/CodeGen/ARM/bf16-convert-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/bf16-convert-intrinsics.ll @@ -0,0 +1,56 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -mtriple=armv8.6a-arm-none-eabi -mattr=+neon,+bf16,+fullfp16 | FileCheck %s + +declare bfloat @llvm.arm.neon.vcvtbfp2bf(float) + +; Hard float ABI +declare <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float>) + +define arm_aapcs_vfpcc <4 x bfloat> @test_vcvt_bf16_f32_hardfp(<4 x float> %a) { +; CHECK-LABEL: test_vcvt_bf16_f32_hardfp: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcvt.bf16.f32 d0, q0 +; CHECK-NEXT: bx lr +entry: + %vcvtfp2bf1.i.i = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> %a) + ret <4 x bfloat> %vcvtfp2bf1.i.i +} + +define arm_aapcs_vfpcc bfloat @test_vcvth_bf16_f32_hardfp(float %a) { +; CHECK-LABEL: test_vcvth_bf16_f32_hardfp: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcvtb.bf16.f32 s0, s0 +; CHECK-NEXT: bx lr +entry: + %vcvtbfp2bf.i = call bfloat @llvm.arm.neon.vcvtbfp2bf(float %a) + ret bfloat %vcvtbfp2bf.i +} + +; Soft float ABI +declare <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float>) + +define <2 x i32> @test_vcvt_bf16_f32_softfp(<4 x float> %a) { +; CHECK-LABEL: test_vcvt_bf16_f32_softfp: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vcvt.bf16.f32 d16, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr +entry: + %vcvtfp2bf1.i.i = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> %a) + %.cast = bitcast <4 x i16> %vcvtfp2bf1.i.i to <2 x i32> + ret <2 x i32> %.cast +} + +define bfloat @test_vcvth_bf16_f32_softfp(float %a) #1 { +; CHECK-LABEL: test_vcvth_bf16_f32_softfp: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: vcvtb.bf16.f32 s0, s0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +entry: + %vcvtbfp2bf.i = call bfloat @llvm.arm.neon.vcvtbfp2bf(float %a) #3 + ret bfloat %vcvtbfp2bf.i +}