diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -252,6 +252,34 @@ : Op<(call "vbfmlalt", $p0, $p1, (dup_typed $p1, (call "vget_lane", $p2, $p3)))>; +def OP_VCVT_F32_BF16 + : Op<(bitcast "R", + (call "vshll_n", (bitcast "int16x4_t", $p0), + (literal "int32_t", "16")))>; +def OP_VCVT_F32_BF16_LO + : Op<(call "vcvt_f32_bf16", (call "vget_low", $p0))>; +def OP_VCVT_F32_BF16_HI + : Op<(call "vcvt_f32_bf16", (call "vget_high", $p0))>; + +def OP_VCVT_BF16_F32_LO_A64 + : Op<(call "__a64_vcvtq_low_bf16", $p0)>; +def OP_VCVT_BF16_F32_A64 + : Op<(call "vget_low", (call "__a64_vcvtq_low_bf16", $p0))>; + +def OP_VCVT_BF16_F32_A32 + : Op<(call "__a32_vcvt_bf16", $p0)>; + +def OP_VCVT_BF16_F32_LO_A32 + : Op<(call "vcombine", (cast "bfloat16x4_t", (literal "uint64_t", "0ULL")), + (call "__a32_vcvt_bf16", $p0))>; +def OP_VCVT_BF16_F32_HI_A32 + : Op<(call "vcombine", (call "__a32_vcvt_bf16", $p1), + (call "vget_low", $p0))>; + +def OP_CVT_F32_BF16 + : Op<(bitcast "R", (op "<<", (bitcast "int32_t", $p0), + (literal "int32_t", "16")))>; + //===----------------------------------------------------------------------===// // Auxiliary Instructions //===----------------------------------------------------------------------===// @@ -1942,7 +1970,31 @@ def VLD3_DUP_BF : WInst<"vld3_dup", "3(c*!)", "bQb">; def VLD4_DUP_BF : WInst<"vld4_dup", "4(c*!)", "bQb">; + def VCVT_F32_BF16 : SOpInst<"vcvt_f32_bf16", "(F>)(Bq!)", "Qb", OP_VCVT_F32_BF16>; + def VCVT_LOW_F32_BF16 : SOpInst<"vcvt_low_f32", "(F>)(BQ!)", "Qb", OP_VCVT_F32_BF16_LO>; + def VCVT_HIGH_F32_BF16 : SOpInst<"vcvt_high_f32", "(F>)(BQ!)", "Qb", OP_VCVT_F32_BF16_HI>; + + def SCALAR_CVT_BF16_F32 : SInst<"vcvth_bf16", "(1B)1", "f">; + def SCALAR_CVT_F32_BF16 : SOpInst<"vcvtah_f32", "(1F>)(1!)", "b", OP_CVT_F32_BF16>; +} + +let ArchGuard = "defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && !defined(__aarch64__)" in { + def VCVT_BF16_F32_A32_INTERNAL : WInst<"__a32_vcvt_bf16", "BQ", "f">; + def VCVT_BF16_F32_A32 : SOpInst<"vcvt_bf16", "BQ", "f", OP_VCVT_BF16_F32_A32>; + def VCVT_LOW_BF16_F32_A32 : SOpInst<"vcvt_low_bf16", "BQ", "Qf", OP_VCVT_BF16_F32_LO_A32>; + def VCVT_HIGH_BF16_F32_A32 : SOpInst<"vcvt_high_bf16", "BBQ", "Qf", OP_VCVT_BF16_F32_HI_A32>; +} + +let ArchGuard = "defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && defined(__aarch64__)" in { + def VCVT_LOW_BF16_F32_A64_INTERNAL : WInst<"__a64_vcvtq_low_bf16", "BQ", "Hf">; + def VCVT_LOW_BF16_F32_A64 : SOpInst<"vcvt_low_bf16", "BQ", "Qf", OP_VCVT_BF16_F32_LO_A64>; + def VCVT_HIGH_BF16_F32_A64 : SInst<"vcvt_high_bf16", "BBQ", "Qf">; + def VCVT_BF16_F32 : SOpInst<"vcvt_bf16", "BQ", "f", OP_VCVT_BF16_F32_A64>; + def COPY_LANE_BF16 : IOpInst<"vcopy_lane", "..I.I", "b", OP_COPY_LN>; + def COPYQ_LANE_BF16 : IOpInst<"vcopy_lane", "..IqI", "Qb", OP_COPY_LN>; + def COPY_LANEQ_BF16 : IOpInst<"vcopy_laneq", "..IQI", "b", OP_COPY_LN>; + def COPYQ_LANEQ_BF16 : IOpInst<"vcopy_laneq", "..I.I", "Qb", OP_COPY_LN>; } let ArchGuard = "defined(__ARM_FEATURE_BF16) && !defined(__aarch64__)" in { diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -4656,6 +4656,7 @@ TypeModifier } static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = { + NEONMAP1(__a32_vcvt_bf16_v, arm_neon_vcvtfp2bf, 0), NEONMAP0(splat_lane_v), NEONMAP0(splat_laneq_v), NEONMAP0(splatq_lane_v), @@ -4729,6 +4730,7 @@ NEONMAP1(vcvtaq_u16_v, arm_neon_vcvtau, 0), NEONMAP1(vcvtaq_u32_v, arm_neon_vcvtau, 0), NEONMAP1(vcvtaq_u64_v, arm_neon_vcvtau, 0), + NEONMAP1(vcvth_bf16_f32, arm_neon_vcvtbfp2bf, 0), NEONMAP1(vcvtm_s16_v, arm_neon_vcvtms, 0), NEONMAP1(vcvtm_s32_v, arm_neon_vcvtms, 0), NEONMAP1(vcvtm_s64_v, arm_neon_vcvtms, 0), @@ -4945,6 +4947,7 @@ }; static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = { + NEONMAP1(__a64_vcvtq_low_bf16_v, aarch64_neon_bfcvtn, 0), NEONMAP0(splat_lane_v), NEONMAP0(splat_laneq_v), NEONMAP0(splatq_lane_v), @@ -5004,6 +5007,7 @@ NEONMAP1(vcvt_n_u64_v, aarch64_neon_vcvtfp2fxu, 0), NEONMAP0(vcvtq_f16_v), NEONMAP0(vcvtq_f32_v), + NEONMAP1(vcvtq_high_bf16_v, aarch64_neon_bfcvtn2, 0), NEONMAP2(vcvtq_n_f16_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0), NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0), NEONMAP2(vcvtq_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0), @@ -5159,6 +5163,7 @@ NEONMAP1(vcvtd_n_f64_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType), NEONMAP1(vcvtd_n_s64_f64, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType), NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType), + NEONMAP1(vcvth_bf16_f32, aarch64_neon_bfcvt, AddRetType | Add1ArgType), NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType), NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType), NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType), @@ -6157,6 +6162,27 @@ llvm::Type *Tys[2] = { Ty, InputTy }; return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmlalt"); } + case NEON::BI__builtin_neon___a64_vcvtq_low_bf16_v: { + llvm::Type *InputTy = + GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, true)); + llvm::Type *Tys[2] = { Ty, InputTy }; + Function *F = CGM.getIntrinsic(Int, Tys); + return EmitNeonCall(F, Ops, "bfcvtn"); + } + case NEON::BI__builtin_neon_vcvtq_high_bf16_v: { + llvm::Type *InputTy = + GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, true)); + llvm::Type *Tys[3] = { Ty, Ty, InputTy }; + Function *F = CGM.getIntrinsic(Int, Tys); + return EmitNeonCall(F, Ops, "bfcvtn2"); + } + case NEON::BI__builtin_neon___a32_vcvt_bf16_v: { + llvm::Type *InputTy = + GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, true)); + llvm::Type *Tys[2] = { Ty, InputTy }; + Function *F = CGM.getIntrinsic(Int, Tys); + return EmitNeonCall(F, Ops, "vcvtfp2bf"); + } } assert(Int && "Expected valid intrinsic number"); @@ -6364,6 +6390,7 @@ case NEON::BI__builtin_neon_vsha1cq_u32: case NEON::BI__builtin_neon_vsha1pq_u32: case NEON::BI__builtin_neon_vsha1mq_u32: + case NEON::BI__builtin_neon_vcvth_bf16_f32: case clang::ARM::BI_MoveToCoprocessor: case clang::ARM::BI_MoveToCoprocessor2: return false; @@ -6847,6 +6874,22 @@ return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1m), Ops, "vsha1h"); + case NEON::BI__builtin_neon_vcvth_bf16_f32: { + LLVMContext &Ctx = CGM.getLLVMContext(); + const bool AllowBFloatArgsAndRet = + getTargetHooks().getABIInfo().allowBFloatArgsAndRet(); + llvm::Type *DestTy = AllowBFloatArgsAndRet ? llvm::Type::getBFloatTy(Ctx) + : llvm::Type::getInt32Ty(Ctx); + llvm::Type *Tys[] = { DestTy, Ops[0]->getType() }; + Value *Ret = + EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vcvtbfp2bf, Tys), Ops, + "vcvtbfp2bf"); + if (AllowBFloatArgsAndRet) + return Ret; + Value *RetLO = Builder.CreateTrunc(Ret, llvm::Type::getInt16Ty(Ctx)); + return Builder.CreateBitCast(RetLO, llvm::Type::getBFloatTy(Ctx)); + } + // The ARM _MoveToCoprocessor builtins put the input register value as // the first argument, but the LLVM intrinsic expects it as the third one. case ARM::BI_MoveToCoprocessor: diff --git a/clang/test/CodeGen/aarch64-bf16-lane-intrinsics.c b/clang/test/CodeGen/aarch64-bf16-lane-intrinsics.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-bf16-lane-intrinsics.c @@ -0,0 +1,78 @@ +// RUN: %clang_cc1 \ +// RUN: -triple aarch64-arm-none-eabi -target-cpu cortex-a75 \ +// RUN: -target-feature +neon -target-feature +bf16 -ffreestanding \ +// RUN: -disable-O0-optnone -emit-llvm -o - %s \ +// RUN: | opt -S -mem2reg | FileCheck --check-prefixes=CHECK,CHECK-LE %s +// RUN: %clang_cc1 \ +// RUN: -triple aarch64_be-arm-none-eabi -target-cpu cortex-a75 \ +// RUN: -target-feature +neon -target-feature +bf16 -ffreestanding \ +// RUN: -disable-O0-optnone -emit-llvm -o - %s \ +// RUN: | opt -S -mem2reg | FileCheck %s + +#include + +// CHECK-LABEL: test_vcopy_lane_bf16_v1 +// CHECK: %[[LANE:.*]] = extractelement <4 x bfloat> %{{.*}}, i32 3 +// CHECK: %[[RES:.*]] = insertelement <4 x bfloat> %{{.*}}, bfloat %[[LANE]], i32 1 +// CHECK-LE: ret <4 x bfloat> %[[RES]] +bfloat16x4_t test_vcopy_lane_bf16_v1(bfloat16x4_t a, bfloat16x4_t b) { + return vcopy_lane_bf16(a, 1, b, 3); +} + +// CHECK-LABEL: test_vcopy_lane_bf16_v2 +// CHECK: %[[LANE:.*]] = extractelement <4 x bfloat> %{{.*}}, i32 0 +// CHECK: %[[RES:.*]] = insertelement <4 x bfloat> %{{.*}}, bfloat %[[LANE]], i32 2 +// CHECK-LE: ret <4 x bfloat> %[[RES]] +bfloat16x4_t test_vcopy_lane_bf16_v2(bfloat16x4_t a, bfloat16x4_t b) { + return vcopy_lane_bf16(a, 2, b, 0); +} + +// CHECK-LABEL: test_vcopyq_lane_bf16_v1 +// CHECK: %[[LANE:.*]] = extractelement <4 x bfloat> %{{.*}}, i32 2 +// CHECK: %[[RES:.*]] = insertelement <8 x bfloat> %{{.*}}, bfloat %[[LANE]], i32 0 +// CHECK-LE: ret <8 x bfloat> %[[RES]] +bfloat16x8_t test_vcopyq_lane_bf16_v1(bfloat16x8_t a, bfloat16x4_t b) { + return vcopyq_lane_bf16(a, 0, b, 2); +} + +// CHECK-LABEL: test_vcopyq_lane_bf16_v2 +// CHECK: %[[LANE:.*]] = extractelement <4 x bfloat> %{{.*}}, i32 0 +// CHECK: %[[RES:.*]] = insertelement <8 x bfloat> %{{.*}}, bfloat %[[LANE]], i32 6 +// CHECK-LE: ret <8 x bfloat> %[[RES]] +bfloat16x8_t test_vcopyq_lane_bf16_v2(bfloat16x8_t a, bfloat16x4_t b) { + return vcopyq_lane_bf16(a, 6, b, 0); +} + +// CHECK-LABEL: test_vcopy_laneq_bf16_v1 +// CHECK: %[[LANE:.*]] = extractelement <8 x bfloat> %{{.*}}, i32 7 +// CHECK: %[[RES:.*]] = insertelement <4 x bfloat> %{{.*}}, bfloat %[[LANE]], i32 0 +// CHECK-LE: ret <4 x bfloat> %[[RES]] +bfloat16x4_t test_vcopy_laneq_bf16_v1(bfloat16x4_t a, bfloat16x8_t b) { + return vcopy_laneq_bf16(a, 0, b, 7); +} + +// CHECK-LABEL: test_vcopy_laneq_bf16_v2 +// CHECK: %[[LANE:.*]] = extractelement <8 x bfloat> %{{.*}}, i32 4 +// CHECK: %[[RES:.*]] = insertelement <4 x bfloat> %{{.*}}, bfloat %[[LANE]], i32 3 +// CHECK-LE: ret <4 x bfloat> %[[RES]] +bfloat16x4_t test_vcopy_laneq_bf16_v2(bfloat16x4_t a, bfloat16x8_t b) { + return vcopy_laneq_bf16(a, 3, b, 4); +} + +// CHECK-LABEL: test_vcopyq_laneq_bf16_v1 +// CHECK: %[[LANE:.*]] = extractelement <8 x bfloat> %{{.*}}, i32 7 +// CHECK: %[[RES:.*]] = insertelement <8 x bfloat> %{{.*}}, bfloat %[[LANE]], i32 3 +// CHECK-LE: ret <8 x bfloat> %[[RES]] +bfloat16x8_t test_vcopyq_laneq_bf16_v1(bfloat16x8_t a, bfloat16x8_t b) { + return vcopyq_laneq_bf16(a, 3, b, 7); + +} + +// CHECK-LABEL: test_vcopyq_laneq_bf16_v2 +// CHECK: %[[LANE:.*]] = extractelement <8 x bfloat> %{{.*}}, i32 2 +// CHECK: %[[RES:.*]] = insertelement <8 x bfloat> %{{.*}}, bfloat %[[LANE]], i32 6 +// CHECK-LE: ret <8 x bfloat> %[[RES]] +bfloat16x8_t test_vcopyq_laneq_bf16_v2(bfloat16x8_t a, bfloat16x8_t b) { + return vcopyq_laneq_bf16(a, 6, b, 2); +} + diff --git a/clang/test/CodeGen/arm-bf16-conv-copy-intrinsics.c b/clang/test/CodeGen/arm-bf16-conv-copy-intrinsics.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/arm-bf16-conv-copy-intrinsics.c @@ -0,0 +1,88 @@ +// RUN: %clang_cc1 \ +// RUN: -triple aarch64-arm-none-eabi -target-feature +neon -target-feature +bf16 \ +// RUN: -disable-O0-optnone -emit-llvm -o - %s \ +// RUN: | opt -S -mem2reg -instcombine \ +// RUN: | FileCheck --check-prefixes=CHECK,CHECK-A64 %s +// RUN: %clang_cc1 \ +// RUN: -triple armv8.6a-arm-none-eabi -target-feature +neon \ +// RUN: -target-feature +bf16 -mfloat-abi hard \ +// RUN: -disable-O0-optnone -emit-llvm -o - %s \ +// RUN: | opt -S -mem2reg -instcombine \ +// RUN: | FileCheck --check-prefixes=CHECK,CHECK-A32-FP16 %s +// RUN: %clang_cc1 \ +// RUN: -triple armv8.6a-arm-none-eabi -target-feature +neon \ +// RUN: -target-feature +bf16 -mfloat-abi softfp \ +// RUN: -disable-O0-optnone -emit-llvm -o - %s \ +// RUN: | opt -S -mem2reg -instcombine \ +// RUN: | FileCheck --check-prefixes=CHECK,CHECK-A32-NOFP16 %s + +#include + +// CHECK-LABEL: test_vcvt_f32_bf16 +// CHECK: %[[EXT:.*]] = zext <4 x i16> %{{.*}} to <4 x i32> +// CHECK: shl nuw <4 x i32> %[[EXT]], +float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) { + return vcvt_f32_bf16(a); +} + +// CHECK-LABEL: test_vcvtq_low_f32_bf16 +// CHECK: shufflevector <8 x bfloat> %{{.*}}, <8 x bfloat> undef, <4 x i32> +// CHECK: %[[EXT:.*]] = zext <4 x i16> %{{.*}} to <4 x i32> +// CHECK: shl nuw <4 x i32> %[[EXT]], +float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) { + return vcvtq_low_f32_bf16(a); +} + +// CHECK-LABEL: test_vcvtq_high_f32_bf16 +// CHECK: shufflevector <8 x bfloat> %{{.*}}, <8 x bfloat> undef, <4 x i32> +// CHECK: %[[EXT:.*]] = zext <4 x i16> %{{.*}} to <4 x i32> +// CHECK: shl nuw <4 x i32> %[[EXT]], +float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) { + return vcvtq_high_f32_bf16(a); +} + +// CHECK-LABEL: test_vcvt_bf16_f32 +// CHECK-A64: %[[CVT:.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn.v8bf16.v4f32(<4 x float> %a) +// CHECK-A64: shufflevector <8 x bfloat> %[[CVT]], <8 x bfloat> undef, <4 x i32> +// CHECK-A32-FP16: call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16.v4f32(<4 x float> %a) +// CHECK-A32-NOFP16: call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16.v4f32(<4 x float> %a) +bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) { + return vcvt_bf16_f32(a); +} + +// CHECK-LABEL: test_vcvtq_low_bf16_f32 +// CHECK-A64: call <8 x bfloat> @llvm.aarch64.neon.bfcvtn.v8bf16.v4f32(<4 x float> %a) +// CHECK-A32-FP16: %[[CVT:.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16.v4f32 +// CHECK-A32-FP16: shufflevector <4 x bfloat> zeroinitializer, <4 x bfloat> %[[CVT]], <8 x i32> +// CHECK-A32-NOFP16: call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16.v4f32 +// CHECK-A32-NOFP16: shufflevector <4 x bfloat> zeroinitializer, <4 x bfloat> %{{.*}}, <8 x i32> +bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) { + return vcvtq_low_bf16_f32(a); +} + +// CHECK-LABEL: test_vcvtq_high_bf16_f32 +// CHECK-A64: call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2.v8bf16.v8bf16.v4f32(<8 x bfloat> %inactive, <4 x float> %a) +// CHECK-A32-FP16: %[[CVT:.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16.v4f32(<4 x float> %a) +// CHECK-A32-FP16: %[[INACT:.*]] = shufflevector <8 x bfloat> %inactive, <8 x bfloat> undef, <4 x i32> +// CHECK-A32-FP16: shufflevector <4 x bfloat> %[[CVT]], <4 x bfloat> %[[INACT]], <8 x i32> +// CHECK-A32-NOFP16: call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16.v4f32(<4 x float> %a) +// CHECK-A32-NOFP16: shufflevector <8 x bfloat> %{{.*}}, <8 x bfloat> undef, <4 x i32> +// CHECK-A32-NOFP16: shufflevector <4 x bfloat> %{{.*}}, <4 x bfloat> %{{.*}}, <8 x i32> +bfloat16x8_t test_vcvtq_high_bf16_f32(bfloat16x8_t inactive, float32x4_t a) { + return vcvtq_high_bf16_f32(inactive, a); +} + +// CHECK-LABEL: test_vcvth_bf16_f32 +// CHECK-A64: call bfloat @llvm.aarch64.neon.bfcvt.bf16.f32 +// CHECK-A32-FP16: call bfloat @llvm.arm.neon.vcvtbfp2bf.bf16.f32(float %a) +// CHECK-A32-NOFP16: call i32 @llvm.arm.neon.vcvtbfp2bf.i32.f32(float %a) +bfloat16_t test_vcvth_bf16_f32(float32_t a) { + return vcvth_bf16_f32(a); +} + +// CHECK-LABEL: test_vcvtah_f32_bf16 +// CHECK: shl i32 %{{.*}}, 16 +float32_t test_vcvtah_f32_bf16(bfloat16_t a) { + return vcvtah_f32_bf16(a); +} + diff --git a/clang/test/Sema/aarch64-neon-bf16-ranges.c b/clang/test/Sema/aarch64-neon-bf16-ranges.c new file mode 100644 --- /dev/null +++ b/clang/test/Sema/aarch64-neon-bf16-ranges.c @@ -0,0 +1,47 @@ +// RUN: %clang_cc1 -fsyntax-only -verify \ +// RUN: -triple aarch64-arm-none-eabi -target-feature +neon \ +// RUN: -target-feature +bf16 %s + +#include + +int x; + +void test_vcopy_lane_bf16(bfloat16x4_t a, bfloat16x8_t b) { + // 0 <= lane1 <= 3; 0 <= lane2 <= 3 + (void)vcopy_lane_bf16(a, 3, a, 3); + (void)vcopy_lane_bf16(a, 0, a, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + (void)vcopy_lane_bf16(a, 1, a, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + (void)vcopy_lane_bf16(a, 4, a, 0); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + (void)vcopy_lane_bf16(a, -1, a, 1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + (void)vcopy_lane_bf16(a, 0, a, x); // expected-error-re {{argument {{.*}} must be a constant integer}} + (void)vcopy_lane_bf16(a, x, a, 0); // expected-error-re {{argument {{.*}} must be a constant integer}} + + // 0 <= lane1 <= 7; 0 <= lane2 <= 3 + (void)vcopyq_lane_bf16(b, 7, a, 3); + (void)vcopyq_lane_bf16(b, 0, a, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + (void)vcopyq_lane_bf16(b, 1, a, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + (void)vcopyq_lane_bf16(b, 8, a, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + (void)vcopyq_lane_bf16(b, -1, a, 1); // expected-error {{argument value -1 is outside the valid range [0, 7]}} + (void)vcopyq_lane_bf16(b, 0, a, x); // expected-error-re {{argument {{.*}} must be a constant integer}} + (void)vcopyq_lane_bf16(b, x, a, 0); // expected-error-re {{argument {{.*}} must be a constant integer}} + + // 0 <= lane1 <= 3; 0 <= lane2 <= 7 + (void)vcopy_laneq_bf16(a, 3, b, 7); + (void)vcopy_laneq_bf16(a, 0, b, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + (void)vcopy_laneq_bf16(a, 1, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 7]}} + (void)vcopy_laneq_bf16(a, 4, b, 0); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + (void)vcopy_laneq_bf16(a, -1, b, 1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + (void)vcopy_laneq_bf16(a, 0, b, x); // expected-error-re {{argument {{.*}} must be a constant integer}} + (void)vcopy_laneq_bf16(a, x, b, 0); // expected-error-re {{argument {{.*}} must be a constant integer}} + + + // 0 <= lane1 <= 7; 0 <= lane2 <= 7 + (void)vcopyq_laneq_bf16(b, 7, b, 7); + (void)vcopyq_laneq_bf16(b, 0, b, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + (void)vcopyq_laneq_bf16(b, 1, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 7]}} + (void)vcopyq_laneq_bf16(b, 8, b, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + (void)vcopyq_laneq_bf16(b, -1, b, 1); // expected-error {{argument value -1 is outside the valid range [0, 7]}} + (void)vcopyq_laneq_bf16(b, 0, b, x); // expected-error-re {{argument {{.*}} must be a constant integer}} + (void)vcopyq_laneq_bf16(b, x, b, 0); // expected-error-re {{argument {{.*}} must be a constant integer}} +} + diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -1064,7 +1064,8 @@ std::string S = Name; if (Name == "vcvt_f16_f32" || Name == "vcvt_f32_f16" || - Name == "vcvt_f32_f64" || Name == "vcvt_f64_f32") + Name == "vcvt_f32_f64" || Name == "vcvt_f64_f32" || + Name == "vcvt_f32_bf16") return Name; if (!typeCode.empty()) { diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -471,6 +471,16 @@ def int_aarch64_neon_bfmlalt : AdvSIMD_FML_Intrinsic; + // v8.6-A Bfloat Intrinsics + def int_aarch64_neon_bfcvt + : Intrinsic<[llvm_anyfloat_ty], [llvm_anyfloat_ty], [IntrNoMem]>; + def int_aarch64_neon_bfcvtn + : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>; + def int_aarch64_neon_bfcvtn2 + : Intrinsic<[llvm_anyvector_ty], + [llvm_anyvector_ty, llvm_anyvector_ty], + [IntrNoMem]>; + // v8.2-A FP16 Fused Multiply-Add Long def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic; def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic; diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -786,6 +786,12 @@ // v8.6-A Bfloat Intrinsics +def int_arm_neon_vcvtfp2bf + : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>; + +def int_arm_neon_vcvtbfp2bf + : Intrinsic<[llvm_any_ty], [llvm_anyfloat_ty], [IntrNoMem]>; + def int_arm_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -7911,15 +7911,18 @@ class SIMD_BFCVTN : BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128, "bfcvtn", ".4h", ".4s", - []>; + [(set (v8bf16 V128:$Rd), + (int_aarch64_neon_bfcvtn (v4f32 V128:$Rn)))]>; class SIMD_BFCVTN2 : BaseSIMDMixedTwoVectorTied<1, 0, 0b10, 0b10110, V128, V128, "bfcvtn2", ".8h", ".4s", - []>; + [(set (v8bf16 V128:$dst), + (int_aarch64_neon_bfcvtn2 (v8bf16 V128:$Rd), (v4f32 V128:$Rn)))]>; class BF16ToSinglePrecision - : I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "", []>, + : I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "", + [(set (bf16 FPR16:$Rd), (int_aarch64_neon_bfcvt (f32 FPR32:$Rn)))]>, Sched<[WriteFCvt]> { bits<5> Rd; bits<5> Rn; diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -4653,6 +4653,29 @@ default: break; + // Scalar f32 -> bf16 + case Intrinsic::arm_neon_vcvtbfp2bf: { + SDLoc dl(N); + const SDValue &Src = N->getOperand(1); + llvm::EVT DestTy = N->getValueType(0); + SDValue Pred = getAL(CurDAG, dl); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { Src, Src, Pred, Reg0 }; + CurDAG->SelectNodeTo(N, ARM::BF16_VCVTB, DestTy, Ops); + return; + } + + // Vector v4f32 -> v4bf16 + case Intrinsic::arm_neon_vcvtfp2bf: { + SDLoc dl(N); + const SDValue &Src = N->getOperand(1); + SDValue Pred = getAL(CurDAG, dl); + SDValue Reg0 = CurDAG->getRegister(0, MVT::i32); + SDValue Ops[] = { Src, Pred, Reg0 }; + CurDAG->SelectNodeTo(N, ARM::BF16_VCVT, MVT::v4i16, Ops); + return; + } + case Intrinsic::arm_mve_urshrl: SelectMVE_LongShift(N, ARM::MVE_URSHRL, true, false); return; diff --git a/llvm/test/CodeGen/AArch64/bf16-intrinsics.ll b/llvm/test/CodeGen/AArch64/bf16-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/bf16-intrinsics.ll @@ -0,0 +1,34 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-arm-none-eabi -mattr=+neon -mattr=+bf16 | FileCheck %s + +declare bfloat @llvm.aarch64.neon.bfcvt.f16.f32(float) +declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn.v8f16.v4f32(<4 x float>) +declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn2.v8f16.v8f16.v4f32(<8 x bfloat>, <4 x float>) + +; CHECK-LABEL: test_vcvth_bf16_f32 +; CHECK: bfcvt h0, s0 +; CHECK-NEXT: ret +define bfloat @test_vcvth_bf16_f32(float %a) { +entry: + %vcvth_bf16_f32 = call bfloat @llvm.aarch64.neon.bfcvt.f16.f32(float %a) + ret bfloat %vcvth_bf16_f32 +} + +; CHECK-LABEL: test_vcvtq_low_bf16_f32 +; CHECK: bfcvtn v0.4h, v0.4s +; CHECK-NEXT: ret +define <8 x bfloat> @test_vcvtq_low_bf16_f32(<4 x float> %a) { +entry: + %cvt = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn.v8f16.v4f32(<4 x float> %a) + ret <8 x bfloat> %cvt +} + +; CHECK-LABEL: test_vcvtq_high_bf16_f32 +; CHECK: bfcvtn2 v1.8h, v0.4s +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +define <8 x bfloat> @test_vcvtq_high_bf16_f32(<4 x float> %a, <8 x bfloat> %inactive) { +entry: + %cvt = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2.v8f16.v8f16.v4f32(<8 x bfloat> %inactive, <4 x float> %a) + ret <8 x bfloat> %cvt +} + diff --git a/llvm/test/CodeGen/ARM/bf16-intrinsics-nofp16.ll b/llvm/test/CodeGen/ARM/bf16-intrinsics-nofp16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/bf16-intrinsics-nofp16.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=armv8.6a-arm-none-eabi -mattr=+neon -mattr=+bf16 | FileCheck %s + +declare i32 @llvm.arm.neon.vcvtbfp2bf.i32.f32(float) +declare <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16.v4f32(<4 x float>) + +; CHECK-LABEL: test_vcvth_bf16_f32 +; CHECK: vcvtb.bf16.f32 s0, s0 +define arm_aapcs_vfpcc float @test_vcvth_bf16_f32(float %a) { +entry: + %vcvtbfp2bf = tail call i32 @llvm.arm.neon.vcvtbfp2bf.i32.f32(float %a) + %tmp.0.insert.ext = and i32 %vcvtbfp2bf, 65535 + %0 = bitcast i32 %tmp.0.insert.ext to float + ret float %0 +} + +; CHECK-LABEL: test_vcvt_bf16_f32 +; CHECK: vcvt.bf16.f32 d0, q0 +define arm_aapcs_vfpcc <2 x i32> @test_vcvt_bf16_f32(<4 x float> %a) { +entry: + %vcvtfp2bf1.i.i = tail call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16.v4f32(<4 x float> %a) + %0 = bitcast <4 x i16> %vcvtfp2bf1.i.i to <2 x i32> + ret <2 x i32> %0 +} diff --git a/llvm/test/CodeGen/ARM/bf16-intrinsics.ll b/llvm/test/CodeGen/ARM/bf16-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/bf16-intrinsics.ll @@ -0,0 +1,24 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=armv8.6a-arm-none-eabi -mattr=+fullfp16 -mattr=+neon -mattr=+bf16 | FileCheck %s + +declare bfloat @llvm.arm.neon.vcvtbfp2bf.bf16.f32(float) +declare <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16.v4f32(<4 x float>) + +; CHECK-LABEL: test_vcvth_bf16_f32 +; CHECK: vcvtb.bf16.f32 s0, s0 +define arm_aapcs_vfpcc float @test_vcvth_bf16_f32(float %a) { +entry: + %vcvtbfp2bf.i = tail call bfloat @llvm.arm.neon.vcvtbfp2bf.bf16.f32(float %a) + %0 = bitcast bfloat %vcvtbfp2bf.i to i16 + %tmp.0.insert.ext.i = zext i16 %0 to i32 + %1 = bitcast i32 %tmp.0.insert.ext.i to float + ret float %1 +} + +; CHECK-LABEL: test_vcvt_bf16_f32 +; CHECK: vcvt.bf16.f32 d0, q0 +define arm_aapcs_vfpcc <4 x bfloat> @test_vcvt_bf16_f32(<4 x float> %a) { +entry: + %vcvtfp2bf1.i.i = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16.v4f32(<4 x float> %a) + ret <4 x bfloat> %vcvtfp2bf1.i.i +} +