diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -6241,28 +6241,10 @@ case NEON::BI__builtin_neon_vbfdot_v: case NEON::BI__builtin_neon_vbfdotq_v: { llvm::Type *InputTy = - llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); + llvm::FixedVectorType::get(BFloatTy, Ty->getPrimitiveSizeInBits() / 16); llvm::Type *Tys[2] = { Ty, InputTy }; return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfdot"); } - case NEON::BI__builtin_neon_vbfmmlaq_v: { - llvm::Type *InputTy = - llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); - llvm::Type *Tys[2] = { Ty, InputTy }; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmmla"); - } - case NEON::BI__builtin_neon_vbfmlalbq_v: { - llvm::Type *InputTy = - llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); - llvm::Type *Tys[2] = { Ty, InputTy }; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmlalb"); - } - case NEON::BI__builtin_neon_vbfmlaltq_v: { - llvm::Type *InputTy = - llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); - llvm::Type *Tys[2] = { Ty, InputTy }; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmlalt"); - } case NEON::BI__builtin_neon___a32_vcvt_bf16_v: { llvm::Type *Tys[1] = { Ty }; Function *F = CGM.getIntrinsic(Int, Tys); diff --git a/clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c b/clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c --- a/clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c +++ b/clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c @@ -1,146 +1,138 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // RUN: %clang_cc1 -triple aarch64-arm-none-eabi -target-feature +neon -target-feature +bf16 \ // RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -mem2reg -instcombine | FileCheck %s #include -// CHECK-LABEL: test_vbfdot_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %0 = bitcast <4 x bfloat> %a to <8 x i8> -// CHECK-NEXT %1 = bitcast <4 x bfloat> %b to <8 x i8> -// CHECK-NEXT %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1) -// CHECK-NEXT ret <2 x float> %vbfdot1.i +// CHECK-LABEL: @test_vbfdot_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[B:%.*]]) [[ATTR3:#.*]] +// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]] +// float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) { return vbfdot_f32(r, a, b); } -// CHECK-LABEL: test_vbfdotq_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %1 = bitcast <8 x bfloat> %b to <16 x i8> -// CHECK-NEXT %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) -// CHECK-NEXT ret <4 x float> %vbfdot1.i +// CHECK-LABEL: @test_vbfdotq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]] +// float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){ return vbfdotq_f32(r, a, b); } -// CHECK-LABEL: test_vbfdot_lane_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %0 = bitcast <4 x bfloat> %b to <2 x float> -// CHECK-NEXT %lane = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer -// CHECK-NEXT %1 = bitcast <4 x bfloat> %a to <8 x i8> -// CHECK-NEXT %2 = bitcast <2 x float> %lane to <8 x i8> -// CHECK-NEXT %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) -// CHECK-NEXT ret <2 x float> %vbfdot1.i +// CHECK-LABEL: @test_vbfdot_lane_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <2 x i32> zeroinitializer +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]]) [[ATTR3]] +// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]] +// float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){ return vbfdot_lane_f32(r, a, b, 0); } -// CHECK-LABEL: test_vbfdotq_laneq_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %0 = bitcast <8 x bfloat> %b to <4 x float> -// CHECK-NEXT %lane = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> -// CHECK-NEXT %1 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %2 = bitcast <4 x float> %lane to <16 x i8> -// CHECK-NEXT %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) -// CHECK-NEXT ret <4 x float> %vbfdot1.i +// CHECK-LABEL: @test_vbfdotq_laneq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <4 x i32> +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]] +// float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfdotq_laneq_f32(r, a, b, 3); } -// CHECK-LABEL: test_vbfdot_laneq_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %0 = bitcast <8 x bfloat> %b to <4 x float> -// CHECK-NEXT %lane = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> -// CHECK-NEXT %1 = bitcast <4 x bfloat> %a to <8 x i8> -// CHECK-NEXT %2 = bitcast <2 x float> %lane to <8 x i8> -// CHECK-NEXT %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) -// CHECK-NEXT ret <2 x float> %vbfdot1.i +// CHECK-LABEL: @test_vbfdot_laneq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <2 x i32> +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]]) [[ATTR3]] +// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]] +// float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) { return vbfdot_laneq_f32(r, a, b, 3); } -// CHECK-LABEL: test_vbfdotq_lane_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %0 = bitcast <4 x bfloat> %b to <2 x float> -// CHECK-NEXT %lane = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer -// CHECK-NEXT %1 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %2 = bitcast <4 x float> %lane to <16 x i8> -// CHECK-NEXT %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) -// CHECK-NEXT ret <4 x float> %vbfdot1.i +// CHECK-LABEL: @test_vbfdotq_lane_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]] +// float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { return vbfdotq_lane_f32(r, a, b, 0); } -// CHECK-LABEL: test_vbfmmlaq_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %1 = bitcast <8 x bfloat> %b to <16 x i8> -// CHECK-NEXT %vbfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) -// CHECK-NEXT ret <4 x float> %vbfmmla1.i +// CHECK-LABEL: @test_vbfmmlaq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VBFMMLAQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMMLAQ_V3_I]] +// float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmmlaq_f32(r, a, b); } -// CHECK-LABEL: test_vbfmlalbq_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %1 = bitcast <8 x bfloat> %b to <16 x i8> -// CHECK-NEXT %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) -// CHECK-NEXT ret <4 x float> %vbfmlalb1.i +// CHECK-LABEL: @test_vbfmlalbq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]] +// float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlalbq_f32(r, a, b); } -// CHECK-LABEL: test_vbfmlaltq_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %1 = bitcast <8 x bfloat> %b to <16 x i8> -// CHECK-NEXT %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) -// CHECK-NEXT ret <4 x float> %vbfmlalt1.i +// CHECK-LABEL: @test_vbfmlaltq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]] +// float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlaltq_f32(r, a, b); } -// CHECK-LABEL: test_vbfmlalbq_lane_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer -// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> -// CHECK-NEXT %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) -// CHECK-NEXT ret <4 x float> %vbfmlalb1.i +// CHECK-LABEL: @test_vbfmlalbq_lane_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]] +// float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { return vbfmlalbq_lane_f32(r, a, b, 0); } -// CHECK-LABEL: test_vbfmlalbq_laneq_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> -// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> -// CHECK-NEXT %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) -// CHECK-NEXT ret <4 x float> %vbfmlalb1.i +// CHECK-LABEL: @test_vbfmlalbq_laneq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> undef, <8 x i32> +// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]] +// float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlalbq_laneq_f32(r, a, b, 3); } -// CHECK-LABEL: test_vbfmlaltq_lane_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer -// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> -// CHECK-NEXT %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) -// CHECK-NEXT ret <4 x float> %vbfmlalt1.i +// CHECK-LABEL: @test_vbfmlaltq_lane_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]] +// float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { return vbfmlaltq_lane_f32(r, a, b, 0); } -// CHECK-LABEL: test_vbfmlaltq_laneq_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> -// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> -// CHECK-NEXT %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) -// CHECK-NEXT ret <4 x float> %vbfmlalt1.i +// CHECK-LABEL: @test_vbfmlaltq_laneq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> undef, <8 x i32> +// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]] +// float32x4_t test_vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlaltq_laneq_f32(r, a, b, 3); } diff --git a/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c b/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c --- a/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c +++ b/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c @@ -12,10 +12,8 @@ // CHECK-LABEL: @test_vbfdot_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.*]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #3 -// CHECK-NEXT: ret <2 x float> [[VBFDOT1_I]] +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[B:%.*]]) [[ATTR3:#.*]] +// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]] // float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) { return vbfdot_f32(r, a, b); @@ -23,10 +21,8 @@ // CHECK-LABEL: @test_vbfdotq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3 -// CHECK-NEXT: ret <4 x float> [[VBFDOT1_I]] +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]] // float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){ return vbfdotq_f32(r, a, b); @@ -36,10 +32,9 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float> // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <2 x i32> zeroinitializer -// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.*]], <8 x i8> [[TMP0]], <8 x i8> [[DOTCAST1]]) #3 -// CHECK-NEXT: ret <2 x float> [[VBFDOT1_I]] +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]]) [[ATTR3]] +// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]] // float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){ return vbfdot_lane_f32(r, a, b, 0); @@ -49,10 +44,9 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float> // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <4 x i32> -// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[DOTCAST1]]) #3 -// CHECK-NEXT: ret <4 x float> [[VBFDOT1_I]] +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]] // float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfdotq_laneq_f32(r, a, b, 3); @@ -62,10 +56,9 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float> // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <2 x i32> -// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.*]], <8 x i8> [[TMP0]], <8 x i8> [[DOTCAST1]]) #3 -// CHECK-NEXT: ret <2 x float> [[VBFDOT1_I]] +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]]) [[ATTR3]] +// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]] // float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) { return vbfdot_laneq_f32(r, a, b, 3); @@ -75,10 +68,9 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float> // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <4 x i32> zeroinitializer -// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[DOTCAST1]]) #3 -// CHECK-NEXT: ret <4 x float> [[VBFDOT1_I]] +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]] // float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { return vbfdotq_lane_f32(r, a, b, 0); @@ -86,10 +78,8 @@ // CHECK-LABEL: @test_vbfmmlaq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFMMLA1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3 -// CHECK-NEXT: ret <4 x float> [[VBFMMLA1_I]] +// CHECK-NEXT: [[VBFMMLAQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMMLAQ_V3_I]] // float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmmlaq_f32(r, a, b); @@ -97,10 +87,8 @@ // CHECK-LABEL: @test_vbfmlalbq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALB1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3 -// CHECK-NEXT: ret <4 x float> [[VBFMLALB1_I]] +// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]] // float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlalbq_f32(r, a, b); @@ -108,10 +96,8 @@ // CHECK-LABEL: @test_vbfmlaltq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3 -// CHECK-NEXT: ret <4 x float> [[VBFMLALT1_I]] +// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]] // float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlaltq_f32(r, a, b); @@ -120,10 +106,8 @@ // CHECK-LABEL: @test_vbfmlalbq_lane_f32( // CHECK-NEXT: entry: // CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALB1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3 -// CHECK-NEXT: ret <4 x float> [[VBFMLALB1_I]] +// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]] // float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { return vbfmlalbq_lane_f32(r, a, b, 0); @@ -132,10 +116,8 @@ // CHECK-LABEL: @test_vbfmlalbq_laneq_f32( // CHECK-NEXT: entry: // CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> undef, <8 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALB1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3 -// CHECK-NEXT: ret <4 x float> [[VBFMLALB1_I]] +// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]] // float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlalbq_laneq_f32(r, a, b, 3); @@ -144,10 +126,8 @@ // CHECK-LABEL: @test_vbfmlaltq_lane_f32( // CHECK-NEXT: entry: // CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3 -// CHECK-NEXT: ret <4 x float> [[VBFMLALT1_I]] +// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]] // float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { return vbfmlaltq_lane_f32(r, a, b, 0); @@ -156,10 +136,8 @@ // CHECK-LABEL: @test_vbfmlaltq_laneq_f32( // CHECK-NEXT: entry: // CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> undef, <8 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3 -// CHECK-NEXT: ret <4 x float> [[VBFMLALT1_I]] +// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]] // float32x4_t test_vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlaltq_laneq_f32(r, a, b, 3); diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -184,6 +184,10 @@ [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>], [IntrNoMem]>; + class AdvSIMD_BF16FML_Intrinsic + : Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty], + [IntrNoMem]>; } // Arithmetic ops @@ -466,9 +470,12 @@ def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic; def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic; def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic; - def int_aarch64_neon_bfmmla : AdvSIMD_MatMul_Intrinsic; - def int_aarch64_neon_bfmlalb : AdvSIMD_FML_Intrinsic; - def int_aarch64_neon_bfmlalt : AdvSIMD_FML_Intrinsic; + def int_aarch64_neon_bfmmla + : Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty], + [IntrNoMem]>; + def int_aarch64_neon_bfmlalb : AdvSIMD_BF16FML_Intrinsic; + def int_aarch64_neon_bfmlalt : AdvSIMD_BF16FML_Intrinsic; // v8.6-A Bfloat Intrinsics diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -791,14 +791,17 @@ : Intrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>; def int_arm_neon_bfdot : Neon_Dot_Intrinsic; -def int_arm_neon_bfmmla : Neon_MatMul_Intrinsic; - -class Neon_FML_Intrinsic - : Intrinsic<[llvm_anyvector_ty], - [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>], - [IntrNoMem]>; -def int_arm_neon_bfmlalb : Neon_FML_Intrinsic; -def int_arm_neon_bfmlalt : Neon_FML_Intrinsic; +def int_arm_neon_bfmmla + : Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty], + [IntrNoMem]>; + +class Neon_BF16FML_Intrinsic + : Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty], + [IntrNoMem]>; +def int_arm_neon_bfmlalb : Neon_BF16FML_Intrinsic; +def int_arm_neon_bfmlalt : Neon_BF16FML_Intrinsic; def int_arm_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -632,6 +632,63 @@ return true; } } + + // Changed in 12.0: bfdot accept v4bf16 and v8bf16 instead of v8i8 and v16i8 + // respectively + if ((Name.startswith("arm.neon.bfdot.") || + Name.startswith("aarch64.neon.bfdot.")) && + Name.endswith("i8")) { + Intrinsic::ID IID = + StringSwitch(Name) + .Cases("arm.neon.bfdot.v2f32.v8i8", + "arm.neon.bfdot.v4f32.v16i8", + Intrinsic::arm_neon_bfdot) + .Cases("aarch64.neon.bfdot.v2f32.v8i8", + "aarch64.neon.bfdot.v4f32.v16i8", + Intrinsic::aarch64_neon_bfdot) + .Default(Intrinsic::not_intrinsic); + if (IID == Intrinsic::not_intrinsic) + break; + + size_t OperandWidth = F->getReturnType()->getPrimitiveSizeInBits(); + assert((OperandWidth == 64 || OperandWidth == 128) && + "Unexpected operand width"); + LLVMContext &Ctx = F->getParent()->getContext(); + std::array Tys {{ + F->getReturnType(), + FixedVectorType::get(Type::getBFloatTy(Ctx), OperandWidth / 16) + }}; + NewFn = Intrinsic::getDeclaration(F->getParent(), IID, Tys); + return true; + } + + // Changed in 12.0: bfmmla, bfmlalb and bfmlalt are not polymorphic anymore + // and accept v8bf16 instead of v16i8 + if ((Name.startswith("arm.neon.bfm") || + Name.startswith("aarch64.neon.bfm")) && + Name.endswith(".v4f32.v16i8")) { + Intrinsic::ID IID = + StringSwitch(Name) + .Case("arm.neon.bfmmla.v4f32.v16i8", + Intrinsic::arm_neon_bfmmla) + .Case("arm.neon.bfmlalb.v4f32.v16i8", + Intrinsic::arm_neon_bfmlalb) + .Case("arm.neon.bfmlalt.v4f32.v16i8", + Intrinsic::arm_neon_bfmlalt) + .Case("aarch64.neon.bfmmla.v4f32.v16i8", + Intrinsic::aarch64_neon_bfmmla) + .Case("aarch64.neon.bfmlalb.v4f32.v16i8", + Intrinsic::aarch64_neon_bfmlalb) + .Case("aarch64.neon.bfmlalt.v4f32.v16i8", + Intrinsic::aarch64_neon_bfmlalt) + .Default(Intrinsic::not_intrinsic); + if (IID == Intrinsic::not_intrinsic) + break; + + std::array Tys; + NewFn = Intrinsic::getDeclaration(F->getParent(), IID, Tys); + return true; + } break; } @@ -3618,6 +3675,30 @@ break; } + case Intrinsic::arm_neon_bfdot: + case Intrinsic::arm_neon_bfmmla: + case Intrinsic::arm_neon_bfmlalb: + case Intrinsic::arm_neon_bfmlalt: + case Intrinsic::aarch64_neon_bfdot: + case Intrinsic::aarch64_neon_bfmmla: + case Intrinsic::aarch64_neon_bfmlalb: + case Intrinsic::aarch64_neon_bfmlalt: { + SmallVector Args; + assert(CI->getNumArgOperands() == 3 && + "Mismatch between function args and call args"); + size_t OperandWidth = + CI->getArgOperand(1)->getType()->getPrimitiveSizeInBits(); + assert((OperandWidth == 64 || OperandWidth == 128) && + "Unexpected operand width"); + Type *NewTy = FixedVectorType::get(Type::getBFloatTy(C), OperandWidth / 16); + auto Iter = CI->arg_operands().begin(); + Args.push_back(*Iter++); + Args.push_back(Builder.CreateBitCast(*Iter++, NewTy)); + Args.push_back(Builder.CreateBitCast(*Iter++, NewTy)); + NewCall = Builder.CreateCall(NewFn, Args); + break; + } + case Intrinsic::bitreverse: NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(0)}); break; diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -7841,9 +7841,9 @@ multiclass SIMDThreeSameVectorBFDot { def v4bf16 : BaseSIMDThreeSameVectorBFDot<0, U, asm, ".2s", ".4h", V64, - v2f32, v8i8>; + v2f32, v4bf16>; def v8bf16 : BaseSIMDThreeSameVectorBFDot<1, U, asm, ".4s", ".8h", V128, - v4f32, v16i8>; + v4f32, v8bf16>; } class BaseSIMDThreeSameVectorBF16DotI { + VectorIndexS:$idx)))))))]> { bits<2> idx; let Inst{21} = idx{0}; // L @@ -7871,16 +7871,16 @@ multiclass SIMDThreeSameVectorBF16DotI { def v4bf16 : BaseSIMDThreeSameVectorBF16DotI<0, U, asm, ".2s", ".4h", - ".2h", V64, v2f32, v8i8>; + ".2h", V64, v2f32, v4bf16>; def v8bf16 : BaseSIMDThreeSameVectorBF16DotI<1, U, asm, ".4s", ".8h", - ".2h", V128, v4f32, v16i8>; + ".2h", V128, v4f32, v8bf16>; } class SIMDBF16MLAL : BaseSIMDThreeSameVectorTied { + (v8bf16 V128:$Rn), + (v8bf16 V128:$Rm)))]> { let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}"); } @@ -7890,10 +7890,10 @@ "{\t$Rd.4s, $Rn.8h, $Rm.h$idx}", "$Rd = $dst", [(set (v4f32 V128:$dst), (v4f32 (OpNode (v4f32 V128:$Rd), - (v16i8 V128:$Rn), - (v16i8 (bitconvert (v8bf16 + (v8bf16 V128:$Rn), + (v8bf16 (AArch64duplane16 (v8bf16 V128_lo:$Rm), - VectorIndexH:$idx)))))))]>, + VectorIndexH:$idx)))))]>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; @@ -7917,8 +7917,8 @@ V128, asm, ".4s", [(set (v4f32 V128:$dst), (int_aarch64_neon_bfmmla (v4f32 V128:$Rd), - (v16i8 V128:$Rn), - (v16i8 V128:$Rm)))]> { + (v8bf16 V128:$Rn), + (v8bf16 V128:$Rm)))]> { let AsmString = !strconcat(asm, "{\t$Rd", ".4s", ", $Rn", ".8h", ", $Rm", ".8h", "}"); } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -798,6 +798,23 @@ def BFCVTN : SIMD_BFCVTN; def BFCVTN2 : SIMD_BFCVTN2; def BFCVT : BF16ToSinglePrecision<"bfcvt">; + +// Vector-scalar BFDOT: +// The second source operand of the 64-bit variant of BF16DOTlane is a 128-bit +// register (the instruction uses a single 32-bit lane from it), so the pattern +// is a bit tricky. +def : Pat<(v2f32 (int_aarch64_neon_bfdot + (v2f32 V64:$Rd), (v4bf16 V64:$Rn), + (v4bf16 (bitconvert + (v2i32 (AArch64duplane32 + (v4i32 (bitconvert + (v8bf16 (insert_subvector undef, + (v4bf16 V64:$Rm), + (i64 0))))), + VectorIndexS:$idx)))))), + (BF16DOTlanev4bf16 (v2f32 V64:$Rd), (v4bf16 V64:$Rn), + (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), + VectorIndexS:$idx)>; } // ARMv8.6A AArch64 matrix multiplication diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td --- a/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -9079,11 +9079,11 @@ (!cast(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>; } -def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v8i8>; -def BF16VDOTS_VDOTQ : BF16VDOTS<1, QPR, "vdot", v4f32, v16i8>; +def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v4bf16>; +def BF16VDOTS_VDOTQ : BF16VDOTS<1, QPR, "vdot", v4f32, v8bf16>; -defm BF16VDOTI_VDOTD : BF16VDOTI<0, DPR, "vdot", v2f32, v8i8, (v2f32 DPR_VFP2:$Vm)>; -defm BF16VDOTI_VDOTQ : BF16VDOTI<1, QPR, "vdot", v4f32, v16i8, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>; +defm BF16VDOTI_VDOTD : BF16VDOTI<0, DPR, "vdot", v2f32, v4bf16, (v2f32 DPR_VFP2:$Vm)>; +defm BF16VDOTI_VDOTQ : BF16VDOTI<1, QPR, "vdot", v4f32, v8bf16, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>; class BF16MM @@ -9091,8 +9091,8 @@ (outs RegTy:$dst), (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm), N3RegFrm, IIC_VDOTPROD, "", "", [(set (v4f32 QPR:$dst), (int_arm_neon_bfmmla (v4f32 QPR:$Vd), - (v16i8 QPR:$Vn), - (v16i8 QPR:$Vm)))]> { + (v8bf16 QPR:$Vn), + (v8bf16 QPR:$Vm)))]> { let Constraints = "$dst = $Vd"; let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm"); let DecoderNamespace = "VFPV8"; @@ -9106,8 +9106,8 @@ NoItinerary, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm", "", [(set (v4f32 QPR:$dst), (OpNode (v4f32 QPR:$Vd), - (v16i8 QPR:$Vn), - (v16i8 QPR:$Vm)))]> { + (v8bf16 QPR:$Vn), + (v8bf16 QPR:$Vm)))]> { let Constraints = "$dst = $Vd"; let DecoderNamespace = "VFPV8"; } @@ -9128,9 +9128,9 @@ def : Pat< (v4f32 (OpNode (v4f32 QPR:$Vd), - (v16i8 QPR:$Vn), - (v16i8 (bitconvert (v8bf16 (ARMvduplane (v8bf16 QPR:$Vm), - VectorIndex16:$lane)))))), + (v8bf16 QPR:$Vn), + (v8bf16 (ARMvduplane (v8bf16 QPR:$Vm), + VectorIndex16:$lane)))), (!cast(NAME) QPR:$Vd, QPR:$Vn, (EXTRACT_SUBREG QPR:$Vm, diff --git a/llvm/test/Bitcode/aarch64-bf16-upgrade.ll b/llvm/test/Bitcode/aarch64-bf16-upgrade.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Bitcode/aarch64-bf16-upgrade.ll @@ -0,0 +1,76 @@ +; RUN: llvm-dis < %s.bc | FileCheck %s + +; Bitcode was generated from file below + +define <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) { +; CHECK-LABEL: @test_vbfdot_f32 +entry: + %0 = bitcast <4 x bfloat> %a to <8 x i8> + %1 = bitcast <4 x bfloat> %b to <8 x i8> + ; CHECK: %2 = bitcast <8 x i8> %0 to <4 x bfloat> + ; CHECK-NEXT: %3 = bitcast <8 x i8> %1 to <4 x bfloat> + ; CHECK-NEXT: %vbfdot1.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %2, <4 x bfloat> %3) + %vbfdot1.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1) + ret <2 x float> %vbfdot1.i +} + +define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { +; CHECK-LABEL: @test_vbfdotq_f32 +entry: + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %b to <16 x i8> + ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat> + ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat> + ; CHECK-NEXT: %vbfdot1.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3) + %vbfdot1.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) + ret <4 x float> %vbfdot1.i +} + +define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { +; CHECK-LABEL: @test_vbfmmlaq_f32 +entry: + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %b to <16 x i8> + %vbfmmla1.i = call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) + ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat> + ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat> + ; CHECK-NEXT: %vbfmmla1.i = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3) + ret <4 x float> %vbfmmla1.i +} + +define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { +; CHECK-LABEL: @test_vbfmlalbq_laneq_f32 +entry: + %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> + %vbfmlalb1.i = call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) + ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat> + ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat> + ; CHECK-NEXT: %vbfmlalb1.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3) + ret <4 x float> %vbfmlalb1.i +} + +define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { +; CHECK-LABEL: @test_vbfmlaltq_laneq_f32 +entry: + %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> + %vbfmlalt1.i = call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) + ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat> + ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat> + ; CHECK-NEXT: %vbfmlalt1.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3) + ret <4 x float> %vbfmlalt1.i +} + +declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>) +; CHECK: declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>) +declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) +; CHECK: declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>) +declare <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) +; CHECK: declare <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>) +declare <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) +; CHECK: declare <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>) +declare <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) +; CHECK: declare <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>) \ No newline at end of file diff --git a/llvm/test/Bitcode/aarch64-bf16-upgrade.ll.bc b/llvm/test/Bitcode/aarch64-bf16-upgrade.ll.bc new file mode 100644 diff --git a/llvm/test/Bitcode/arm-bf16-upgrade.ll b/llvm/test/Bitcode/arm-bf16-upgrade.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Bitcode/arm-bf16-upgrade.ll @@ -0,0 +1,76 @@ +; RUN: llvm-dis < %s.bc | FileCheck %s + +; Bitcode was generated from file below + +define arm_aapcs_vfpcc <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) { +; CHECK-LABEL: @test_vbfdot_f32 +entry: + %0 = bitcast <4 x bfloat> %a to <8 x i8> + %1 = bitcast <4 x bfloat> %b to <8 x i8> + %vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1) + ; CHECK: %2 = bitcast <8 x i8> %0 to <4 x bfloat> + ; CHECK-NEXT: %3 = bitcast <8 x i8> %1 to <4 x bfloat> + ; CHECK-NEXT: %vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %2, <4 x bfloat> %3) + ret <2 x float> %vbfdot1.i +} + +define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { +; CHECK-LABEL: @test_vbfdotq_f32 +entry: + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %b to <16 x i8> + %vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) + ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat> + ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat> + ; CHECK-NEXT: %vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3) + ret <4 x float> %vbfdot1.i +} + +define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { +; CHECK-LABEL: @test_vbfmmlaq_f32 +entry: + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %b to <16 x i8> + %vbfmmla1.i = call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) + ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat> + ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat> + ; CHECK-NEXT: %vbfmmla1.i = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3) + ret <4 x float> %vbfmmla1.i +} + +define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { +; CHECK-LABEL: @test_vbfmlalbq_laneq_f32 +entry: + %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> + %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) + ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat> + ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat> + ; CHECK-NEXT: %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3) + ret <4 x float> %vbfmlalb1.i +} + +define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { +; CHECK-LABEL: @test_vbfmlaltq_laneq_f32 +entry: + %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> + %0 = bitcast <8 x bfloat> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> + %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) + ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat> + ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat> + ; CHECK-NEXT: %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3) + ret <4 x float> %vbfmlalt1.i +} + +declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>) +; CHECK: declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>) +declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) +; CHECK: declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>) +declare <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) +; CHECK: declare <4 x float> @llvm.arm.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>) +declare <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) +; CHECK: declare <4 x float> @llvm.arm.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>) +declare <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) +; CHECK: declare <4 x float> @llvm.arm.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>) \ No newline at end of file diff --git a/llvm/test/Bitcode/arm-bf16-upgrade.ll.bc b/llvm/test/Bitcode/arm-bf16-upgrade.ll.bc new file mode 100644 diff --git a/llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll b/llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll --- a/llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll @@ -7,10 +7,8 @@ ; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.4h ; CHECK-NEXT: ret entry: - %0 = bitcast <4 x bfloat> %a to <8 x i8> - %1 = bitcast <4 x bfloat> %b to <8 x i8> - %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1) - ret <2 x float> %vbfdot1.i + %vbfdot3.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) + ret <2 x float> %vbfdot3.i } define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -19,24 +17,22 @@ ; CHECK-NEXT: bfdot v0.4s, v1.8h, v2.8h ; CHECK-NEXT: ret entry: - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %b to <16 x i8> - %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfdot1.i + %vbfdot3.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) + ret <4 x float> %vbfdot3.i } define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) { ; CHECK-LABEL: test_vbfdot_lane_f32: ; CHECK: // %bb.0: // %entry -; CHECK: bfdot v0.2s, v1.4h, v2.2h[0] +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.2h[0] ; CHECK-NEXT: ret entry: - %0 = bitcast <4 x bfloat> %b to <2 x float> - %shuffle = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer - %1 = bitcast <4 x bfloat> %a to <8 x i8> - %2 = bitcast <2 x float> %shuffle to <8 x i8> - %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) - ret <2 x float> %vbfdot1.i + %.cast = bitcast <4 x bfloat> %b to <2 x float> + %lane = shufflevector <2 x float> %.cast, <2 x float> undef, <2 x i32> zeroinitializer + %.cast1 = bitcast <2 x float> %lane to <4 x bfloat> + %vbfdot3.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1) + ret <2 x float> %vbfdot3.i } define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -45,12 +41,11 @@ ; CHECK-NEXT: bfdot v0.4s, v1.8h, v2.2h[3] ; CHECK-NEXT: ret entry: - %0 = bitcast <8 x bfloat> %b to <4 x float> - %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> - %1 = bitcast <8 x bfloat> %a to <16 x i8> - %2 = bitcast <4 x float> %shuffle to <16 x i8> - %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) - ret <4 x float> %vbfdot1.i + %.cast = bitcast <8 x bfloat> %b to <4 x float> + %lane = shufflevector <4 x float> %.cast, <4 x float> undef, <4 x i32> + %.cast1 = bitcast <4 x float> %lane to <8 x bfloat> + %vbfdot3.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1) + ret <4 x float> %vbfdot3.i } define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) { @@ -59,26 +54,25 @@ ; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.2h[3] ; CHECK-NEXT: ret entry: - %0 = bitcast <8 x bfloat> %b to <4 x float> - %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> - %1 = bitcast <4 x bfloat> %a to <8 x i8> - %2 = bitcast <2 x float> %shuffle to <8 x i8> - %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) - ret <2 x float> %vbfdot1.i + %.cast = bitcast <8 x bfloat> %b to <4 x float> + %lane = shufflevector <4 x float> %.cast, <4 x float> undef, <2 x i32> + %.cast1 = bitcast <2 x float> %lane to <4 x bfloat> + %vbfdot3.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1) + ret <2 x float> %vbfdot3.i } define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) { ; CHECK-LABEL: test_vbfdotq_lane_f32: ; CHECK: // %bb.0: // %entry -; CHECK: bfdot v0.4s, v1.8h, v2.2h[0] +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: bfdot v0.4s, v1.8h, v2.2h[0] ; CHECK-NEXT: ret entry: - %0 = bitcast <4 x bfloat> %b to <2 x float> - %shuffle = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer - %1 = bitcast <8 x bfloat> %a to <16 x i8> - %2 = bitcast <4 x float> %shuffle to <16 x i8> - %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) - ret <4 x float> %vbfdot1.i + %.cast = bitcast <4 x bfloat> %b to <2 x float> + %lane = shufflevector <2 x float> %.cast, <2 x float> undef, <4 x i32> zeroinitializer + %.cast1 = bitcast <4 x float> %lane to <8 x bfloat> + %vbfdot3.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1) + ret <4 x float> %vbfdot3.i } define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -87,10 +81,8 @@ ; CHECK-NEXT: bfmmla v0.4s, v1.8h, v2.8h ; CHECK-NEXT: ret entry: - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %b to <16 x i8> - %vbfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmmla1.i + %vbfmmlaq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) + ret <4 x float> %vbfmmlaq_v3.i } define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -99,10 +91,8 @@ ; CHECK-NEXT: bfmlalb v0.4s, v1.8h, v2.8h ; CHECK-NEXT: ret entry: - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %b to <16 x i8> - %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalb1.i + %vbfmlalbq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) + ret <4 x float> %vbfmlalbq_v3.i } define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -111,23 +101,20 @@ ; CHECK-NEXT: bfmlalt v0.4s, v1.8h, v2.8h ; CHECK-NEXT: ret entry: - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %b to <16 x i8> - %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalt1.i + %vbfmlaltq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) + ret <4 x float> %vbfmlaltq_v3.i } define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) { ; CHECK-LABEL: test_vbfmlalbq_lane_f32: ; CHECK: // %bb.0: // %entry -; CHECK: bfmlalb v0.4s, v1.8h, v2.h[0] +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: bfmlalb v0.4s, v1.8h, v2.h[0] ; CHECK-NEXT: ret entry: %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> - %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalb1.i + %vbfmlalbq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35) + ret <4 x float> %vbfmlalbq_v3.i } define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -137,23 +124,20 @@ ; CHECK-NEXT: ret entry: %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> - %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalb1.i + %vbfmlalbq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35) + ret <4 x float> %vbfmlalbq_v3.i } define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) { ; CHECK-LABEL: test_vbfmlaltq_lane_f32: ; CHECK: // %bb.0: // %entry -; CHECK: bfmlalt v0.4s, v1.8h, v2.h[0] +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: bfmlalt v0.4s, v1.8h, v2.h[0] ; CHECK-NEXT: ret entry: %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> - %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalt1.i + %vbfmlaltq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35) + ret <4 x float> %vbfmlaltq_v3.i } define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -163,14 +147,12 @@ ; CHECK-NEXT: ret entry: %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> - %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalt1.i + %vbfmlaltq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35) + ret <4 x float> %vbfmlaltq_v3.i } -declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>) #2 -declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2 -declare <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2 -declare <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2 -declare <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2 +declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>) +declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>) +declare <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>) +declare <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>) +declare <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>) diff --git a/llvm/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll b/llvm/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll --- a/llvm/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll +++ b/llvm/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll @@ -7,10 +7,8 @@ ; CHECK-NEXT: vdot.bf16 d0, d1, d2 ; CHECK-NEXT: bx lr entry: - %0 = bitcast <4 x bfloat> %a to <8 x i8> - %1 = bitcast <4 x bfloat> %b to <8 x i8> - %vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1) - ret <2 x float> %vbfdot1.i + %vbfdot3.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) #3 + ret <2 x float> %vbfdot3.i } define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -19,10 +17,8 @@ ; CHECK-NEXT: vdot.bf16 q0, q1, q2 ; CHECK-NEXT: bx lr entry: - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %b to <16 x i8> - %vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfdot1.i + %vbfdot3.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) #3 + ret <4 x float> %vbfdot3.i } define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) { @@ -31,12 +27,11 @@ ; CHECK-NEXT: vdot.bf16 d0, d1, d2[0] ; CHECK-NEXT: bx lr entry: - %0 = bitcast <4 x bfloat> %b to <2 x float> - %shuffle = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer - %1 = bitcast <4 x bfloat> %a to <8 x i8> - %2 = bitcast <2 x float> %shuffle to <8 x i8> - %vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) - ret <2 x float> %vbfdot1.i + %.cast = bitcast <4 x bfloat> %b to <2 x float> + %lane = shufflevector <2 x float> %.cast, <2 x float> undef, <2 x i32> zeroinitializer + %.cast1 = bitcast <2 x float> %lane to <4 x bfloat> + %vbfdot3.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1) #3 + ret <2 x float> %vbfdot3.i } define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -46,12 +41,11 @@ ; CHECK-NEXT: vdot.bf16 q0, q1, q8 ; CHECK-NEXT: bx lr entry: - %0 = bitcast <8 x bfloat> %b to <4 x float> - %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> - %1 = bitcast <8 x bfloat> %a to <16 x i8> - %2 = bitcast <4 x float> %shuffle to <16 x i8> - %vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) - ret <4 x float> %vbfdot1.i + %.cast = bitcast <8 x bfloat> %b to <4 x float> + %lane = shufflevector <4 x float> %.cast, <4 x float> undef, <4 x i32> + %.cast1 = bitcast <4 x float> %lane to <8 x bfloat> + %vbfdot3.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1) #3 + ret <4 x float> %vbfdot3.i } define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) { @@ -60,12 +54,11 @@ ; CHECK-NEXT: vdot.bf16 d0, d1, d3[1] ; CHECK-NEXT: bx lr entry: - %0 = bitcast <8 x bfloat> %b to <4 x float> - %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> - %1 = bitcast <4 x bfloat> %a to <8 x i8> - %2 = bitcast <2 x float> %shuffle to <8 x i8> - %vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) - ret <2 x float> %vbfdot1.i + %.cast = bitcast <8 x bfloat> %b to <4 x float> + %lane = shufflevector <4 x float> %.cast, <4 x float> undef, <2 x i32> + %.cast1 = bitcast <2 x float> %lane to <4 x bfloat> + %vbfdot3.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1) #3 + ret <2 x float> %vbfdot3.i } define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) { @@ -75,12 +68,11 @@ ; CHECK-NEXT: vdot.bf16 q0, q1, d4[0] ; CHECK-NEXT: bx lr entry: - %0 = bitcast <4 x bfloat> %b to <2 x float> - %shuffle = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer - %1 = bitcast <8 x bfloat> %a to <16 x i8> - %2 = bitcast <4 x float> %shuffle to <16 x i8> - %vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) - ret <4 x float> %vbfdot1.i + %.cast = bitcast <4 x bfloat> %b to <2 x float> + %lane = shufflevector <2 x float> %.cast, <2 x float> undef, <4 x i32> zeroinitializer + %.cast1 = bitcast <4 x float> %lane to <8 x bfloat> + %vbfdot3.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1) #3 + ret <4 x float> %vbfdot3.i } define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -89,10 +81,8 @@ ; CHECK-NEXT: vmmla.bf16 q0, q1, q2 ; CHECK-NEXT: bx lr entry: - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %b to <16 x i8> - %vbfmmla1.i = call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmmla1.i + %vbfmmlaq_v3.i = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) + ret <4 x float> %vbfmmlaq_v3.i } define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -101,10 +91,8 @@ ; CHECK-NEXT: vfmab.bf16 q0, q1, q2 ; CHECK-NEXT: bx lr entry: - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %b to <16 x i8> - %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalb1.i + %vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) + ret <4 x float> %vbfmlalbq_v3.i } define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -113,10 +101,8 @@ ; CHECK-NEXT: vfmat.bf16 q0, q1, q2 ; CHECK-NEXT: bx lr entry: - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %b to <16 x i8> - %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalt1.i + %vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) + ret <4 x float> %vbfmlaltq_v3.i } define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) { @@ -127,10 +113,8 @@ ; CHECK-NEXT: bx lr entry: %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> - %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalb1.i + %vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35) + ret <4 x float> %vbfmlalbq_v3.i } define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -140,10 +124,8 @@ ; CHECK-NEXT: bx lr entry: %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> - %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalb1.i + %vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35) + ret <4 x float> %vbfmlalbq_v3.i } define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) { @@ -154,10 +136,8 @@ ; CHECK-NEXT: bx lr entry: %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> - %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalt1.i + %vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35) + ret <4 x float> %vbfmlaltq_v3.i } define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -167,10 +147,8 @@ ; CHECK-NEXT: bx lr entry: %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> - %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalt1.i + %vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35) + ret <4 x float> %vbfmlaltq_v3.i } define <4 x float> @test_vbfmlaltq_laneq_f32_v2(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -181,14 +159,12 @@ ; CHECK-NEXT: bx lr entry: %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> - %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) + %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35) ret <4 x float> %vbfmlalt1.i } -declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>) -declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) -declare <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) -declare <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) -declare <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) +declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>) +declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>) +declare <4 x float> @llvm.arm.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>) +declare <4 x float> @llvm.arm.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>) +declare <4 x float> @llvm.arm.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)