Index: clang/lib/CodeGen/CGBuiltin.cpp =================================================================== --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -6241,24 +6241,6 @@ llvm::Type *Tys[2] = { Ty, InputTy }; return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfdot"); } - case NEON::BI__builtin_neon_vbfmmlaq_v: { - llvm::Type *InputTy = - llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); - llvm::Type *Tys[2] = { Ty, InputTy }; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmmla"); - } - case NEON::BI__builtin_neon_vbfmlalbq_v: { - llvm::Type *InputTy = - llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); - llvm::Type *Tys[2] = { Ty, InputTy }; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmlalb"); - } - case NEON::BI__builtin_neon_vbfmlaltq_v: { - llvm::Type *InputTy = - llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); - llvm::Type *Tys[2] = { Ty, InputTy }; - return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmlalt"); - } case NEON::BI__builtin_neon___a32_vcvt_bf16_v: { llvm::Type *Tys[1] = { Ty }; Function *F = CGM.getIntrinsic(Int, Tys); Index: clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c =================================================================== --- clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c +++ clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c @@ -1,146 +1,146 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // RUN: %clang_cc1 -triple aarch64-arm-none-eabi -target-feature +neon -target-feature +bf16 \ // RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -mem2reg -instcombine | FileCheck %s #include -// CHECK-LABEL: test_vbfdot_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %0 = bitcast <4 x bfloat> %a to <8 x i8> -// CHECK-NEXT %1 = bitcast <4 x bfloat> %b to <8 x i8> -// CHECK-NEXT %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1) -// CHECK-NEXT ret <2 x float> %vbfdot1.i +// CHECK-LABEL: @test_vbfdot_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.*]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) [[ATTR3:#.*]] +// CHECK-NEXT: ret <2 x float> [[VBFDOT1_I]] +// float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) { return vbfdot_f32(r, a, b); } -// CHECK-LABEL: test_vbfdotq_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %1 = bitcast <8 x bfloat> %b to <16 x i8> -// CHECK-NEXT %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) -// CHECK-NEXT ret <4 x float> %vbfdot1.i +// CHECK-LABEL: @test_vbfdotq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> +// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFDOT1_I]] +// float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){ return vbfdotq_f32(r, a, b); } -// CHECK-LABEL: test_vbfdot_lane_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %0 = bitcast <4 x bfloat> %b to <2 x float> -// CHECK-NEXT %lane = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer -// CHECK-NEXT %1 = bitcast <4 x bfloat> %a to <8 x i8> -// CHECK-NEXT %2 = bitcast <2 x float> %lane to <8 x i8> -// CHECK-NEXT %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) -// CHECK-NEXT ret <2 x float> %vbfdot1.i +// CHECK-LABEL: @test_vbfdot_lane_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <2 x i32> zeroinitializer +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.*]], <8 x i8> [[TMP0]], <8 x i8> [[DOTCAST1]]) [[ATTR3]] +// CHECK-NEXT: ret <2 x float> [[VBFDOT1_I]] +// float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){ return vbfdot_lane_f32(r, a, b, 0); } -// CHECK-LABEL: test_vbfdotq_laneq_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %0 = bitcast <8 x bfloat> %b to <4 x float> -// CHECK-NEXT %lane = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> -// CHECK-NEXT %1 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %2 = bitcast <4 x float> %lane to <16 x i8> -// CHECK-NEXT %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) -// CHECK-NEXT ret <4 x float> %vbfdot1.i +// CHECK-LABEL: @test_vbfdotq_laneq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <4 x i32> +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[DOTCAST1]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFDOT1_I]] +// float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfdotq_laneq_f32(r, a, b, 3); } -// CHECK-LABEL: test_vbfdot_laneq_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %0 = bitcast <8 x bfloat> %b to <4 x float> -// CHECK-NEXT %lane = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> -// CHECK-NEXT %1 = bitcast <4 x bfloat> %a to <8 x i8> -// CHECK-NEXT %2 = bitcast <2 x float> %lane to <8 x i8> -// CHECK-NEXT %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) -// CHECK-NEXT ret <2 x float> %vbfdot1.i +// CHECK-LABEL: @test_vbfdot_laneq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <2 x i32> +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.*]], <8 x i8> [[TMP0]], <8 x i8> [[DOTCAST1]]) [[ATTR3]] +// CHECK-NEXT: ret <2 x float> [[VBFDOT1_I]] +// float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) { return vbfdot_laneq_f32(r, a, b, 3); } -// CHECK-LABEL: test_vbfdotq_lane_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %0 = bitcast <4 x bfloat> %b to <2 x float> -// CHECK-NEXT %lane = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer -// CHECK-NEXT %1 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %2 = bitcast <4 x float> %lane to <16 x i8> -// CHECK-NEXT %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) -// CHECK-NEXT ret <4 x float> %vbfdot1.i +// CHECK-LABEL: @test_vbfdotq_lane_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <4 x i32> zeroinitializer +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[DOTCAST1]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFDOT1_I]] +// float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { return vbfdotq_lane_f32(r, a, b, 0); } -// CHECK-LABEL: test_vbfmmlaq_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %1 = bitcast <8 x bfloat> %b to <16 x i8> -// CHECK-NEXT %vbfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) -// CHECK-NEXT ret <4 x float> %vbfmmla1.i +// CHECK-LABEL: @test_vbfmmlaq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VBFMMLAQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMMLAQ_V3_I]] +// float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmmlaq_f32(r, a, b); } -// CHECK-LABEL: test_vbfmlalbq_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %1 = bitcast <8 x bfloat> %b to <16 x i8> -// CHECK-NEXT %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) -// CHECK-NEXT ret <4 x float> %vbfmlalb1.i +// CHECK-LABEL: @test_vbfmlalbq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]] +// float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlalbq_f32(r, a, b); } -// CHECK-LABEL: test_vbfmlaltq_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %1 = bitcast <8 x bfloat> %b to <16 x i8> -// CHECK-NEXT %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) -// CHECK-NEXT ret <4 x float> %vbfmlalt1.i +// CHECK-LABEL: @test_vbfmlaltq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]] +// float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlaltq_f32(r, a, b); } -// CHECK-LABEL: test_vbfmlalbq_lane_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer -// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> -// CHECK-NEXT %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) -// CHECK-NEXT ret <4 x float> %vbfmlalb1.i +// CHECK-LABEL: @test_vbfmlalbq_lane_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]] +// float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { return vbfmlalbq_lane_f32(r, a, b, 0); } -// CHECK-LABEL: test_vbfmlalbq_laneq_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> -// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> -// CHECK-NEXT %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) -// CHECK-NEXT ret <4 x float> %vbfmlalb1.i +// CHECK-LABEL: @test_vbfmlalbq_laneq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> undef, <8 x i32> +// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]] +// float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlalbq_laneq_f32(r, a, b, 3); } -// CHECK-LABEL: test_vbfmlaltq_lane_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer -// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> -// CHECK-NEXT %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) -// CHECK-NEXT ret <4 x float> %vbfmlalt1.i +// CHECK-LABEL: @test_vbfmlaltq_lane_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> zeroinitializer +// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]] +// float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { return vbfmlaltq_lane_f32(r, a, b, 0); } -// CHECK-LABEL: test_vbfmlaltq_laneq_f32 -// CHECK-NEXT: entry: -// CHECK-NEXT %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> -// CHECK-NEXT %0 = bitcast <8 x bfloat> %a to <16 x i8> -// CHECK-NEXT %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> -// CHECK-NEXT %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) -// CHECK-NEXT ret <4 x float> %vbfmlalt1.i +// CHECK-LABEL: @test_vbfmlaltq_laneq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> undef, <8 x i32> +// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]] +// float32x4_t test_vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlaltq_laneq_f32(r, a, b, 3); } Index: clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c =================================================================== --- clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c +++ clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c @@ -14,7 +14,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.*]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #3 +// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.*]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) [[ATTR3:#.*]] // CHECK-NEXT: ret <2 x float> [[VBFDOT1_I]] // float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) { @@ -25,7 +25,7 @@ // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3 +// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) [[ATTR3]] // CHECK-NEXT: ret <4 x float> [[VBFDOT1_I]] // float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){ @@ -38,7 +38,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <2 x i32> zeroinitializer // CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.*]], <8 x i8> [[TMP0]], <8 x i8> [[DOTCAST1]]) #3 +// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.*]], <8 x i8> [[TMP0]], <8 x i8> [[DOTCAST1]]) [[ATTR3]] // CHECK-NEXT: ret <2 x float> [[VBFDOT1_I]] // float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){ @@ -51,7 +51,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <4 x i32> // CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[DOTCAST1]]) #3 +// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[DOTCAST1]]) [[ATTR3]] // CHECK-NEXT: ret <4 x float> [[VBFDOT1_I]] // float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { @@ -64,7 +64,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <2 x i32> // CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.*]], <8 x i8> [[TMP0]], <8 x i8> [[DOTCAST1]]) #3 +// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.*]], <8 x i8> [[TMP0]], <8 x i8> [[DOTCAST1]]) [[ATTR3]] // CHECK-NEXT: ret <2 x float> [[VBFDOT1_I]] // float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) { @@ -77,7 +77,7 @@ // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <4 x i32> zeroinitializer // CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[DOTCAST1]]) #3 +// CHECK-NEXT: [[VBFDOT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[DOTCAST1]]) [[ATTR3]] // CHECK-NEXT: ret <4 x float> [[VBFDOT1_I]] // float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { @@ -86,10 +86,8 @@ // CHECK-LABEL: @test_vbfmmlaq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFMMLA1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3 -// CHECK-NEXT: ret <4 x float> [[VBFMMLA1_I]] +// CHECK-NEXT: [[VBFMMLAQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMMLAQ_V3_I]] // float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmmlaq_f32(r, a, b); @@ -97,10 +95,8 @@ // CHECK-LABEL: @test_vbfmlalbq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALB1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3 -// CHECK-NEXT: ret <4 x float> [[VBFMLALB1_I]] +// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]] // float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlalbq_f32(r, a, b); @@ -108,10 +104,8 @@ // CHECK-LABEL: @test_vbfmlaltq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3 -// CHECK-NEXT: ret <4 x float> [[VBFMLALT1_I]] +// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]] // float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlaltq_f32(r, a, b); @@ -120,10 +114,8 @@ // CHECK-LABEL: @test_vbfmlalbq_lane_f32( // CHECK-NEXT: entry: // CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALB1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3 -// CHECK-NEXT: ret <4 x float> [[VBFMLALB1_I]] +// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]] // float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { return vbfmlalbq_lane_f32(r, a, b, 0); @@ -132,10 +124,8 @@ // CHECK-LABEL: @test_vbfmlalbq_laneq_f32( // CHECK-NEXT: entry: // CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> undef, <8 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALB1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3 -// CHECK-NEXT: ret <4 x float> [[VBFMLALB1_I]] +// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]] // float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlalbq_laneq_f32(r, a, b, 3); @@ -144,10 +134,8 @@ // CHECK-LABEL: @test_vbfmlaltq_lane_f32( // CHECK-NEXT: entry: // CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3 -// CHECK-NEXT: ret <4 x float> [[VBFMLALT1_I]] +// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]] // float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { return vbfmlaltq_lane_f32(r, a, b, 0); @@ -156,10 +144,8 @@ // CHECK-LABEL: @test_vbfmlaltq_laneq_f32( // CHECK-NEXT: entry: // CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> undef, <8 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3 -// CHECK-NEXT: ret <4 x float> [[VBFMLALT1_I]] +// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]] +// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]] // float32x4_t test_vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlaltq_laneq_f32(r, a, b, 3); Index: llvm/include/llvm/IR/IntrinsicsAArch64.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAArch64.td +++ llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -184,6 +184,10 @@ [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>], [IntrNoMem]>; + class AdvSIMD_BF16FML_Intrinsic + : Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty], + [IntrNoMem]>; } // Arithmetic ops @@ -466,9 +470,12 @@ def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic; def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic; def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic; - def int_aarch64_neon_bfmmla : AdvSIMD_MatMul_Intrinsic; - def int_aarch64_neon_bfmlalb : AdvSIMD_FML_Intrinsic; - def int_aarch64_neon_bfmlalt : AdvSIMD_FML_Intrinsic; + def int_aarch64_neon_bfmmla + : Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty], + [IntrNoMem]>; + def int_aarch64_neon_bfmlalb : AdvSIMD_BF16FML_Intrinsic; + def int_aarch64_neon_bfmlalt : AdvSIMD_BF16FML_Intrinsic; // v8.6-A Bfloat Intrinsics Index: llvm/include/llvm/IR/IntrinsicsARM.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsARM.td +++ llvm/include/llvm/IR/IntrinsicsARM.td @@ -791,14 +791,17 @@ : Intrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>; def int_arm_neon_bfdot : Neon_Dot_Intrinsic; -def int_arm_neon_bfmmla : Neon_MatMul_Intrinsic; - -class Neon_FML_Intrinsic - : Intrinsic<[llvm_anyvector_ty], - [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>], - [IntrNoMem]>; -def int_arm_neon_bfmlalb : Neon_FML_Intrinsic; -def int_arm_neon_bfmlalt : Neon_FML_Intrinsic; +def int_arm_neon_bfmmla + : Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty], + [IntrNoMem]>; + +class Neon_BF16FML_Intrinsic + : Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty], + [IntrNoMem]>; +def int_arm_neon_bfmlalb : Neon_BF16FML_Intrinsic; +def int_arm_neon_bfmlalt : Neon_BF16FML_Intrinsic; def int_arm_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; Index: llvm/lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -7861,7 +7861,7 @@ (InputType RegType:$Rn), (InputType (bitconvert (AccumType (AArch64duplane32 (v4f32 V128:$Rm), - VectorIndexH:$idx)))))))]> { + VectorIndexS:$idx)))))))]> { bits<2> idx; let Inst{21} = idx{0}; // L @@ -7879,8 +7879,8 @@ class SIMDBF16MLAL : BaseSIMDThreeSameVectorTied { + (v8bf16 V128:$Rn), + (v8bf16 V128:$Rm)))]> { let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}"); } @@ -7890,10 +7890,10 @@ "{\t$Rd.4s, $Rn.8h, $Rm.h$idx}", "$Rd = $dst", [(set (v4f32 V128:$dst), (v4f32 (OpNode (v4f32 V128:$Rd), - (v16i8 V128:$Rn), - (v16i8 (bitconvert (v8bf16 + (v8bf16 V128:$Rn), + (v8bf16 (AArch64duplane16 (v8bf16 V128_lo:$Rm), - VectorIndexH:$idx)))))))]>, + VectorIndexH:$idx)))))]>, Sched<[WriteV]> { bits<5> Rd; bits<5> Rn; @@ -7917,8 +7917,8 @@ V128, asm, ".4s", [(set (v4f32 V128:$dst), (int_aarch64_neon_bfmmla (v4f32 V128:$Rd), - (v16i8 V128:$Rn), - (v16i8 V128:$Rm)))]> { + (v8bf16 V128:$Rn), + (v8bf16 V128:$Rm)))]> { let AsmString = !strconcat(asm, "{\t$Rd", ".4s", ", $Rn", ".8h", ", $Rm", ".8h", "}"); } Index: llvm/lib/Target/ARM/ARMInstrNEON.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrNEON.td +++ llvm/lib/Target/ARM/ARMInstrNEON.td @@ -9091,8 +9091,8 @@ (outs RegTy:$dst), (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm), N3RegFrm, IIC_VDOTPROD, "", "", [(set (v4f32 QPR:$dst), (int_arm_neon_bfmmla (v4f32 QPR:$Vd), - (v16i8 QPR:$Vn), - (v16i8 QPR:$Vm)))]> { + (v8bf16 QPR:$Vn), + (v8bf16 QPR:$Vm)))]> { let Constraints = "$dst = $Vd"; let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm"); let DecoderNamespace = "VFPV8"; @@ -9106,8 +9106,8 @@ NoItinerary, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm", "", [(set (v4f32 QPR:$dst), (OpNode (v4f32 QPR:$Vd), - (v16i8 QPR:$Vn), - (v16i8 QPR:$Vm)))]> { + (v8bf16 QPR:$Vn), + (v8bf16 QPR:$Vm)))]> { let Constraints = "$dst = $Vd"; let DecoderNamespace = "VFPV8"; } @@ -9128,9 +9128,9 @@ def : Pat< (v4f32 (OpNode (v4f32 QPR:$Vd), - (v16i8 QPR:$Vn), - (v16i8 (bitconvert (v8bf16 (ARMvduplane (v8bf16 QPR:$Vm), - VectorIndex16:$lane)))))), + (v8bf16 QPR:$Vn), + (v8bf16 (ARMvduplane (v8bf16 QPR:$Vm), + VectorIndex16:$lane)))), (!cast(NAME) QPR:$Vd, QPR:$Vn, (EXTRACT_SUBREG QPR:$Vm, Index: llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll =================================================================== --- llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll +++ llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll @@ -87,10 +87,8 @@ ; CHECK-NEXT: bfmmla v0.4s, v1.8h, v2.8h ; CHECK-NEXT: ret entry: - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %b to <16 x i8> - %vbfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmmla1.i + %vbfmmlaq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) + ret <4 x float> %vbfmmlaq_v3.i } define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -99,10 +97,8 @@ ; CHECK-NEXT: bfmlalb v0.4s, v1.8h, v2.8h ; CHECK-NEXT: ret entry: - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %b to <16 x i8> - %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalb1.i + %vbfmlalbq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) + ret <4 x float> %vbfmlalbq_v3.i } define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -111,23 +107,20 @@ ; CHECK-NEXT: bfmlalt v0.4s, v1.8h, v2.8h ; CHECK-NEXT: ret entry: - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %b to <16 x i8> - %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalt1.i + %vbfmlaltq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) + ret <4 x float> %vbfmlaltq_v3.i } define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) { ; CHECK-LABEL: test_vbfmlalbq_lane_f32: ; CHECK: // %bb.0: // %entry -; CHECK: bfmlalb v0.4s, v1.8h, v2.h[0] +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: bfmlalb v0.4s, v1.8h, v2.h[0] ; CHECK-NEXT: ret entry: %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> - %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalb1.i + %vbfmlalbq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35) + ret <4 x float> %vbfmlalbq_v3.i } define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -137,23 +130,20 @@ ; CHECK-NEXT: ret entry: %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> - %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalb1.i + %vbfmlalbq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35) + ret <4 x float> %vbfmlalbq_v3.i } define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) { ; CHECK-LABEL: test_vbfmlaltq_lane_f32: ; CHECK: // %bb.0: // %entry -; CHECK: bfmlalt v0.4s, v1.8h, v2.h[0] +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: bfmlalt v0.4s, v1.8h, v2.h[0] ; CHECK-NEXT: ret entry: %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> - %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalt1.i + %vbfmlaltq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35) + ret <4 x float> %vbfmlaltq_v3.i } define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -163,14 +153,12 @@ ; CHECK-NEXT: ret entry: %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> - %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalt1.i + %vbfmlaltq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35) + ret <4 x float> %vbfmlaltq_v3.i } -declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>) #2 -declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2 -declare <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2 -declare <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2 -declare <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2 +declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>) +declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) +declare <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>) +declare <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>) +declare <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>) Index: llvm/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll =================================================================== --- llvm/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll +++ llvm/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll @@ -89,10 +89,8 @@ ; CHECK-NEXT: vmmla.bf16 q0, q1, q2 ; CHECK-NEXT: bx lr entry: - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %b to <16 x i8> - %vbfmmla1.i = call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmmla1.i + %vbfmmlaq_v3.i = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) + ret <4 x float> %vbfmmlaq_v3.i } define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -101,10 +99,8 @@ ; CHECK-NEXT: vfmab.bf16 q0, q1, q2 ; CHECK-NEXT: bx lr entry: - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %b to <16 x i8> - %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalb1.i + %vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) + ret <4 x float> %vbfmlalbq_v3.i } define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -113,10 +109,8 @@ ; CHECK-NEXT: vfmat.bf16 q0, q1, q2 ; CHECK-NEXT: bx lr entry: - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %b to <16 x i8> - %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalt1.i + %vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) + ret <4 x float> %vbfmlaltq_v3.i } define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) { @@ -127,10 +121,8 @@ ; CHECK-NEXT: bx lr entry: %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> - %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalb1.i + %vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35) + ret <4 x float> %vbfmlalbq_v3.i } define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -140,10 +132,8 @@ ; CHECK-NEXT: bx lr entry: %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> - %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalb1.i + %vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35) + ret <4 x float> %vbfmlalbq_v3.i } define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) { @@ -154,10 +144,8 @@ ; CHECK-NEXT: bx lr entry: %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> - %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalt1.i + %vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35) + ret <4 x float> %vbfmlaltq_v3.i } define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { @@ -167,28 +155,12 @@ ; CHECK-NEXT: bx lr entry: %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> - %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalt1.i -} - -define <4 x float> @test_vbfmlaltq_laneq_f32_v2(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { -; CHECK-LABEL: test_vbfmlaltq_laneq_f32_v2: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vdup.16 q8, d5[2] -; CHECK-NEXT: vfmat.bf16 q0, q1, q8 -; CHECK-NEXT: bx lr -entry: - %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> - %0 = bitcast <8 x bfloat> %a to <16 x i8> - %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> - %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) - ret <4 x float> %vbfmlalt1.i + %vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35) + ret <4 x float> %vbfmlaltq_v3.i } declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>) declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) -declare <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) -declare <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) -declare <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) +declare <4 x float> @llvm.arm.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>) +declare <4 x float> @llvm.arm.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>) +declare <4 x float> @llvm.arm.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)