diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -6241,28 +6241,10 @@
   case NEON::BI__builtin_neon_vbfdot_v:
   case NEON::BI__builtin_neon_vbfdotq_v: {
     llvm::Type *InputTy =
-        llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
+        llvm::FixedVectorType::get(BFloatTy, Ty->getPrimitiveSizeInBits() / 16);
     llvm::Type *Tys[2] = { Ty, InputTy };
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfdot");
   }
-  case NEON::BI__builtin_neon_vbfmmlaq_v: {
-    llvm::Type *InputTy =
-        llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
-    llvm::Type *Tys[2] = { Ty, InputTy };
-    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmmla");
-  }
-  case NEON::BI__builtin_neon_vbfmlalbq_v: {
-    llvm::Type *InputTy =
-        llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
-    llvm::Type *Tys[2] = { Ty, InputTy };
-    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmlalb");
-  }
-  case NEON::BI__builtin_neon_vbfmlaltq_v: {
-    llvm::Type *InputTy =
-        llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
-    llvm::Type *Tys[2] = { Ty, InputTy };
-    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmlalt");
-  }
   case NEON::BI__builtin_neon___a32_vcvt_bf16_v: {
     llvm::Type *Tys[1] = { Ty };
     Function *F = CGM.getIntrinsic(Int, Tys);
diff --git a/clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c b/clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c
--- a/clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c
+++ b/clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c
@@ -1,146 +1,138 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // RUN: %clang_cc1 -triple aarch64-arm-none-eabi -target-feature +neon -target-feature +bf16 \
 // RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -mem2reg -instcombine | FileCheck %s
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: test_vbfdot_f32
-// CHECK-NEXT: entry:
-// CHECK-NEXT  %0 = bitcast <4 x bfloat> %a to <8 x i8>
-// CHECK-NEXT  %1 = bitcast <4 x bfloat> %b to <8 x i8>
-// CHECK-NEXT  %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
-// CHECK-NEXT  ret <2 x float> %vbfdot1.i
+// CHECK-LABEL: @test_vbfdot_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[B:%.*]]) [[ATTR3:#.*]]
+// CHECK-NEXT:    ret <2 x float> [[VBFDOT3_I]]
+//
 float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
   return vbfdot_f32(r, a, b);
 }
 
-// CHECK-LABEL: test_vbfdotq_f32
-// CHECK-NEXT: entry:
-// CHECK-NEXT  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-// CHECK-NEXT  %1 = bitcast <8 x bfloat> %b to <16 x i8>
-// CHECK-NEXT  %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-// CHECK-NEXT  ret <4 x float> %vbfdot1.i
+// CHECK-LABEL: @test_vbfdotq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]]
+// CHECK-NEXT:    ret <4 x float> [[VBFDOT3_I]]
+//
 float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
   return vbfdotq_f32(r, a, b);
 }
 
-// CHECK-LABEL: test_vbfdot_lane_f32
-// CHECK-NEXT: entry:
-// CHECK-NEXT  %0 = bitcast <4 x bfloat> %b to <2 x float>
-// CHECK-NEXT  %lane = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer
-// CHECK-NEXT  %1 = bitcast <4 x bfloat> %a to <8 x i8>
-// CHECK-NEXT  %2 = bitcast <2 x float> %lane to <8 x i8>
-// CHECK-NEXT  %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
-// CHECK-NEXT  ret <2 x float> %vbfdot1.i
+// CHECK-LABEL: @test_vbfdot_lane_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]]) [[ATTR3]]
+// CHECK-NEXT:    ret <2 x float> [[VBFDOT3_I]]
+//
 float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
   return vbfdot_lane_f32(r, a, b, 0);
 }
 
-// CHECK-LABEL: test_vbfdotq_laneq_f32
-// CHECK-NEXT: entry:
-// CHECK-NEXT  %0 = bitcast <8 x bfloat> %b to <4 x float>
-// CHECK-NEXT  %lane = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT  %1 = bitcast <8 x bfloat> %a to <16 x i8>
-// CHECK-NEXT  %2 = bitcast <4 x float> %lane to <16 x i8>
-// CHECK-NEXT  %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
-// CHECK-NEXT  ret <4 x float> %vbfdot1.i
+// CHECK-LABEL: @test_vbfdotq_laneq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]]) [[ATTR3]]
+// CHECK-NEXT:    ret <4 x float> [[VBFDOT3_I]]
+//
 float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
   return vbfdotq_laneq_f32(r, a, b, 3);
 }
 
-// CHECK-LABEL: test_vbfdot_laneq_f32
-// CHECK-NEXT: entry:
-// CHECK-NEXT  %0 = bitcast <8 x bfloat> %b to <4 x float>
-// CHECK-NEXT  %lane = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT  %1 = bitcast <4 x bfloat> %a to <8 x i8>
-// CHECK-NEXT  %2 = bitcast <2 x float> %lane to <8 x i8>
-// CHECK-NEXT  %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
-// CHECK-NEXT  ret <2 x float> %vbfdot1.i
+// CHECK-LABEL: @test_vbfdot_laneq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]]) [[ATTR3]]
+// CHECK-NEXT:    ret <2 x float> [[VBFDOT3_I]]
+//
 float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) {
   return vbfdot_laneq_f32(r, a, b, 3);
 }
 
-// CHECK-LABEL: test_vbfdotq_lane_f32
-// CHECK-NEXT: entry:
-// CHECK-NEXT  %0 = bitcast <4 x bfloat> %b to <2 x float>
-// CHECK-NEXT  %lane = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer
-// CHECK-NEXT  %1 = bitcast <8 x bfloat> %a to <16 x i8>
-// CHECK-NEXT  %2 = bitcast <4 x float> %lane to <16 x i8>
-// CHECK-NEXT  %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
-// CHECK-NEXT  ret <4 x float> %vbfdot1.i
+// CHECK-LABEL: @test_vbfdotq_lane_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]]) [[ATTR3]]
+// CHECK-NEXT:    ret <4 x float> [[VBFDOT3_I]]
+//
 float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
   return vbfdotq_lane_f32(r, a, b, 0);
 }
 
-// CHECK-LABEL: test_vbfmmlaq_f32
-// CHECK-NEXT: entry:
-// CHECK-NEXT  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-// CHECK-NEXT  %1 = bitcast <8 x bfloat> %b to <16 x i8>
-// CHECK-NEXT  %vbfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-// CHECK-NEXT  ret <4 x float> %vbfmmla1.i
+// CHECK-LABEL: @test_vbfmmlaq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VBFMMLAQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]]
+// CHECK-NEXT:    ret <4 x float> [[VBFMMLAQ_V3_I]]
+//
 float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
   return vbfmmlaq_f32(r, a, b);
 }
 
-// CHECK-LABEL: test_vbfmlalbq_f32
-// CHECK-NEXT: entry:
-// CHECK-NEXT  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-// CHECK-NEXT  %1 = bitcast <8 x bfloat> %b to <16 x i8>
-// CHECK-NEXT  %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-// CHECK-NEXT  ret <4 x float> %vbfmlalb1.i
+// CHECK-LABEL: @test_vbfmlalbq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]]
+// CHECK-NEXT:    ret <4 x float> [[VBFMLALBQ_V3_I]]
+//
 float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
   return vbfmlalbq_f32(r, a, b);
 }
 
-// CHECK-LABEL: test_vbfmlaltq_f32
-// CHECK-NEXT: entry:
-// CHECK-NEXT  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-// CHECK-NEXT  %1 = bitcast <8 x bfloat> %b to <16 x i8>
-// CHECK-NEXT  %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-// CHECK-NEXT  ret <4 x float> %vbfmlalt1.i
+// CHECK-LABEL: @test_vbfmlaltq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]]
+// CHECK-NEXT:    ret <4 x float> [[VBFMLALTQ_V3_I]]
+//
 float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
   return vbfmlaltq_f32(r, a, b);
 }
 
-// CHECK-LABEL: test_vbfmlalbq_lane_f32
-// CHECK-NEXT: entry:
-// CHECK-NEXT  %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-// CHECK-NEXT  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
-// CHECK-NEXT  %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-// CHECK-NEXT  ret <4 x float> %vbfmlalb1.i
+// CHECK-LABEL: @test_vbfmlalbq_lane_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
+// CHECK-NEXT:    ret <4 x float> [[VBFMLALBQ_V3_I]]
+//
 float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
   return vbfmlalbq_lane_f32(r, a, b, 0);
 }
 
-// CHECK-LABEL: test_vbfmlalbq_laneq_f32
-// CHECK-NEXT: entry:
-// CHECK-NEXT  %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-// CHECK-NEXT  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
-// CHECK-NEXT  %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-// CHECK-NEXT  ret <4 x float> %vbfmlalb1.i
+// CHECK-LABEL: @test_vbfmlalbq_laneq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
+// CHECK-NEXT:    ret <4 x float> [[VBFMLALBQ_V3_I]]
+//
 float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
   return vbfmlalbq_laneq_f32(r, a, b, 3);
 }
 
-// CHECK-LABEL: test_vbfmlaltq_lane_f32
-// CHECK-NEXT: entry:
-// CHECK-NEXT  %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-// CHECK-NEXT  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
-// CHECK-NEXT  %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-// CHECK-NEXT  ret <4 x float> %vbfmlalt1.i
+// CHECK-LABEL: @test_vbfmlaltq_lane_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
+// CHECK-NEXT:    ret <4 x float> [[VBFMLALTQ_V3_I]]
+//
 float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
   return vbfmlaltq_lane_f32(r, a, b, 0);
 }
 
-// CHECK-LABEL: test_vbfmlaltq_laneq_f32
-// CHECK-NEXT: entry:
-// CHECK-NEXT  %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-// CHECK-NEXT  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
-// CHECK-NEXT  %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-// CHECK-NEXT  ret <4 x float> %vbfmlalt1.i
+// CHECK-LABEL: @test_vbfmlaltq_laneq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
+// CHECK-NEXT:    ret <4 x float> [[VBFMLALTQ_V3_I]]
+//
 float32x4_t test_vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
   return vbfmlaltq_laneq_f32(r, a, b, 3);
 }
diff --git a/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c b/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c
--- a/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c
+++ b/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c
@@ -12,10 +12,8 @@
 
 // CHECK-LABEL: @test_vbfdot_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VBFDOT1_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.*]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #3
-// CHECK-NEXT:    ret <2 x float> [[VBFDOT1_I]]
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[B:%.*]]) [[ATTR3:#.*]]
+// CHECK-NEXT:    ret <2 x float> [[VBFDOT3_I]]
 //
 float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
   return vbfdot_f32(r, a, b);
@@ -23,10 +21,8 @@
 
 // CHECK-LABEL: @test_vbfdotq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VBFDOT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
-// CHECK-NEXT:    ret <4 x float> [[VBFDOT1_I]]
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]]
+// CHECK-NEXT:    ret <4 x float> [[VBFDOT3_I]]
 //
 float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
   return vbfdotq_f32(r, a, b);
@@ -36,10 +32,9 @@
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float>
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <2 x i32> zeroinitializer
-// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VBFDOT1_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.*]], <8 x i8> [[TMP0]], <8 x i8> [[DOTCAST1]]) #3
-// CHECK-NEXT:    ret <2 x float> [[VBFDOT1_I]]
+// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]]) [[ATTR3]]
+// CHECK-NEXT:    ret <2 x float> [[VBFDOT3_I]]
 //
 float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
   return vbfdot_lane_f32(r, a, b, 0);
@@ -49,10 +44,9 @@
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float>
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VBFDOT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[DOTCAST1]]) #3
-// CHECK-NEXT:    ret <4 x float> [[VBFDOT1_I]]
+// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]]) [[ATTR3]]
+// CHECK-NEXT:    ret <4 x float> [[VBFDOT3_I]]
 //
 float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
   return vbfdotq_laneq_f32(r, a, b, 3);
@@ -62,10 +56,9 @@
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float>
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> undef, <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VBFDOT1_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> [[R:%.*]], <8 x i8> [[TMP0]], <8 x i8> [[DOTCAST1]]) #3
-// CHECK-NEXT:    ret <2 x float> [[VBFDOT1_I]]
+// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]]) [[ATTR3]]
+// CHECK-NEXT:    ret <2 x float> [[VBFDOT3_I]]
 //
 float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) {
   return vbfdot_laneq_f32(r, a, b, 3);
@@ -75,10 +68,9 @@
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float>
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> undef, <4 x i32> zeroinitializer
-// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VBFDOT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[DOTCAST1]]) #3
-// CHECK-NEXT:    ret <4 x float> [[VBFDOT1_I]]
+// CHECK-NEXT:    [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]]) [[ATTR3]]
+// CHECK-NEXT:    ret <4 x float> [[VBFDOT3_I]]
 //
 float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
   return vbfdotq_lane_f32(r, a, b, 0);
@@ -86,10 +78,8 @@
 
 // CHECK-LABEL: @test_vbfmmlaq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMMLA1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
-// CHECK-NEXT:    ret <4 x float> [[VBFMMLA1_I]]
+// CHECK-NEXT:    [[VBFMMLAQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]]
+// CHECK-NEXT:    ret <4 x float> [[VBFMMLAQ_V3_I]]
 //
 float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
   return vbfmmlaq_f32(r, a, b);
@@ -97,10 +87,8 @@
 
 // CHECK-LABEL: @test_vbfmlalbq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALB1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
-// CHECK-NEXT:    ret <4 x float> [[VBFMLALB1_I]]
+// CHECK-NEXT:    [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]]
+// CHECK-NEXT:    ret <4 x float> [[VBFMLALBQ_V3_I]]
 //
 float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
   return vbfmlalbq_f32(r, a, b);
@@ -108,10 +96,8 @@
 
 // CHECK-LABEL: @test_vbfmlaltq_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
-// CHECK-NEXT:    ret <4 x float> [[VBFMLALT1_I]]
+// CHECK-NEXT:    [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) [[ATTR3]]
+// CHECK-NEXT:    ret <4 x float> [[VBFMLALTQ_V3_I]]
 //
 float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
   return vbfmlaltq_f32(r, a, b);
@@ -120,10 +106,8 @@
 // CHECK-LABEL: @test_vbfmlalbq_lane_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALB1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
-// CHECK-NEXT:    ret <4 x float> [[VBFMLALB1_I]]
+// CHECK-NEXT:    [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
+// CHECK-NEXT:    ret <4 x float> [[VBFMLALBQ_V3_I]]
 //
 float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
   return vbfmlalbq_lane_f32(r, a, b, 0);
@@ -132,10 +116,8 @@
 // CHECK-LABEL: @test_vbfmlalbq_laneq_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALB1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
-// CHECK-NEXT:    ret <4 x float> [[VBFMLALB1_I]]
+// CHECK-NEXT:    [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
+// CHECK-NEXT:    ret <4 x float> [[VBFMLALBQ_V3_I]]
 //
 float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
   return vbfmlalbq_laneq_f32(r, a, b, 3);
@@ -144,10 +126,8 @@
 // CHECK-LABEL: @test_vbfmlaltq_lane_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> undef, <8 x i32> zeroinitializer
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
-// CHECK-NEXT:    ret <4 x float> [[VBFMLALT1_I]]
+// CHECK-NEXT:    [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
+// CHECK-NEXT:    ret <4 x float> [[VBFMLALTQ_V3_I]]
 //
 float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
   return vbfmlaltq_lane_f32(r, a, b, 0);
@@ -156,10 +136,8 @@
 // CHECK-LABEL: @test_vbfmlaltq_laneq_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALT1_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> [[R:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #3
-// CHECK-NEXT:    ret <4 x float> [[VBFMLALT1_I]]
+// CHECK-NEXT:    [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) [[ATTR3]]
+// CHECK-NEXT:    ret <4 x float> [[VBFMLALTQ_V3_I]]
 //
 float32x4_t test_vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
   return vbfmlaltq_laneq_f32(r, a, b, 3);
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -184,6 +184,10 @@
                 [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
                 [IntrNoMem]>;
 
+  class AdvSIMD_BF16FML_Intrinsic
+    : Intrinsic<[llvm_v4f32_ty],
+                [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
+                [IntrNoMem]>;
 }
 
 // Arithmetic ops
@@ -466,9 +470,12 @@
   def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic;
   def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;
   def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic;
-  def int_aarch64_neon_bfmmla : AdvSIMD_MatMul_Intrinsic;
-  def int_aarch64_neon_bfmlalb : AdvSIMD_FML_Intrinsic;
-  def int_aarch64_neon_bfmlalt : AdvSIMD_FML_Intrinsic;
+  def int_aarch64_neon_bfmmla
+    : Intrinsic<[llvm_v4f32_ty],
+                [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
+                [IntrNoMem]>;
+  def int_aarch64_neon_bfmlalb : AdvSIMD_BF16FML_Intrinsic;
+  def int_aarch64_neon_bfmlalt : AdvSIMD_BF16FML_Intrinsic;
 
 
   // v8.6-A Bfloat Intrinsics
diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -791,14 +791,17 @@
     : Intrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>;
 
 def int_arm_neon_bfdot : Neon_Dot_Intrinsic;
-def int_arm_neon_bfmmla : Neon_MatMul_Intrinsic;
-
-class Neon_FML_Intrinsic
-  : Intrinsic<[llvm_anyvector_ty],
-              [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
-              [IntrNoMem]>;
-def int_arm_neon_bfmlalb : Neon_FML_Intrinsic;
-def int_arm_neon_bfmlalt : Neon_FML_Intrinsic;
+def int_arm_neon_bfmmla
+    : Intrinsic<[llvm_v4f32_ty],
+                [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
+                [IntrNoMem]>;
+
+class Neon_BF16FML_Intrinsic
+    : Intrinsic<[llvm_v4f32_ty],
+                [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
+                [IntrNoMem]>;
+def int_arm_neon_bfmlalb : Neon_BF16FML_Intrinsic;
+def int_arm_neon_bfmlalt : Neon_BF16FML_Intrinsic;
 
 def int_arm_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
 def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -632,6 +632,63 @@
         return true;
       }
     }
+
+    // Changed in 12.0: bfdot accept v4bf16 and v8bf16 instead of v8i8 and v16i8
+    // respectively
+    if ((Name.startswith("arm.neon.bfdot.") ||
+         Name.startswith("aarch64.neon.bfdot.")) &&
+        Name.endswith("i8")) {
+      Intrinsic::ID IID =
+          StringSwitch<Intrinsic::ID>(Name)
+              .Cases("arm.neon.bfdot.v2f32.v8i8",
+                     "arm.neon.bfdot.v4f32.v16i8",
+                     Intrinsic::arm_neon_bfdot)
+              .Cases("aarch64.neon.bfdot.v2f32.v8i8",
+                     "aarch64.neon.bfdot.v4f32.v16i8",
+                     Intrinsic::aarch64_neon_bfdot)
+              .Default(Intrinsic::not_intrinsic);
+      if (IID == Intrinsic::not_intrinsic)
+        break;
+
+      size_t OperandWidth = F->getReturnType()->getPrimitiveSizeInBits();
+      assert((OperandWidth == 64 || OperandWidth == 128) &&
+             "Unexpected operand width");
+      LLVMContext &Ctx = F->getParent()->getContext();
+      std::array<Type *, 2> Tys {{
+        F->getReturnType(),
+        FixedVectorType::get(Type::getBFloatTy(Ctx), OperandWidth / 16)
+      }};
+      NewFn = Intrinsic::getDeclaration(F->getParent(), IID, Tys);
+      return true;
+    }
+
+    // Changed in 12.0: bfmmla, bfmlalb and bfmlalt are not polymorphic anymore
+    // and accept v8bf16 instead of v16i8
+    if ((Name.startswith("arm.neon.bfm") ||
+         Name.startswith("aarch64.neon.bfm")) &&
+        Name.endswith(".v4f32.v16i8")) {
+      Intrinsic::ID IID =
+          StringSwitch<Intrinsic::ID>(Name)
+              .Case("arm.neon.bfmmla.v4f32.v16i8",
+                    Intrinsic::arm_neon_bfmmla)
+              .Case("arm.neon.bfmlalb.v4f32.v16i8",
+                    Intrinsic::arm_neon_bfmlalb)
+              .Case("arm.neon.bfmlalt.v4f32.v16i8",
+                    Intrinsic::arm_neon_bfmlalt)
+              .Case("aarch64.neon.bfmmla.v4f32.v16i8",
+                    Intrinsic::aarch64_neon_bfmmla)
+              .Case("aarch64.neon.bfmlalb.v4f32.v16i8",
+                    Intrinsic::aarch64_neon_bfmlalb)
+              .Case("aarch64.neon.bfmlalt.v4f32.v16i8",
+                    Intrinsic::aarch64_neon_bfmlalt)
+              .Default(Intrinsic::not_intrinsic);
+      if (IID == Intrinsic::not_intrinsic)
+        break;
+
+      std::array<Type *, 0> Tys;
+      NewFn = Intrinsic::getDeclaration(F->getParent(), IID, Tys);
+      return true;
+    }
     break;
   }
 
@@ -3618,6 +3675,30 @@
     break;
   }
 
+  case Intrinsic::arm_neon_bfdot:
+  case Intrinsic::arm_neon_bfmmla:
+  case Intrinsic::arm_neon_bfmlalb:
+  case Intrinsic::arm_neon_bfmlalt:
+  case Intrinsic::aarch64_neon_bfdot:
+  case Intrinsic::aarch64_neon_bfmmla:
+  case Intrinsic::aarch64_neon_bfmlalb:
+  case Intrinsic::aarch64_neon_bfmlalt: {
+    SmallVector<Value *, 3> Args;
+    assert(CI->getNumArgOperands() == 3 &&
+           "Mismatch between function args and call args");
+    size_t OperandWidth =
+        CI->getArgOperand(1)->getType()->getPrimitiveSizeInBits();
+    assert((OperandWidth == 64 || OperandWidth == 128) &&
+           "Unexpected operand width");
+    Type *NewTy = FixedVectorType::get(Type::getBFloatTy(C), OperandWidth / 16);
+    auto Iter = CI->arg_operands().begin();
+    Args.push_back(*Iter++);
+    Args.push_back(Builder.CreateBitCast(*Iter++, NewTy));
+    Args.push_back(Builder.CreateBitCast(*Iter++, NewTy));
+    NewCall = Builder.CreateCall(NewFn, Args);
+    break;
+  }
+
   case Intrinsic::bitreverse:
     NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(0)});
     break;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -7841,9 +7841,9 @@
 
 multiclass SIMDThreeSameVectorBFDot<bit U, string asm> {
   def v4bf16 : BaseSIMDThreeSameVectorBFDot<0, U, asm, ".2s", ".4h", V64,
-                                           v2f32, v8i8>;
+                                           v2f32, v4bf16>;
   def v8bf16 : BaseSIMDThreeSameVectorBFDot<1, U, asm, ".4s", ".8h", V128,
-                                           v4f32, v16i8>;
+                                           v4f32, v8bf16>;
 }
 
 class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,
@@ -7861,7 +7861,7 @@
                                  (InputType RegType:$Rn),
                                  (InputType (bitconvert (AccumType
                                     (AArch64duplane32 (v4f32 V128:$Rm),
-                                        VectorIndexH:$idx)))))))]> {
+                                        VectorIndexS:$idx)))))))]> {
 
   bits<2> idx;
   let Inst{21}    = idx{0};  // L
@@ -7871,16 +7871,16 @@
 multiclass SIMDThreeSameVectorBF16DotI<bit U, string asm> {
 
   def v4bf16  : BaseSIMDThreeSameVectorBF16DotI<0, U, asm, ".2s", ".4h",
-                                               ".2h", V64, v2f32, v8i8>;
+                                               ".2h", V64, v2f32, v4bf16>;
   def v8bf16 : BaseSIMDThreeSameVectorBF16DotI<1, U, asm, ".4s", ".8h",
-                                              ".2h", V128, v4f32, v16i8>;
+                                              ".2h", V128, v4f32, v8bf16>;
 }
 
 class SIMDBF16MLAL<bit Q, string asm, SDPatternOperator OpNode>
   : BaseSIMDThreeSameVectorTied<Q, 0b1, 0b110, 0b11111, V128, asm, ".4s",
               [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
-                                               (v16i8 V128:$Rn),
-                                               (v16i8 V128:$Rm)))]> {
+                                               (v8bf16 V128:$Rn),
+                                               (v8bf16 V128:$Rm)))]> {
   let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}");
 }
 
@@ -7890,10 +7890,10 @@
       "{\t$Rd.4s, $Rn.8h, $Rm.h$idx}", "$Rd = $dst",
           [(set (v4f32 V128:$dst),
                 (v4f32 (OpNode (v4f32 V128:$Rd),
-                               (v16i8 V128:$Rn),
-                               (v16i8 (bitconvert (v8bf16
+                               (v8bf16 V128:$Rn),
+                               (v8bf16
                                   (AArch64duplane16 (v8bf16 V128_lo:$Rm),
-                                      VectorIndexH:$idx)))))))]>,
+                                      VectorIndexH:$idx)))))]>,
     Sched<[WriteV]> {
   bits<5> Rd;
   bits<5> Rn;
@@ -7917,8 +7917,8 @@
                                 V128, asm, ".4s",
                           [(set (v4f32 V128:$dst),
                                 (int_aarch64_neon_bfmmla (v4f32 V128:$Rd),
-                                                         (v16i8 V128:$Rn),
-                                                         (v16i8 V128:$Rm)))]> {
+                                                         (v8bf16 V128:$Rn),
+                                                         (v8bf16 V128:$Rm)))]> {
   let AsmString = !strconcat(asm, "{\t$Rd", ".4s", ", $Rn", ".8h",
                                     ", $Rm", ".8h", "}");
 }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -798,6 +798,23 @@
 def BFCVTN       : SIMD_BFCVTN;
 def BFCVTN2      : SIMD_BFCVTN2;
 def BFCVT        : BF16ToSinglePrecision<"bfcvt">;
+
+// Vector-scalar BFDOT:
+// The second source operand of the 64-bit variant of BF16DOTlane is a 128-bit
+// register (the instruction uses a single 32-bit lane from it), so the pattern
+// is a bit tricky.
+def : Pat<(v2f32 (int_aarch64_neon_bfdot
+                    (v2f32 V64:$Rd), (v4bf16 V64:$Rn),
+                    (v4bf16 (bitconvert
+                      (v2i32 (AArch64duplane32
+                        (v4i32 (bitconvert
+                          (v8bf16 (insert_subvector undef,
+                            (v4bf16 V64:$Rm),
+                            (i64 0))))),
+                        VectorIndexS:$idx)))))),
+          (BF16DOTlanev4bf16 (v2f32 V64:$Rd), (v4bf16 V64:$Rn),
+                             (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
+                             VectorIndexS:$idx)>;
 }
 
 // ARMv8.6A AArch64 matrix multiplication
diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -9079,11 +9079,11 @@
     (!cast<Instruction>(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>;
 }
 
-def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v8i8>;
-def BF16VDOTS_VDOTQ : BF16VDOTS<1, QPR, "vdot", v4f32, v16i8>;
+def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v4bf16>;
+def BF16VDOTS_VDOTQ : BF16VDOTS<1, QPR, "vdot", v4f32, v8bf16>;
 
-defm BF16VDOTI_VDOTD : BF16VDOTI<0, DPR, "vdot", v2f32, v8i8, (v2f32 DPR_VFP2:$Vm)>;
-defm BF16VDOTI_VDOTQ : BF16VDOTI<1, QPR, "vdot", v4f32, v16i8, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
+defm BF16VDOTI_VDOTD : BF16VDOTI<0, DPR, "vdot", v2f32, v4bf16, (v2f32 DPR_VFP2:$Vm)>;
+defm BF16VDOTI_VDOTQ : BF16VDOTI<1, QPR, "vdot", v4f32, v8bf16, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
 
 class BF16MM<bit Q, RegisterClass RegTy,
              string opc>
@@ -9091,8 +9091,8 @@
            (outs RegTy:$dst), (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm),
            N3RegFrm, IIC_VDOTPROD, "", "",
                 [(set (v4f32 QPR:$dst), (int_arm_neon_bfmmla (v4f32 QPR:$Vd),
-                                                (v16i8 QPR:$Vn),
-                                                (v16i8 QPR:$Vm)))]> {
+                                                (v8bf16 QPR:$Vn),
+                                                (v8bf16 QPR:$Vm)))]> {
    let Constraints = "$dst = $Vd";
    let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm");
    let DecoderNamespace = "VFPV8";
@@ -9106,8 +9106,8 @@
            NoItinerary, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm", "",
                 [(set (v4f32 QPR:$dst),
                       (OpNode (v4f32 QPR:$Vd),
-                              (v16i8 QPR:$Vn),
-                              (v16i8 QPR:$Vm)))]> {
+                              (v8bf16 QPR:$Vn),
+                              (v8bf16 QPR:$Vm)))]> {
   let Constraints = "$dst = $Vd";
   let DecoderNamespace = "VFPV8";
 }
@@ -9128,9 +9128,9 @@
 
   def : Pat<
     (v4f32 (OpNode (v4f32 QPR:$Vd),
-                   (v16i8 QPR:$Vn),
-                   (v16i8 (bitconvert (v8bf16 (ARMvduplane (v8bf16 QPR:$Vm),
-                                                           VectorIndex16:$lane)))))),
+                   (v8bf16 QPR:$Vn),
+                   (v8bf16 (ARMvduplane (v8bf16 QPR:$Vm),
+                            VectorIndex16:$lane)))),
     (!cast<Instruction>(NAME) QPR:$Vd,
                               QPR:$Vn,
                               (EXTRACT_SUBREG QPR:$Vm,
diff --git a/llvm/test/Bitcode/aarch64-bf16-upgrade.ll b/llvm/test/Bitcode/aarch64-bf16-upgrade.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Bitcode/aarch64-bf16-upgrade.ll
@@ -0,0 +1,76 @@
+; RUN: llvm-dis < %s.bc | FileCheck %s
+
+; Bitcode was generated from file below
+
+define <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-LABEL: @test_vbfdot_f32
+entry:
+  %0 = bitcast <4 x bfloat> %a to <8 x i8>
+  %1 = bitcast <4 x bfloat> %b to <8 x i8>
+  ; CHECK: %2 = bitcast <8 x i8> %0 to <4 x bfloat>
+  ; CHECK-NEXT: %3 = bitcast <8 x i8> %1 to <4 x bfloat>
+  ; CHECK-NEXT: %vbfdot1.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %2, <4 x bfloat> %3)
+  %vbfdot1.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
+  ret <2 x float> %vbfdot1.i
+}
+
+define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: @test_vbfdotq_f32
+entry:
+  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+  %1 = bitcast <8 x bfloat> %b to <16 x i8>
+  ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
+  ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
+  ; CHECK-NEXT: %vbfdot1.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
+  %vbfdot1.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+  ret <4 x float> %vbfdot1.i
+}
+
+define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: @test_vbfmmlaq_f32
+entry:
+  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+  %1 = bitcast <8 x bfloat> %b to <16 x i8>
+  %vbfmmla1.i = call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+  ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
+  ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
+  ; CHECK-NEXT: %vbfmmla1.i = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
+  ret <4 x float> %vbfmmla1.i
+}
+
+define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: @test_vbfmlalbq_laneq_f32
+entry:
+  %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
+  %vbfmlalb1.i = call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+  ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
+  ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
+  ; CHECK-NEXT: %vbfmlalb1.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
+  ret <4 x float> %vbfmlalb1.i
+}
+
+define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: @test_vbfmlaltq_laneq_f32
+entry:
+  %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
+  %vbfmlalt1.i = call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+  ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
+  ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
+  ; CHECK-NEXT: %vbfmlalt1.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
+  ret <4 x float> %vbfmlalt1.i
+}
+
+declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>)
+; CHECK: declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>)
+declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
+; CHECK: declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>)
+declare <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
+; CHECK: declare <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>)
+declare <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
+; CHECK: declare <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>)
+declare <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
+; CHECK: declare <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)
\ No newline at end of file
diff --git a/llvm/test/Bitcode/aarch64-bf16-upgrade.ll.bc b/llvm/test/Bitcode/aarch64-bf16-upgrade.ll.bc
new file mode 100644
diff --git a/llvm/test/Bitcode/arm-bf16-upgrade.ll b/llvm/test/Bitcode/arm-bf16-upgrade.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Bitcode/arm-bf16-upgrade.ll
@@ -0,0 +1,76 @@
+; RUN: llvm-dis < %s.bc | FileCheck %s
+
+; Bitcode was generated from file below
+
+define arm_aapcs_vfpcc <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-LABEL: @test_vbfdot_f32
+entry:
+  %0 = bitcast <4 x bfloat> %a to <8 x i8>
+  %1 = bitcast <4 x bfloat> %b to <8 x i8>
+  %vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
+  ; CHECK: %2 = bitcast <8 x i8> %0 to <4 x bfloat>
+  ; CHECK-NEXT: %3 = bitcast <8 x i8> %1 to <4 x bfloat>
+  ; CHECK-NEXT: %vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %2, <4 x bfloat> %3)
+  ret <2 x float> %vbfdot1.i
+}
+
+define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: @test_vbfdotq_f32
+entry:
+  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+  %1 = bitcast <8 x bfloat> %b to <16 x i8>
+  %vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+  ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
+  ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
+  ; CHECK-NEXT: %vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
+  ret <4 x float> %vbfdot1.i
+}
+
+define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: @test_vbfmmlaq_f32
+entry:
+  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+  %1 = bitcast <8 x bfloat> %b to <16 x i8>
+  %vbfmmla1.i = call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+  ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
+  ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
+  ; CHECK-NEXT: %vbfmmla1.i = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
+  ret <4 x float> %vbfmmla1.i
+}
+
+define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: @test_vbfmlalbq_laneq_f32
+entry:
+  %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
+  %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+  ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
+  ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
+  ; CHECK-NEXT: %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
+  ret <4 x float> %vbfmlalb1.i
+}
+
+define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-LABEL: @test_vbfmlaltq_laneq_f32
+entry:
+  %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %0 = bitcast <8 x bfloat> %a to <16 x i8>
+  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
+  %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+  ; CHECK: %2 = bitcast <16 x i8> %0 to <8 x bfloat>
+  ; CHECK-NEXT: %3 = bitcast <16 x i8> %1 to <8 x bfloat>
+  ; CHECK-NEXT: %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %2, <8 x bfloat> %3)
+  ret <4 x float> %vbfmlalt1.i
+}
+
+declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>)
+; CHECK: declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>)
+declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
+; CHECK: declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>)
+declare <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
+; CHECK: declare <4 x float> @llvm.arm.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>)
+declare <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
+; CHECK: declare <4 x float> @llvm.arm.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>)
+declare <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
+; CHECK: declare <4 x float> @llvm.arm.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)
\ No newline at end of file
diff --git a/llvm/test/Bitcode/arm-bf16-upgrade.ll.bc b/llvm/test/Bitcode/arm-bf16-upgrade.ll.bc
new file mode 100644
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll b/llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll
--- a/llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-bf16-dotprod-intrinsics.ll
@@ -7,10 +7,8 @@
 ; CHECK-NEXT:    bfdot v0.2s, v1.4h, v2.4h
 ; CHECK-NEXT:    ret
 entry:
-  %0 = bitcast <4 x bfloat> %a to <8 x i8>
-  %1 = bitcast <4 x bfloat> %b to <8 x i8>
-  %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
-  ret <2 x float> %vbfdot1.i
+  %vbfdot3.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b)
+  ret <2 x float> %vbfdot3.i
 }
 
 define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@@ -19,24 +17,22 @@
 ; CHECK-NEXT:    bfdot v0.4s, v1.8h, v2.8h
 ; CHECK-NEXT:    ret
 entry:
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %b to <16 x i8>
-  %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfdot1.i
+  %vbfdot3.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
+  ret <4 x float> %vbfdot3.i
 }
 
 define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
 ; CHECK-LABEL: test_vbfdot_lane_f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK:    bfdot v0.2s, v1.4h, v2.2h[0]
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    bfdot v0.2s, v1.4h, v2.2h[0]
 ; CHECK-NEXT:    ret
 entry:
-  %0 = bitcast <4 x bfloat> %b to <2 x float>
-  %shuffle = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer
-  %1 = bitcast <4 x bfloat> %a to <8 x i8>
-  %2 = bitcast <2 x float> %shuffle to <8 x i8>
-  %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
-  ret <2 x float> %vbfdot1.i
+  %.cast = bitcast <4 x bfloat> %b to <2 x float>
+  %lane = shufflevector <2 x float> %.cast, <2 x float> undef, <2 x i32> zeroinitializer
+  %.cast1 = bitcast <2 x float> %lane to <4 x bfloat>
+  %vbfdot3.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1)
+  ret <2 x float> %vbfdot3.i
 }
 
 define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@@ -45,12 +41,11 @@
 ; CHECK-NEXT:    bfdot v0.4s, v1.8h, v2.2h[3]
 ; CHECK-NEXT:    ret
 entry:
-  %0 = bitcast <8 x bfloat> %b to <4 x float>
-  %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %1 = bitcast <8 x bfloat> %a to <16 x i8>
-  %2 = bitcast <4 x float> %shuffle to <16 x i8>
-  %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
-  ret <4 x float> %vbfdot1.i
+  %.cast = bitcast <8 x bfloat> %b to <4 x float>
+  %lane = shufflevector <4 x float> %.cast, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %.cast1 = bitcast <4 x float> %lane to <8 x bfloat>
+  %vbfdot3.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1)
+  ret <4 x float> %vbfdot3.i
 }
 
 define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) {
@@ -59,26 +54,25 @@
 ; CHECK-NEXT:    bfdot v0.2s, v1.4h, v2.2h[3]
 ; CHECK-NEXT:    ret
 entry:
-  %0 = bitcast <8 x bfloat> %b to <4 x float>
-  %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 3, i32 3>
-  %1 = bitcast <4 x bfloat> %a to <8 x i8>
-  %2 = bitcast <2 x float> %shuffle to <8 x i8>
-  %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
-  ret <2 x float> %vbfdot1.i
+  %.cast = bitcast <8 x bfloat> %b to <4 x float>
+  %lane = shufflevector <4 x float> %.cast, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+  %.cast1 = bitcast <2 x float> %lane to <4 x bfloat>
+  %vbfdot3.i = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1)
+  ret <2 x float> %vbfdot3.i
 }
 
 define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
 ; CHECK-LABEL: test_vbfdotq_lane_f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK:    bfdot v0.4s, v1.8h, v2.2h[0]
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    bfdot v0.4s, v1.8h, v2.2h[0]
 ; CHECK-NEXT:    ret
 entry:
-  %0 = bitcast <4 x bfloat> %b to <2 x float>
-  %shuffle = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer
-  %1 = bitcast <8 x bfloat> %a to <16 x i8>
-  %2 = bitcast <4 x float> %shuffle to <16 x i8>
-  %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
-  ret <4 x float> %vbfdot1.i
+  %.cast = bitcast <4 x bfloat> %b to <2 x float>
+  %lane = shufflevector <2 x float> %.cast, <2 x float> undef, <4 x i32> zeroinitializer
+  %.cast1 = bitcast <4 x float> %lane to <8 x bfloat>
+  %vbfdot3.i = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1)
+  ret <4 x float> %vbfdot3.i
 }
 
 define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@@ -87,10 +81,8 @@
 ; CHECK-NEXT:    bfmmla v0.4s, v1.8h, v2.8h
 ; CHECK-NEXT:    ret
 entry:
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %b to <16 x i8>
-  %vbfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfmmla1.i
+  %vbfmmlaq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
+  ret <4 x float> %vbfmmlaq_v3.i
 }
 
 define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@@ -99,10 +91,8 @@
 ; CHECK-NEXT:    bfmlalb v0.4s, v1.8h, v2.8h
 ; CHECK-NEXT:    ret
 entry:
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %b to <16 x i8>
-  %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfmlalb1.i
+  %vbfmlalbq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
+  ret <4 x float> %vbfmlalbq_v3.i
 }
 
 define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@@ -111,23 +101,20 @@
 ; CHECK-NEXT:    bfmlalt v0.4s, v1.8h, v2.8h
 ; CHECK-NEXT:    ret
 entry:
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %b to <16 x i8>
-  %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfmlalt1.i
+  %vbfmlaltq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
+  ret <4 x float> %vbfmlaltq_v3.i
 }
 
 define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
 ; CHECK-LABEL: test_vbfmlalbq_lane_f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK:    bfmlalb v0.4s, v1.8h, v2.h[0]
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    bfmlalb v0.4s, v1.8h, v2.h[0]
 ; CHECK-NEXT:    ret
 entry:
   %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
-  %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfmlalb1.i
+  %vbfmlalbq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
+  ret <4 x float> %vbfmlalbq_v3.i
 }
 
 define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@@ -137,23 +124,20 @@
 ; CHECK-NEXT:    ret
 entry:
   %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
-  %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfmlalb1.i
+  %vbfmlalbq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
+  ret <4 x float> %vbfmlalbq_v3.i
 }
 
 define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
 ; CHECK-LABEL: test_vbfmlaltq_lane_f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK:    bfmlalt v0.4s, v1.8h, v2.h[0]
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    bfmlalt v0.4s, v1.8h, v2.h[0]
 ; CHECK-NEXT:    ret
 entry:
   %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
-  %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfmlalt1.i
+  %vbfmlaltq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
+  ret <4 x float> %vbfmlaltq_v3.i
 }
 
 define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@@ -163,14 +147,12 @@
 ; CHECK-NEXT:    ret
 entry:
   %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
-  %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfmlalt1.i
+  %vbfmlaltq_v3.i = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
+  ret <4 x float> %vbfmlaltq_v3.i
 }
 
-declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>) #2
-declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
-declare <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
-declare <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
-declare <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
+declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>)
+declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>)
+declare <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>)
+declare <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>)
+declare <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)
diff --git a/llvm/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll b/llvm/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll
--- a/llvm/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll
+++ b/llvm/test/CodeGen/ARM/arm-bf16-dotprod-intrinsics.ll
@@ -7,10 +7,8 @@
 ; CHECK-NEXT:    vdot.bf16 d0, d1, d2
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast <4 x bfloat> %a to <8 x i8>
-  %1 = bitcast <4 x bfloat> %b to <8 x i8>
-  %vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
-  ret <2 x float> %vbfdot1.i
+  %vbfdot3.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) #3
+  ret <2 x float> %vbfdot3.i
 }
 
 define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@@ -19,10 +17,8 @@
 ; CHECK-NEXT:    vdot.bf16 q0, q1, q2
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %b to <16 x i8>
-  %vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfdot1.i
+  %vbfdot3.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) #3
+  ret <4 x float> %vbfdot3.i
 }
 
 define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
@@ -31,12 +27,11 @@
 ; CHECK-NEXT:    vdot.bf16 d0, d1, d2[0]
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast <4 x bfloat> %b to <2 x float>
-  %shuffle = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer
-  %1 = bitcast <4 x bfloat> %a to <8 x i8>
-  %2 = bitcast <2 x float> %shuffle to <8 x i8>
-  %vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
-  ret <2 x float> %vbfdot1.i
+  %.cast = bitcast <4 x bfloat> %b to <2 x float>
+  %lane = shufflevector <2 x float> %.cast, <2 x float> undef, <2 x i32> zeroinitializer
+  %.cast1 = bitcast <2 x float> %lane to <4 x bfloat>
+  %vbfdot3.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1) #3
+  ret <2 x float> %vbfdot3.i
 }
 
 define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@@ -46,12 +41,11 @@
 ; CHECK-NEXT:    vdot.bf16 q0, q1, q8
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast <8 x bfloat> %b to <4 x float>
-  %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %1 = bitcast <8 x bfloat> %a to <16 x i8>
-  %2 = bitcast <4 x float> %shuffle to <16 x i8>
-  %vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
-  ret <4 x float> %vbfdot1.i
+  %.cast = bitcast <8 x bfloat> %b to <4 x float>
+  %lane = shufflevector <4 x float> %.cast, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %.cast1 = bitcast <4 x float> %lane to <8 x bfloat>
+  %vbfdot3.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1) #3
+  ret <4 x float> %vbfdot3.i
 }
 
 define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) {
@@ -60,12 +54,11 @@
 ; CHECK-NEXT:    vdot.bf16 d0, d1, d3[1]
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast <8 x bfloat> %b to <4 x float>
-  %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 3, i32 3>
-  %1 = bitcast <4 x bfloat> %a to <8 x i8>
-  %2 = bitcast <2 x float> %shuffle to <8 x i8>
-  %vbfdot1.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
-  ret <2 x float> %vbfdot1.i
+  %.cast = bitcast <8 x bfloat> %b to <4 x float>
+  %lane = shufflevector <4 x float> %.cast, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+  %.cast1 = bitcast <2 x float> %lane to <4 x bfloat>
+  %vbfdot3.i = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %.cast1) #3
+  ret <2 x float> %vbfdot3.i
 }
 
 define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
@@ -75,12 +68,11 @@
 ; CHECK-NEXT:    vdot.bf16 q0, q1, d4[0]
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast <4 x bfloat> %b to <2 x float>
-  %shuffle = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer
-  %1 = bitcast <8 x bfloat> %a to <16 x i8>
-  %2 = bitcast <4 x float> %shuffle to <16 x i8>
-  %vbfdot1.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
-  ret <4 x float> %vbfdot1.i
+  %.cast = bitcast <4 x bfloat> %b to <2 x float>
+  %lane = shufflevector <2 x float> %.cast, <2 x float> undef, <4 x i32> zeroinitializer
+  %.cast1 = bitcast <4 x float> %lane to <8 x bfloat>
+  %vbfdot3.i = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %.cast1) #3
+  ret <4 x float> %vbfdot3.i
 }
 
 define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@@ -89,10 +81,8 @@
 ; CHECK-NEXT:    vmmla.bf16 q0, q1, q2
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %b to <16 x i8>
-  %vbfmmla1.i = call <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfmmla1.i
+  %vbfmmlaq_v3.i = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
+  ret <4 x float> %vbfmmlaq_v3.i
 }
 
 define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@@ -101,10 +91,8 @@
 ; CHECK-NEXT:    vfmab.bf16 q0, q1, q2
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %b to <16 x i8>
-  %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfmlalb1.i
+  %vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
+  ret <4 x float> %vbfmlalbq_v3.i
 }
 
 define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@@ -113,10 +101,8 @@
 ; CHECK-NEXT:    vfmat.bf16 q0, q1, q2
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %b to <16 x i8>
-  %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfmlalt1.i
+  %vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b)
+  ret <4 x float> %vbfmlaltq_v3.i
 }
 
 define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
@@ -127,10 +113,8 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
-  %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfmlalb1.i
+  %vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
+  ret <4 x float> %vbfmlalbq_v3.i
 }
 
 define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@@ -140,10 +124,8 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
-  %vbfmlalb1.i = call <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfmlalb1.i
+  %vbfmlalbq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
+  ret <4 x float> %vbfmlalbq_v3.i
 }
 
 define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
@@ -154,10 +136,8 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
-  %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfmlalt1.i
+  %vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
+  ret <4 x float> %vbfmlaltq_v3.i
 }
 
 define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@@ -167,10 +147,8 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
-  %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
-  ret <4 x float> %vbfmlalt1.i
+  %vbfmlaltq_v3.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
+  ret <4 x float> %vbfmlaltq_v3.i
 }
 
 define <4 x float> @test_vbfmlaltq_laneq_f32_v2(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
@@ -181,14 +159,12 @@
 ; CHECK-NEXT:    bx lr
 entry:
   %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
-  %0 = bitcast <8 x bfloat> %a to <16 x i8>
-  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
-  %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
+  %vbfmlalt1.i = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %vecinit35)
   ret <4 x float> %vbfmlalt1.i
 }
 
-declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>)
-declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
-declare <4 x float> @llvm.arm.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
-declare <4 x float> @llvm.arm.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
-declare <4 x float> @llvm.arm.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>)
+declare <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float>, <4 x bfloat>, <4 x bfloat>)
+declare <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float>, <8 x bfloat>, <8 x bfloat>)
+declare <4 x float> @llvm.arm.neon.bfmmla(<4 x float>, <8 x bfloat>, <8 x bfloat>)
+declare <4 x float> @llvm.arm.neon.bfmlalb(<4 x float>, <8 x bfloat>, <8 x bfloat>)
+declare <4 x float> @llvm.arm.neon.bfmlalt(<4 x float>, <8 x bfloat>, <8 x bfloat>)