Index: clang/include/clang/Basic/arm_sve.td
===================================================================
--- clang/include/clang/Basic/arm_sve.td
+++ clang/include/clang/Basic/arm_sve.td
@@ -563,6 +563,11 @@
 def SVST1W_VNUM_S : MInst<"svst1w_vnum[_{d}]", "vPCld", "l",               [IsStore], MemEltTyInt32,   "aarch64_sve_st1">;
 def SVST1W_VNUM_U : MInst<"svst1w_vnum[_{d}]", "vPGld", "Ul",              [IsStore], MemEltTyInt32,   "aarch64_sve_st1">;
 
+let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in {
+  def SVST1_BF      : MInst<"svst1[_{d}]",      "vPpd",  "b", [IsStore], MemEltTyDefault, "aarch64_sve_st1">;
+  def SVST1_VNUM_BF : MInst<"svst1_vnum[_{d}]", "vPpld", "b", [IsStore], MemEltTyDefault, "aarch64_sve_st1">;
+}
+
 // Store one vector (vector base)
 def SVST1_SCATTER_BASES_U     : MInst<"svst1_scatter[_{2}base_{d}]",  "vPud",  "ilUiUlfd", [IsScatterStore], MemEltTyDefault, "aarch64_sve_st1_scatter_scalar_offset">;
 def SVST1B_SCATTER_BASES_U    : MInst<"svst1b_scatter[_{2}base_{d}]", "vPud",  "ilUiUl",   [IsScatterStore], MemEltTyInt8,    "aarch64_sve_st1_scatter_scalar_offset">;
@@ -654,6 +659,11 @@
 // Store one vector, with no truncation, non-temporal (scalar base, VL displacement)
 def SVSTNT1_VNUM : MInst<"svstnt1_vnum[_{d}]", "vPpld", "csilUcUsUiUlhfd", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">;
 
+let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in {
+  def SVSTNT1_BF      : MInst<"svstnt1[_{d}]",      "vPpd",  "b", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">;
+  def SVSTNT1_VNUM_BF : MInst<"svstnt1_vnum[_{d}]", "vPpld", "b", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // Prefetches
 
Index: clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1-bfloat.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st1-bfloat.c
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -fsyntax-only -verify -verify-ignore-unexpected=error -verify-ignore-unexpected=note %s
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
+void test_svst1_bf16(svbool_t pg, bfloat16_t *base, svbfloat16_t data)
+{
+  // CHECK-LABEL: test_svst1_bf16
+  // CHECK: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+  // CHECK: call void @llvm.aarch64.sve.st1.nxv8bf16(<vscale x 8 x bfloat> %data, <vscale x 8 x i1> %[[PG]], bfloat* %base)
+  // CHECK: ret void
+  // expected-warning@+1 {{implicit declaration of function 'svst1_bf16'}}
+  return SVE_ACLE_FUNC(svst1,_bf16,,)(pg, base, data);
+}
+
+void test_svst1_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16_t data)
+{
+  // CHECK-LABEL: test_svst1_vnum_bf16
+  // CHECK-DAG: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+  // CHECK-DAG: %[[BASE:.*]] = bitcast bfloat* %base to <vscale x 8 x bfloat>*
+  // CHECK-DAG: %[[GEP:.*]] = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %[[BASE]], i64 %vnum, i64 0
+  // CHECK: call void @llvm.aarch64.sve.st1.nxv8bf16(<vscale x 8 x bfloat> %data, <vscale x 8 x i1> %[[PG]], bfloat* %[[GEP]])
+  // CHECK: ret void
+  // expected-warning@+1 {{implicit declaration of function 'svst1_vnum_bf16'}}
+  return SVE_ACLE_FUNC(svst1_vnum,_bf16,,)(pg, base, vnum, data);
+}
Index: clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_stnt1-bfloat.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_stnt1-bfloat.c
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -fsyntax-only -verify -verify-ignore-unexpected=error -verify-ignore-unexpected=note %s
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
+#endif
+
+void test_svstnt1_bf16(svbool_t pg, bfloat16_t *base, svbfloat16_t data)
+{
+  // CHECK-LABEL: test_svstnt1_bf16
+  // CHECK-DAG: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+  // CHECK: call void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat> %data, <vscale x 8 x i1> %[[PG]], bfloat* %base)
+  // CHECK-NEXT: ret
+  // expected-warning@+1 {{implicit declaration of function 'svstnt1_bf16'}}
+  return SVE_ACLE_FUNC(svstnt1,_bf16,,)(pg, base, data);
+}
+
+void test_svstnt1_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16_t data)
+{
+  // CHECK-LABEL: test_svstnt1_vnum_bf16
+  // CHECK-DAG: %[[BITCAST:.*]] = bitcast bfloat* %base to <vscale x 8 x bfloat>*
+  // CHECK-DAG: %[[GEP:.*]] = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %[[BITCAST]], i64 %vnum, i64 0
+  // CHECK-DAG: %[[PG:.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
+  // CHECK: call void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat> %data, <vscale x 8 x i1> %[[PG]], bfloat* %[[GEP]])
+  // CHECK-NEXT: ret
+  // expected-warning@+1 {{implicit declaration of function 'svstnt1_vnum_bf16'}}
+  return SVE_ACLE_FUNC(svstnt1_vnum,_bf16,,)(pg, base, vnum, data);
+}
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -12037,6 +12037,7 @@
   case MVT::nxv8i8:
   case MVT::nxv8i16:
   case MVT::nxv8f16:
+  case MVT::nxv8bf16:
     return MVT::nxv8i16;
   case MVT::nxv16i8:
     return MVT::nxv16i8;
@@ -12127,6 +12128,12 @@
   EVT HwSrcVt = getSVEContainerType(DataVT);
   SDValue InputVT = DAG.getValueType(DataVT);
 
+  const bool hasBF16 =
+    static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16();
+
+  assert((DataVT != MVT::nxv8bf16 || hasBF16) &&
+         "Unsupported type (BF16)");
+
   if (DataVT.isFloatingPoint())
     InputVT = DAG.getValueType(HwSrcVt);
 
@@ -12153,6 +12160,12 @@
   EVT DataVT = Data.getValueType();
   EVT PtrTy = N->getOperand(4).getValueType();
 
+  const bool hasBF16 =
+    static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16();
+
+  assert((DataVT != MVT::nxv8bf16 || hasBF16) &&
+         "Unsupported type (BF16)");
+
   if (DataVT.isFloatingPoint())
     Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);
 
Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1416,6 +1416,7 @@
     def : Pat<(nxv8i16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8i16 ZPR:$src)>;
     def : Pat<(nxv8i16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8i16 ZPR:$src)>;
     def : Pat<(nxv8i16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+    def : Pat<(nxv8i16 (bitconvert (nxv8bf16 ZPR:$src))), (nxv8i16 ZPR:$src)>;
     def : Pat<(nxv8i16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8i16 ZPR:$src)>;
     def : Pat<(nxv8i16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8i16 ZPR:$src)>;
 
@@ -1563,9 +1564,13 @@
   defm : pred_store<nxv4f32, nxv4i1, nontrunc_masked_store,  ST1W,   ST1W_IMM,   am_sve_regreg_lsl2>;
 
   // 8-element contiguous stores
-  defm : pred_store<nxv8i16, nxv8i1, trunc_masked_store_i8, ST1B_H, ST1B_H_IMM, am_sve_regreg_lsl0>;
-  defm : pred_store<nxv8i16, nxv8i1, nontrunc_masked_store, ST1H,   ST1H_IMM,   am_sve_regreg_lsl1>;
-  defm : pred_store<nxv8f16, nxv8i1, nontrunc_masked_store, ST1H,   ST1H_IMM,   am_sve_regreg_lsl1>;
+  defm : pred_store<nxv8i16,  nxv8i1, trunc_masked_store_i8, ST1B_H, ST1B_H_IMM, am_sve_regreg_lsl0>;
+  defm : pred_store<nxv8i16,  nxv8i1, nontrunc_masked_store, ST1H,   ST1H_IMM,   am_sve_regreg_lsl1>;
+  defm : pred_store<nxv8f16,  nxv8i1, nontrunc_masked_store, ST1H,   ST1H_IMM,   am_sve_regreg_lsl1>;
+
+  let Predicates = [HasBF16] in {
+    defm : pred_store<nxv8bf16, nxv8i1, nontrunc_masked_store, ST1H,   ST1H_IMM,   am_sve_regreg_lsl1>;
+  }
 
   // 16-element contiguous stores
   defm : pred_store<nxv16i8, nxv16i1, nontrunc_masked_store, ST1B, ST1B_IMM, am_sve_regreg_lsl0>;
Index: llvm/test/CodeGen/AArch64/sve-intrinsics-st1-addressing-mode-reg-imm.ll
===================================================================
--- llvm/test/CodeGen/AArch64/sve-intrinsics-st1-addressing-mode-reg-imm.ll
+++ llvm/test/CodeGen/AArch64/sve-intrinsics-st1-addressing-mode-reg-imm.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 < %s 2>%t | FileCheck %s
 ; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
 
 ; WARN-NOT: warning
@@ -126,6 +126,17 @@
   ret void
 }
 
+define void @st1h_bf16_inbound(<vscale x 8 x bfloat> %data, <vscale x 8 x i1> %pg, bfloat* %a) {
+; CHECK-LABEL: st1h_bf16_inbound:
+; CHECK: st1h { z0.h }, p0, [x0, #-5, mul vl]
+; CHECK-NEXT: ret
+  %base_scalable = bitcast bfloat* %a to <vscale x 8 x bfloat>*
+  %base = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %base_scalable, i64 -5
+  %base_scalar = bitcast <vscale x 8 x bfloat>* %base to bfloat*
+  call void @llvm.aarch64.sve.st1.nxv8bf16(<vscale x 8 x bfloat> %data, <vscale x 8 x i1> %pg, bfloat* %base_scalar)
+  ret void
+}
+
 define void @st1h_s_inbound(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, i16* %a) {
 ; CHECK-LABEL: st1h_s_inbound:
 ; CHECK: st1h { z0.s }, p0, [x0, #2, mul vl]
@@ -219,6 +230,7 @@
 declare void @llvm.aarch64.sve.st1.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i1>, i8*)
 declare void @llvm.aarch64.sve.st1.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16*)
 declare void @llvm.aarch64.sve.st1.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half*)
+declare void @llvm.aarch64.sve.st1.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat*)
 
 declare void @llvm.aarch64.sve.st1.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i1>, i8*)
 declare void @llvm.aarch64.sve.st1.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i1>, i16*)
Index: llvm/test/CodeGen/AArch64/sve-intrinsics-st1-addressing-mode-reg-reg.ll
===================================================================
--- llvm/test/CodeGen/AArch64/sve-intrinsics-st1-addressing-mode-reg-reg.ll
+++ llvm/test/CodeGen/AArch64/sve-intrinsics-st1-addressing-mode-reg-reg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 < %s 2>%t | FileCheck %s
 ; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
 
 ; WARN-NOT: warning
@@ -82,6 +82,17 @@
   ret void
 }
 
+define void @st1h_bf16(<vscale x 8 x bfloat> %data, <vscale x 8 x i1> %pred, bfloat* %a, i64 %index) {
+; CHECK-LABEL: st1h_bf16:
+; CHECK: st1h { z0.h }, p0, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+  %base = getelementptr bfloat, bfloat* %a, i64 %index
+  call void @llvm.aarch64.sve.st1.nxv8bf16(<vscale x 8 x bfloat> %data,
+                                           <vscale x 8 x i1> %pred,
+                                           bfloat* %base)
+  ret void
+}
+
 define void @st1h_s(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pred, i16* %addr) {
 ; CHECK-LABEL: st1h_s:
 ; CHECK: st1h { z0.s }, p0, [x0]
@@ -174,6 +185,7 @@
 declare void @llvm.aarch64.sve.st1.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i1>, i8*)
 declare void @llvm.aarch64.sve.st1.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16*)
 declare void @llvm.aarch64.sve.st1.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half*)
+declare void @llvm.aarch64.sve.st1.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat*)
 
 declare void @llvm.aarch64.sve.st1.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i1>, i8*)
 declare void @llvm.aarch64.sve.st1.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i1>, i16*)
Index: llvm/test/CodeGen/AArch64/sve-intrinsics-st1.ll
===================================================================
--- llvm/test/CodeGen/AArch64/sve-intrinsics-st1.ll
+++ llvm/test/CodeGen/AArch64/sve-intrinsics-st1.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s
-; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 < %s 2>%t | FileCheck %s
+; RUN: llc -O0 -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 < %s | FileCheck %s
 ; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
 
 ; WARN-NOT: warning
@@ -75,6 +75,16 @@
   ret void
 }
 
+define void @st1h_bf16(<vscale x 8 x bfloat> %data, <vscale x 8 x i1> %pred, bfloat* %addr) {
+; CHECK-LABEL: st1h_bf16:
+; CHECK: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
+  call void @llvm.aarch64.sve.st1.nxv8bf16(<vscale x 8 x bfloat> %data,
+                                           <vscale x 8 x i1> %pred,
+                                           bfloat* %addr)
+  ret void
+}
+
 define void @st1h_s(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pred, i16* %addr) {
 ; CHECK-LABEL: st1h_s:
 ; CHECK: st1h { z0.s }, p0, [x0]
@@ -161,6 +171,7 @@
 declare void @llvm.aarch64.sve.st1.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i1>, i8*)
 declare void @llvm.aarch64.sve.st1.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16*)
 declare void @llvm.aarch64.sve.st1.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half*)
+declare void @llvm.aarch64.sve.st1.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat*)
 
 declare void @llvm.aarch64.sve.st1.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i1>, i8*)
 declare void @llvm.aarch64.sve.st1.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i1>, i16*)
Index: llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll
===================================================================
--- llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll
+++ llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll
@@ -377,6 +377,16 @@
   ret void
 }
 
+define void @stnt1h_bf16(<vscale x 8 x bfloat> %data, <vscale x 8 x i1> %pred, bfloat* %addr) {
+; CHECK-LABEL: stnt1h_bf16:
+; CHECK: stnt1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
+  call void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat> %data,
+                                             <vscale x 8 x i1> %pred,
+                                             bfloat* %addr)
+  ret void
+}
+
 ;
 ; STNT1W
 ;
@@ -458,5 +468,6 @@
 declare void @llvm.aarch64.sve.stnt1.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32*)
 declare void @llvm.aarch64.sve.stnt1.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64*)
 declare void @llvm.aarch64.sve.stnt1.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half*)
+declare void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat*)
 declare void @llvm.aarch64.sve.stnt1.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, float*)
 declare void @llvm.aarch64.sve.stnt1.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double*)
Index: llvm/test/CodeGen/AArch64/sve-masked-ldst-nonext.ll
===================================================================
--- llvm/test/CodeGen/AArch64/sve-masked-ldst-nonext.ll
+++ llvm/test/CodeGen/AArch64/sve-masked-ldst-nonext.ll
@@ -179,6 +179,14 @@
   ret void
 }
 
+define void @masked_store_nxv8bf16(<vscale x 8 x bfloat> *%a, <vscale x 8 x bfloat> %val, <vscale x 8 x i1> %mask) nounwind {
+; CHECK-LABEL: masked_store_nxv8bf16:
+; CHECK-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
+  call void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat> %val, <vscale x 8 x bfloat> *%a, i32 2, <vscale x 8 x i1> %mask)
+  ret void
+}
+
 declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64(<vscale x 2 x i64>*, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
 declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32(<vscale x 4 x i32>*, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
 declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>*, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
@@ -203,3 +211,4 @@
 declare void @llvm.masked.store.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>*, i32, <vscale x 4 x i1>)
 declare void @llvm.masked.store.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>*, i32, <vscale x 4 x i1>)
 declare void @llvm.masked.store.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>*, i32, <vscale x 8 x i1>)
+declare void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>)
Index: llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-imm.ll
===================================================================
--- llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-imm.ll
+++ llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-imm.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s 2>%t | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve,+bf16 --asm-verbose=false < %s 2>%t | FileCheck %s
 ; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
 
 ; WARN-NOT: warning
@@ -513,6 +513,24 @@
   ret void
 }
 
+define void @test_masked_ldst_sv8bf16(<vscale x 8 x bfloat> * %base, <vscale x 8 x i1> %mask) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv8bf16:
+; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #-1, mul vl]
+; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, #2, mul vl]
+; CHECK-NEXT: ret
+  %base_load = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %base, i64 -1
+  %data = call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(<vscale x 8 x bfloat>* %base_load,
+                                                                i32 1,
+                                                                <vscale x 8 x i1> %mask,
+                                                                <vscale x 8 x bfloat> undef)
+  %base_store = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat> * %base, i64 2
+  call void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat> %data,
+                                        <vscale x 8 x bfloat>* %base_store,
+                                        i32 1,
+                                        <vscale x 8 x i1> %mask)
+  ret void
+}
+
 ; 8-lane zero/sign extended contiguous loads.
 
 define <vscale x 8 x i16> @masked_zload_sv8i8_to_sv8i16(<vscale x 8 x i8>* %base, <vscale x 8 x i1> %mask) nounwind {
@@ -596,6 +614,7 @@
 declare <vscale x 8 x i8>  @llvm.masked.load.nxv8i8 (<vscale x 8 x i8>* , i32, <vscale x 8 x i1>, <vscale x 8 x i8> )
 declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>*, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
 declare <vscale x 8 x half> @llvm.masked.load.nxv8f16(<vscale x 8 x half>*, i32, <vscale x 8 x i1>, <vscale x 8 x half>)
+declare <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(<vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>, <vscale x 8 x bfloat>)
 
 ; 16-element contiguous loads.
 declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8(<vscale x 16 x i8>*, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
@@ -620,6 +639,7 @@
 declare void @llvm.masked.store.nxv8i8 (<vscale x 8 x i8> , <vscale x 8 x i8>* , i32, <vscale x 8 x i1>)
 declare void @llvm.masked.store.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>*, i32, <vscale x 8 x i1>)
 declare void @llvm.masked.store.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>*, i32, <vscale x 8 x i1>)
+declare void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>)
 
 ; 16-element contiguous stores.
 declare void @llvm.masked.store.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>*, i32, <vscale x 16 x i1>)
Index: llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-reg.ll
===================================================================
--- llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-reg.ll
+++ llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-reg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s 2>%t | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve,+bf16 --asm-verbose=false < %s 2>%t | FileCheck %s
 ; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
 
 ; WARN-NOT: warning
@@ -498,6 +498,24 @@
   ret void
 }
 
+define void @test_masked_ldst_sv8bf16(bfloat * %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv8bf16:
+; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+  %base_f16 = getelementptr bfloat, bfloat* %base, i64 %offset
+  %base_addr = bitcast bfloat* %base_f16 to <vscale x 8 x bfloat>*
+  %data = call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(<vscale x 8 x bfloat>* %base_addr,
+                                                               i32 1,
+                                                               <vscale x 8 x i1> %mask,
+                                                               <vscale x 8 x bfloat> undef)
+  call void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat> %data,
+                                        <vscale x 8 x bfloat>* %base_addr,
+                                        i32 1,
+                                        <vscale x 8 x i1> %mask)
+  ret void
+}
+
 ; 8-lane zero/sign extended contiguous loads.
 
 define <vscale x 8 x i16> @masked_zload_sv8i8_to_sv8i16(i8* %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
@@ -584,6 +602,7 @@
 declare <vscale x 8 x i8>  @llvm.masked.load.nxv8i8 (<vscale x 8 x i8>* , i32, <vscale x 8 x i1>, <vscale x 8 x i8> )
 declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16(<vscale x 8 x i16>*, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
 declare <vscale x 8 x half> @llvm.masked.load.nxv8f16(<vscale x 8 x half>*, i32, <vscale x 8 x i1>, <vscale x 8 x half>)
+declare <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(<vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>, <vscale x 8 x bfloat>)
 
 ; 16-element contiguous loads.
 declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8(<vscale x 16 x i8>*, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
@@ -608,6 +627,7 @@
 declare void @llvm.masked.store.nxv8i8 (<vscale x 8 x i8> , <vscale x 8 x i8>* , i32, <vscale x 8 x i1>)
 declare void @llvm.masked.store.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>*, i32, <vscale x 8 x i1>)
 declare void @llvm.masked.store.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>*, i32, <vscale x 8 x i1>)
+declare void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>*, i32, <vscale x 8 x i1>)
 
 ; 16-element contiguous stores.
 declare void @llvm.masked.store.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>*, i32, <vscale x 16 x i1>)
Index: llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-imm.ll
===================================================================
--- llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-imm.ll
+++ llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-imm.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s 2>%t | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve,+bf16 --asm-verbose=false < %s 2>%t | FileCheck %s
 ; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
 
 ; WARN-NOT: warning
@@ -139,6 +139,23 @@
   ret void
 }
 
+define void @test_masked_ldst_sv8bf16(<vscale x 8 x bfloat> * %base, <vscale x 8 x i1> %mask) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv8bf16:
+; CHECK-NEXT: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #-1, mul vl]
+; CHECK-NEXT: stnt1h { z[[DATA]].h }, p0, [x0, #2, mul vl]
+; CHECK-NEXT: ret
+  %base_load = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %base, i64 -1
+  %base_load_bc = bitcast <vscale x 8 x bfloat>* %base_load to bfloat*
+  %data = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1> %mask,
+                                                                      bfloat* %base_load_bc)
+  %base_store = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat> * %base, i64 2
+  %base_store_bc = bitcast <vscale x 8 x bfloat>* %base_store to bfloat*
+  call void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat> %data,
+                                             <vscale x 8 x i1> %mask,
+                                             bfloat* %base_store_bc)
+  ret void
+}
+
 ; 16-lane non-temporal load/stores.
 
 define void @test_masked_ldst_sv16i8(<vscale x 16 x i8> * %base, <vscale x 16 x i1> %mask) nounwind {
@@ -169,6 +186,7 @@
 ; 8-element non-temporal loads.
 declare <vscale x 8 x i16> @llvm.aarch64.sve.ldnt1.nxv8i16(<vscale x 8 x i1>, i16*)
 declare <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1>, half*)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1>, bfloat*)
 
 ; 16-element non-temporal loads.
 declare <vscale x 16 x i8> @llvm.aarch64.sve.ldnt1.nxv16i8(<vscale x 16 x i1>, i8*)
@@ -176,14 +194,15 @@
 ; 2-element non-temporal stores.
 declare void @llvm.aarch64.sve.stnt1.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64*)
 declare void @llvm.aarch64.sve.stnt1.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double*)
-                                                                      
-; 4-element non-temporal stores.                                        
+
+; 4-element non-temporal stores.
 declare void @llvm.aarch64.sve.stnt1.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32*)
 declare void @llvm.aarch64.sve.stnt1.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, float*)
-                                                                      
-; 8-element non-temporal stores.                                        
+
+; 8-element non-temporal stores.
 declare void @llvm.aarch64.sve.stnt1.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16*)
 declare void @llvm.aarch64.sve.stnt1.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half*)
+declare void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat*)
 
 ; 16-element non-temporal stores.
 declare void @llvm.aarch64.sve.stnt1.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i8*)
Index: llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-reg.ll
===================================================================
--- llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-reg.ll
+++ llvm/test/CodeGen/AArch64/sve-pred-non-temporal-ldst-addressing-mode-reg-reg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s 2>%t | FileCheck %s
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve,+bf16 --asm-verbose=false < %s 2>%t | FileCheck %s
 ; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
 
 ; WARN-NOT: warning
@@ -94,6 +94,20 @@
   ret void
 }
 
+define void @test_masked_ldst_sv8bf16(bfloat* %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
+; CHECK-LABEL: test_masked_ldst_sv8bf16:
+; CHECK-NEXT: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1]
+; CHECK-NEXT: stnt1h { z[[DATA]].h }, p0, [x0, x1, lsl #1]
+; CHECK-NEXT: ret
+  %gep = getelementptr bfloat, bfloat* %base, i64 %offset
+  %data = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1> %mask,
+                                                                      bfloat* %gep)
+  call void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat> %data,
+                                             <vscale x 8 x i1> %mask,
+                                             bfloat* %gep)
+  ret void
+}
+
 ; 16-lane non-temporal load/stores.
 
 define void @test_masked_ldst_sv16i8(i8* %base, <vscale x 16 x i1> %mask, i64 %offset) nounwind {
@@ -121,6 +135,7 @@
 ; 8-element non-temporal loads.
 declare <vscale x 8 x i16> @llvm.aarch64.sve.ldnt1.nxv8i16(<vscale x 8 x i1>, i16*)
 declare <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1>, half*)
+declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1>, bfloat*)
 
 ; 16-element non-temporal loads.
 declare <vscale x 16 x i8> @llvm.aarch64.sve.ldnt1.nxv16i8(<vscale x 16 x i1>, i8*)
@@ -128,14 +143,15 @@
 ; 2-element non-temporal stores.
 declare void @llvm.aarch64.sve.stnt1.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64*)
 declare void @llvm.aarch64.sve.stnt1.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double*)
-                                                                      
-; 4-element non-temporal stores.                                        
+
+; 4-element non-temporal stores.
 declare void @llvm.aarch64.sve.stnt1.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32*)
 declare void @llvm.aarch64.sve.stnt1.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, float*)
-                                                                      
-; 8-element non-temporal stores.                                        
+
+; 8-element non-temporal stores.
 declare void @llvm.aarch64.sve.stnt1.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16*)
 declare void @llvm.aarch64.sve.stnt1.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half*)
+declare void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat*)
 
 ; 16-element non-temporal stores.
 declare void @llvm.aarch64.sve.stnt1.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i8*)