Index: clang/include/clang/Basic/arm_neon.td
===================================================================
--- clang/include/clang/Basic/arm_neon.td
+++ clang/include/clang/Basic/arm_neon.td
@@ -221,6 +221,21 @@
 def OP_FMLSL_LN_Hi  : Op<(call "vfmlsl_high", $p0, $p1,
                            (dup_typed $p1, (call "vget_lane", $p2, $p3)))>;
 
+def OP_USDOT_LN
+    : Op<(call "vusdot", $p0, $p1,
+          (cast "8", "S", (call_mangled "splat_lane", (bitcast "int32x2_t", $p2), $p3)))>;
+def OP_USDOT_LNQ
+    : Op<(call "vusdot", $p0, $p1,
+          (cast "8", "S", (call_mangled "splat_lane", (bitcast "int32x4_t", $p2), $p3)))>;
+
+// sudot splats the second vector and then calls vusdot
+def OP_SUDOT_LN
+    : Op<(call "vusdot", $p0,
+          (cast "8", "U", (call_mangled "splat_lane", (bitcast "int32x2_t", $p2), $p3)), $p1)>;
+def OP_SUDOT_LNQ
+    : Op<(call "vusdot", $p0,
+          (cast "8", "U", (call_mangled "splat_lane", (bitcast "int32x4_t", $p2), $p3)), $p1)>;
+
 //===----------------------------------------------------------------------===//
 // Auxiliary Instructions
 //===----------------------------------------------------------------------===//
@@ -1792,6 +1807,23 @@
   }
 }
 
+let ArchGuard = "defined(__ARM_FEATURE_MATMUL_INT8)" in {
+  def VMMLA   : SInst<"vmmla", "..(<<)(<<)", "QUiQi">;
+  def VUSMMLA : SInst<"vusmmla", "..(<<U)(<<)", "Qi">;
+
+  def VUSDOT  : SInst<"vusdot", "..(<<U)(<<)", "iQi">;
+
+  def VUSDOT_LANE  : SOpInst<"vusdot_lane", "..(<<U)(<<q)I", "iQi", OP_USDOT_LN>;
+  def VSUDOT_LANE  : SOpInst<"vsudot_lane", "..(<<)(<<qU)I", "iQi", OP_SUDOT_LN>;
+
+  let ArchGuard = "defined(__aarch64__)" in {
+    let isLaneQ = 1 in {
+      def VUSDOT_LANEQ  : SOpInst<"vusdot_laneq", "..(<<U)(<<Q)I", "iQi", OP_USDOT_LNQ>;
+      def VSUDOT_LANEQ  : SOpInst<"vsudot_laneq", "..(<<)(<<QU)I", "iQi", OP_SUDOT_LNQ>;
+    }
+  }
+}
+
 // v8.3-A Vector complex addition intrinsics
 let ArchGuard = "defined(__ARM_FEATURE_COMPLEX) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)" in {
   def VCADD_ROT90_FP16   : SInst<"vcadd_rot90", "...", "h">;
@@ -1808,4 +1840,4 @@
 let ArchGuard = "defined(__ARM_FEATURE_COMPLEX) && defined(__aarch64__)" in {
   def VCADDQ_ROT90_FP64  : SInst<"vcaddq_rot90", "QQQ", "d">;
   def VCADDQ_ROT270_FP64 : SInst<"vcaddq_rot270", "QQQ", "d">;
-}
\ No newline at end of file
+}
Index: clang/lib/Basic/Targets/AArch64.h
===================================================================
--- clang/lib/Basic/Targets/AArch64.h
+++ clang/lib/Basic/Targets/AArch64.h
@@ -36,6 +36,7 @@
   bool HasFP16FML;
   bool HasMTE;
   bool HasTME;
+  bool HasMatMul;
 
   llvm::AArch64::ArchKind ArchKind;
 
Index: clang/lib/Basic/Targets/AArch64.cpp
===================================================================
--- clang/lib/Basic/Targets/AArch64.cpp
+++ clang/lib/Basic/Targets/AArch64.cpp
@@ -280,6 +280,9 @@
   if (HasTME)
     Builder.defineMacro("__ARM_FEATURE_TME", "1");
 
+  if (HasMatMul)
+    Builder.defineMacro("__ARM_FEATURE_MATMUL_INT8", "1");
+
   if ((FPU & NeonMode) && HasFP16FML)
     Builder.defineMacro("__ARM_FEATURE_FP16FML", "1");
 
@@ -356,6 +359,7 @@
   HasFP16FML = false;
   HasMTE = false;
   HasTME = false;
+  HasMatMul = false;
   ArchKind = llvm::AArch64::ArchKind::ARMV8A;
 
   for (const auto &Feature : Features) {
@@ -391,6 +395,8 @@
       HasMTE = true;
     if (Feature == "+tme")
       HasTME = true;
+    if (Feature == "+i8mm")
+      HasMatMul = true;
   }
 
   setDataLayout();
Index: clang/lib/Basic/Targets/ARM.h
===================================================================
--- clang/lib/Basic/Targets/ARM.h
+++ clang/lib/Basic/Targets/ARM.h
@@ -75,6 +75,7 @@
   unsigned DSP : 1;
   unsigned Unaligned : 1;
   unsigned DotProd : 1;
+  unsigned HasMatMul : 1;
 
   enum {
     LDREX_B = (1 << 0), /// byte (8-bit)
Index: clang/lib/Basic/Targets/ARM.cpp
===================================================================
--- clang/lib/Basic/Targets/ARM.cpp
+++ clang/lib/Basic/Targets/ARM.cpp
@@ -425,6 +425,7 @@
   // Note that SoftFloatABI is initialized in our constructor.
   HWDiv = 0;
   DotProd = 0;
+  HasMatMul = 0;
   HasFloat16 = true;
   ARMCDECoprocMask = 0;
 
@@ -491,6 +492,8 @@
       FPU |= FPARMV8;
       MVE |= MVE_INT | MVE_FP;
       HW_FP |= HW_FP_SP | HW_FP_HP;
+    } else if (Feature == "+i8mm") {
+      HasMatMul = 1;
     } else if (Feature.size() == strlen("+cdecp0") && Feature >= "+cdecp0" &&
                Feature <= "+cdecp7") {
       unsigned Coproc = Feature.back() - '0';
@@ -820,6 +823,9 @@
   if (DotProd)
     Builder.defineMacro("__ARM_FEATURE_DOTPROD", "1");
 
+  if (HasMatMul)
+    Builder.defineMacro("__ARM_FEATURE_MATMUL_INT8", "1");
+
   switch (ArchKind) {
   default:
     break;
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -4781,6 +4781,7 @@
   NEONMAP1(vminnm_v, arm_neon_vminnm, Add1ArgType),
   NEONMAP1(vminnmq_v, arm_neon_vminnm, Add1ArgType),
   NEONMAP2(vminq_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vmmlaq_v, arm_neon_ummla, arm_neon_smmla, 0),
   NEONMAP0(vmovl_v),
   NEONMAP0(vmovn_v),
   NEONMAP1(vmul_v, arm_neon_vmulp, Add1ArgType),
@@ -4888,6 +4889,9 @@
   NEONMAP0(vtrnq_v),
   NEONMAP0(vtst_v),
   NEONMAP0(vtstq_v),
+  NEONMAP1(vusdot_v, arm_neon_usdot, 0),
+  NEONMAP1(vusdotq_v, arm_neon_usdot, 0),
+  NEONMAP1(vusmmlaq_v, arm_neon_usmmla, 0),
   NEONMAP0(vuzp_v),
   NEONMAP0(vuzpq_v),
   NEONMAP0(vzip_v),
@@ -4983,6 +4987,7 @@
   NEONMAP1(vld1q_x2_v, aarch64_neon_ld1x2, 0),
   NEONMAP1(vld1q_x3_v, aarch64_neon_ld1x3, 0),
   NEONMAP1(vld1q_x4_v, aarch64_neon_ld1x4, 0),
+  NEONMAP2(vmmlaq_v, aarch64_neon_ummla, aarch64_neon_smmla, 0),
   NEONMAP0(vmovl_v),
   NEONMAP0(vmovn_v),
   NEONMAP1(vmul_v, aarch64_neon_pmul, Add1ArgType),
@@ -5065,6 +5070,9 @@
   NEONMAP0(vsubhn_v),
   NEONMAP0(vtst_v),
   NEONMAP0(vtstq_v),
+  NEONMAP1(vusdot_v, aarch64_neon_usdot, 0),
+  NEONMAP1(vusdotq_v, aarch64_neon_usdot, 0),
+  NEONMAP1(vusmmlaq_v, aarch64_neon_usmmla, 0),
 };
 
 static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
@@ -6047,6 +6055,26 @@
     llvm::Type *Tys[2] = { Ty, InputTy };
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_high");
   }
+  case NEON::BI__builtin_neon_vmmlaq_v: {
+    llvm::Type *InputTy =
+           llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
+    llvm::Type *Tys[2] = { Ty, InputTy };
+    Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic;
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmmla");
+  }
+  case NEON::BI__builtin_neon_vusmmlaq_v: {
+    llvm::Type *InputTy =
+           llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
+    llvm::Type *Tys[2] = { Ty, InputTy };
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusmmla");
+  }
+  case NEON::BI__builtin_neon_vusdot_v:
+  case NEON::BI__builtin_neon_vusdotq_v: {
+    llvm::Type *InputTy =
+           llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
+    llvm::Type *Tys[2] = { Ty, InputTy };
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusdot");
+  }
   }
 
   assert(Int && "Expected valid intrinsic number");
Index: clang/lib/Driver/ToolChains/Arch/AArch64.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Arch/AArch64.cpp
+++ clang/lib/Driver/ToolChains/Arch/AArch64.cpp
@@ -54,7 +54,8 @@
 
 // Decode AArch64 features from string like +[no]featureA+[no]featureB+...
 static bool DecodeAArch64Features(const Driver &D, StringRef text,
-                                  std::vector<StringRef> &Features) {
+                                  std::vector<StringRef> &Features,
+                                  llvm::AArch64::ArchKind ArchKind) {
   SmallVector<StringRef, 8> Split;
   text.split(Split, StringRef("+"), -1, false);
 
@@ -66,6 +67,10 @@
       D.Diag(clang::diag::err_drv_no_neon_modifier);
     else
       return false;
+
+    // +sve implies +f32mm if the base architecture is v8.6A
+    if ((ArchKind == llvm::AArch64::ArchKind::ARMV8_6A) && Feature == "sve")
+      Features.push_back("+f32mm");
   }
   return true;
 }
@@ -76,6 +81,7 @@
                               std::vector<StringRef> &Features) {
   std::pair<StringRef, StringRef> Split = Mcpu.split("+");
   CPU = Split.first;
+  llvm::AArch64::ArchKind ArchKind = llvm::AArch64::ArchKind::ARMV8A;
 
   if (CPU == "native")
     CPU = llvm::sys::getHostCPUName();
@@ -83,7 +89,7 @@
   if (CPU == "generic") {
     Features.push_back("+neon");
   } else {
-    llvm::AArch64::ArchKind ArchKind = llvm::AArch64::parseCPUArch(CPU);
+    ArchKind = llvm::AArch64::parseCPUArch(CPU);
     if (!llvm::AArch64::getArchFeatures(ArchKind, Features))
       return false;
 
@@ -92,10 +98,11 @@
       return false;
    }
 
-  if (Split.second.size() && !DecodeAArch64Features(D, Split.second, Features))
-    return false;
+   if (Split.second.size() &&
+       !DecodeAArch64Features(D, Split.second, Features, ArchKind))
+     return false;
 
-  return true;
+   return true;
 }
 
 static bool
@@ -108,7 +115,8 @@
   llvm::AArch64::ArchKind ArchKind = llvm::AArch64::parseArch(Split.first);
   if (ArchKind == llvm::AArch64::ArchKind::INVALID ||
       !llvm::AArch64::getArchFeatures(ArchKind, Features) ||
-      (Split.second.size() && !DecodeAArch64Features(D, Split.second, Features)))
+      (Split.second.size() &&
+       !DecodeAArch64Features(D, Split.second, Features, ArchKind)))
     return false;
 
   return true;
Index: clang/test/CodeGen/aarch64-matmul.cpp
===================================================================
--- /dev/null
+++ clang/test/CodeGen/aarch64-matmul.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1             -triple aarch64-eabi -target-feature +neon -target-feature +i8mm -S -emit-llvm %s -o - | FileCheck %s
+
+#ifdef __ARM_FEATURE_MATMUL_INT8
+extern "C" void arm_feature_matmulint8_defined() {}
+#endif
+// CHECK: define void @arm_feature_matmulint8_defined()
+
+
Index: clang/test/CodeGen/aarch64-v8.6a-neon-intrinsics.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/aarch64-v8.6a-neon-intrinsics.c
@@ -0,0 +1,171 @@
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm \
+// RUN: -fallow-half-arguments-and-returns -S -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg \
+// RUN: | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: test_vmmlaq_s32
+// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
+// CHECK: ret <4 x i32> [[VAL]]
+int32x4_t test_vmmlaq_s32(int32x4_t r, int8x16_t a, int8x16_t b) {
+  return vmmlaq_s32(r, a, b);
+}
+
+// CHECK-LABEL: test_vmmlaq_u32
+// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
+// CHECK: ret <4 x i32> [[VAL]]
+uint32x4_t test_vmmlaq_u32(uint32x4_t r, uint8x16_t a, uint8x16_t b) {
+  return vmmlaq_u32(r, a, b);
+}
+
+// CHECK-LABEL: test_vusmmlaq_s32
+// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
+// CHECK: ret <4 x i32> [[VAL]]
+int32x4_t test_vusmmlaq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) {
+  return vusmmlaq_s32(r, a, b);
+}
+
+// CHECK-LABEL: test_vusdot_s32
+// CHECK: [[VAL:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b)
+// CHECK: ret <2 x i32> [[VAL]]
+int32x2_t test_vusdot_s32(int32x2_t r, uint8x8_t a, int8x8_t b) {
+  return vusdot_s32(r, a, b);
+}
+
+// CHECK-LABEL: test_vusdot_lane_s32
+// CHECK: [[REINT:%.*]] = alloca <8 x i8>
+// CHECK: store <8 x i8> %b, <8 x i8>* [[REINT]]
+// CHECK: [[TMP0:%.*]] = bitcast <8 x i8>* [[REINT]] to <2 x i32>*
+// CHECK: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]]
+// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8>
+// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8>
+// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP4]])
+// CHECK: ret <2 x i32> [[OP]]
+int32x2_t test_vusdot_lane_s32(int32x2_t r, uint8x8_t a, int8x8_t b) {
+  return vusdot_lane_s32(r, a, b, 0);
+}
+
+// CHECK-LABEL: test_vsudot_lane_s32
+// CHECK: [[REINT:%.*]] = alloca <8 x i8>
+// CHECK: store <8 x i8> %b, <8 x i8>* [[REINT]]
+// CHECK: [[TMP0:%.*]] = bitcast <8 x i8>* [[REINT]] to <2 x i32>*
+// CHECK: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]]
+// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %1 to <8 x i8>
+// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> %2 to <2 x i32>
+// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> %3, <2 x i32> %3, <2 x i32> zeroinitializer
+// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8>
+// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP4]], <8 x i8> %a)
+// CHECK: ret <2 x i32> [[OP]]
+int32x2_t test_vsudot_lane_s32(int32x2_t r, int8x8_t a, uint8x8_t b) {
+  return vsudot_lane_s32(r, a, b, 0);
+}
+
+// CHECK-LABEL: test_vusdot_laneq_s32
+// CHECK: [[REINT:%.*]] = alloca <16 x i8>
+// CHECK: store <16 x i8> %b, <16 x i8>* [[REINT]]
+// CHECK: [[TMP0:%.*]] = bitcast <16 x i8>* [[REINT]] to <4 x i32>*
+// CHECK: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]]
+// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
+// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8>
+// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP4]])
+// CHECK: ret <2 x i32> [[OP]]
+int32x2_t test_vusdot_laneq_s32(int32x2_t r, uint8x8_t a, int8x16_t b) {
+  return vusdot_laneq_s32(r, a, b, 0);
+}
+
+// CHECK-LABEL: test_vsudot_laneq_s32
+// CHECK: [[REINT:%.*]] = alloca <16 x i8>
+// CHECK: store <16 x i8> %b, <16 x i8>* [[REINT]]
+// CHECK: [[TMP0:%.*]] = bitcast <16 x i8>* [[REINT]] to <4 x i32>*
+// CHECK: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]]
+// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
+// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8>
+// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP4]], <8 x i8> %a)
+// CHECK: ret <2 x i32> [[OP]]
+int32x2_t test_vsudot_laneq_s32(int32x2_t r, int8x8_t a, uint8x16_t b) {
+  return vsudot_laneq_s32(r, a, b, 0);
+}
+
+// CHECK-LABEL: test_vusdotq_s32
+// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
+// CHECK: ret <4 x i32> [[VAL]]
+int32x4_t test_vusdotq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) {
+  return vusdotq_s32(r, a, b);
+}
+
+// CHECK-LABEL: test_vusdotq_lane_s32
+// CHECK: [[REINT:%.*]] = alloca <8 x i8>
+// CHECK: store <8 x i8> %b, <8 x i8>* [[REINT]]
+// CHECK: [[TMP0:%.*]] = bitcast <8 x i8>* [[REINT]] to <2 x i32>*
+// CHECK: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]]
+// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8>
+// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
+// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8>
+// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]])
+// CHECK: ret <4 x i32> [[OP]]
+int32x4_t test_vusdotq_lane_s32(int32x4_t r, uint8x16_t a, int8x8_t b) {
+  return vusdotq_lane_s32(r, a, b, 0);
+}
+
+// CHECK-LABEL: test_vsudotq_lane_s32
+// CHECK: [[REINT:%.*]] = alloca <8 x i8>
+// CHECK: store <8 x i8> %b, <8 x i8>* [[REINT]]
+// CHECK: [[TMP0:%.*]] = bitcast <8 x i8>* [[REINT]] to <2 x i32>*
+// CHECK: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]]
+// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8>
+// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
+// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8>
+// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> [[TMP4]], <16 x i8> %a)
+// CHECK: ret <4 x i32> [[OP]]
+int32x4_t test_vsudotq_lane_s32(int32x4_t r, int8x16_t a, uint8x8_t b) {
+  return vsudotq_lane_s32(r, a, b, 0);
+}
+
+// CHECK-LABEL: test_vusdotq_laneq_s32
+// CHECK: [[REINT:%.*]] = alloca <16 x i8>
+// CHECK: store <16 x i8> %b, <16 x i8>* [[REINT]]
+// CHECK: [[TMP0:%.*]] = bitcast <16 x i8>* [[REINT]] to <4 x i32>*
+// CHECK: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]]
+// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
+// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
+// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8>
+// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]])
+// CHECK: ret <4 x i32> [[OP]]
+int32x4_t test_vusdotq_laneq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) {
+  return vusdotq_laneq_s32(r, a, b, 0);
+}
+
+// CHECK-LABEL: test_vsudotq_laneq_s32
+// CHECK: [[REINT:%.*]] = alloca <16 x i8>
+// CHECK: store <16 x i8> %b, <16 x i8>* [[REINT]]
+// CHECK: [[TMP0:%.*]] = bitcast <16 x i8>* [[REINT]] to <4 x i32>*
+// CHECK: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]]
+// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8>
+// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
+// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8>
+// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> [[TMP4]], <16 x i8> %a)
+// CHECK: ret <4 x i32> [[OP]]
+int32x4_t test_vsudotq_laneq_s32(int32x4_t r, int8x16_t a, uint8x16_t b) {
+  return vsudotq_laneq_s32(r, a, b, 0);
+}
Index: clang/test/CodeGen/arm-v8.6a-neon-intrinsics.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/arm-v8.6a-neon-intrinsics.c
@@ -0,0 +1,100 @@
+// RUN: %clang_cc1 -triple armv8.6a-arm-none-eabi -target-feature +neon -target-feature +fullfp16 -target-feature +i8mm \
+// RUN: -fallow-half-arguments-and-returns -S -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg \
+// RUN: | FileCheck %s
+
+// REQUIRES: arm-registered-target
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: test_vmmlaq_s32
+// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.arm.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
+// CHECK: ret <4 x i32> [[VAL]]
+int32x4_t test_vmmlaq_s32(int32x4_t r, int8x16_t a, int8x16_t b) {
+  return vmmlaq_s32(r, a, b);
+}
+
+// CHECK-LABEL: test_vmmlaq_u32
+// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.arm.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
+// CHECK: ret <4 x i32> [[VAL]]
+uint32x4_t test_vmmlaq_u32(uint32x4_t r, uint8x16_t a, uint8x16_t b) {
+  return vmmlaq_u32(r, a, b);
+}
+
+// CHECK-LABEL: test_vusmmlaq_s32
+// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.arm.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
+// CHECK: ret <4 x i32> [[VAL]]
+int32x4_t test_vusmmlaq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) {
+  return vusmmlaq_s32(r, a, b);
+}
+
+// CHECK-LABEL: test_vusdot_s32
+// CHECK: [[VAL:%.*]] = call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b)
+// CHECK: ret <2 x i32> [[VAL]]
+int32x2_t test_vusdot_s32(int32x2_t r, uint8x8_t a, int8x8_t b) {
+  return vusdot_s32(r, a, b);
+}
+
+// CHECK-LABEL: test_vusdot_lane_s32
+// CHECK: [[REINT:%.*]] = alloca <8 x i8>
+// CHECK: store <8 x i8> %b, <8 x i8>* [[REINT]]
+// CHECK: [[TMP0:%.*]] = bitcast <8 x i8>* [[REINT]] to <2 x i32>*
+// CHECK: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]]
+// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8>
+// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8>
+// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP4]])
+// CHECK: ret <2 x i32> [[OP]]
+int32x2_t test_vusdot_lane_s32(int32x2_t r, uint8x8_t a, int8x8_t b) {
+  return vusdot_lane_s32(r, a, b, 0);
+}
+
+// CHECK-LABEL: test_vsudot_lane_s32
+// CHECK: [[REINT:%.*]] = alloca <8 x i8>
+// CHECK: store <8 x i8> %b, <8 x i8>* [[REINT]]
+// CHECK: [[TMP0:%.*]] = bitcast <8 x i8>* [[REINT]] to <2 x i32>*
+// CHECK: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]]
+// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8>
+// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
+// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8>
+// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP4]], <8 x i8> %a)
+// CHECK: ret <2 x i32> [[OP]]
+int32x2_t test_vsudot_lane_s32(int32x2_t r, int8x8_t a, uint8x8_t b) {
+  return vsudot_lane_s32(r, a, b, 0);
+}
+
+// CHECK-LABEL: test_vusdotq_lane_s32
+// CHECK: [[REINT:%.*]] = alloca <8 x i8>
+// CHECK: store <8 x i8> %b, <8 x i8>* [[REINT]]
+// CHECK: [[TMP0:%.*]] = bitcast <8 x i8>* [[REINT]] to <2 x i32>*
+// CHECK: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]]
+// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8>
+// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
+// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8>
+// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.arm.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]])
+// CHECK: ret <4 x i32> [[OP]]
+int32x4_t test_vusdotq_lane_s32(int32x4_t r, uint8x16_t a, int8x8_t b) {
+  return vusdotq_lane_s32(r, a, b, 0);
+}
+
+// CHECK-LABEL: test_vsudotq_lane_s32
+// CHECK: [[REINT:%.*]] = alloca <8 x i8>
+// CHECK: store <8 x i8> %b, <8 x i8>* [[REINT]]
+// CHECK: [[TMP0:%.*]] = bitcast <8 x i8>* [[REINT]] to <2 x i32>*
+// CHECK: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]]
+// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %1 to <8 x i8>
+// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> %2 to <2 x i32>
+// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> %3, <2 x i32> %3, <4 x i32> zeroinitializer
+// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8>
+// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8>
+// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.arm.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> [[TMP4]], <16 x i8> %a)
+// CHECK: ret <4 x i32> [[OP]]
+int32x4_t test_vsudotq_lane_s32(int32x4_t r, int8x16_t a, uint8x8_t b) {
+  return vsudotq_lane_s32(r, a, b, 0);
+}
Index: clang/test/Driver/aarch64-cpus.c
===================================================================
--- clang/test/Driver/aarch64-cpus.c
+++ clang/test/Driver/aarch64-cpus.c
@@ -636,6 +636,34 @@
 // RUN: %clang -target aarch64 -march=armv8.5a+bf16+sve -### -c %s 2>&1 | FileCheck -check-prefixes=GENERICV85A-BF16-SVE %s
 // GENERICV85A-BF16-SVE: "-target-feature" "+bf16" "-target-feature" "+sve"
 
+// The 8-bit integer matrix multiply extension is a mandatory component of the
+// Armv8.6-A extensions, but is permitted as an optional feature for any
+// implementation of Armv8.2-A to Armv8.5-A (inclusive)
+// RUN: %clang -target aarch64 -march=armv8.5a -### -c %s 2>&1 | FileCheck -check-prefix=NO-I8MM %s
+// RUN: %clang -target aarch64 -march=armv8.5a+i8mm -### -c %s 2>&1 | FileCheck -check-prefix=I8MM %s
+// NO-I8MM-NOT: "-target-feature" "+i8mm"
+// I8MM: "-target-feature" "+i8mm"
+
+// The 32-bit floating point matrix multiply extension is enabled by default
+// for armv8.6-a targets (or later) with SVE, and can optionally be enabled for
+// any target from armv8.2a onwards (we don't enforce not using it with earlier
+// targets).
+// RUN: %clang -target aarch64 -march=armv8.6a       -### -c %s 2>&1 | FileCheck -check-prefix=NO-F32MM %s
+// RUN: %clang -target aarch64 -march=armv8.6a+sve   -### -c %s 2>&1 | FileCheck -check-prefix=F32MM %s
+// RUN: %clang -target aarch64 -march=armv8.5a+f32mm -### -c %s 2>&1 | FileCheck -check-prefix=F32MM %s
+// NO-F32MM-NOT: "-target-feature" "+f32mm"
+// F32MM: "-target-feature" "+f32mm"
+
+// The 64-bit floating point matrix multiply extension is not currently enabled
+// by default for any targets, because it requires an SVE vector length >= 256
+// bits. When we add a CPU which has that, then it can be enabled by default,
+// but for now it can only be used by adding the +f64mm feature.
+// RUN: %clang -target aarch64 -march=armv8.6a       -### -c %s 2>&1 | FileCheck -check-prefix=NO-F64MM %s
+// RUN: %clang -target aarch64 -march=armv8.6a+sve   -### -c %s 2>&1 | FileCheck -check-prefix=NO-F64MM %s
+// RUN: %clang -target aarch64 -march=armv8.6a+f64mm -### -c %s 2>&1 | FileCheck -check-prefix=F64MM %s
+// NO-F64MM-NOT: "-target-feature" "+f64mm"
+// F64MM: "-target-feature" "+f64mm"
+
 // fullfp16 is off by default for v8a, feature must not be mentioned
 // RUN: %clang -target aarch64 -march=armv8a  -### -c %s 2>&1 | FileCheck -check-prefix=V82ANOFP16 -check-prefix=GENERIC %s
 // RUN: %clang -target aarch64 -march=armv8-a -### -c %s 2>&1 | FileCheck -check-prefix=V82ANOFP16 -check-prefix=GENERIC %s
Index: clang/test/Driver/arm-matrix-multiply.c
===================================================================
--- /dev/null
+++ clang/test/Driver/arm-matrix-multiply.c
@@ -0,0 +1,14 @@
+// RUN: %clang -### -target arm-none-none-eabi -march=armv8.5a+i8mm %s 2>&1 | FileCheck %s
+// RUN: %clang -### -target aarch64-none-none-eabi -march=armv8.5a+i8mm %s 2>&1 | FileCheck %s
+// CHECK: "-target-feature" "+i8mm"
+// CHECK-NOT: "-target-feature" "-i8mm"
+
+// RUN: %clang -### -target arm-none-none-eabi -march=armv8.6a+noi8mm %s 2>&1 | FileCheck %s --check-prefix=NOI8MM
+// RUN: %clang -### -target aarch64-none-none-eabi -march=armv8.6a+noi8mm %s 2>&1 | FileCheck %s --check-prefix=NOI8MM
+// NOI8MM: "-target-feature" "-i8mm"
+// NOI8MM-NOT: "-target-feature" "+i8mm"
+
+// RUN: %clang -### -target arm-none-none-eabi %s 2>&1 | FileCheck %s --check-prefix=ABSENT
+// RUN: %clang -### -target aarch64-none-none-eabi %s 2>&1 | FileCheck %s --check-prefix=ABSENT
+// ABSENT-NOT: "-target-feature" "+i8mm"
+// ABSENT-NOT: "-target-feature" "-i8mm"
Index: llvm/include/llvm/IR/IntrinsicsAArch64.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -173,6 +173,11 @@
     : Intrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
                 [IntrNoMem]>;
+
+  class AdvSIMD_MatMul_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
+                [IntrNoMem]>;
 }
 
 // Arithmetic ops
@@ -449,6 +454,12 @@
   def int_aarch64_neon_udot : AdvSIMD_Dot_Intrinsic;
   def int_aarch64_neon_sdot : AdvSIMD_Dot_Intrinsic;
 
+// v8.6-A Matrix Multiply Intrinsics
+  def int_aarch64_neon_ummla : AdvSIMD_MatMul_Intrinsic;
+  def int_aarch64_neon_smmla : AdvSIMD_MatMul_Intrinsic;
+  def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic;
+  def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;
+
   // v8.2-A FP16 Fused Multiply-Add Long
   def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic;
   def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic;
Index: llvm/include/llvm/IR/IntrinsicsARM.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsARM.td
+++ llvm/include/llvm/IR/IntrinsicsARM.td
@@ -773,6 +773,19 @@
 def int_arm_neon_udot : Neon_Dot_Intrinsic;
 def int_arm_neon_sdot : Neon_Dot_Intrinsic;
 
+// v8.6-A Matrix Multiply Intrinsics
+class Neon_MatMul_Intrinsic
+  : Intrinsic<[llvm_anyvector_ty],
+              [LLVMMatchType<0>, llvm_anyvector_ty,
+               LLVMMatchType<1>],
+              [IntrNoMem]>;
+def int_arm_neon_ummla  : Neon_MatMul_Intrinsic;
+def int_arm_neon_smmla  : Neon_MatMul_Intrinsic;
+def int_arm_neon_usmmla : Neon_MatMul_Intrinsic;
+def int_arm_neon_usdot  : Neon_Dot_Intrinsic;
+
+// v8.6-A Bfloat Intrinsics
+
 def int_arm_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
 def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
 
Index: llvm/include/llvm/Support/AArch64TargetParser.h
===================================================================
--- llvm/include/llvm/Support/AArch64TargetParser.h
+++ llvm/include/llvm/Support/AArch64TargetParser.h
@@ -24,7 +24,7 @@
 namespace AArch64 {
 
 // Arch extension modifiers for CPUs.
-enum ArchExtKind : unsigned {
+enum ArchExtKind : uint64_t {
   AEK_INVALID =     0,
   AEK_NONE =        1,
   AEK_CRC =         1 << 1,
@@ -57,6 +57,8 @@
   AEK_TME =         1 << 28,
   AEK_BF16 =        1 << 29,
   AEK_I8MM =        1 << 30,
+  AEK_F32MM =       1ULL << 31,
+  AEK_F64MM =       1ULL << 32,
 };
 
 enum class ArchKind {
Index: llvm/include/llvm/Support/AArch64TargetParser.def
===================================================================
--- llvm/include/llvm/Support/AArch64TargetParser.def
+++ llvm/include/llvm/Support/AArch64TargetParser.def
@@ -88,6 +88,8 @@
 AARCH64_ARCH_EXT_NAME("predres",      AArch64::AEK_PREDRES,     "+predres", "-predres")
 AARCH64_ARCH_EXT_NAME("bf16",         AArch64::AEK_BF16,        "+bf16",  "-bf16")
 AARCH64_ARCH_EXT_NAME("i8mm",         AArch64::AEK_I8MM,        "+i8mm",  "-i8mm")
+AARCH64_ARCH_EXT_NAME("f32mm",        AArch64::AEK_F32MM,       "+f32mm", "-f32mm")
+AARCH64_ARCH_EXT_NAME("f64mm",        AArch64::AEK_F64MM,       "+f64mm", "-f64mm")
 AARCH64_ARCH_EXT_NAME("tme",          AArch64::AEK_TME,         "+tme",   "-tme")
 #undef AARCH64_ARCH_EXT_NAME
 
Index: llvm/include/llvm/Support/ARMTargetParser.h
===================================================================
--- llvm/include/llvm/Support/ARMTargetParser.h
+++ llvm/include/llvm/Support/ARMTargetParser.h
@@ -47,14 +47,15 @@
   AEK_FP_DP   =     1 << 18,
   AEK_LOB     =     1 << 19,
   AEK_BF16    =     1 << 20,
-  AEK_CDECP0 =      1 << 21,
-  AEK_CDECP1 =      1 << 22,
-  AEK_CDECP2 =      1 << 23,
-  AEK_CDECP3 =      1 << 24,
-  AEK_CDECP4 =      1 << 25,
-  AEK_CDECP5 =      1 << 26,
-  AEK_CDECP6 =      1 << 27,
-  AEK_CDECP7 =      1 << 28,
+  AEK_I8MM    =     1 << 21,
+  AEK_CDECP0 =      1 << 22,
+  AEK_CDECP1 =      1 << 23,
+  AEK_CDECP2 =      1 << 24,
+  AEK_CDECP3 =      1 << 25,
+  AEK_CDECP4 =      1 << 26,
+  AEK_CDECP5 =      1 << 27,
+  AEK_CDECP6 =      1 << 28,
+  AEK_CDECP7 =      1 << 29,
 
   // Unsupported extensions.
   AEK_OS       =    1ULL << 59,
Index: llvm/include/llvm/Support/ARMTargetParser.def
===================================================================
--- llvm/include/llvm/Support/ARMTargetParser.def
+++ llvm/include/llvm/Support/ARMTargetParser.def
@@ -116,7 +116,8 @@
          ARMBuildAttrs::CPUArch::v8_A, FK_CRYPTO_NEON_FP_ARMV8,
          (ARM::AEK_SEC        | ARM::AEK_MP   | ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
           ARM::AEK_HWDIVTHUMB | ARM::AEK_DSP  | ARM::AEK_CRC  | ARM::AEK_RAS |
-          ARM::AEK_DOTPROD    | ARM::AEK_BF16 | ARM::AEK_SHA2 | ARM::AEK_AES))
+          ARM::AEK_DOTPROD    | ARM::AEK_BF16 | ARM::AEK_SHA2 | ARM::AEK_AES |
+          ARM::AEK_I8MM))
 ARM_ARCH("armv8-r", ARMV8R, "8-R", "v8r", ARMBuildAttrs::CPUArch::v8_R,
           FK_NEON_FP_ARMV8,
           (ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM | ARM::AEK_HWDIVTHUMB |
@@ -171,6 +172,7 @@
 ARM_ARCH_EXT_NAME("fp16fml",  ARM::AEK_FP16FML,  "+fp16fml", "-fp16fml")
 ARM_ARCH_EXT_NAME("bf16",     ARM::AEK_BF16,     "+bf16",    "-bf16")
 ARM_ARCH_EXT_NAME("sb",       ARM::AEK_SB,       "+sb",      "-sb")
+ARM_ARCH_EXT_NAME("i8mm",     ARM::AEK_I8MM,     "+i8mm",    "-i8mm")
 ARM_ARCH_EXT_NAME("lob",      ARM::AEK_LOB,      "+lob",   "-lob")
 ARM_ARCH_EXT_NAME("cdecp0",   ARM::AEK_CDECP0,   "+cdecp0",  "-cdecp0")
 ARM_ARCH_EXT_NAME("cdecp1",   ARM::AEK_CDECP1,   "+cdecp1",  "-cdecp1")
Index: llvm/lib/Target/AArch64/AArch64.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64.td
+++ llvm/lib/Target/AArch64/AArch64.td
@@ -373,6 +373,15 @@
 def FeatureBF16 : SubtargetFeature<"bf16", "HasBF16",
     "true", "Enable BFloat16 Extension" >;
 
+def FeatureMatMulInt8 : SubtargetFeature<"i8mm", "HasMatMulInt8",
+    "true", "Enable Matrix Multiply Int8 Extension">;
+
+def FeatureMatMulFP32 : SubtargetFeature<"f32mm", "HasMatMulFP32",
+    "true", "Enable Matrix Multiply FP32 Extension", [FeatureSVE]>;
+
+def FeatureMatMulFP64 : SubtargetFeature<"f64mm", "HasMatMulFP64",
+    "true", "Enable Matrix Multiply FP64 Extension", [FeatureSVE]>;
+
 def FeatureFineGrainedTraps : SubtargetFeature<"fgt", "HasFineGrainedTraps",
     "true", "Enable fine grained virtualization traps extension">;
 
@@ -380,7 +389,6 @@
       SubtargetFeature<"ecv", "HasEnhancedCounterVirtualization",
       "true", "Enable enhanced counter virtualization extension">;
 
-
 //===----------------------------------------------------------------------===//
 // Architectures.
 //
@@ -413,7 +421,7 @@
   "v8.6a", "HasV8_6aOps", "true", "Support ARM v8.6a instructions",
 
   [HasV8_5aOps, FeatureAMVS, FeatureBF16, FeatureFineGrainedTraps,
-   FeatureEnhancedCounterVirtualization]>;
+   FeatureEnhancedCounterVirtualization, FeatureMatMulInt8]>;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
Index: llvm/lib/Target/AArch64/AArch64InstrFormats.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -498,6 +498,7 @@
 def SImm4s3Operand  : SImmScaledMemoryIndexed<4, 3>;
 def SImm4s4Operand  : SImmScaledMemoryIndexed<4, 4>;
 def SImm4s16Operand : SImmScaledMemoryIndexed<4, 16>;
+def SImm4s32Operand : SImmScaledMemoryIndexed<4, 32>;
 
 def simm4s1 : Operand<i64>, ImmLeaf<i64,
 [{ return Imm >=-8  && Imm <= 7; }]> {
@@ -531,6 +532,12 @@
   let ParserMatchClass = SImm4s16Operand;
   let DecoderMethod = "DecodeSImm<4>";
 }
+def simm4s32 : Operand<i64>, ImmLeaf<i64,
+[{ return Imm >=-256  && Imm <= 224 && (Imm % 32) == 0x0; }]> {
+  let PrintMethod = "printImmScale<32>";
+  let ParserMatchClass = SImm4s32Operand;
+  let DecoderMethod = "DecodeSImm<4>";
+}
 
 def Imm1_8Operand : AsmImmRange<1, 8>;
 def Imm1_16Operand : AsmImmRange<1, 16>;
@@ -5537,11 +5544,11 @@
 
 // ARMv8.2-A Dot Product Instructions (Vector): These instructions extract
 // bytes from S-sized elements.
-class BaseSIMDThreeSameVectorDot<bit Q, bit U, string asm, string kind1,
+class BaseSIMDThreeSameVectorDot<bit Q, bit U, bit Mixed, string asm, string kind1,
                                  string kind2, RegisterOperand RegType,
                                  ValueType AccumType, ValueType InputType,
                                  SDPatternOperator OpNode> :
-        BaseSIMDThreeSameVectorTied<Q, U, 0b100, 0b10010, RegType, asm, kind1,
+        BaseSIMDThreeSameVectorTied<Q, U, 0b100, {0b1001, Mixed}, RegType, asm, kind1,
         [(set (AccumType RegType:$dst),
               (OpNode (AccumType RegType:$Rd),
                       (InputType RegType:$Rn),
@@ -5549,10 +5556,10 @@
   let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}");
 }
 
-multiclass SIMDThreeSameVectorDot<bit U, string asm, SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVectorDot<0, U, asm, ".2s", ".8b", V64,
+multiclass SIMDThreeSameVectorDot<bit U, bit Mixed, string asm, SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVectorDot<0, U, Mixed, asm, ".2s", ".8b", V64,
                                          v2i32, v8i8, OpNode>;
-  def v16i8 : BaseSIMDThreeSameVectorDot<1, U, asm, ".4s", ".16b", V128,
+  def v16i8 : BaseSIMDThreeSameVectorDot<1, U, Mixed, asm, ".4s", ".16b", V128,
                                          v4i32, v16i8, OpNode>;
 }
 
@@ -7890,13 +7897,26 @@
 }
 } // End of let mayStore = 0, mayLoad = 0, hasSideEffects = 0
 
+//----------------------------------------------------------------------------
+// Armv8.6 Matrix Multiply Extension
+//----------------------------------------------------------------------------
+
+class SIMDThreeSameVectorMatMul<bit B, bit U, string asm, SDPatternOperator OpNode>
+  : BaseSIMDThreeSameVectorTied<1, U, 0b100, {0b1010, B}, V128, asm, ".4s",
+              [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
+                                               (v16i8 V128:$Rn),
+                                               (v16i8 V128:$Rm)))]> {
+  let AsmString = asm # "{\t$Rd.4s, $Rn.16b, $Rm.16b}";
+}
+
+//----------------------------------------------------------------------------
 // ARMv8.2-A Dot Product Instructions (Indexed)
-class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, string asm, string dst_kind,
-                                      string lhs_kind, string rhs_kind,
+class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, bit Mixed, bits<2> size, string asm,
+                                      string dst_kind, string lhs_kind, string rhs_kind,
                                       RegisterOperand RegType,
                                       ValueType AccumType, ValueType InputType,
                                       SDPatternOperator OpNode> :
-        BaseSIMDIndexedTied<Q, U, 0b0, 0b10, 0b1110, RegType, RegType, V128,
+        BaseSIMDIndexedTied<Q, U, 0b0, size, {0b111, Mixed}, RegType, RegType, V128,
                             VectorIndexS, asm, "", dst_kind, lhs_kind, rhs_kind,
         [(set (AccumType RegType:$dst),
               (AccumType (OpNode (AccumType RegType:$Rd),
@@ -7909,11 +7929,11 @@
   let Inst{11}    = idx{1};  // H
 }
 
-multiclass SIMDThreeSameVectorDotIndex<bit U, string asm,
+multiclass SIMDThreeSameVectorDotIndex<bit U, bit Mixed, bits<2> size, string asm,
                                        SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVectorDotIndex<0, U, asm, ".2s", ".8b", ".4b",
+  def v8i8  : BaseSIMDThreeSameVectorDotIndex<0, U, Mixed, size, asm, ".2s", ".8b", ".4b",
                                               V64, v2i32, v8i8, OpNode>;
-  def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, asm, ".4s", ".16b", ".4b",
+  def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, Mixed, size, asm, ".4s", ".16b", ".4b",
                                               V128, v4i32, v16i8, OpNode>;
 }
 
Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -146,6 +146,12 @@
                        AssemblerPredicate<(all_of FeatureTRBE), "trbe">;
 def HasBF16          : Predicate<"Subtarget->hasBF16()">,
                        AssemblerPredicate<(all_of FeatureBF16), "bf16">;
+def HasMatMulInt8    : Predicate<"Subtarget->hasMatMulInt8()">,
+                       AssemblerPredicate<(all_of FeatureMatMulInt8), "i8mm">;
+def HasMatMulFP32    : Predicate<"Subtarget->hasMatMulFP32()">,
+                       AssemblerPredicate<(all_of FeatureMatMulFP32), "f32mm">;
+def HasMatMulFP64    : Predicate<"Subtarget->hasMatMulFP64()">,
+                       AssemblerPredicate<(all_of FeatureMatMulFP64), "f64mm">;
 def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
 def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
 def IsWindows        : Predicate<"Subtarget->isTargetWindows()">;
@@ -745,10 +751,10 @@
 
 // ARMv8.2-A Dot Product
 let Predicates = [HasDotProd] in {
-defm SDOT : SIMDThreeSameVectorDot<0, "sdot", int_aarch64_neon_sdot>;
-defm UDOT : SIMDThreeSameVectorDot<1, "udot", int_aarch64_neon_udot>;
-defm SDOTlane : SIMDThreeSameVectorDotIndex<0, "sdot", int_aarch64_neon_sdot>;
-defm UDOTlane : SIMDThreeSameVectorDotIndex<1, "udot", int_aarch64_neon_udot>;
+defm SDOT : SIMDThreeSameVectorDot<0, 0, "sdot", int_aarch64_neon_sdot>;
+defm UDOT : SIMDThreeSameVectorDot<1, 0, "udot", int_aarch64_neon_udot>;
+defm SDOTlane : SIMDThreeSameVectorDotIndex<0, 0, 0b10, "sdot", int_aarch64_neon_sdot>;
+defm UDOTlane : SIMDThreeSameVectorDotIndex<1, 0, 0b10, "udot", int_aarch64_neon_udot>;
 }
 
 // ARMv8.6-A BFloat
@@ -765,6 +771,40 @@
 def BFCVT        : BF16ToSinglePrecision<"bfcvt">;
 }
 
+// ARMv8.6A AArch64 matrix multiplication
+let Predicates = [HasMatMulInt8] in {
+def  SMMLA : SIMDThreeSameVectorMatMul<0, 0, "smmla", int_aarch64_neon_smmla>;
+def  UMMLA : SIMDThreeSameVectorMatMul<0, 1, "ummla", int_aarch64_neon_ummla>;
+def USMMLA : SIMDThreeSameVectorMatMul<1, 0, "usmmla", int_aarch64_neon_usmmla>;
+defm USDOT : SIMDThreeSameVectorDot<0, 1, "usdot", int_aarch64_neon_usdot>;
+defm USDOTlane : SIMDThreeSameVectorDotIndex<0, 1, 0b10, "usdot", int_aarch64_neon_usdot>;
+
+// sudot lane has a pattern where usdot is expected (there is no sudot).
+// The second operand is used in the dup operation to repeat the indexed
+// element.
+class BaseSIMDSUDOTIndex<bit Q, string dst_kind, string lhs_kind,
+                         string rhs_kind, RegisterOperand RegType,
+                         ValueType AccumType, ValueType InputType>
+      : BaseSIMDThreeSameVectorDotIndex<Q, 0, 1, 0b00, "sudot", dst_kind,
+                                        lhs_kind, rhs_kind, RegType, AccumType,
+                                        InputType, null_frag> {
+  let Pattern = [(set (AccumType RegType:$dst),
+                      (AccumType (int_aarch64_neon_usdot (AccumType RegType:$Rd),
+                                 (InputType (bitconvert (AccumType
+                                    (AArch64duplane32 (v4i32 V128:$Rm),
+                                        VectorIndexS:$idx)))),
+                                 (InputType RegType:$Rn))))];
+}
+
+multiclass SIMDSUDOTIndex {
+  def v8i8  : BaseSIMDSUDOTIndex<0, ".2s", ".8b", ".4b", V64, v2i32, v8i8>;
+  def v16i8 : BaseSIMDSUDOTIndex<1, ".4s", ".16b", ".4b", V128, v4i32, v16i8>;
+}
+
+defm SUDOTlane : SIMDSUDOTIndex;
+
+}
+
 // ARMv8.2-A FP16 Fused Multiply-Add Long
 let Predicates = [HasNEON, HasFP16FML] in {
 defm FMLAL      : SIMDThreeSameVectorFML<0, 1, 0b001, "fmlal", int_aarch64_neon_fmlal>;
Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1634,6 +1634,37 @@
   defm : ldff1<LDFF1B, nxv16i8, AArch64ldff1, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;
 }
 
+let Predicates = [HasSVE, HasMatMulInt8] in {
+  def  SMMLA_ZZZ : sve_int_matmul<0b00, "smmla">;
+  def  UMMLA_ZZZ : sve_int_matmul<0b11, "ummla">;
+  def USMMLA_ZZZ : sve_int_matmul<0b10, "usmmla">;
+  def USDOT_ZZZ  : sve_int_dot_mixed<"usdot">;
+  def USDOT_ZZZI : sve_int_dot_mixed_indexed<0, "usdot">;
+  def SUDOT_ZZZI : sve_int_dot_mixed_indexed<1, "sudot">;
+}
+
+let Predicates = [HasSVE, HasMatMulFP32] in {
+  def FMMLA_ZZZ_S : sve_fp_matrix_mla<0, "fmmla", ZPR32>;
+}
+
+let Predicates = [HasSVE, HasMatMulFP64] in {
+  def FMMLA_ZZZ_D : sve_fp_matrix_mla<1, "fmmla", ZPR64>;
+  defm LD1RO_B_IMM  : sve_mem_ldor_si<0b00, "ld1rob", Z_b, ZPR8>;
+  defm LD1RO_H_IMM  : sve_mem_ldor_si<0b01, "ld1roh", Z_h, ZPR16>;
+  defm LD1RO_W_IMM  : sve_mem_ldor_si<0b10, "ld1row", Z_s, ZPR32>;
+  defm LD1RO_D_IMM  : sve_mem_ldor_si<0b11, "ld1rod", Z_d, ZPR64>;
+  defm LD1RO_B      : sve_mem_ldor_ss<0b00, "ld1rob", Z_b, ZPR8,  GPR64NoXZRshifted8>;
+  defm LD1RO_H      : sve_mem_ldor_ss<0b01, "ld1roh", Z_h, ZPR16, GPR64NoXZRshifted16>;
+  defm LD1RO_W      : sve_mem_ldor_ss<0b10, "ld1row", Z_s, ZPR32, GPR64NoXZRshifted32>;
+  defm LD1RO_D      : sve_mem_ldor_ss<0b11, "ld1rod", Z_d, ZPR64, GPR64NoXZRshifted64>;
+  def ZIP1_ZZZ_128 : sve_int_perm_bin_perm_128_zz<0b00, 0, "zip1">;
+  def ZIP2_ZZZ_128 : sve_int_perm_bin_perm_128_zz<0b00, 1, "zip2">;
+  def UZP1_ZZZ_128 : sve_int_perm_bin_perm_128_zz<0b01, 0, "uzp1">;
+  def UZP2_ZZZ_128 : sve_int_perm_bin_perm_128_zz<0b01, 1, "uzp2">;
+  def TRN1_ZZZ_128 : sve_int_perm_bin_perm_128_zz<0b11, 0, "trn1">;
+  def TRN2_ZZZ_128 : sve_int_perm_bin_perm_128_zz<0b11, 1, "trn2">;
+}
+
 let Predicates = [HasSVE2] in {
   // SVE2 integer multiply-add (indexed)
   defm MLA_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b0, "mla", int_aarch64_sve_mla_lane>;
Index: llvm/lib/Target/AArch64/AArch64Subtarget.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -147,6 +147,9 @@
 
   // Armv8.6-A Extensions
   bool HasBF16 = false;
+  bool HasMatMulInt8 = false;
+  bool HasMatMulFP32 = false;
+  bool HasMatMulFP64 = false;
   bool HasAMVS = false;
   bool HasFineGrainedTraps = false;
   bool HasEnhancedCounterVirtualization = false;
@@ -414,6 +417,9 @@
   bool hasSVE2SM4() const { return HasSVE2SM4; }
   bool hasSVE2SHA3() const { return HasSVE2SHA3; }
   bool hasSVE2BitPerm() const { return HasSVE2BitPerm; }
+  bool hasMatMulInt8() const { return HasMatMulInt8; }
+  bool hasMatMulFP32() const { return HasMatMulFP32; }
+  bool hasMatMulFP64() const { return HasMatMulFP64; }
 
   // Armv8.6-A Extensions
   bool hasBF16() const { return HasBF16; }
Index: llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
===================================================================
--- llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -4255,6 +4255,8 @@
     return Error(Loc, "index must be a multiple of 4 in range [-32, 28].");
   case Match_InvalidMemoryIndexed16SImm4:
     return Error(Loc, "index must be a multiple of 16 in range [-128, 112].");
+  case Match_InvalidMemoryIndexed32SImm4:
+    return Error(Loc, "index must be a multiple of 32 in range [-256, 224].");
   case Match_InvalidMemoryIndexed1SImm6:
     return Error(Loc, "index must be an integer in range [-32, 31].");
   case Match_InvalidMemoryIndexedSImm8:
@@ -4914,6 +4916,7 @@
   case Match_InvalidMemoryIndexed4SImm4:
   case Match_InvalidMemoryIndexed1SImm6:
   case Match_InvalidMemoryIndexed16SImm4:
+  case Match_InvalidMemoryIndexed32SImm4:
   case Match_InvalidMemoryIndexed4SImm7:
   case Match_InvalidMemoryIndexed8SImm7:
   case Match_InvalidMemoryIndexed16SImm7:
Index: llvm/lib/Target/AArch64/SVEInstrFormats.td
===================================================================
--- llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -7528,6 +7528,180 @@
   let ElementSize = ElementSizeS;
 }
 
+//===----------------------------------------------------------------------===//
+// SVE Integer Matrix Multiply Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_matmul<bits<2> uns, string asm>
+: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR8:$Zm), asm,
+  "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
+  bits<5> Zda;
+  bits<5> Zn;
+  bits<5> Zm;
+  let Inst{31-24} = 0b01000101;
+  let Inst{23-22} = uns;
+  let Inst{21}    = 0;
+  let Inst{20-16} = Zm;
+  let Inst{15-10} = 0b100110;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zda;
+
+  let Constraints = "$Zda = $_Zda";
+  let DestructiveInstType = DestructiveOther;
+  let ElementSize = ZPR32.ElementSize;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Dot Product Mixed Sign Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_dot_mixed<string asm>
+: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR8:$Zm), asm,
+  "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
+  bits<5> Zda;
+  bits<5> Zn;
+  bits<5> Zm;
+  let Inst{31-21} = 0b01000100100;
+  let Inst{20-16} = Zm;
+  let Inst{15-10} = 0b011110;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zda;
+
+  let Constraints = "$Zda = $_Zda";
+  let DestructiveInstType = DestructiveOther;
+  let ElementSize = ZPR32.ElementSize;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Dot Product Mixed Sign - Indexed Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_dot_mixed_indexed<bit U, string asm>
+: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR3b8:$Zm, VectorIndexS:$idx),
+    asm, "\t$Zda, $Zn, $Zm$idx", "", []>, Sched<[]> {
+  bits<5> Zda;
+  bits<5> Zn;
+  bits<3> Zm;
+  bits<2> idx;
+  let Inst{31-21} = 0b01000100101;
+  let Inst{20-19} = idx;
+  let Inst{18-16} = Zm;
+  let Inst{15-11} = 0b00011;
+  let Inst{10}    = U;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zda;
+
+  let Constraints = "$Zda = $_Zda";
+  let DestructiveInstType = DestructiveOther;
+  let ElementSize = ZPR32.ElementSize;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Floating Point Matrix Multiply Accumulate Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_matrix_mla<bit sz, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zda), (ins zprty:$_Zda, zprty:$Zn, zprty:$Zm),
+    asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
+  bits<5> Zda;
+  bits<5> Zn;
+  bits<5> Zm;
+  let Inst{31-23} = 0b011001001;
+  let Inst{22}    = sz;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Zm;
+  let Inst{15-10} = 0b111001;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zda;
+
+  let Constraints = "$Zda = $_Zda";
+  let DestructiveInstType = DestructiveOther;
+  let ElementSize = zprty.ElementSize;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Memory - Contiguous Load And Replicate 256-bit Group
+//===----------------------------------------------------------------------===//
+class sve_mem_ldor_si<bits<2> sz, string asm, RegisterOperand VecList>
+: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s32:$imm4),
+  asm, "\t$Zt, $Pg/z, [$Rn, $imm4]", "", []>, Sched<[]> {
+  bits<5> Zt;
+  bits<5> Rn;
+  bits<3> Pg;
+  bits<4> imm4;
+  let Inst{31-25} = 0b1010010;
+  let Inst{24-23} = sz;
+  let Inst{22-20} = 0b010;
+  let Inst{19-16} = imm4;
+  let Inst{15-13} = 0b001;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zt;
+
+  let mayLoad = 1;
+}
+
+multiclass sve_mem_ldor_si<bits<2> sz, string asm, RegisterOperand listty,
+                           ZPRRegOp zprty> {
+  def NAME : sve_mem_ldor_si<sz, asm, listty>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+                  (!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+                  (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4]",
+                  (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s32:$imm4), 0>;
+}
+
+class sve_mem_ldor_ss<bits<2> sz, string asm, RegisterOperand VecList,
+                      RegisterOperand gprty>
+: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
+  asm, "\t$Zt, $Pg/z, [$Rn, $Rm]", "", []>, Sched<[]> {
+  bits<5> Zt;
+  bits<3> Pg;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-25} = 0b1010010;
+  let Inst{24-23} = sz;
+  let Inst{22-21} = 0b01;
+  let Inst{20-16} = Rm;
+  let Inst{15-13} = 0;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zt;
+
+  let mayLoad = 1;
+}
+
+multiclass sve_mem_ldor_ss<bits<2> sz, string asm, RegisterOperand listty,
+                           ZPRRegOp zprty, RegisterOperand gprty> {
+  def NAME : sve_mem_ldor_ss<sz, asm, listty, gprty>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
+                  (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Interleave 128-bit Elements Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_perm_bin_perm_128_zz<bits<2> opc, bit P, string asm>
+: I<(outs ZPR128:$Zd), (ins ZPR128:$Zn, ZPR128:$Zm),
+  asm, "\t$Zd, $Zn, $Zm",
+  "",
+  []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zm;
+  bits<5> Zn;
+  let Inst{31-21} = 0b00000101101;
+  let Inst{20-16} = Zm;
+  let Inst{15-13} = 0b000;
+  let Inst{12-11} = opc;
+  let Inst{10}    = P;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+}
+
+
 /// Addressing modes
 def am_sve_indexed_s4 :ComplexPattern<i64, 2, "SelectAddrModeIndexedSVE<-8,7>", [], [SDNPWantRoot]>;
 def am_sve_indexed_s6 :ComplexPattern<i64, 2, "SelectAddrModeIndexedSVE<-32,31>", [], [SDNPWantRoot]>;
Index: llvm/lib/Target/ARM/ARM.td
===================================================================
--- llvm/lib/Target/ARM/ARM.td
+++ llvm/lib/Target/ARM/ARM.td
@@ -428,6 +428,9 @@
 def FeatureBF16     : SubtargetFeature<"bf16", "HasBF16", "true",
   "Enable support for BFloat16 instructions",  [FeatureNEON]>;
 
+def FeatureMatMulInt8 : SubtargetFeature<"i8mm", "HasMatMulInt8",
+    "true", "Enable Matrix Multiply Int8 Extension", [FeatureNEON]>;
+
 // Armv8.1-M extensions
 
 def FeatureLOB            : SubtargetFeature<"lob", "HasLOB", "true",
@@ -529,7 +532,8 @@
 
 def HasV8_6aOps   : SubtargetFeature<"v8.6a", "HasV8_6aOps", "true",
                                    "Support ARM v8.6a instructions",
-                                   [HasV8_5aOps, FeatureBF16]>;
+                                   [HasV8_5aOps, FeatureBF16,
+                                    FeatureMatMulInt8]>;
 
 def HasV8_1MMainlineOps : SubtargetFeature<
                "v8.1m.main", "HasV8_1MMainlineOps", "true",
Index: llvm/lib/Target/ARM/ARMInstrNEON.td
===================================================================
--- llvm/lib/Target/ARM/ARMInstrNEON.td
+++ llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -4823,10 +4823,10 @@
 // We put them in the VFPV8 decoder namespace because the ARM and Thumb
 // encodings are the same and thus no further bit twiddling is necessary
 // in the disassembler.
-class VDOT<bit op6, bit op4, RegisterClass RegTy, string Asm, string AsmTy,
-           ValueType AccumTy, ValueType InputTy,
+class VDOT<bit op6, bit op4, bit op23, RegisterClass RegTy, string Asm,
+           string AsmTy, ValueType AccumTy, ValueType InputTy,
            SDPatternOperator OpNode> :
-      N3Vnp<0b11000, 0b10, 0b1101, op6, op4, (outs RegTy:$dst),
+      N3Vnp<{0b1100, op23}, 0b10, 0b1101, op6, op4, (outs RegTy:$dst),
             (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm), N3RegFrm, IIC_VDOTPROD,
             Asm, AsmTy,
             [(set (AccumTy RegTy:$dst),
@@ -4836,12 +4836,13 @@
   let Predicates = [HasDotProd];
   let DecoderNamespace = "VFPV8";
   let Constraints = "$dst = $Vd";
+  let hasNoSchedulingInfo = 1;
 }
 
-def VUDOTD : VDOT<0, 1, DPR, "vudot", "u8", v2i32, v8i8,  int_arm_neon_udot>;
-def VSDOTD : VDOT<0, 0, DPR, "vsdot", "s8", v2i32, v8i8,  int_arm_neon_sdot>;
-def VUDOTQ : VDOT<1, 1, QPR, "vudot", "u8", v4i32, v16i8, int_arm_neon_udot>;
-def VSDOTQ : VDOT<1, 0, QPR, "vsdot", "s8", v4i32, v16i8, int_arm_neon_sdot>;
+def VUDOTD : VDOT<0, 1, 0, DPR, "vudot", "u8", v2i32, v8i8,  int_arm_neon_udot>;
+def VSDOTD : VDOT<0, 0, 0, DPR, "vsdot", "s8", v2i32, v8i8,  int_arm_neon_sdot>;
+def VUDOTQ : VDOT<1, 1, 0, QPR, "vudot", "u8", v4i32, v16i8, int_arm_neon_udot>;
+def VSDOTQ : VDOT<1, 0, 0, QPR, "vsdot", "s8", v4i32, v16i8, int_arm_neon_sdot>;
 
 // Indexed dot product instructions:
 multiclass DOTI<string opc, string dt, bit Q, bit U, RegisterClass Ty,
@@ -4876,6 +4877,70 @@
 defm VSDOTQI : DOTI<"vsdot", "s8", 0b1, 0b0, QPR, v4i32, v16i8,
                     int_arm_neon_sdot, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
 
+// v8.6A matrix multiplication extension
+let Predicates = [HasMatMulInt8] in {
+  class N3VMatMul<bit B, bit U, string Asm, string AsmTy,
+                  SDPatternOperator OpNode>
+        : N3Vnp<{0b1100, B}, 0b10, 0b1100, 1, U, (outs QPR:$dst),
+                (ins QPR:$Vd, QPR:$Vn, QPR:$Vm), N3RegFrm, NoItinerary,
+                Asm, AsmTy,
+                [(set (v4i32 QPR:$dst), (OpNode (v4i32 QPR:$Vd),
+                                                (v16i8 QPR:$Vn),
+                                                (v16i8 QPR:$Vm)))]> {
+    let DecoderNamespace = "VFPV8";
+    let Constraints = "$dst = $Vd";
+    let hasNoSchedulingInfo = 1;
+  }
+
+  multiclass N3VMixedDotLane<bit Q, bit U, string Asm, string AsmTy, RegisterClass RegTy,
+                        ValueType AccumTy, ValueType InputTy, SDPatternOperator OpNode,
+                        dag RHS> {
+
+    def "" : N3Vnp<0b11101, 0b00, 0b1101, Q, U, (outs RegTy:$dst),
+                (ins RegTy:$Vd, RegTy:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane), N3RegFrm,
+                 NoItinerary, Asm, AsmTy, []> {
+      bit lane;
+      let hasNoSchedulingInfo = 1;
+      let Inst{5} = lane;
+      let AsmString = !strconcat(Asm, ".", AsmTy, "\t$Vd, $Vn, $Vm$lane");
+      let DecoderNamespace = "VFPV8";
+      let Constraints = "$dst = $Vd";
+    }
+
+    def : Pat<
+      (AccumTy (OpNode (AccumTy RegTy:$Vd),
+                       (InputTy RegTy:$Vn),
+                       (InputTy (bitconvert (AccumTy
+                                (ARMvduplane (AccumTy RegTy:$Vm),
+                                              VectorIndex32:$lane)))))),
+      (!cast<Instruction>(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>;
+
+  }
+
+  multiclass SUDOTLane<bit Q, RegisterClass RegTy, ValueType AccumTy, ValueType InputTy, dag RHS>
+        : N3VMixedDotLane<Q, 1, "vsudot", "u8", RegTy, AccumTy, InputTy, null_frag, null_frag> {
+    def : Pat<
+      (AccumTy (int_arm_neon_usdot (AccumTy RegTy:$Vd),
+                                   (InputTy (bitconvert (AccumTy
+                                            (ARMvduplane (AccumTy RegTy:$Vm),
+                                                          VectorIndex32:$lane)))),
+                                   (InputTy RegTy:$Vn))),
+      (!cast<Instruction>(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>;
+  }
+
+  def VSMMLA  : N3VMatMul<0, 0, "vsmmla",  "s8", int_arm_neon_smmla>;
+  def VUMMLA  : N3VMatMul<0, 1, "vummla",  "u8", int_arm_neon_ummla>;
+  def VUSMMLA : N3VMatMul<1, 0, "vusmmla", "s8", int_arm_neon_usmmla>;
+  def VUSDOTD : VDOT<0, 0, 1, DPR, "vusdot", "s8", v2i32, v8i8,  int_arm_neon_usdot>;
+  def VUSDOTQ : VDOT<1, 0, 1, QPR, "vusdot", "s8", v4i32, v16i8, int_arm_neon_usdot>;
+
+  defm VUSDOTDI : N3VMixedDotLane<0, 0, "vusdot", "s8", DPR, v2i32, v8i8,
+                                  int_arm_neon_usdot, (v2i32 DPR_VFP2:$Vm)>;
+  defm VUSDOTQI : N3VMixedDotLane<1, 0, "vusdot", "s8", QPR, v4i32, v16i8,
+                                  int_arm_neon_usdot, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
+  defm VSUDOTDI : SUDOTLane<0, DPR, v2i32, v8i8, (v2i32 DPR_VFP2:$Vm)>;
+  defm VSUDOTQI : SUDOTLane<1, QPR, v4i32, v16i8, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
+}
 
 // ARMv8.3 complex operations
 class BaseN3VCP8ComplexTied<bit op21, bit op4, bit s, bit q,
Index: llvm/lib/Target/ARM/ARMPredicates.td
===================================================================
--- llvm/lib/Target/ARM/ARMPredicates.td
+++ llvm/lib/Target/ARM/ARMPredicates.td
@@ -110,6 +110,8 @@
                                  AssemblerPredicate<(all_of FeatureFP16FML),"full half-float fml">;
 def HasBF16          : Predicate<"Subtarget->hasBF16()">,
                                  AssemblerPredicate<(all_of FeatureBF16),"BFloat16 floating point extension">;
+def HasMatMulInt8    : Predicate<"Subtarget->hasMatMulInt8()">,
+                                 AssemblerPredicate<(all_of FeatureMatMulInt8),"8-bit integer matrix multiply">;
 def HasDivideInThumb : Predicate<"Subtarget->hasDivideInThumbMode()">,
                                  AssemblerPredicate<(all_of FeatureHWDivThumb), "divide in THUMB">;
 def HasDivideInARM   : Predicate<"Subtarget->hasDivideInARMMode()">,
Index: llvm/lib/Target/ARM/ARMSubtarget.h
===================================================================
--- llvm/lib/Target/ARM/ARMSubtarget.h
+++ llvm/lib/Target/ARM/ARMSubtarget.h
@@ -260,6 +260,9 @@
   /// HasBF16 - True if subtarget supports BFloat16 floating point operations
   bool HasBF16 = false;
 
+  /// HasMatMulInt8 - True if subtarget supports 8-bit integer matrix multiply
+  bool HasMatMulInt8 = false;
+
   /// HasD32 - True if subtarget has the full 32 double precision
   /// FP registers for VFPv3.
   bool HasD32 = false;
@@ -704,6 +707,8 @@
   /// Return true if the CPU supports any kind of instruction fusion.
   bool hasFusion() const { return hasFuseAES() || hasFuseLiterals(); }
 
+  bool hasMatMulInt8() const { return HasMatMulInt8; }
+
   const Triple &getTargetTriple() const { return TargetTriple; }
 
   bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
Index: llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
===================================================================
--- llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -6330,7 +6330,10 @@
       Mnemonic == "csel" || Mnemonic == "csinc" ||
       Mnemonic == "csinv" || Mnemonic == "csneg" || Mnemonic == "cinc" ||
       Mnemonic == "cinv" || Mnemonic == "cneg" || Mnemonic == "cset" ||
-      Mnemonic == "csetm")
+      Mnemonic == "csetm" ||
+      Mnemonic == "vsmmla" || Mnemonic == "vummla" ||
+      Mnemonic == "vusmmla" || Mnemonic == "vsudot" ||
+      Mnemonic == "vusdot")
     return Mnemonic;
 
   // First, split out any predication code. Ignore mnemonics we know aren't
@@ -6466,7 +6469,9 @@
       Mnemonic == "vfmat" || Mnemonic == "vfmab" ||
       Mnemonic == "vdot"  || Mnemonic == "vmmla" ||
       Mnemonic == "sb"    || Mnemonic == "ssbb"  ||
-      Mnemonic == "pssbb" ||
+      Mnemonic == "pssbb" || Mnemonic == "vsmmla" ||
+      Mnemonic == "vummla" || Mnemonic == "vusmmla" ||
+      Mnemonic == "vusdot" || Mnemonic == "vsudot" ||
       Mnemonic == "bfcsel" || Mnemonic == "wls" ||
       Mnemonic == "dls" || Mnemonic == "le" || Mnemonic == "csel" ||
       Mnemonic == "csinc" || Mnemonic == "csinv" || Mnemonic == "csneg" ||
Index: llvm/test/CodeGen/AArch64/aarch64-matmul.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/aarch64-matmul.ll
@@ -0,0 +1,136 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s -o -| FileCheck %s
+
+define <4 x i32> @smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: smmla.v4i32.v16i8
+; CHECK: smmla   v0.4s, v1.16b, v2.16b
+  %vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
+  ret <4 x i32> %vmmla1.i
+}
+
+define <4 x i32> @ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: ummla.v4i32.v16i8
+; CHECK: ummla   v0.4s, v1.16b, v2.16b
+  %vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b)
+  ret <4 x i32> %vmmla1.i
+}
+
+define <4 x i32> @usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: usmmla.v4i32.v16i8
+; CHECK: usmmla   v0.4s, v1.16b, v2.16b
+  %vusmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3
+  ret <4 x i32> %vusmmla1.i
+}
+
+define <2 x i32> @usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
+entry:
+; CHECK-LABEL: usdot.v2i32.v8i8
+; CHECK: usdot   v0.2s, v1.8b, v2.8b
+  %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b)
+  ret <2 x i32> %vusdot1.i
+}
+
+define <2 x i32> @usdot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
+entry:
+; CHECK-LABEL: usdot_lane.v2i32.v8i8
+; CHECK: usdot   v0.2s, v1.8b, v2.4b[0]
+  %0 = bitcast <8 x i8> %b to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer
+  %1 = bitcast <2 x i32> %shuffle to <8 x i8>
+  %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %1)
+  ret <2 x i32> %vusdot1.i
+}
+
+define <2 x i32> @sudot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
+entry:
+; CHECK-LABEL: sudot_lane.v2i32.v8i8
+; CHECK: sudot   v0.2s, v1.8b, v2.4b[0]
+  %0 = bitcast <8 x i8> %b to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer
+  %1 = bitcast <2 x i32> %shuffle to <8 x i8>
+  %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %1, <8 x i8> %a)
+  ret <2 x i32> %vusdot1.i
+}
+
+define <2 x i32> @usdot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: usdot_lane.v2i32.v16i8
+; CHECK: usdot   v0.2s, v1.8b, v2.4b[0]
+  %0 = bitcast <16 x i8> %b to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer
+  %1 = bitcast <2 x i32> %shuffle to <8 x i8>
+  %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %1)
+  ret <2 x i32> %vusdot1.i
+}
+
+define <2 x i32> @sudot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: sudot_lane.v2i32.v16i8
+; CHECK: sudot   v0.2s, v1.8b, v2.4b[0]
+  %0 = bitcast <16 x i8> %b to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer
+  %1 = bitcast <2 x i32> %shuffle to <8 x i8>
+  %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %1, <8 x i8> %a) #3
+  ret <2 x i32> %vusdot1.i
+}
+
+define <4 x i32> @usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: usdot.v4i32.v16i8
+; CHECK: usdot   v0.4s, v1.16b, v2.16b
+  %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3
+  ret <4 x i32> %vusdot1.i
+}
+
+define <4 x i32> @usdot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) {
+entry:
+; CHECK-LABEL: usdot_lane.v4i32.v16i8
+; CHECK: usdot   v0.4s, v1.16b, v2.4b[0]
+  %0 = bitcast <8 x i8> %b to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer
+  %1 = bitcast <4 x i32> %shuffle to <16 x i8>
+  %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %1) #3
+  ret <4 x i32> %vusdot1.i
+}
+
+define <4 x i32> @sudot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) {
+entry:
+; CHECK-LABEL: sudot_lane.v4i32.v16i8
+; CHECK: sudot   v0.4s, v1.16b, v2.4b[0]
+  %0 = bitcast <8 x i8> %b to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer
+  %1 = bitcast <4 x i32> %shuffle to <16 x i8>
+  %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %1, <16 x i8> %a) #3
+  ret <4 x i32> %vusdot1.i
+}
+
+define <4 x i32> @usdot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: usdot_laneq.v4i32.v16i8
+; CHECK: usdot   v0.4s, v1.16b, v2.4b[0]
+  %0 = bitcast <16 x i8> %b to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
+  %1 = bitcast <4 x i32> %shuffle to <16 x i8>
+  %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %1) #3
+  ret <4 x i32> %vusdot1.i
+}
+
+define <4 x i32> @sudot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: sudot_laneq.v4i32.v16i8
+; CHECK: sudot   v0.4s, v1.16b, v2.4b[0]
+  %0 = bitcast <16 x i8> %b to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
+  %1 = bitcast <4 x i32> %shuffle to <16 x i8>
+  %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %1, <16 x i8> %a) #3
+  ret <4 x i32> %vusdot1.i
+}
+
+declare <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
+declare <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
+declare <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
+declare <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) #2
+declare <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
+
Index: llvm/test/CodeGen/ARM/arm-matmul.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/ARM/arm-matmul.ll
@@ -0,0 +1,83 @@
+; RUN: llc -mtriple=arm-none-linux-gnu -mattr=+neon,+i8mm -float-abi=hard < %s -o -| FileCheck %s
+
+define <4 x i32> @smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: smmla.v4i32.v16i8
+; CHECK:        vsmmla.s8       q0, q1, q2
+  %vmmla1.i = tail call <4 x i32> @llvm.arm.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3
+  ret <4 x i32> %vmmla1.i
+}
+
+define <4 x i32> @ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: ummla.v4i32.v16i8
+; CHECK:        vummla.u8       q0, q1, q2
+  %vmmla1.i = tail call <4 x i32> @llvm.arm.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3
+  ret <4 x i32> %vmmla1.i
+}
+
+define <4 x i32> @usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+entry:
+; CHECK-LABEL: usmmla.v4i32.v16i8
+; CHECK:        vusmmla.s8       q0, q1, q2
+  %vusmmla1.i = tail call <4 x i32> @llvm.arm.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3
+  ret <4 x i32> %vusmmla1.i
+}
+
+define <2 x i32> @usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
+entry:
+; CHECK-LABEL: usdot.v2i32.v8i8
+; CHECK:        vusdot.s8       d0, d1, d2
+  %vusdot1.i = tail call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) #3
+  ret <2 x i32> %vusdot1.i
+}
+
+define <2 x i32> @usdot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
+entry:
+; CHECK-LABEL: usdot_lane.v2i32.v8i8
+; CHECK:        vusdot.s8       d0, d1, d2[0]
+  %0 = bitcast <8 x i8> %b to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer
+  %1 = bitcast <2 x i32> %shuffle to <8 x i8>
+  %vusdot1.i = tail call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %1) #3
+  ret <2 x i32> %vusdot1.i
+}
+
+define <2 x i32> @sudot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
+entry:
+; CHECK-LABEL: sudot_lane.v2i32.v8i8
+; CHECK:        vsudot.u8       d0, d1, d2[0]
+  %0 = bitcast <8 x i8> %b to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer
+  %1 = bitcast <2 x i32> %shuffle to <8 x i8>
+  %vusdot1.i = tail call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %1, <8 x i8> %a) #3
+  ret <2 x i32> %vusdot1.i
+}
+
+define <4 x i32> @usdotq_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) {
+entry:
+; CHECK-LABEL: usdotq_lane.v4i32.v16i8
+; CHECK:        vusdot.s8       q0, q1, d4[0]
+  %0 = bitcast <8 x i8> %b to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer
+  %1 = bitcast <4 x i32> %shuffle to <16 x i8>
+  %vusdot1.i = tail call <4 x i32> @llvm.arm.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %1) #3
+  ret <4 x i32> %vusdot1.i
+}
+
+define <4 x i32> @sudotq_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) {
+entry:
+; CHECK-LABEL: sudotq_lane.v4i32.v16i8
+; CHECK:        vsudot.u8       q0, q1, d4[0]
+  %0 = bitcast <8 x i8> %b to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer
+  %1 = bitcast <4 x i32> %shuffle to <16 x i8>
+  %vusdot1.i = tail call <4 x i32> @llvm.arm.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %1, <16 x i8> %a) #3
+  ret <4 x i32> %vusdot1.i
+}
+
+declare <4 x i32> @llvm.arm.neon.smmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
+declare <4 x i32> @llvm.arm.neon.ummla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
+declare <4 x i32> @llvm.arm.neon.usmmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
+declare <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) #2
+declare <4 x i32> @llvm.arm.neon.usdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
Index: llvm/test/MC/AArch64/SVE/matrix-multiply-fp-diagnostics.s
===================================================================
--- /dev/null
+++ llvm/test/MC/AArch64/SVE/matrix-multiply-fp-diagnostics.s
@@ -0,0 +1,86 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve,+f32mm,+f64mm  2>&1 < %s | FileCheck %s
+
+// --------------------------------------------------------------------------//
+// FMMLA (SVE)
+
+// Invalid element size
+
+fmmla z0.h, z1.h, z2.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+
+// Mis-matched element size
+
+fmmla z0.d, z1.s, z2.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+fmmla z0.s, z1.d, z2.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+fmmla z0.s, z1.s, z2.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+
+
+// --------------------------------------------------------------------------//
+// LD1RO (SVE, scalar plus immediate)
+
+// Immediate too high (>224)
+ld1rob { z0.b }, p1/z, [x2, #256]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: index must be a multiple of 32 in range [-256, 224].
+ld1roh { z0.h }, p1/z, [x2, #256]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: index must be a multiple of 32 in range [-256, 224].
+ld1row { z0.s }, p1/z, [x2, #256]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: index must be a multiple of 32 in range [-256, 224].
+ld1rod { z0.d }, p1/z, [x2, #256]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: index must be a multiple of 32 in range [-256, 224].
+
+// Immediate too low (<-256)
+ld1rob { z0.b }, p1/z, [x2, #-288]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: index must be a multiple of 32 in range [-256, 224].
+ld1roh { z0.h }, p1/z, [x2, #-288]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: index must be a multiple of 32 in range [-256, 224].
+ld1row { z0.s }, p1/z, [x2, #-288]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: index must be a multiple of 32 in range [-256, 224].
+ld1rod { z0.d }, p1/z, [x2, #-288]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: index must be a multiple of 32 in range [-256, 224].
+
+// Immediate not a multiple of 32
+ld1rob { z0.b }, p1/z, [x2, #16]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: index must be a multiple of 32 in range [-256, 224].
+ld1roh { z0.h }, p1/z, [x2, #16]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: index must be a multiple of 32 in range [-256, 224].
+ld1row { z0.s }, p1/z, [x2, #16]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: index must be a multiple of 32 in range [-256, 224].
+ld1rod { z0.d }, p1/z, [x2, #16]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: index must be a multiple of 32 in range [-256, 224].
+
+// Prediate register too high
+ld1rob { z0.b }, p8/z, [x2]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix)
+ld1roh { z0.h }, p8/z, [x2]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix)
+ld1row { z0.s }, p8/z, [x2]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix)
+ld1rod { z0.d }, p8/z, [x2]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix)
+
+
+// --------------------------------------------------------------------------//
+// LD1RO (SVE, scalar plus scalar)
+
+// Shift ammount not matched to data width
+ld1rob { z0.b }, p1/z, [x2, x3, lsl #1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: register must be x0..x30 without shift
+ld1roh { z0.h }, p1/z, [x2, x3, lsl #0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: register must be x0..x30 with required shift 'lsl #1'
+ld1row { z0.s }, p1/z, [x2, x3, lsl #3]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: register must be x0..x30 with required shift 'lsl #2'
+ld1rod { z0.d }, p1/z, [x2, x3, lsl #2]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: register must be x0..x30 with required shift 'lsl #3'
+
+// Prediate register too high
+ld1rob { z0.b }, p8/z, [x2, x3, lsl #0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix)
+ld1roh { z0.h }, p8/z, [x2, x3, lsl #1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix)
+ld1row { z0.s }, p8/z, [x2, x3, lsl #2]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix)
+ld1rod { z0.d }, p8/z, [x2, x3, lsl #3]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix)
Index: llvm/test/MC/AArch64/SVE/matrix-multiply-fp32.s
===================================================================
--- /dev/null
+++ llvm/test/MC/AArch64/SVE/matrix-multiply-fp32.s
@@ -0,0 +1,17 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve,+f32mm < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve,+i8mm,+f64mm < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve,+f32mm < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve,+f32mm - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve,+f32mm < %s \
+// RUN:        | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+
+// --------------------------------------------------------------------------//
+// FMMLA (SVE)
+
+fmmla z0.s, z1.s, z2.s
+// CHECK-INST: fmmla z0.s, z1.s, z2.s
+// CHECK-ENCODING: [0x20,0xe4,0xa2,0x64]
+// CHECK-ERROR: instruction requires: f32mm
+// CHECK-UNKNOWN: 20 e4 a2 64 <unknown>
Index: llvm/test/MC/AArch64/SVE/matrix-multiply-fp64.s
===================================================================
--- /dev/null
+++ llvm/test/MC/AArch64/SVE/matrix-multiply-fp64.s
@@ -0,0 +1,185 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve,+f64mm < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve,+i8mm,+f32mm < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve,+f64mm < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve,+f64mm - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve,+f64mm < %s \
+// RUN:        | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+
+// --------------------------------------------------------------------------//
+// FMMLA (SVE)
+
+fmmla z0.d, z1.d, z2.d
+// CHECK-INST: fmmla z0.d, z1.d, z2.d
+// CHECK-ENCODING: [0x20,0xe4,0xe2,0x64]
+// CHECK-ERROR: instruction requires: f64mm
+// CHECK-UNKNOWN: 20 e4 e2 64 <unknown>
+
+// --------------------------------------------------------------------------//
+// LD1RO (SVE, scalar plus immediate)
+
+// With maximum immediate (224)
+
+ld1rob { z0.b }, p1/z, [x2, #224]
+// CHECK-INST: ld1rob { z0.b }, p1/z, [x2, #224]
+// CHECK-ENCODING: [0x40,0x24,0x27,0xa4]
+// CHECK-ERROR: instruction requires: f64mm
+// CHECK-UNKNOWN: 40 24 27 a4 <unknown>
+
+ld1roh { z0.h }, p1/z, [x2, #224]
+// CHECK-INST: ld1roh { z0.h }, p1/z, [x2, #224]
+// CHECK-ENCODING: [0x40,0x24,0xa7,0xa4]
+// CHECK-ERROR: instruction requires: f64mm
+// CHECK-UNKNOWN: 40 24 a7 a4 <unknown>
+
+ld1row { z0.s }, p1/z, [x2, #224]
+// CHECK-INST: ld1row { z0.s }, p1/z, [x2, #224]
+// CHECK-ENCODING: [0x40,0x24,0x27,0xa5]
+// CHECK-ERROR: instruction requires: f64mm
+// CHECK-UNKNOWN: 40 24 27 a5 <unknown>
+
+ld1rod { z0.d }, p1/z, [x2, #224]
+// CHECK-INST: ld1rod { z0.d }, p1/z, [x2, #224]
+// CHECK-ENCODING: [0x40,0x24,0xa7,0xa5]
+// CHECK-ERROR: instruction requires: f64mm
+// CHECK-UNKNOWN: 40 24 a7 a5 <unknown>
+
+// With minimum immediate (-256)
+
+ld1rob { z0.b }, p1/z, [x2, #-256]
+// CHECK-INST: ld1rob { z0.b }, p1/z, [x2, #-256]
+// CHECK-ENCODING: [0x40,0x24,0x28,0xa4]
+// CHECK-ERROR: instruction requires: f64mm
+// CHECK-UNKNOWN: 40 24 28 a4 <unknown>
+
+ld1roh { z0.h }, p1/z, [x2, #-256]
+// CHECK-INST: ld1roh { z0.h }, p1/z, [x2, #-256]
+// CHECK-ENCODING: [0x40,0x24,0xa8,0xa4]
+// CHECK-ERROR: instruction requires: f64mm
+// CHECK-UNKNOWN: 40 24 a8 a4 <unknown>
+
+ld1row { z0.s }, p1/z, [x2, #-256]
+// CHECK-INST: ld1row { z0.s }, p1/z, [x2, #-256]
+// CHECK-ENCODING: [0x40,0x24,0x28,0xa5]
+// CHECK-ERROR: instruction requires: f64mm
+// CHECK-UNKNOWN: 40 24 28 a5 <unknown>
+
+ld1rod { z0.d }, p1/z, [x2, #-256]
+// CHECK-INST: ld1rod { z0.d }, p1/z, [x2, #-256]
+// CHECK-ENCODING: [0x40,0x24,0xa8,0xa5]
+// CHECK-ERROR: instruction requires: f64mm
+// CHECK-UNKNOWN: 40 24 a8 a5 <unknown>
+
+// Aliases with a plain (non-list) first operand, and omitted offset.
+
+ld1rob z0.b, p1/z, [x2]
+// CHECK-INST: ld1rob { z0.b }, p1/z, [x2]
+// CHECK-ENCODING: [0x40,0x24,0x20,0xa4]
+// CHECK-ERROR: instruction requires: f64mm
+// CHECK-UNKNOWN: 40 24 20 a4 <unknown>
+
+ld1roh z0.h, p1/z, [x2]
+// CHECK-INST: ld1roh { z0.h }, p1/z, [x2]
+// CHECK-ENCODING: [0x40,0x24,0xa0,0xa4]
+// CHECK-ERROR: instruction requires: f64mm
+// CHECK-UNKNOWN: 40 24 a0 a4 <unknown>
+
+ld1row z0.s, p1/z, [x2]
+// CHECK-INST: ld1row { z0.s }, p1/z, [x2]
+// CHECK-ENCODING: [0x40,0x24,0x20,0xa5]
+// CHECK-ERROR: instruction requires: f64mm
+// CHECK-UNKNOWN: 40 24 20 a5 <unknown>
+
+ld1rod z0.d, p1/z, [x2]
+// CHECK-INST: ld1rod { z0.d }, p1/z, [x2]
+// CHECK-ENCODING: [0x40,0x24,0xa0,0xa5]
+// CHECK-ERROR: instruction requires: f64mm
+// CHECK-UNKNOWN: 40 24 a0 a5 <unknown>
+
+
+// --------------------------------------------------------------------------//
+// LD1RO (SVE, scalar plus scalar)
+
+ld1rob { z0.b }, p1/z, [x2, x3, lsl #0]
+// CHECK-INST: ld1rob { z0.b }, p1/z, [x2, x3]
+// CHECK-ENCODING: [0x40,0x04,0x23,0xa4]
+// CHECK-ERROR: instruction requires: f64mm
+// CHECK-UNKNOWN: 40 04 23 a4 <unknown>
+
+ld1roh { z0.h }, p1/z, [x2, x3, lsl #1]
+// CHECK-INST: ld1roh { z0.h }, p1/z, [x2, x3, lsl #1]
+// CHECK-ENCODING: [0x40,0x04,0xa3,0xa4]
+// CHECK-ERROR: instruction requires: f64mm
+// CHECK-UNKNOWN: 40 04 a3 a4 <unknown>
+
+ld1row { z0.s }, p1/z, [x2, x3, lsl #2]
+// CHECK-INST: ld1row { z0.s }, p1/z, [x2, x3, lsl #2]
+// CHECK-ENCODING: [0x40,0x04,0x23,0xa5]
+// CHECK-ERROR: instruction requires: f64mm
+// CHECK-UNKNOWN: 40 04 23 a5 <unknown>
+
+ld1rod { z0.d }, p1/z, [x2, x3, lsl #3]
+// CHECK-INST: ld1rod { z0.d }, p1/z, [x2, x3, lsl #3]
+// CHECK-ENCODING: [0x40,0x04,0xa3,0xa5]
+// CHECK-ERROR: instruction requires: f64mm
+// CHECK-UNKNOWN: 40 04 a3 a5 <unknown>
+
+// Aliases with a plain (non-list) first operand, and omitted shift for the
+// byte variant.
+
+ld1rob z0.b, p1/z, [x2, x3]
+// CHECK-INST: ld1rob { z0.b }, p1/z, [x2, x3]
+// CHECK-ENCODING: [0x40,0x04,0x23,0xa4]
+// CHECK-ERROR: instruction requires: f64mm
+// CHECK-UNKNOWN: 40 04 23 a4 <unknown>
+
+ld1roh z0.h, p1/z, [x2, x3, lsl #1]
+// CHECK-INST: ld1roh { z0.h }, p1/z, [x2, x3, lsl #1]
+// CHECK-ENCODING: [0x40,0x04,0xa3,0xa4]
+// CHECK-ERROR: instruction requires: f64mm
+// CHECK-UNKNOWN: 40 04 a3 a4 <unknown>
+
+ld1row z0.s, p1/z, [x2, x3, lsl #2]
+// CHECK-INST: ld1row { z0.s }, p1/z, [x2, x3, lsl #2]
+// CHECK-ENCODING: [0x40,0x04,0x23,0xa5]
+// CHECK-ERROR: instruction requires: f64mm
+// CHECK-UNKNOWN: 40 04 23 a5 <unknown>
+
+ld1rod z0.d, p1/z, [x2, x3, lsl #3]
+// CHECK-INST: ld1rod { z0.d }, p1/z, [x2, x3, lsl #3]
+// CHECK-ENCODING: [0x40,0x04,0xa3,0xa5]
+// CHECK-ERROR: instruction requires: f64mm
+// CHECK-UNKNOWN: 40 04 a3 a5 <unknown>
+
+
+// --------------------------------------------------------------------------//
+// ZIP1, ZIP2 (SVE, 128-bit element)
+
+zip1 z0.q, z1.q, z2.q
+// CHECK-INST: zip1 z0.q, z1.q, z2.q
+// CHECK-ENCODING: [0x20,0x00,0xa2,0x05]
+// CHECK-ERROR: instruction requires: f64mm
+// CHECK-UNKNOWN: 20 00 a2 05 <unknown>
+
+zip2 z0.q, z1.q, z2.q
+// CHECK-INST: zip2 z0.q, z1.q, z2.q
+// CHECK-ENCODING: [0x20,0x04,0xa2,0x05]
+// CHECK-ERROR: instruction requires: f64mm
+// CHECK-UNKNOWN: 20 04 a2 05 <unknown>
+
+
+// --------------------------------------------------------------------------//
+// TRN1, TRN2 (SVE, 128-bit element)
+
+trn1 z0.q, z1.q, z2.q
+// CHECK-INST: trn1 z0.q, z1.q, z2.q
+// CHECK-ENCODING: [0x20,0x18,0xa2,0x05]
+// CHECK-ERROR: instruction requires: f64mm
+// CHECK-UNKNOWN: 20 18 a2 05 <unknown>
+
+trn2 z0.q, z1.q, z2.q
+// CHECK-INST: trn2 z0.q, z1.q, z2.q
+// CHECK-ENCODING: [0x20,0x1c,0xa2,0x05]
+// CHECK-ERROR: instruction requires: f64mm
+// CHECK-UNKNOWN: 20 1c a2 05 <unknown>
Index: llvm/test/MC/AArch64/SVE/matrix-multiply-int8-diagnostics.s
===================================================================
--- /dev/null
+++ llvm/test/MC/AArch64/SVE/matrix-multiply-int8-diagnostics.s
@@ -0,0 +1,78 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve,+i8mm  2>&1 < %s | FileCheck %s
+
+// --------------------------------------------------------------------------//
+// SMMLA, UMMLA, USMMLA (SVE)
+
+// Invalid element size
+
+ummla z0.h, z1.b, z2.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+ummla z0.s, z1.h, z2.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+ummla z0.s, z1.b, z2.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.d, p0/z, z7.d
+ummla z0.s, z1.b, z2.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a predicated movprfx, suggest using unpredicated movprfx
+movprfx z0.d, p0/z, z7.d
+smmla z0.s, z1.b, z2.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a predicated movprfx, suggest using unpredicated movprfx
+movprfx z0.d, p0/z, z7.d
+usmmla z0.s, z1.b, z2.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a predicated movprfx, suggest using unpredicated movprfx
+
+
+// --------------------------------------------------------------------------//
+// USDOT (SVE, vectors)
+
+// Invalid element size
+
+usdot z0.d, z1.b, z2.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+usdot z0.s, z1.s, z2.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+usdot z0.s, z1.b, z2.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected z0.b..z7.b
+
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.d, p0/z, z7.d
+usdot z0.s, z1.b, z2.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a predicated movprfx, suggest using unpredicated movprfx
+
+
+// --------------------------------------------------------------------------//
+// USDOT, SUDOT (SVE, indexed)
+
+// Invalid element size
+
+usdot z0.h, z1.b, z2.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+sudot z0.s, z1.h, z2.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+usdot z0.s, z1.b, z2.s[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected z0.b..z7.b
+
+// Invalid restricted register for indexed vector.
+usdot z0.s, z1.b, z9.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+sudot z0.s, z1.b, z9.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected z0.b..z7.b
+
+// Invalid element index
+usdot z0.s, z1.b, z2.b[4]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+sudot z0.s, z1.b, z2.b[4]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.d, p0/z, z7.d
+usdot z0.s, z1.b, z2.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a predicated movprfx, suggest using unpredicated movprfx
+movprfx z0.d, p0/z, z7.d
+sudot z0.s, z1.b, z2.b[3]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a predicated movprfx, suggest using unpredicated movprfx
Index: llvm/test/MC/AArch64/SVE/matrix-multiply-int8.s
===================================================================
--- /dev/null
+++ llvm/test/MC/AArch64/SVE/matrix-multiply-int8.s
@@ -0,0 +1,118 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve,+i8mm < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve,+i8mm < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve,+i8mm - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve,+i8mm < %s \
+// RUN:        | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+
+
+// --------------------------------------------------------------------------//
+// SMMLA, UMMLA, USMMLA (SVE)
+
+ummla z0.s, z1.b, z2.b
+// CHECK-INST: ummla z0.s, z1.b, z2.b
+// CHECK-ENCODING: [0x20,0x98,0xc2,0x45]
+// CHECK-ERROR: instruction requires: i8mm
+// CHECK-UNKNOWN: 20 98 c2 45 <unknown>
+
+smmla z0.s, z1.b, z2.b
+// CHECK-INST: smmla z0.s, z1.b, z2.b
+// CHECK-ENCODING: [0x20,0x98,0x02,0x45]
+// CHECK-ERROR: instruction requires: i8mm
+// CHECK-UNKNOWN: 20 98 02 45 <unknown>
+
+usmmla z0.s, z1.b, z2.b
+// CHECK-INST: usmmla z0.s, z1.b, z2.b
+// CHECK-ENCODING: [0x20,0x98,0x82,0x45]
+// CHECK-ERROR: instruction requires: i8mm
+// CHECK-UNKNOWN: 20 98 82 45 <unknown>
+
+
+// Test compatibility with MOVPRFX instruction.
+
+movprfx z0, z7
+// CHECK-INST: movprfx	z0, z7
+// CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
+// CHECK-UNKNOWN: e0 bc 20 04 <unknown>
+
+ummla z0.s, z1.b, z2.b
+// CHECK-INST: ummla z0.s, z1.b, z2.b
+// CHECK-ENCODING: [0x20,0x98,0xc2,0x45]
+// CHECK-ERROR: instruction requires: i8mm
+// CHECK-UNKNOWN: 20 98 c2 45 <unknown>
+
+movprfx z0, z7
+// CHECK-INST: movprfx	z0, z7
+// CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
+// CHECK-UNKNOWN: e0 bc 20 04 <unknown>
+
+smmla z0.s, z1.b, z2.b
+// CHECK-INST: smmla z0.s, z1.b, z2.b
+// CHECK-ENCODING: [0x20,0x98,0x02,0x45]
+// CHECK-ERROR: instruction requires: i8mm
+// CHECK-UNKNOWN: 20 98 02 45 <unknown>
+
+movprfx z0, z7
+// CHECK-INST: movprfx	z0, z7
+// CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
+// CHECK-UNKNOWN: e0 bc 20 04 <unknown>
+
+usmmla z0.s, z1.b, z2.b
+// CHECK-INST: usmmla z0.s, z1.b, z2.b
+// CHECK-ENCODING: [0x20,0x98,0x82,0x45]
+// CHECK-ERROR: instruction requires: i8mm
+// CHECK-UNKNOWN: 20 98 82 45 <unknown>
+
+
+// --------------------------------------------------------------------------//
+// USDOT (SVE, vectors)
+
+usdot z0.s, z1.b, z2.b
+// CHECK-INST: usdot z0.s, z1.b, z2.b
+// CHECK-ENCODING: [0x20,0x78,0x82,0x44]
+// CHECK-ERROR: instruction requires: i8mm
+// CHECK-UNKNOWN: 20 78 82 44 <unknown>
+
+// Test compatibility with MOVPRFX instruction.
+
+movprfx z0, z7
+// CHECK-INST: movprfx	z0, z7
+// CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
+// CHECK-UNKNOWN: e0 bc 20 04 <unknown>
+
+usdot z0.s, z1.b, z2.b
+// CHECK-INST: usdot z0.s, z1.b, z2.b
+// CHECK-ENCODING: [0x20,0x78,0x82,0x44]
+// CHECK-ERROR: instruction requires: i8mm
+// CHECK-UNKNOWN: 20 78 82 44 <unknown>
+
+
+// --------------------------------------------------------------------------//
+// USDOT, SUDOT (SVE, indexed)
+
+usdot z0.s, z1.b, z2.b[0]
+// CHECK-INST: usdot z0.s, z1.b, z2.b[0]
+// CHECK-ENCODING: [0x20,0x18,0xa2,0x44]
+// CHECK-ERROR: instruction requires: i8mm
+// CHECK-UNKNOWN: 20 18 a2 44 <unknown>
+
+sudot z0.s, z1.b, z2.b[3]
+// CHECK-INST: sudot z0.s, z1.b, z2.b[3]
+// CHECK-ENCODING: [0x20,0x1c,0xba,0x44]
+// CHECK-ERROR: instruction requires: i8mm
+// CHECK-UNKNOWN: 20 1c ba 44 <unknown>
+
+// Test compatibility with MOVPRFX instruction.
+
+movprfx z0, z7
+// CHECK-INST: movprfx	z0, z7
+// CHECK-ENCODING: [0xe0,0xbc,0x20,0x04]
+// CHECK-UNKNOWN: e0 bc 20 04 <unknown>
+
+usdot z0.s, z1.b, z2.b[0]
+// CHECK-INST: usdot z0.s, z1.b, z2.b[0]
+// CHECK-ENCODING: [0x20,0x18,0xa2,0x44]
+// CHECK-ERROR: instruction requires: i8mm
+// CHECK-UNKNOWN: 20 18 a2 44 <unknown>
Index: llvm/test/MC/AArch64/armv8.6a-simd-matmul-error.s
===================================================================
--- /dev/null
+++ llvm/test/MC/AArch64/armv8.6a-simd-matmul-error.s
@@ -0,0 +1,34 @@
+// RUN: not llvm-mc -triple aarch64 -show-encoding -mattr=+i8mm       < %s 2>&1 | FileCheck %s
+
+// No interesting edge cases for [US]MMLA, except for the fact that the data
+// types are fixed (no 64-bit version), and USMMLA exists, but SUMMLA does not.
+smmla  v1.2s, v16.8b, v31.8b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+summla v1.4s, v16.16b, v31.16b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: unrecognized instruction mnemonic, did you mean: smmla, ummla, usmmla?
+
+// USDOT (vector) has two valid data type combinations, others are rejected.
+usdot v3.4s, v15.8b, v30.8b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+usdot v3.2s, v15.16b, v30.16b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+// For USDOT and SUDOT (indexed), the index is in range [0,3] (regardless of data types)
+usdot v31.2s, v1.8b,  v2.4b[4]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+usdot v31.4s, v1.16b, v2.4b[4]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+sudot v31.2s, v1.8b,  v2.4b[4]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+sudot v31.4s, v1.16b, v2.4b[4]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+
+// The arrangement specifiers of the first two operands muct match.
+usdot v31.4s, v1.8b,  v2.4b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+usdot v31.2s, v1.16b, v2.4b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+sudot v31.4s, v1.8b,  v2.4b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+sudot v31.2s, v1.16b, v2.4b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
Index: llvm/test/MC/AArch64/armv8.6a-simd-matmul.s
===================================================================
--- /dev/null
+++ llvm/test/MC/AArch64/armv8.6a-simd-matmul.s
@@ -0,0 +1,43 @@
+// RUN:     llvm-mc -triple aarch64 -show-encoding -mattr=+i8mm       < %s      | FileCheck %s
+// RUN:     llvm-mc -triple aarch64 -show-encoding -mattr=+v8.6a      < %s      | FileCheck %s
+// RUN: not llvm-mc -triple aarch64 -show-encoding -mattr=+v8.6a-i8mm < %s 2>&1 | FileCheck %s --check-prefix=NOMATMUL
+
+smmla  v1.4s, v16.16b, v31.16b
+ummla  v1.4s, v16.16b, v31.16b
+usmmla v1.4s, v16.16b, v31.16b
+// CHECK: smmla   v1.4s, v16.16b, v31.16b // encoding: [0x01,0xa6,0x9f,0x4e]
+// CHECK: ummla   v1.4s, v16.16b, v31.16b // encoding: [0x01,0xa6,0x9f,0x6e]
+// CHECK: usmmla  v1.4s, v16.16b, v31.16b // encoding: [0x01,0xae,0x9f,0x4e]
+// NOMATMUL: instruction requires: i8mm
+// NOMATMUL-NEXT: smmla  v1.4s, v16.16b, v31.16b
+// NOMATMUL: instruction requires: i8mm
+// NOMATMUL-NEXT: ummla  v1.4s, v16.16b, v31.16b
+// NOMATMUL: instruction requires: i8mm
+// NOMATMUL-NEXT: usmmla  v1.4s, v16.16b, v31.16b
+
+usdot v3.2s, v15.8b, v30.8b
+usdot v3.4s, v15.16b, v30.16b
+// CHECK: usdot   v3.2s, v15.8b, v30.8b   // encoding: [0xe3,0x9d,0x9e,0x0e]
+// CHECK: usdot   v3.4s, v15.16b, v30.16b // encoding: [0xe3,0x9d,0x9e,0x4e]
+// NOMATMUL: instruction requires: i8mm
+// NOMATMUL-NEXT: usdot v3.2s, v15.8b, v30.8b
+// NOMATMUL: instruction requires: i8mm
+// NOMATMUL-NEXT: usdot v3.4s, v15.16b, v30.16b
+
+usdot v31.2s, v1.8b,  v2.4b[3]
+usdot v31.4s, v1.16b, v2.4b[3]
+// CHECK: usdot   v31.2s, v1.8b, v2.4b[3] // encoding: [0x3f,0xf8,0xa2,0x0f]
+// CHECK: usdot   v31.4s, v1.16b, v2.4b[3] // encoding: [0x3f,0xf8,0xa2,0x4f]
+// NOMATMUL: instruction requires: i8mm
+// NOMATMUL-NEXT: usdot   v31.2s, v1.8b, v2.4b[3]
+// NOMATMUL: instruction requires: i8mm
+// NOMATMUL-NEXT: usdot   v31.4s, v1.16b, v2.4b[3]
+
+sudot v31.2s, v1.8b,  v2.4b[3]
+sudot v31.4s, v1.16b, v2.4b[3]
+// CHECK: sudot   v31.2s, v1.8b, v2.4b[3] // encoding: [0x3f,0xf8,0x22,0x0f]
+// CHECK: sudot   v31.4s, v1.16b, v2.4b[3] // encoding: [0x3f,0xf8,0x22,0x4f]
+// NOMATMUL: instruction requires: i8mm
+// NOMATMUL-NEXT: sudot   v31.2s, v1.8b, v2.4b[3]
+// NOMATMUL: instruction requires: i8mm
+// NOMATMUL-NEXT: sudot   v31.4s, v1.16b, v2.4b[3]
Index: llvm/test/MC/ARM/armv8.6a-matmul-error.s
===================================================================
--- /dev/null
+++ llvm/test/MC/ARM/armv8.6a-matmul-error.s
@@ -0,0 +1,113 @@
+// RUN: not llvm-mc -triple armv8a   -show-encoding -mattr=+i8mm < %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -triple thumbv8a -show-encoding -mattr=+i8mm < %s 2>&1 | FileCheck %s
+
+
+// VSMMLA, VUMMLA, VUSMMLA
+
+// Data type specifier must match instruction
+
+vsmmla.u8 q0, q1, q2
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT:    vsmmla.u8 q0, q1, q2
+// CHECK-NEXT: {{^      \^}}
+
+vummla.s8 q0, q1, q2
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT:    vummla.s8 q0, q1, q2
+// CHECK-NEXT: {{^      \^}}
+
+vusmmla.u8 q0, q1, q2
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT:    vusmmla.u8 q0, q1, q2
+// CHECK-NEXT: {{^       \^}}
+
+
+// Incorrect register type
+
+vsmmla.s8 d0, q1, q2
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: operand must be a register in range [q0, q15]
+// CHECK-NEXT:    vsmmla.s8 d0, q1, q2
+// CHECK-NEXT: {{^          \^}}
+
+vummla.u8 q0, d1, q2
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: operand must be a register in range [q0, q15]
+// CHECK-NEXT:    vummla.u8 q0, d1, q2
+// CHECK-NEXT: {{^              \^}}
+
+vusmmla.s8 q0, q1, d2
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: operand must be a register in range [q0, q15]
+// CHECK-NEXT:    vusmmla.s8 q0, q1, d2
+// CHECK-NEXT: {{^                   \^}}
+
+
+// VUSDOT (vector)
+
+// Data type specifier must match instruction
+
+vusdot.u8 q0, q1, q2
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT:    vusdot.u8 q0, q1, q2
+// CHECK-NEXT: {{^      \^}}
+
+// Mis-matched register types
+
+vusdot.s8 q0, d1, d2
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: operand must be a register in range [d0, d31]
+vusdot.s8 d0, q1, d2
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: operand must be a register in range [d0, d31]
+vusdot.s8 d0, d1, q2
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: operand must be a register in range [d0, d31]
+
+
+// VUSDOT, VSUDOT (by scalar)
+
+// Data type specifier must match instruction
+
+vusdot.u8 d0, d1, d2[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT:    vusdot.u8 d0, d1, d2[0]
+// CHECK-NEXT: {{^      \^}}
+
+vsudot.s8 d0, d1, d2[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT:    vsudot.s8 d0, d1, d2[0]
+// CHECK-NEXT: {{^      \^}}
+
+// Incorrect register types
+
+vusdot.s8 q0, d1, d2[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid instruction, any one of the following would fix this:
+// CHECK-NEXT: vusdot.s8 q0, d1, d2[0]
+// CHECK: [[@LINE-3]]:{{[0-9]+}}: note: operand must be a register in range [d0, d31]
+// CHECK-NEXT: vusdot.s8 q0, d1, d2[0]
+// CHECK-NEXT: {{^       \^}}
+// CHECK: [[@LINE-6]]:{{[0-9]+}}: note: operand must be a register in range [q0, q15]
+// CHECK-NEXT: vusdot.s8 q0, d1, d2[0]
+// CHECK-NEXT: {{^           \^}}
+
+vusdot.s8 d0, q1, d2[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid instruction, any one of the following would fix this:
+// CHECK-NEXT: vusdot.s8 d0, q1, d2[0]
+// CHECK: [[@LINE-3]]:{{[0-9]+}}: note: operand must be a register in range [d0, d31]
+// CHECK-NEXT: vusdot.s8 d0, q1, d2[0]
+// CHECK-NEXT: {{^           \^}}
+// CHECK: [[@LINE-6]]:{{[0-9]+}}: note: operand must be a register in range [q0, q15]
+// CHECK-NEXT: vusdot.s8 d0, q1, d2[0]
+// CHECK-NEXT: {{^       \^}}
+
+vusdot.s8 q0, q1, q2[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid instruction, any one of the following would fix this:
+// CHECK-NEXT: vusdot.s8 q0, q1, q2[0]
+// CHECK: [[@LINE-3]]:{{[0-9]+}}: note: operand must be a register in range [d0, d15]
+// CHECK-NEXT: vusdot.s8 q0, q1, q2[0]
+// CHECK-NEXT: {{^               \^}}
+// CHECK: [[@LINE-6]]:{{[0-9]+}}: note: too many operands for instruction
+// CHECK-NEXT: vusdot.s8 q0, q1, q2[0]
+// CHECK-NEXT: {{^                 \^}}
+
+// Out of range lane index
+
+vusdot.s8 d0, d1, d2[2]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+vsudot.u8 q0, q1, d2[2]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
Index: llvm/test/MC/ARM/armv8.6a-matmul.s
===================================================================
--- /dev/null
+++ llvm/test/MC/ARM/armv8.6a-matmul.s
@@ -0,0 +1,49 @@
+// RUN:     llvm-mc -triple armv8a   -show-encoding -mattr=+i8mm < %s      | FileCheck %s --check-prefix=ARM
+// RUN:     llvm-mc -triple thumbv8a -show-encoding -mattr=+i8mm < %s      | FileCheck %s --check-prefix=THUMB
+// RUN: not llvm-mc -triple armv8a   -show-encoding -mattr=v8.5a < %s 2>&1 | FileCheck %s --check-prefix=NOMATMUL
+// RUN: not llvm-mc -triple thumbv8a -show-encoding -mattr=v8.5a < %s 2>&1 | FileCheck %s --check-prefix=NOMATMUL
+
+vsmmla.s8 q0, q1, q2
+// ARM: vsmmla.s8       q0, q1, q2      @ encoding: [0x44,0x0c,0x22,0xfc]
+// THUMB: vsmmla.s8       q0, q1, q2      @ encoding: [0x22,0xfc,0x44,0x0c]
+// NOMATMUL: [[@LINE-3]]:{{[0-9]+}}: error: instruction requires: 8-bit integer matrix multiply
+
+vummla.u8 q0, q1, q2
+// ARM: vummla.u8       q0, q1, q2      @ encoding: [0x54,0x0c,0x22,0xfc]
+// THUMB: vummla.u8       q0, q1, q2      @ encoding: [0x22,0xfc,0x54,0x0c]
+// NOMATMUL: [[@LINE-3]]:{{[0-9]+}}: error: instruction requires: 8-bit integer matrix multiply
+
+vusmmla.s8 q0, q1, q2
+// ARM: vusmmla.s8      q0, q1, q2      @ encoding: [0x44,0x0c,0xa2,0xfc]
+// THUMB: vusmmla.s8      q0, q1, q2      @ encoding: [0xa2,0xfc,0x44,0x0c]
+// NOMATMUL: [[@LINE-3]]:{{[0-9]+}}: error: instruction requires: 8-bit integer matrix multiply
+
+vusdot.s8 d0, d1, d2
+// ARM: vusdot.s8       d0, d1, d2      @ encoding: [0x02,0x0d,0xa1,0xfc]
+// THUMB: vusdot.s8       d0, d1, d2      @ encoding: [0xa1,0xfc,0x02,0x0d]
+// NOMATMUL: [[@LINE-3]]:{{[0-9]+}}: error: instruction requires: 8-bit integer matrix multiply
+
+vusdot.s8 q0, q1, q2
+// ARM: vusdot.s8       q0, q1, q2      @ encoding: [0x44,0x0d,0xa2,0xfc]
+// THUMB: vusdot.s8       q0, q1, q2      @ encoding: [0xa2,0xfc,0x44,0x0d]
+// NOMATMUL: [[@LINE-3]]:{{[0-9]+}}: error: instruction requires: 8-bit integer matrix multiply
+
+vusdot.s8 d0, d1, d2[0]
+// ARM: vusdot.s8       d0, d1, d2[0]   @ encoding: [0x02,0x0d,0x81,0xfe]
+// THUMB: vusdot.s8       d0, d1, d2[0]   @ encoding: [0x81,0xfe,0x02,0x0d]
+// NOMATMUL: [[@LINE-3]]:{{[0-9]+}}: error: instruction requires: 8-bit integer matrix multiply
+
+vsudot.u8 d0, d1, d2[1]
+// ARM: vsudot.u8       d0, d1, d2[1]   @ encoding: [0x32,0x0d,0x81,0xfe]
+// THUMB: vsudot.u8       d0, d1, d2[1]   @ encoding: [0x81,0xfe,0x32,0x0d]
+// NOMATMUL: [[@LINE-3]]:{{[0-9]+}}: error: instruction requires: 8-bit integer matrix multiply
+
+vusdot.s8 q0, q1, d2[0]
+// ARM: vusdot.s8       q0, q1, d2[0]   @ encoding: [0x42,0x0d,0x82,0xfe]
+// THUMB: vusdot.s8       q0, q1, d2[0]   @ encoding: [0x82,0xfe,0x42,0x0d]
+// NOMATMUL: [[@LINE-3]]:{{[0-9]+}}: error: instruction requires: 8-bit integer matrix multiply
+
+vsudot.u8 q0, q1, d2[1]
+// ARM: vsudot.u8       q0, q1, d2[1]   @ encoding: [0x72,0x0d,0x82,0xfe]
+// THUMB: vsudot.u8       q0, q1, d2[1]   @ encoding: [0x82,0xfe,0x72,0x0d]
+// NOMATMUL: [[@LINE-3]]:{{[0-9]+}}: error: instruction requires: 8-bit integer matrix multiply
Index: llvm/test/MC/Disassembler/AArch64/armv8.6a-simd-matmul.txt
===================================================================
--- /dev/null
+++ llvm/test/MC/Disassembler/AArch64/armv8.6a-simd-matmul.txt
@@ -0,0 +1,34 @@
+# RUN:     llvm-mc -triple=aarch64  -mattr=+i8mm -disassemble < %s       | FileCheck %s
+# RUN:     llvm-mc -triple=aarch64  -mattr=+v8.6a -disassemble < %s      | FileCheck %s
+# RUN: not llvm-mc -triple=aarch64  -mattr=+v8.5a -disassemble < %s 2>&1 | FileCheck %s --check-prefix=NOI8MM
+
+[0x01,0xa6,0x9f,0x4e]
+[0x01,0xa6,0x9f,0x6e]
+[0x01,0xae,0x9f,0x4e]
+# CHECK: smmla   v1.4s, v16.16b, v31.16b
+# CHECK: ummla   v1.4s, v16.16b, v31.16b
+# CHECK: usmmla  v1.4s, v16.16b, v31.16b
+# NOI8MM: [[@LINE-6]]:{{[0-9]+}}: warning: invalid instruction encoding
+# NOI8MM: [[@LINE-6]]:{{[0-9]+}}: warning: invalid instruction encoding
+# NOI8MM: [[@LINE-6]]:{{[0-9]+}}: warning: invalid instruction encoding
+
+[0xe3,0x9d,0x9e,0x0e]
+[0xe3,0x9d,0x9e,0x4e]
+# CHECK: usdot   v3.2s, v15.8b, v30.8b
+# CHECK: usdot   v3.4s, v15.16b, v30.16b
+# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding
+# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding
+
+[0x3f,0xf8,0xa2,0x0f]
+[0x3f,0xf8,0xa2,0x4f]
+# CHECK: usdot   v31.2s, v1.8b, v2.4b[3]
+# CHECK: usdot   v31.4s, v1.16b, v2.4b[3]
+# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding
+# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding
+
+[0x3f,0xf8,0x22,0x0f]
+[0x3f,0xf8,0x22,0x4f]
+# CHECK: sudot   v31.2s, v1.8b, v2.4b[3]
+# CHECK: sudot   v31.4s, v1.16b, v2.4b[3]
+# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding
+# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding
Index: llvm/test/MC/Disassembler/ARM/armv8.6a-matmul-arm.txt
===================================================================
--- /dev/null
+++ llvm/test/MC/Disassembler/ARM/armv8.6a-matmul-arm.txt
@@ -0,0 +1,38 @@
+# RUN:     llvm-mc -triple=armv8 -mattr=+i8mm  -disassemble < %s      | FileCheck %s
+# RUN: not llvm-mc -triple=armv8 -mattr=+v8.4a -disassemble < %s 2>&1 | FileCheck %s --check-prefix=NOMATMUL
+
+[0x44,0x0c,0x22,0xfc]
+# CHECK: vsmmla.s8 q0, q1, q2
+# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding
+
+[0x54,0x0c,0x22,0xfc]
+# CHECK: vummla.u8 q0, q1, q2
+# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding
+
+[0x44,0x0c,0xa2,0xfc]
+# CHECK: vusmmla.s8 q0, q1, q2
+# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding
+
+[0x02,0x0d,0xa1,0xfc]
+# CHECK: vusdot.s8 d0, d1, d2
+# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding
+
+[0x44,0x0d,0xa2,0xfc]
+# CHECK: vusdot.s8 q0, q1, q2
+# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding
+
+[0x02,0x0d,0x81,0xfe]
+# CHECK: vusdot.s8 d0, d1, d2[0]
+# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding
+
+[0x32,0x0d,0x81,0xfe]
+# CHECK: vsudot.u8 d0, d1, d2[1]
+# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding
+
+[0x42,0x0d,0x82,0xfe]
+# CHECK: vusdot.s8 q0, q1, d2[0]
+# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding
+
+[0x72,0x0d,0x82,0xfe]
+# CHECK: vsudot.u8 q0, q1, d2[1]
+# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding
Index: llvm/test/MC/Disassembler/ARM/armv8.6a-matmul-thumb.txt
===================================================================
--- /dev/null
+++ llvm/test/MC/Disassembler/ARM/armv8.6a-matmul-thumb.txt
@@ -0,0 +1,38 @@
+# RUN:     llvm-mc -triple=thumbv8a -mattr=+i8mm  -disassemble < %s      | FileCheck %s
+# RUN: not llvm-mc -triple=thumbv8a -mattr=+v8.4a -disassemble < %s 2>&1 | FileCheck %s --check-prefix=NOMATMUL
+
+[0x22,0xfc,0x44,0x0c]
+# CHECK: vsmmla.s8 q0, q1, q2
+# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding
+
+[0x22,0xfc,0x54,0x0c]
+# CHECK: vummla.u8 q0, q1, q2
+# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding
+
+[0xa2,0xfc,0x44,0x0c]
+# CHECK: vusmmla.s8 q0, q1, q2
+# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding
+
+[0xa1,0xfc,0x02,0x0d]
+# CHECK: vusdot.s8 d0, d1, d2
+# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding
+
+[0xa2,0xfc,0x44,0x0d]
+# CHECK: vusdot.s8 q0, q1, q2
+# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding
+
+[0x81,0xfe,0x02,0x0d]
+# CHECK: vusdot.s8 d0, d1, d2[0]
+# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding
+
+[0x81,0xfe,0x32,0x0d]
+# CHECK: vsudot.u8 d0, d1, d2[1]
+# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding
+
+[0x82,0xfe,0x42,0x0d]
+# CHECK: vusdot.s8 q0, q1, d2[0]
+# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding
+
+[0x82,0xfe,0x72,0x0d]
+# CHECK: vsudot.u8 q0, q1, d2[1]
+# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding
Index: llvm/unittests/Support/TargetParserTest.cpp
===================================================================
--- llvm/unittests/Support/TargetParserTest.cpp
+++ llvm/unittests/Support/TargetParserTest.cpp
@@ -636,6 +636,7 @@
                               {"maverick", "maverick", nullptr, nullptr},
                               {"xscale", "noxscale", nullptr, nullptr},
                               {"sb", "nosb", "+sb", "-sb"},
+                              {"i8mm", "noi8mm", "+i8mm", "-i8mm"},
                               {"mve", "nomve", "+mve", "-mve"},
                               {"mve.fp", "nomve.fp", "+mve.fp", "-mve.fp"}};
 
@@ -1230,7 +1231,10 @@
                               {"tme", "notme", "+tme", "-tme"},
                               {"ssbs", "nossbs", "+ssbs", "-ssbs"},
                               {"sb", "nosb", "+sb", "-sb"},
-                              {"predres", "nopredres", "+predres", "-predres"}
+                              {"predres", "nopredres", "+predres", "-predres"},
+                              {"i8mm", "noi8mm", "+i8mm", "-i8mm"},
+                              {"f32mm", "nof32mm", "+f32mm", "-f32mm"},
+                              {"f64mm", "nof64mm", "+f64mm", "-f64mm"},
 };
 
   for (unsigned i = 0; i < array_lengthof(ArchExt); i++) {