diff --git a/clang/include/clang/Basic/BuiltinsAArch64.def b/clang/include/clang/Basic/BuiltinsAArch64.def
--- a/clang/include/clang/Basic/BuiltinsAArch64.def
+++ b/clang/include/clang/Basic/BuiltinsAArch64.def
@@ -62,6 +62,9 @@
 BUILTIN(__builtin_arm_stg, "vv*", "t")
 BUILTIN(__builtin_arm_subp, "Uiv*v*", "t")
 
+// Memory Operations
+BUILTIN(__builtin_arm_mops_memset_tag, "v*v*iz", "")
+
 // Memory barrier
 BUILTIN(__builtin_arm_dmb, "vUi", "nc")
 BUILTIN(__builtin_arm_dsb, "vUi", "nc")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -9765,6 +9765,18 @@
     return Builder.CreateCall(F, {Arg0, Arg1});
   }
 
+  // Memory Operations (MOPS)
+  if (BuiltinID == AArch64::BI__builtin_arm_mops_memset_tag) {
+    Value *Dst = EmitScalarExpr(E->getArg(0));
+    Value *Val = EmitScalarExpr(E->getArg(1));
+    Value *Size = EmitScalarExpr(E->getArg(2));
+    Dst = Builder.CreatePointerCast(Dst, Int8PtrTy);
+    Val = Builder.CreateTrunc(Val, Int8Ty);
+    Size = Builder.CreateIntCast(Size, Int64Ty, false);
+    return Builder.CreateCall(
+        CGM.getIntrinsic(Intrinsic::aarch64_mops_memset_tag), {Dst, Val, Size});
+  }
+
   // Memory Tagging Extensions (MTE) Intrinsics
   Intrinsic::ID MTEIntrinsicID = Intrinsic::not_intrinsic;
   switch (BuiltinID) {
diff --git a/clang/lib/Headers/arm_acle.h b/clang/lib/Headers/arm_acle.h
--- a/clang/lib/Headers/arm_acle.h
+++ b/clang/lib/Headers/arm_acle.h
@@ -730,6 +730,12 @@
 #define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb)
 #endif
 
+/* Memory Operations Intrinsics */
+#if __ARM_FEATURE_MOPS && __ARM_FEATURE_MEMORY_TAGGING
+#define __arm_mops_memset_tag(tagged_address, value, size)                     \
+  __builtin_arm_mops_memset_tag(tagged_address, value, size)
+#endif
+
 /* Transactional Memory Extension (TME) Intrinsics */
 #if __ARM_FEATURE_TME
 
diff --git a/clang/test/CodeGen/aarch64-mops.c b/clang/test/CodeGen/aarch64-mops.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-mops.c
@@ -0,0 +1,152 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple aarch64-arm-unknown-eabi -target-feature +mops -target-feature +mte -O0 -S -emit-llvm -o - %s  | FileCheck %s
+
+#define __ARM_FEATURE_MOPS 1
+#include <arm_acle.h>
+#include <stddef.h>
+
+// CHECK-LABEL: @bzero_0(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DST_ADDR:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    store i8* [[DST:%.*]], i8** [[DST_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[DST_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = call i8* @llvm.aarch64.mops.memset.tag(i8* [[TMP0]], i8 0, i64 0)
+// CHECK-NEXT:    ret i8* [[TMP1]]
+//
+void *bzero_0(void *dst) {
+  return __arm_mops_memset_tag(dst, 0, 0);
+}
+
+// CHECK-LABEL: @bzero_1(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DST_ADDR:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    store i8* [[DST:%.*]], i8** [[DST_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[DST_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = call i8* @llvm.aarch64.mops.memset.tag(i8* [[TMP0]], i8 0, i64 1)
+// CHECK-NEXT:    ret i8* [[TMP1]]
+//
+void *bzero_1(void *dst) {
+  return __arm_mops_memset_tag(dst, 0, 1);
+}
+
+// CHECK-LABEL: @bzero_10(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DST_ADDR:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    store i8* [[DST:%.*]], i8** [[DST_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[DST_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = call i8* @llvm.aarch64.mops.memset.tag(i8* [[TMP0]], i8 0, i64 10)
+// CHECK-NEXT:    ret i8* [[TMP1]]
+//
+void *bzero_10(void *dst) {
+  return __arm_mops_memset_tag(dst, 0, 10);
+}
+
+// CHECK-LABEL: @bzero_10000(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DST_ADDR:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    store i8* [[DST:%.*]], i8** [[DST_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[DST_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = call i8* @llvm.aarch64.mops.memset.tag(i8* [[TMP0]], i8 0, i64 10000)
+// CHECK-NEXT:    ret i8* [[TMP1]]
+//
+void *bzero_10000(void *dst) {
+  return __arm_mops_memset_tag(dst, 0, 10000);
+}
+
+// CHECK-LABEL: @bzero_n(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DST_ADDR:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[SIZE_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    store i8* [[DST:%.*]], i8** [[DST_ADDR]], align 8
+// CHECK-NEXT:    store i64 [[SIZE:%.*]], i64* [[SIZE_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[DST_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* [[SIZE_ADDR]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = call i8* @llvm.aarch64.mops.memset.tag(i8* [[TMP0]], i8 0, i64 [[TMP1]])
+// CHECK-NEXT:    ret i8* [[TMP2]]
+//
+void *bzero_n(void *dst, size_t size) {
+  return __arm_mops_memset_tag(dst, 0, size);
+}
+
+// CHECK-LABEL: @memset_0(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DST_ADDR:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[VALUE_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i8* [[DST:%.*]], i8** [[DST_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[VALUE:%.*]], i32* [[VALUE_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[DST_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[VALUE_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
+// CHECK-NEXT:    [[TMP3:%.*]] = call i8* @llvm.aarch64.mops.memset.tag(i8* [[TMP0]], i8 [[TMP2]], i64 0)
+// CHECK-NEXT:    ret i8* [[TMP3]]
+//
+void *memset_0(void *dst, int value) {
+  return __arm_mops_memset_tag(dst, value, 0);
+}
+
+// CHECK-LABEL: @memset_1(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DST_ADDR:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[VALUE_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i8* [[DST:%.*]], i8** [[DST_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[VALUE:%.*]], i32* [[VALUE_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[DST_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[VALUE_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
+// CHECK-NEXT:    [[TMP3:%.*]] = call i8* @llvm.aarch64.mops.memset.tag(i8* [[TMP0]], i8 [[TMP2]], i64 1)
+// CHECK-NEXT:    ret i8* [[TMP3]]
+//
+void *memset_1(void *dst, int value) {
+  return __arm_mops_memset_tag(dst, value, 1);
+}
+
+// CHECK-LABEL: @memset_10(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DST_ADDR:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[VALUE_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i8* [[DST:%.*]], i8** [[DST_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[VALUE:%.*]], i32* [[VALUE_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[DST_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[VALUE_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
+// CHECK-NEXT:    [[TMP3:%.*]] = call i8* @llvm.aarch64.mops.memset.tag(i8* [[TMP0]], i8 [[TMP2]], i64 10)
+// CHECK-NEXT:    ret i8* [[TMP3]]
+//
+void *memset_10(void *dst, int value) {
+  return __arm_mops_memset_tag(dst, value, 10);
+}
+
+// CHECK-LABEL: @memset_10000(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DST_ADDR:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[VALUE_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i8* [[DST:%.*]], i8** [[DST_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[VALUE:%.*]], i32* [[VALUE_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[DST_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[VALUE_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
+// CHECK-NEXT:    [[TMP3:%.*]] = call i8* @llvm.aarch64.mops.memset.tag(i8* [[TMP0]], i8 [[TMP2]], i64 10000)
+// CHECK-NEXT:    ret i8* [[TMP3]]
+//
+void *memset_10000(void *dst, int value) {
+  return __arm_mops_memset_tag(dst, value, 10000);
+}
+
+// CHECK-LABEL: @memset_n(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DST_ADDR:%.*]] = alloca i8*, align 8
+// CHECK-NEXT:    [[VALUE_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[SIZE_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    store i8* [[DST:%.*]], i8** [[DST_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[VALUE:%.*]], i32* [[VALUE_ADDR]], align 4
+// CHECK-NEXT:    store i64 [[SIZE:%.*]], i64* [[SIZE_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[DST_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[VALUE_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[SIZE_ADDR]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP1]] to i8
+// CHECK-NEXT:    [[TMP4:%.*]] = call i8* @llvm.aarch64.mops.memset.tag(i8* [[TMP0]], i8 [[TMP3]], i64 [[TMP2]])
+// CHECK-NEXT:    ret i8* [[TMP4]]
+//
+void *memset_n(void *dst, int value, size_t size) {
+  return __arm_mops_memset_tag(dst, value, size);
+}
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -432,16 +432,6 @@
     return TypeIdx;
   }
 
-  unsigned immIdx(unsigned ImmIdx) {
-    assert(ImmIdx <= (MCOI::OPERAND_LAST_GENERIC_IMM -
-                      MCOI::OPERAND_FIRST_GENERIC_IMM) &&
-           "Imm Index is out of bounds");
-#ifndef NDEBUG
-    ImmIdxsCovered.set(ImmIdx);
-#endif
-    return ImmIdx;
-  }
-
   void markAllIdxsAsCovered() {
 #ifndef NDEBUG
     TypeIdxsCovered.set();
@@ -568,6 +558,16 @@
   }
   unsigned getAlias() const { return AliasOf; }
 
+  unsigned immIdx(unsigned ImmIdx) {
+    assert(ImmIdx <= (MCOI::OPERAND_LAST_GENERIC_IMM -
+                      MCOI::OPERAND_FIRST_GENERIC_IMM) &&
+           "Imm Index is out of bounds");
+#ifndef NDEBUG
+    ImmIdxsCovered.set(ImmIdx);
+#endif
+    return ImmIdx;
+  }
+
   /// The instruction is legal if predicate is true.
   LegalizeRuleSet &legalIf(LegalityPredicate Predicate) {
     // We have no choice but conservatively assume that the free-form
@@ -824,11 +824,22 @@
   LegalizeRuleSet &customForCartesianProduct(std::initializer_list<LLT> Types) {
     return actionForCartesianProduct(LegalizeAction::Custom, Types);
   }
+  /// The instruction is custom when type indexes 0 and 1 are both in their
+  /// respective lists.
   LegalizeRuleSet &
   customForCartesianProduct(std::initializer_list<LLT> Types0,
                             std::initializer_list<LLT> Types1) {
     return actionForCartesianProduct(LegalizeAction::Custom, Types0, Types1);
   }
+  /// The instruction is custom when when type indexes 0, 1, and 2 are all in
+  /// their respective lists.
+  LegalizeRuleSet &
+  customForCartesianProduct(std::initializer_list<LLT> Types0,
+                            std::initializer_list<LLT> Types1,
+                            std::initializer_list<LLT> Types2) {
+    return actionForCartesianProduct(LegalizeAction::Custom, Types0, Types1,
+                                     Types2);
+  }
 
   /// Unconditionally custom lower.
   LegalizeRuleSet &custom() {
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -890,6 +890,14 @@
     [IntrWriteMem, IntrArgMemOnly, NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>]>;
 }
 
+//===----------------------------------------------------------------------===//
+// Memory Operations (MOPS) Intrinsics
+let TargetPrefix = "aarch64" in {
+  // Sizes are chosen to correspond to the llvm.memset intrinsic: ptr, i8, i64
+  def int_aarch64_mops_memset_tag : DefaultAttrsIntrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_i8_ty, llvm_i64_ty],
+      [IntrWriteMem, IntrArgMemOnly, NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>]>;
+}
+
 // Transactional Memory Extension (TME) Intrinsics
 let TargetPrefix = "aarch64" in {
 def int_aarch64_tstart  : GCCBuiltin<"__builtin_arm_tstart">,
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -28,6 +28,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DebugLoc.h"
@@ -88,6 +89,8 @@
                            MachineBasicBlock::iterator MBBI);
   bool expandStoreSwiftAsyncContext(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator MBBI);
+  bool expandMOPS(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                  unsigned Opc);
 };
 
 } // end anonymous namespace
@@ -807,6 +810,51 @@
   return true;
 }
 
+bool AArch64ExpandPseudo::expandMOPS(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     unsigned Pseudo) {
+  auto &STI = MBB.getParent()->getSubtarget<AArch64Subtarget>();
+  assert(STI.hasMOPS());
+  assert(STI.hasMTE() || Pseudo != AArch64::MOPSMemorySetTagging);
+
+  const auto Ops = [Pseudo]() -> std::array<unsigned, 3> {
+    if (Pseudo == AArch64::MOPSMemoryCopy)
+      return {AArch64::CPYFP, AArch64::CPYFM, AArch64::CPYFE};
+    if (Pseudo == AArch64::MOPSMemoryMove)
+      return {AArch64::CPYP, AArch64::CPYM, AArch64::CPYE};
+    if (Pseudo == AArch64::MOPSMemorySet)
+      return {AArch64::SETP, AArch64::SETM, AArch64::SETE};
+    if (Pseudo == AArch64::MOPSMemorySetTagging)
+      return {AArch64::SETGP, AArch64::SETGM, AArch64::MOPSSETGE};
+    llvm_unreachable("Unhandled memory operation pseudo");
+  }();
+  const bool IsSet = Pseudo == AArch64::MOPSMemorySet ||
+                     Pseudo == AArch64::MOPSMemorySetTagging;
+
+  // MOPS requires consecutive instructions in its sequences, so pack them
+  // inside a bundle to prevent other passes to moving things in between.
+  MIBundleBuilder Bundler(MBB, MBBI);
+  auto &MF = *MBB.getParent();
+  for (auto Op : Ops) {
+    auto B = BuildMI(MF, MBBI->getDebugLoc(), TII->get(Op));
+    int i = 0;
+    // Destination registers
+    B.addDef(MBBI->getOperand(i++).getReg());
+    B.addDef(MBBI->getOperand(i++).getReg());
+    if (!IsSet)
+      B.addDef(MBBI->getOperand(i++).getReg());
+    // Input registers
+    B.addUse(MBBI->getOperand(i++).getReg());
+    B.addUse(MBBI->getOperand(i++).getReg());
+    B.addUse(MBBI->getOperand(i++).getReg());
+    Bundler.append(B);
+  }
+  finalizeBundle(MBB, Bundler.begin(), Bundler.end());
+
+  MBBI->eraseFromParent();
+  return true;
+}
+
 /// If MBBI references a pseudo instruction that should be expanded here,
 /// do the expansion and return true.  Otherwise return false.
 bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
@@ -1231,6 +1279,11 @@
      return expandCALL_RVMARKER(MBB, MBBI);
    case AArch64::StoreSwiftAsyncContext:
      return expandStoreSwiftAsyncContext(MBB, MBBI);
+   case AArch64::MOPSMemoryCopy:
+   case AArch64::MOPSMemoryMove:
+   case AArch64::MOPSMemorySet:
+   case AArch64::MOPSMemorySetTagging:
+     return expandMOPS(MBB, MBBI, Opcode);
   }
   return false;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -450,6 +450,12 @@
   LDP,
   STP,
   STNP,
+
+  // Memory Operations
+  MOPS_MEMSET,
+  MOPS_MEMSET_TAGGING,
+  MOPS_MEMCOPY,
+  MOPS_MEMMOVE,
 };
 
 } // end namespace AArch64ISD
@@ -887,6 +893,7 @@
 
   SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
 
   bool isEligibleForTailCallOptimization(
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -936,17 +936,28 @@
 
   setTargetDAGCombine(ISD::GlobalAddress);
 
-  // In case of strict alignment, avoid an excessive number of byte wide stores.
-  MaxStoresPerMemsetOptSize = 8;
-  MaxStoresPerMemset = Subtarget->requiresStrictAlign()
-                       ? MaxStoresPerMemsetOptSize : 32;
+  if (Subtarget->hasMOPS()) {
+    // If we have MOPS, always use them
+    MaxStoresPerMemsetOptSize = 0;
+    MaxStoresPerMemset = 0;
+    MaxGluedStoresPerMemcpy = 0;
+    MaxStoresPerMemcpyOptSize = 0;
+    MaxStoresPerMemcpy = 0;
+    MaxStoresPerMemmoveOptSize = 0;
+    MaxStoresPerMemmove = 0;
+  } else {
+    // In case of strict alignment, avoid an excessive number of byte wide stores.
+    MaxStoresPerMemsetOptSize = 8;
+    MaxStoresPerMemset = Subtarget->requiresStrictAlign()
+                        ? MaxStoresPerMemsetOptSize : 32;
 
-  MaxGluedStoresPerMemcpy = 4;
-  MaxStoresPerMemcpyOptSize = 4;
-  MaxStoresPerMemcpy = Subtarget->requiresStrictAlign()
-                       ? MaxStoresPerMemcpyOptSize : 16;
+    MaxGluedStoresPerMemcpy = 4;
+    MaxStoresPerMemcpyOptSize = 4;
+    MaxStoresPerMemcpy = Subtarget->requiresStrictAlign()
+                        ? MaxStoresPerMemcpyOptSize : 16;
 
-  MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4;
+    MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4;
+  }
 
   MaxLoadsPerMemcmpOptSize = 4;
   MaxLoadsPerMemcmp = Subtarget->requiresStrictAlign()
@@ -1423,6 +1434,11 @@
     setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
   }
 
+  if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
+    // Only required for llvm.aarch64.mops.memset.tag
+    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
+  }
+
   PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
 }
 
@@ -2240,6 +2256,10 @@
     MAKE_CASE(AArch64ISD::UADDLP)
     MAKE_CASE(AArch64ISD::CALL_RVMARKER)
     MAKE_CASE(AArch64ISD::ASSERT_ZEXT_BOOL)
+    MAKE_CASE(AArch64ISD::MOPS_MEMSET)
+    MAKE_CASE(AArch64ISD::MOPS_MEMSET_TAGGING)
+    MAKE_CASE(AArch64ISD::MOPS_MEMCOPY)
+    MAKE_CASE(AArch64ISD::MOPS_MEMMOVE)
   }
 #undef MAKE_CASE
   return nullptr;
@@ -4028,6 +4048,39 @@
   return DAG.getNode(ISD::AND, DL, OutVT, Reinterpret, MaskReinterpret);
 }
 
+SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  unsigned IntNo = Op.getConstantOperandVal(1);
+  switch (IntNo) {
+  default:
+    return SDValue(); // Don't custom lower most intrinsics.
+  case Intrinsic::aarch64_mops_memset_tag: {
+    auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
+    SDLoc DL(Op);
+    SDValue Chain = Node->getChain();
+    SDValue Dst = Op.getOperand(2);
+    SDValue Val = Op.getOperand(3);
+    Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
+    SDValue Size = Op.getOperand(4);
+    auto Alignment = Node->getMemOperand()->getAlign();
+    bool IsVol = Node->isVolatile();
+    auto DstPtrInfo = Node->getPointerInfo();
+
+    const auto &SDI =
+        static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
+    SDValue MS =
+        SDI.EmitMOPS(AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, Val,
+                     Size, Alignment, IsVol, DstPtrInfo, MachinePointerInfo{});
+
+    // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
+    // intrinsic has 2. So hide SizeWb it using MERGE_VALUES. Otherwise
+    // LowerOperationWrapper will complain that the number of results has
+    // changed.
+    return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
+  }
+  }
+}
+
 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                      SelectionDAG &DAG) const {
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -5095,6 +5148,8 @@
   case ISD::MULHU:
     return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED,
                                /*OverrideNEON=*/true);
+  case ISD::INTRINSIC_W_CHAIN:
+    return LowerINTRINSIC_W_CHAIN(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN:
     return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::ATOMIC_STORE:
@@ -11812,6 +11867,18 @@
     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal;
     return true;
   }
+  case Intrinsic::aarch64_mops_memset_tag: {
+    Value *Dst = I.getArgOperand(0);
+    Value *Val = I.getArgOperand(1);
+    PointerType *PtrTy = cast<PointerType>(Dst->getType());
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(Val->getType());
+    Info.ptrVal = Dst;
+    Info.offset = 0;
+    Info.align = DL.getABITypeAlign(PtrTy->getElementType());
+    Info.flags = MachineMemOperand::MOStore;
+    return true;
+  }
   default:
     break;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -8337,6 +8337,35 @@
   defm MOPSSETGE : MOPSMemorySetTaggingInsns<0b10, "setge">;
 }
 
+// MOPS Node operands: 0: Dst, 1: Src or Value, 2: Size, 3: Chain
+// MOPS Node results: 0: Dst writeback, 1: Size writeback, 2: Chain
+def SDT_AArch64mops : SDTypeProfile<2, 3, [ SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2> ]>;
+def AArch64mops_memset : SDNode<"AArch64ISD::MOPS_MEMSET", SDT_AArch64mops>;
+def AArch64mops_memset_tagging : SDNode<"AArch64ISD::MOPS_MEMSET_TAGGING", SDT_AArch64mops>;
+def AArch64mops_memcopy : SDNode<"AArch64ISD::MOPS_MEMCOPY", SDT_AArch64mops>;
+def AArch64mops_memmove : SDNode<"AArch64ISD::MOPS_MEMMOVE", SDT_AArch64mops>;
+
+let Predicates = [HasMOPS], mayStore = 1 in {
+  let mayLoad = 1 in {
+    def MOPSMemoryCopy : Pseudo<(outs GPR64common:$Rd_wb, GPR64common:$Rs_wb, GPR64:$Rn_wb),
+                                (ins GPR64common:$Rd, GPR64common:$Rs, GPR64:$Rn),
+                                [], "$Rd = $Rd_wb,$Rs = $Rs_wb,$Rn = $Rn_wb">, Sched<[]>;
+    def MOPSMemoryMove : Pseudo<(outs GPR64common:$Rd_wb, GPR64common:$Rs_wb, GPR64:$Rn_wb),
+                                (ins GPR64common:$Rd, GPR64common:$Rs, GPR64:$Rn),
+                                [], "$Rd = $Rd_wb,$Rs = $Rs_wb,$Rn = $Rn_wb">, Sched<[]>;
+  }
+  let mayLoad = 0 in {
+    def MOPSMemorySet  : Pseudo<(outs GPR64common:$Rd_wb, GPR64:$Rn_wb),
+                                (ins GPR64common:$Rd, GPR64:$Rn, GPR64:$Rm),
+                                [], "$Rd = $Rd_wb,$Rn = $Rn_wb">, Sched<[]>;
+  }
+}
+let Predicates = [HasMOPS, HasMTE], mayLoad = 0, mayStore = 1 in {
+  def MOPSMemorySetTagging : Pseudo<(outs GPR64common:$Rd_wb, GPR64:$Rn_wb),
+                                    (ins GPR64common:$Rd, GPR64:$Rn, GPR64:$Rm),
+                                    [], "$Rd = $Rd_wb,$Rn = $Rn_wb">, Sched<[]>;
+}
+
 let Defs = [X16, X17], mayStore = 1, isCodeGenOnly = 1 in
 def StoreSwiftAsyncContext
       : Pseudo<(outs), (ins GPR64:$ctx, GPR64sp:$base, simm9:$offset),
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -19,11 +19,30 @@
 
 class AArch64SelectionDAGInfo : public SelectionDAGTargetInfo {
 public:
+  SDValue EmitMOPS(AArch64ISD::NodeType SDOpcode, SelectionDAG &DAG,
+                   const SDLoc &DL, SDValue Chain, SDValue Dst,
+                   SDValue SrcOrValue, SDValue Size, Align Alignment,
+                   bool isVolatile, MachinePointerInfo DstPtrInfo,
+                   MachinePointerInfo SrcPtrInfo) const;
+
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+                                  SDValue Chain, SDValue Dst, SDValue Src,
+                                  SDValue Size, Align Alignment,
+                                  bool isVolatile, bool AlwaysInline,
+                                  MachinePointerInfo DstPtrInfo,
+                                  MachinePointerInfo SrcPtrInfo) const override;
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Dst, SDValue Src,
                                   SDValue Size, Align Alignment,
                                   bool isVolatile,
                                   MachinePointerInfo DstPtrInfo) const override;
+  SDValue
+  EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
+                           SDValue Dst, SDValue Src, SDValue Size,
+                           Align Alignment, bool isVolatile,
+                           MachinePointerInfo DstPtrInfo,
+                           MachinePointerInfo SrcPtrInfo) const override;
+
   SDValue EmitTargetCodeForSetTag(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Op1, SDValue Op2,
                                   MachinePointerInfo DstPtrInfo,
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -15,15 +15,102 @@
 
 #define DEBUG_TYPE "aarch64-selectiondag-info"
 
+SDValue AArch64SelectionDAGInfo::EmitMOPS(AArch64ISD::NodeType SDOpcode,
+                                          SelectionDAG &DAG, const SDLoc &DL,
+                                          SDValue Chain, SDValue Dst,
+                                          SDValue SrcOrValue, SDValue Size,
+                                          Align Alignment, bool isVolatile,
+                                          MachinePointerInfo DstPtrInfo,
+                                          MachinePointerInfo SrcPtrInfo) const {
+
+  // Get the constant size of the copy/set. We don't use it.
+  uint64_t ConstSize = 0;
+  if (auto *C = dyn_cast<ConstantSDNode>(Size)) {
+    ConstSize = cast<ConstantSDNode>(Size)->getZExtValue();
+  }
+
+  const bool IsSet = SDOpcode == AArch64ISD::MOPS_MEMSET ||
+                     SDOpcode == AArch64ISD::MOPS_MEMSET_TAGGING;
+
+  const auto MachineOpcode = [&]() {
+    switch (SDOpcode) {
+    case AArch64ISD::MOPS_MEMSET:
+      return AArch64::MOPSMemorySet;
+    case AArch64ISD::MOPS_MEMSET_TAGGING:
+      return AArch64::MOPSMemorySetTagging;
+    case AArch64ISD::MOPS_MEMCOPY:
+      return AArch64::MOPSMemoryCopy;
+    case AArch64ISD::MOPS_MEMMOVE:
+      return AArch64::MOPSMemoryMove;
+    default:
+      break;
+    }
+    llvm_unreachable_internal("Unhandled MOPS ISD Opcode");
+    return AArch64::INSTRUCTION_LIST_END;
+  }();
+
+  MachineMemOperand::Flags Flags = MachineMemOperand::MOStore;
+  // if (!Temporal)
+  //   Flags |= MachineMemOperand::MONonTemporal;
+  if (isVolatile)
+    Flags |= MachineMemOperand::MOVolatile;
+  if (!IsSet)
+    Flags |= MachineMemOperand::MOLoad;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  auto *DstOp =
+      MF.getMachineMemOperand(DstPtrInfo, Flags, ConstSize, Alignment);
+  auto *SrcOp =
+      MF.getMachineMemOperand(SrcPtrInfo, Flags, ConstSize, Alignment);
+
+  // Extend i8 value to i64 if required
+  if (SrcOrValue.getValueType().getSimpleVT() == MVT::i8) {
+    SrcOrValue = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, SrcOrValue);
+  }
+
+  if (IsSet) {
+    SDValue Ops[] = {Dst, Size, SrcOrValue, Chain};
+    const EVT ResultTys[] = {MVT::i64, MVT::i64, MVT::Other};
+    MachineSDNode *Node = DAG.getMachineNode(MachineOpcode, DL, ResultTys, Ops);
+    DAG.setNodeMemRefs(Node, {DstOp});
+    return SDValue(Node, 2);
+  } else {
+    SDValue Ops[] = {Dst, SrcOrValue, Size, Chain};
+    const EVT ResultTys[] = {MVT::i64, MVT::i64, MVT::i64, MVT::Other};
+    MachineSDNode *Node = DAG.getMachineNode(MachineOpcode, DL, ResultTys, Ops);
+    DAG.setNodeMemRefs(Node, {DstOp, SrcOp});
+    return SDValue(Node, 3);
+  }
+}
+
+SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemcpy(
+    SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
+    MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+  const AArch64Subtarget &STI =
+      DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
+  if (STI.hasMOPS())
+    return EmitMOPS(AArch64ISD::MOPS_MEMCOPY, DAG, DL, Chain, Dst, Src, Size,
+                    Alignment, isVolatile, DstPtrInfo, SrcPtrInfo);
+  return SDValue();
+}
+
 SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
     SDValue Size, Align Alignment, bool isVolatile,
     MachinePointerInfo DstPtrInfo) const {
+  const AArch64Subtarget &STI =
+      DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
+
+  if (STI.hasMOPS()) {
+    return EmitMOPS(AArch64ISD::MOPS_MEMSET, DAG, dl, Chain, Dst, Src, Size,
+                    Alignment, isVolatile, DstPtrInfo, MachinePointerInfo{});
+  }
+
   // Check to see if there is a specialized entry-point for memory zeroing.
   ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
   ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size);
-  const AArch64Subtarget &STI =
-      DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
   const char *bzeroName =
       (V && V->isZero())
           ? DAG.getTargetLoweringInfo().getLibcallName(RTLIB::BZERO)
@@ -55,6 +142,19 @@
   return SDValue();
 }
 
+SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemmove(
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, Align Alignment, bool isVolatile,
+    MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
+  const AArch64Subtarget &STI =
+      DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
+  if (STI.hasMOPS()) {
+    return EmitMOPS(AArch64ISD::MOPS_MEMMOVE, DAG, dl, Chain, Dst, Src, Size,
+                    Alignment, isVolatile, DstPtrInfo, SrcPtrInfo);
+  }
+  return SDValue();
+}
+
 static const int kSetTagLoopThreshold = 176;
 
 static SDValue EmitUnrolledSetTag(SelectionDAG &DAG, const SDLoc &dl,
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -193,6 +193,7 @@
   bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI);
+  bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI);
 
   unsigned emitConstantPoolEntry(const Constant *CPVal,
@@ -3425,6 +3426,13 @@
   case TargetOpcode::G_VECREDUCE_FADD:
   case TargetOpcode::G_VECREDUCE_ADD:
     return selectReduction(I, MRI);
+  case TargetOpcode::G_BZERO:
+  case TargetOpcode::G_MEMCPY:
+  case TargetOpcode::G_MEMCPY_INLINE:
+  case TargetOpcode::G_MEMMOVE:
+  case TargetOpcode::G_MEMSET:
+    assert(STI.hasMOPS() && "Shouldn't get here without +mops feature");
+    return selectMOPS(I, MRI);
   }
 
   return false;
@@ -3482,6 +3490,68 @@
   return false;
 }
 
+bool AArch64InstructionSelector::selectMOPS(MachineInstr &GI,
+                                            MachineRegisterInfo &MRI) {
+  assert(GI.getOpcode() != TargetOpcode::G_BZERO &&
+         "There is no point combining to G_BZERO only to re-materialize the "
+         "zero.");
+
+  unsigned Mopcode;
+  switch (GI.getOpcode()) {
+  case TargetOpcode::G_MEMCPY:
+  case TargetOpcode::G_MEMCPY_INLINE:
+    Mopcode = AArch64::MOPSMemoryCopy;
+    break;
+  case TargetOpcode::G_MEMMOVE:
+    Mopcode = AArch64::MOPSMemoryMove;
+    break;
+  case TargetOpcode::G_MEMSET:
+    // For tagged memset see llvm.aarch64.mops.memset.tag
+    Mopcode = AArch64::MOPSMemorySet;
+    break;
+  }
+
+  auto &DstPtr = GI.getOperand(0);
+  auto &SrcOrVal = GI.getOperand(1);
+  auto &Size = GI.getOperand(2);
+
+  // Create copies of the registers that can be clobbered.
+  const Register DstPtrCopy = MRI.cloneVirtualRegister(DstPtr.getReg());
+  const Register SrcValCopy = MRI.cloneVirtualRegister(SrcOrVal.getReg());
+  const Register SizeCopy = MRI.cloneVirtualRegister(Size.getReg());
+
+  const bool IsSet = Mopcode == AArch64::MOPSMemorySet;
+  const auto &SrcValRegClass =
+      IsSet ? AArch64::GPR64RegClass : AArch64::GPR64commonRegClass;
+
+  // Constrain to specific registers
+  RBI.constrainGenericRegister(DstPtrCopy, AArch64::GPR64commonRegClass, MRI);
+  RBI.constrainGenericRegister(SrcValCopy, SrcValRegClass, MRI);
+  RBI.constrainGenericRegister(SizeCopy, AArch64::GPR64RegClass, MRI);
+
+  MIB.buildCopy(DstPtrCopy, DstPtr);
+  MIB.buildCopy(SrcValCopy, SrcOrVal);
+  MIB.buildCopy(SizeCopy, Size);
+
+  // New instruction uses the copied registers because it must update them.
+  // The defs are not used since they don't exist in G_MEM*. They are still
+  // tied.
+  // Note: order of operands is different from G_MEMSET, G_MEMCPY, G_MEMMOVE
+  Register DefDstPtr = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
+  Register DefSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+  if (IsSet) {
+    MIB.buildInstr(Mopcode, {DefDstPtr, DefSize},
+                   {DstPtrCopy, SizeCopy, SrcValCopy});
+  } else {
+    Register DefSrcPtr = MRI.createVirtualRegister(&SrcValRegClass);
+    MIB.buildInstr(Mopcode, {DefDstPtr, DefSrcPtr, DefSize},
+                   {DstPtrCopy, SrcValCopy, SizeCopy});
+  }
+
+  GI.eraseFromParent();
+  return true;
+}
+
 bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
                                             MachineRegisterInfo &MRI) {
   assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT");
@@ -5376,6 +5446,36 @@
     constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
     break;
   }
+  case Intrinsic::aarch64_mops_memset_tag: {
+    // Transform
+    //    %dst:gpr(p0) = \
+    //      G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.mops.memset.tag),
+    //      \ %dst:gpr(p0), %val:gpr(s64), %n:gpr(s64)
+    // where %dst is updated, into
+    //    %Rd:GPR64common, %Rn:GPR64) = \
+    //      MOPSMemorySetTagging \
+    //      %Rd:GPR64common, %Rn:GPR64, %Rm:GPR64
+    // where Rd and Rn are tied.
+    // It is expected that %val has been extended to s64 in legalization.
+    // Note that the order of the size/value operands are swapped.
+
+    Register DstDef = I.getOperand(0).getReg();
+    // I.getOperand(1) is the intrinsic function
+    Register DstUse = I.getOperand(2).getReg();
+    Register ValUse = I.getOperand(3).getReg();
+    Register SizeUse = I.getOperand(4).getReg();
+
+    // MOPSMemorySetTagging has two defs; the intrinsic call has only one.
+    // Therefore an additional virtual register is requried for the updated size
+    // operand. This value is not accessible via the semantics of the intrinsic.
+    Register SizeDef = MRI.createGenericVirtualRegister(LLT::scalar(64));
+
+    auto Memset = MIB.buildInstr(AArch64::MOPSMemorySetTagging,
+                                 {DstDef, SizeDef}, {DstUse, SizeUse, ValUse});
+    Memset.cloneMemRefs(I);
+    constrainSelectedInstRegOperands(*Memset, TII, TRI, RBI);
+    break;
+  }
   }
 
   I.eraseFromParent();
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
@@ -56,6 +56,7 @@
   bool legalizeAtomicCmpxchg128(MachineInstr &MI, MachineRegisterInfo &MRI,
                                 LegalizerHelper &Helper) const;
   bool legalizeCTTZ(MachineInstr &MI, LegalizerHelper &Helper) const;
+  bool legalizeMemOps(MachineInstr &MI, LegalizerHelper &Helper) const;
   const AArch64Subtarget *ST;
 };
 } // End llvm namespace.
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -699,8 +699,28 @@
 
   getActionDefinitionsBuilder(G_DYN_STACKALLOC).lower();
 
-  getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET})
-      .libcall();
+  if (ST.hasMOPS()) {
+    // G_BZERO is not supported. Currently it is only emitted by
+    // PreLegalizerCombiner for G_MEMSET with zero constant.
+    getActionDefinitionsBuilder(G_BZERO).unsupported();
+
+    getActionDefinitionsBuilder(G_MEMSET)
+        .legalForCartesianProduct({p0}, {s64}, {s64})
+        .customForCartesianProduct({p0}, {s8}, {s64})
+        .immIdx(0); // Inform verifier imm idx 0 is handled.
+
+    getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE})
+        .legalForCartesianProduct({p0}, {p0}, {s64})
+        .immIdx(0); // Inform verifier imm idx 0 is handled.
+
+    // G_MEMCPY_INLINE does not have a tailcall immediate
+    getActionDefinitionsBuilder(G_MEMCPY_INLINE)
+        .legalForCartesianProduct({p0}, {p0}, {s64});
+
+  } else {
+    getActionDefinitionsBuilder({G_BZERO, G_MEMCPY, G_MEMMOVE, G_MEMSET})
+        .libcall();
+  }
 
   // FIXME: Legal types are only legal with NEON.
   getActionDefinitionsBuilder(G_ABS)
@@ -832,6 +852,11 @@
     return legalizeAtomicCmpxchg128(MI, MRI, Helper);
   case TargetOpcode::G_CTTZ:
     return legalizeCTTZ(MI, Helper);
+  case TargetOpcode::G_BZERO:
+  case TargetOpcode::G_MEMCPY:
+  case TargetOpcode::G_MEMMOVE:
+  case TargetOpcode::G_MEMSET:
+    return legalizeMemOps(MI, Helper);
   }
 
   llvm_unreachable("expected switch to return");
@@ -989,6 +1014,15 @@
     MI.eraseFromParent();
     return true;
   }
+  case Intrinsic::aarch64_mops_memset_tag: {
+    assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
+    // Zext the value to 64 bit
+    MachineIRBuilder MIB(MI);
+    auto &Value = MI.getOperand(3);
+    Register ZExtValueReg = MIB.buildAnyExt(LLT::scalar(64), Value).getReg(0);
+    Value.setReg(ZExtValueReg);
+    return true;
+  }
   }
 
   return true;
@@ -1359,3 +1393,20 @@
   MI.eraseFromParent();
   return true;
 }
+
+bool AArch64LegalizerInfo::legalizeMemOps(MachineInstr &MI,
+                                          LegalizerHelper &Helper) const {
+  MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
+
+  // Tagged version MOPSMemorySetTagged is legalised in legalizeIntrinsic
+  if (MI.getOpcode() == TargetOpcode::G_MEMSET) {
+    // Zext the value operand to 64 bit
+    auto &Value = MI.getOperand(1);
+    Register ZExtValueReg =
+        MIRBuilder.buildAnyExt(LLT::scalar(64), Value).getReg(0);
+    Value.setReg(ZExtValueReg);
+    return true;
+  }
+
+  return false;
+}
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp
--- a/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp
@@ -75,6 +75,7 @@
                                                 MachineIRBuilder &B) const {
   CombinerHelper Helper(Observer, B, KB, MDT);
   AArch64GenO0PreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper);
+  auto &ST = static_cast<const AArch64Subtarget &>(B.getMF().getSubtarget());
 
   if (Generated.tryCombineAll(Observer, MI, B))
     return true;
@@ -86,10 +87,15 @@
   case TargetOpcode::G_SHUFFLE_VECTOR:
     return Helper.tryCombineShuffleVector(MI);
   case TargetOpcode::G_MEMCPY_INLINE:
+    if (ST.hasMOPS())
+      return false;
     return Helper.tryEmitMemcpyInline(MI);
   case TargetOpcode::G_MEMCPY:
   case TargetOpcode::G_MEMMOVE:
   case TargetOpcode::G_MEMSET: {
+    if (ST.hasMOPS())
+      return false;
+
     // At -O0 set a maxlen of 32 to inline;
     unsigned MaxLen = 32;
     // Try to inline memcpy type calls if optimizations are enabled.
diff --git a/llvm/test/CodeGen/AArch64/aarch64-mops-consecutive.ll b/llvm/test/CodeGen/AArch64/aarch64-mops-consecutive.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-mops-consecutive.ll
@@ -0,0 +1,67 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc %s -o - -mtriple=aarch64-arm-none-eabi -O2 -mattr=+mops       | FileCheck %s --check-prefix=CHECK-MOPS
+
+; Function Attrs: argmemonly mustprogress nofree nounwind willreturn writeonly
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg)
+
+declare dso_local void @fn(i8*, i8*) local_unnamed_addr
+
+define hidden void @consecutive() local_unnamed_addr {
+; CHECK-MOPS-LABEL: consecutive:
+; CHECK-MOPS:       // %bb.0: // %entry
+; CHECK-MOPS-NEXT:    sub sp, sp, #80
+; CHECK-MOPS-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-MOPS-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-MOPS-NEXT:    .cfi_offset w30, -16
+; CHECK-MOPS-NEXT:    adrp x10, .LCPI0_0
+; CHECK-MOPS-NEXT:    adrp x11, .LCPI0_1
+; CHECK-MOPS-NEXT:    mov w8, #31
+; CHECK-MOPS-NEXT:    mov x9, sp
+; CHECK-MOPS-NEXT:    mov w12, #6424
+; CHECK-MOPS-NEXT:    setp [x9]!, x8!, xzr
+; CHECK-MOPS-NEXT:    setm [x9]!, x8!, xzr
+; CHECK-MOPS-NEXT:    sete [x9]!, x8!, xzr
+; CHECK-MOPS-NEXT:    movk w12, #6938, lsl #16
+; CHECK-MOPS-NEXT:    mov w13, #7452
+; CHECK-MOPS-NEXT:    ldr q0, [x10, :lo12:.LCPI0_0]
+; CHECK-MOPS-NEXT:    mov w8, #30
+; CHECK-MOPS-NEXT:    ldr d1, [x11, :lo12:.LCPI0_1]
+; CHECK-MOPS-NEXT:    add x0, sp, #32
+; CHECK-MOPS-NEXT:    mov x1, sp
+; CHECK-MOPS-NEXT:    str w12, [sp, #56]
+; CHECK-MOPS-NEXT:    strh w13, [sp, #60]
+; CHECK-MOPS-NEXT:    str q0, [sp, #32]
+; CHECK-MOPS-NEXT:    str d1, [sp, #48]
+; CHECK-MOPS-NEXT:    strb w8, [sp, #62]
+; CHECK-MOPS-NEXT:    bl fn
+; CHECK-MOPS-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-MOPS-NEXT:    add sp, sp, #80
+; CHECK-MOPS-NEXT:    ret
+entry:
+  %buf_from = alloca [31 x i8], align 16
+  %buf_to = alloca [31 x i8], align 1
+  %0 = getelementptr inbounds [31 x i8], [31 x i8]* %buf_from, i64 0, i64 0
+  %1 = getelementptr inbounds [31 x i8], [31 x i8]* %buf_to, i64 0, i64 0
+  call void @llvm.memset.p0i8.i64(i8* noundef nonnull align 1 dereferenceable(31) %1, i8 0, i64 31, i1 false)
+  %2 = bitcast [31 x i8]* %buf_from to <16 x i8>*
+  store <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, <16 x i8>* %2, align 16
+  %arrayidx.16 = getelementptr inbounds [31 x i8], [31 x i8]* %buf_from, i64 0, i64 16
+  %3 = bitcast i8* %arrayidx.16 to <8 x i8>*
+  store <8 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>, <8 x i8>* %3, align 16
+  %arrayidx.24 = getelementptr inbounds [31 x i8], [31 x i8]* %buf_from, i64 0, i64 24
+  store i8 24, i8* %arrayidx.24, align 8
+  %arrayidx.25 = getelementptr inbounds [31 x i8], [31 x i8]* %buf_from, i64 0, i64 25
+  store i8 25, i8* %arrayidx.25, align 1
+  %arrayidx.26 = getelementptr inbounds [31 x i8], [31 x i8]* %buf_from, i64 0, i64 26
+  store i8 26, i8* %arrayidx.26, align 2
+  %arrayidx.27 = getelementptr inbounds [31 x i8], [31 x i8]* %buf_from, i64 0, i64 27
+  store i8 27, i8* %arrayidx.27, align 1
+  %arrayidx.28 = getelementptr inbounds [31 x i8], [31 x i8]* %buf_from, i64 0, i64 28
+  store i8 28, i8* %arrayidx.28, align 4
+  %arrayidx.29 = getelementptr inbounds [31 x i8], [31 x i8]* %buf_from, i64 0, i64 29
+  store i8 29, i8* %arrayidx.29, align 1
+  %arrayidx.30 = getelementptr inbounds [31 x i8], [31 x i8]* %buf_from, i64 0, i64 30
+  store i8 30, i8* %arrayidx.30, align 2
+  call void @fn(i8* nonnull %0, i8* nonnull %1)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-mops-mte.ll b/llvm/test/CodeGen/AArch64/aarch64-mops-mte.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-mops-mte.ll
@@ -0,0 +1,243 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+
+; RUN: llc %s -o - -mtriple=aarch64-arm-none-eabi -O2 -mattr=+mops,+mte  | FileCheck %s --check-prefix=SDAG
+; RUN: llc %s -o - -mtriple=aarch64-arm-none-eabi -O0 -global-isel=1 -global-isel-abort=1 -mattr=+mops,+mte  | FileCheck %s --check-prefix=GISel
+
+; Function Attrs: mustprogress nofree nosync nounwind willreturn writeonly
+declare i8* @llvm.aarch64.mops.memset.tag(i8*, i8, i64)
+
+define i8* @memset_tagged_0_zeroval(i8* %dst, i64 %size) {
+; SDAG-LABEL: memset_tagged_0_zeroval:
+; SDAG:       // %bb.0: // %entry
+; SDAG-NEXT:    mov x8, xzr
+; SDAG-NEXT:    setgp [x0]!, x8!, xzr
+; SDAG-NEXT:    setgm [x0]!, x8!, xzr
+; SDAG-NEXT:    setge [x0]!, x8!, xzr
+; SDAG-NEXT:    ret
+;
+; GISel-LABEL: memset_tagged_0_zeroval:
+; GISel:       // %bb.0: // %entry
+; GISel-NEXT:    mov x8, xzr
+; GISel-NEXT:    setgp [x0]!, x8!, x8
+; GISel-NEXT:    setgm [x0]!, x8!, x8
+; GISel-NEXT:    setge [x0]!, x8!, x8
+; GISel-NEXT:    ret
+entry:
+  %r = tail call i8* @llvm.aarch64.mops.memset.tag(i8* %dst, i8 0, i64 0)
+  ret i8* %r
+}
+
+define i8* @memset_tagged_1_zeroval(i8* %dst, i64 %size) {
+; SDAG-LABEL: memset_tagged_1_zeroval:
+; SDAG:       // %bb.0: // %entry
+; SDAG-NEXT:    mov w8, #1
+; SDAG-NEXT:    setgp [x0]!, x8!, xzr
+; SDAG-NEXT:    setgm [x0]!, x8!, xzr
+; SDAG-NEXT:    setge [x0]!, x8!, xzr
+; SDAG-NEXT:    ret
+;
+; GISel-LABEL: memset_tagged_1_zeroval:
+; GISel:       // %bb.0: // %entry
+; GISel-NEXT:    mov x9, xzr
+; GISel-NEXT:    mov w8, #1
+; GISel-NEXT:    // kill: def $x8 killed $w8
+; GISel-NEXT:    setgp [x0]!, x8!, x9
+; GISel-NEXT:    setgm [x0]!, x8!, x9
+; GISel-NEXT:    setge [x0]!, x8!, x9
+; GISel-NEXT:    ret
+entry:
+  %r = tail call i8* @llvm.aarch64.mops.memset.tag(i8* %dst, i8 0, i64 1)
+  ret i8* %r
+}
+
+define i8* @memset_tagged_10_zeroval(i8* %dst, i64 %size) {
+; SDAG-LABEL: memset_tagged_10_zeroval:
+; SDAG:       // %bb.0: // %entry
+; SDAG-NEXT:    mov w8, #10
+; SDAG-NEXT:    setgp [x0]!, x8!, xzr
+; SDAG-NEXT:    setgm [x0]!, x8!, xzr
+; SDAG-NEXT:    setge [x0]!, x8!, xzr
+; SDAG-NEXT:    ret
+;
+; GISel-LABEL: memset_tagged_10_zeroval:
+; GISel:       // %bb.0: // %entry
+; GISel-NEXT:    mov x9, xzr
+; GISel-NEXT:    mov w8, #10
+; GISel-NEXT:    // kill: def $x8 killed $w8
+; GISel-NEXT:    setgp [x0]!, x8!, x9
+; GISel-NEXT:    setgm [x0]!, x8!, x9
+; GISel-NEXT:    setge [x0]!, x8!, x9
+; GISel-NEXT:    ret
+entry:
+  %r = tail call i8* @llvm.aarch64.mops.memset.tag(i8* %dst, i8 0, i64 10)
+  ret i8* %r
+}
+
+define i8* @memset_tagged_10000_zeroval(i8* %dst, i64 %size) {
+; SDAG-LABEL: memset_tagged_10000_zeroval:
+; SDAG:       // %bb.0: // %entry
+; SDAG-NEXT:    mov w8, #10000
+; SDAG-NEXT:    setgp [x0]!, x8!, xzr
+; SDAG-NEXT:    setgm [x0]!, x8!, xzr
+; SDAG-NEXT:    setge [x0]!, x8!, xzr
+; SDAG-NEXT:    ret
+;
+; GISel-LABEL: memset_tagged_10000_zeroval:
+; GISel:       // %bb.0: // %entry
+; GISel-NEXT:    mov x9, xzr
+; GISel-NEXT:    mov w8, #10000
+; GISel-NEXT:    // kill: def $x8 killed $w8
+; GISel-NEXT:    setgp [x0]!, x8!, x9
+; GISel-NEXT:    setgm [x0]!, x8!, x9
+; GISel-NEXT:    setge [x0]!, x8!, x9
+; GISel-NEXT:    ret
+entry:
+  %r = tail call i8* @llvm.aarch64.mops.memset.tag(i8* %dst, i8 0, i64 10000)
+  ret i8* %r
+}
+
+define i8* @memset_tagged_size_zeroval(i8* %dst, i64 %size) {
+; SDAG-LABEL: memset_tagged_size_zeroval:
+; SDAG:       // %bb.0: // %entry
+; SDAG-NEXT:    setgp [x0]!, x1!, xzr
+; SDAG-NEXT:    setgm [x0]!, x1!, xzr
+; SDAG-NEXT:    setge [x0]!, x1!, xzr
+; SDAG-NEXT:    ret
+;
+; GISel-LABEL: memset_tagged_size_zeroval:
+; GISel:       // %bb.0: // %entry
+; GISel-NEXT:    mov x8, xzr
+; GISel-NEXT:    setgp [x0]!, x1!, x8
+; GISel-NEXT:    setgm [x0]!, x1!, x8
+; GISel-NEXT:    setge [x0]!, x1!, x8
+; GISel-NEXT:    ret
+entry:
+  %r = tail call i8* @llvm.aarch64.mops.memset.tag(i8* %dst, i8 0, i64 %size)
+  ret i8* %r
+}
+
+define i8* @memset_tagged_0(i8* %dst, i64 %size, i32 %value) {
+; SDAG-LABEL: memset_tagged_0:
+; SDAG:       // %bb.0: // %entry
+; SDAG-NEXT:    mov x8, xzr
+; SDAG-NEXT:    // kill: def $w2 killed $w2 def $x2
+; SDAG-NEXT:    setgp [x0]!, x8!, x2
+; SDAG-NEXT:    setgm [x0]!, x8!, x2
+; SDAG-NEXT:    setge [x0]!, x8!, x2
+; SDAG-NEXT:    ret
+;
+; GISel-LABEL: memset_tagged_0:
+; GISel:       // %bb.0: // %entry
+; GISel-NEXT:    // implicit-def: $x9
+; GISel-NEXT:    mov w9, w2
+; GISel-NEXT:    mov x8, xzr
+; GISel-NEXT:    setgp [x0]!, x8!, x9
+; GISel-NEXT:    setgm [x0]!, x8!, x9
+; GISel-NEXT:    setge [x0]!, x8!, x9
+; GISel-NEXT:    ret
+entry:
+  %value_trunc = trunc i32 %value to i8
+  %r = tail call i8* @llvm.aarch64.mops.memset.tag(i8* %dst, i8 %value_trunc, i64 0)
+  ret i8* %r
+}
+
+define i8* @memset_tagged_1(i8* %dst, i64 %size, i32 %value) {
+; SDAG-LABEL: memset_tagged_1:
+; SDAG:       // %bb.0: // %entry
+; SDAG-NEXT:    mov w8, #1
+; SDAG-NEXT:    // kill: def $w2 killed $w2 def $x2
+; SDAG-NEXT:    setgp [x0]!, x8!, x2
+; SDAG-NEXT:    setgm [x0]!, x8!, x2
+; SDAG-NEXT:    setge [x0]!, x8!, x2
+; SDAG-NEXT:    ret
+;
+; GISel-LABEL: memset_tagged_1:
+; GISel:       // %bb.0: // %entry
+; GISel-NEXT:    // implicit-def: $x9
+; GISel-NEXT:    mov w9, w2
+; GISel-NEXT:    mov w8, #1
+; GISel-NEXT:    // kill: def $x8 killed $w8
+; GISel-NEXT:    setgp [x0]!, x8!, x9
+; GISel-NEXT:    setgm [x0]!, x8!, x9
+; GISel-NEXT:    setge [x0]!, x8!, x9
+; GISel-NEXT:    ret
+entry:
+  %value_trunc = trunc i32 %value to i8
+  %r = tail call i8* @llvm.aarch64.mops.memset.tag(i8* %dst, i8 %value_trunc, i64 1)
+  ret i8* %r
+}
+
+define i8* @memset_tagged_10(i8* %dst, i64 %size, i32 %value) {
+; SDAG-LABEL: memset_tagged_10:
+; SDAG:       // %bb.0: // %entry
+; SDAG-NEXT:    mov w8, #10
+; SDAG-NEXT:    // kill: def $w2 killed $w2 def $x2
+; SDAG-NEXT:    setgp [x0]!, x8!, x2
+; SDAG-NEXT:    setgm [x0]!, x8!, x2
+; SDAG-NEXT:    setge [x0]!, x8!, x2
+; SDAG-NEXT:    ret
+;
+; GISel-LABEL: memset_tagged_10:
+; GISel:       // %bb.0: // %entry
+; GISel-NEXT:    // implicit-def: $x9
+; GISel-NEXT:    mov w9, w2
+; GISel-NEXT:    mov w8, #10
+; GISel-NEXT:    // kill: def $x8 killed $w8
+; GISel-NEXT:    setgp [x0]!, x8!, x9
+; GISel-NEXT:    setgm [x0]!, x8!, x9
+; GISel-NEXT:    setge [x0]!, x8!, x9
+; GISel-NEXT:    ret
+entry:
+  %value_trunc = trunc i32 %value to i8
+  %r = tail call i8* @llvm.aarch64.mops.memset.tag(i8* %dst, i8 %value_trunc, i64 10)
+  ret i8* %r
+}
+
+define i8* @memset_tagged_10000(i8* %dst, i64 %size, i32 %value) {
+; SDAG-LABEL: memset_tagged_10000:
+; SDAG:       // %bb.0: // %entry
+; SDAG-NEXT:    mov w8, #10000
+; SDAG-NEXT:    // kill: def $w2 killed $w2 def $x2
+; SDAG-NEXT:    setgp [x0]!, x8!, x2
+; SDAG-NEXT:    setgm [x0]!, x8!, x2
+; SDAG-NEXT:    setge [x0]!, x8!, x2
+; SDAG-NEXT:    ret
+;
+; GISel-LABEL: memset_tagged_10000:
+; GISel:       // %bb.0: // %entry
+; GISel-NEXT:    // implicit-def: $x9
+; GISel-NEXT:    mov w9, w2
+; GISel-NEXT:    mov w8, #10000
+; GISel-NEXT:    // kill: def $x8 killed $w8
+; GISel-NEXT:    setgp [x0]!, x8!, x9
+; GISel-NEXT:    setgm [x0]!, x8!, x9
+; GISel-NEXT:    setge [x0]!, x8!, x9
+; GISel-NEXT:    ret
+entry:
+  %value_trunc = trunc i32 %value to i8
+  %r = tail call i8* @llvm.aarch64.mops.memset.tag(i8* %dst, i8 %value_trunc, i64 10000)
+  ret i8* %r
+}
+
+define i8* @memset_tagged_size(i8* %dst, i64 %size, i32 %value) {
+; SDAG-LABEL: memset_tagged_size:
+; SDAG:       // %bb.0: // %entry
+; SDAG-NEXT:    // kill: def $w2 killed $w2 def $x2
+; SDAG-NEXT:    setgp [x0]!, x1!, x2
+; SDAG-NEXT:    setgm [x0]!, x1!, x2
+; SDAG-NEXT:    setge [x0]!, x1!, x2
+; SDAG-NEXT:    ret
+;
+; GISel-LABEL: memset_tagged_size:
+; GISel:       // %bb.0: // %entry
+; GISel-NEXT:    // implicit-def: $x8
+; GISel-NEXT:    mov w8, w2
+; GISel-NEXT:    setgp [x0]!, x1!, x8
+; GISel-NEXT:    setgm [x0]!, x1!, x8
+; GISel-NEXT:    setge [x0]!, x1!, x8
+; GISel-NEXT:    ret
+entry:
+  %value_trunc = trunc i32 %value to i8
+  %r = tail call i8* @llvm.aarch64.mops.memset.tag(i8* %dst, i8 %value_trunc, i64 %size)
+  ret i8* %r
+}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-mops.ll b/llvm/test/CodeGen/AArch64/aarch64-mops.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-mops.ll
@@ -0,0 +1,1373 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+
+; RUN: llc %s -o - -mtriple=aarch64-arm-none-eabi -O2                    | FileCheck %s --check-prefix=O2-SDAG-WITHOUT-MOPS
+; RUN: llc %s -o - -mtriple=aarch64-arm-none-eabi -O2 -mattr=+mops       | FileCheck %s --check-prefix=O2-SDAG-MOPS
+
+; RUN: llc %s -o - -mtriple=aarch64-arm-none-eabi -O0 -global-isel=1 -global-isel-abort=1                    | FileCheck %s --check-prefix=O0-GISel-WITHOUT-MOPS
+; RUN: llc %s -o - -mtriple=aarch64-arm-none-eabi -O0 -global-isel=1 -global-isel-abort=1 -mattr=+mops       | FileCheck %s --check-prefix=O0-GISel-MOPS
+
+; Function Attrs: argmemonly nofree nounwind willreturn writeonly
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg)
+
+; Function Attrs: argmemonly nofree nounwind willreturn
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1 immarg)
+
+; Function Attrs: argmemonly nofree nounwind willreturn
+declare void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1 immarg)
+
+; Function Attrs: argmemonly nofree nounwind willreturn
+declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1 immarg)
+
+
+define void @memset_0_zeroval(i8* %dst) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memset_0_zeroval:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memset_0_zeroval:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memset_0_zeroval:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memset_0_zeroval:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    mov x8, xzr
+; O0-GISel-MOPS-NEXT:    mov x9, x8
+; O0-GISel-MOPS-NEXT:    setp [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    setm [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    sete [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  call void @llvm.memset.p0i8.i64(i8* align 1 %dst, i8 0, i64 0, i1 false)
+  ret void
+}
+
+define void @memset_0_zeroval_volatile(i8* %dst) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memset_0_zeroval_volatile:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memset_0_zeroval_volatile:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memset_0_zeroval_volatile:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memset_0_zeroval_volatile:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    mov x8, xzr
+; O0-GISel-MOPS-NEXT:    mov x9, x8
+; O0-GISel-MOPS-NEXT:    setp [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    setm [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    sete [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  call void @llvm.memset.p0i8.i64(i8* align 1 %dst, i8 0, i64 0, i1 true)
+  ret void
+}
+
+define void @memset_10_zeroval(i8* %dst) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memset_10_zeroval:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    strh wzr, [x0, #8]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    str xzr, [x0]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memset_10_zeroval:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    mov w8, #10
+; O2-SDAG-MOPS-NEXT:    setp [x0]!, x8!, xzr
+; O2-SDAG-MOPS-NEXT:    setm [x0]!, x8!, xzr
+; O2-SDAG-MOPS-NEXT:    sete [x0]!, x8!, xzr
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memset_10_zeroval:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    str xzr, [x0]
+; O0-GISel-WITHOUT-MOPS-NEXT:    strh wzr, [x0, #8]
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memset_10_zeroval:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    mov x9, xzr
+; O0-GISel-MOPS-NEXT:    mov w8, #10
+; O0-GISel-MOPS-NEXT:    // kill: def $x8 killed $w8
+; O0-GISel-MOPS-NEXT:    setp [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    setm [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    sete [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  call void @llvm.memset.p0i8.i64(i8* align 1 %dst, i8 0, i64 10, i1 false)
+  ret void
+}
+
+define void @memset_10_zeroval_volatile(i8* %dst) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memset_10_zeroval_volatile:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    strh wzr, [x0, #8]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    str xzr, [x0]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memset_10_zeroval_volatile:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    mov w8, #10
+; O2-SDAG-MOPS-NEXT:    setp [x0]!, x8!, xzr
+; O2-SDAG-MOPS-NEXT:    setm [x0]!, x8!, xzr
+; O2-SDAG-MOPS-NEXT:    sete [x0]!, x8!, xzr
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memset_10_zeroval_volatile:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w1, wzr
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w8, #10
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w2, w8
+; O0-GISel-WITHOUT-MOPS-NEXT:    bl memset
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memset_10_zeroval_volatile:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    mov x9, xzr
+; O0-GISel-MOPS-NEXT:    mov w8, #10
+; O0-GISel-MOPS-NEXT:    // kill: def $x8 killed $w8
+; O0-GISel-MOPS-NEXT:    setp [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    setm [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    sete [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  call void @llvm.memset.p0i8.i64(i8* align 1 %dst, i8 0, i64 10, i1 true)
+  ret void
+}
+
+define void @memset_10000_zeroval(i8* %dst) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memset_10000_zeroval:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    mov w1, wzr
+; O2-SDAG-WITHOUT-MOPS-NEXT:    mov w2, #10000
+; O2-SDAG-WITHOUT-MOPS-NEXT:    bl memset
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memset_10000_zeroval:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    mov w8, #10000
+; O2-SDAG-MOPS-NEXT:    setp [x0]!, x8!, xzr
+; O2-SDAG-MOPS-NEXT:    setm [x0]!, x8!, xzr
+; O2-SDAG-MOPS-NEXT:    sete [x0]!, x8!, xzr
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memset_10000_zeroval:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w1, wzr
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w8, #10000
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w2, w8
+; O0-GISel-WITHOUT-MOPS-NEXT:    bl memset
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memset_10000_zeroval:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    mov x9, xzr
+; O0-GISel-MOPS-NEXT:    mov w8, #10000
+; O0-GISel-MOPS-NEXT:    // kill: def $x8 killed $w8
+; O0-GISel-MOPS-NEXT:    setp [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    setm [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    sete [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  call void @llvm.memset.p0i8.i64(i8* align 1 %dst, i8 0, i64 10000, i1 false)
+  ret void
+}
+
+define void @memset_10000_zeroval_volatile(i8* %dst) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memset_10000_zeroval_volatile:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    mov w1, wzr
+; O2-SDAG-WITHOUT-MOPS-NEXT:    mov w2, #10000
+; O2-SDAG-WITHOUT-MOPS-NEXT:    bl memset
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memset_10000_zeroval_volatile:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    mov w8, #10000
+; O2-SDAG-MOPS-NEXT:    setp [x0]!, x8!, xzr
+; O2-SDAG-MOPS-NEXT:    setm [x0]!, x8!, xzr
+; O2-SDAG-MOPS-NEXT:    sete [x0]!, x8!, xzr
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memset_10000_zeroval_volatile:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w1, wzr
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w8, #10000
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w2, w8
+; O0-GISel-WITHOUT-MOPS-NEXT:    bl memset
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memset_10000_zeroval_volatile:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    mov x9, xzr
+; O0-GISel-MOPS-NEXT:    mov w8, #10000
+; O0-GISel-MOPS-NEXT:    // kill: def $x8 killed $w8
+; O0-GISel-MOPS-NEXT:    setp [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    setm [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    sete [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  call void @llvm.memset.p0i8.i64(i8* align 1 %dst, i8 0, i64 10000, i1 true)
+  ret void
+}
+
+define void @memset_size_zeroval(i8* %dst, i64 %size) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memset_size_zeroval:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    mov x2, x1
+; O2-SDAG-WITHOUT-MOPS-NEXT:    mov w1, wzr
+; O2-SDAG-WITHOUT-MOPS-NEXT:    bl memset
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memset_size_zeroval:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    setp [x0]!, x1!, xzr
+; O2-SDAG-MOPS-NEXT:    setm [x0]!, x1!, xzr
+; O2-SDAG-MOPS-NEXT:    sete [x0]!, x1!, xzr
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memset_size_zeroval:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov x2, x1
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w1, wzr
+; O0-GISel-WITHOUT-MOPS-NEXT:    bl memset
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memset_size_zeroval:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    mov x8, xzr
+; O0-GISel-MOPS-NEXT:    setp [x0]!, x1!, x8
+; O0-GISel-MOPS-NEXT:    setm [x0]!, x1!, x8
+; O0-GISel-MOPS-NEXT:    sete [x0]!, x1!, x8
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  call void @llvm.memset.p0i8.i64(i8* align 1 %dst, i8 0, i64 %size, i1 false)
+  ret void
+}
+
+define void @memset_size_zeroval_volatile(i8* %dst, i64 %size) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memset_size_zeroval_volatile:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    mov x2, x1
+; O2-SDAG-WITHOUT-MOPS-NEXT:    mov w1, wzr
+; O2-SDAG-WITHOUT-MOPS-NEXT:    bl memset
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memset_size_zeroval_volatile:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    setp [x0]!, x1!, xzr
+; O2-SDAG-MOPS-NEXT:    setm [x0]!, x1!, xzr
+; O2-SDAG-MOPS-NEXT:    sete [x0]!, x1!, xzr
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memset_size_zeroval_volatile:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov x2, x1
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w1, wzr
+; O0-GISel-WITHOUT-MOPS-NEXT:    bl memset
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memset_size_zeroval_volatile:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    mov x8, xzr
+; O0-GISel-MOPS-NEXT:    setp [x0]!, x1!, x8
+; O0-GISel-MOPS-NEXT:    setm [x0]!, x1!, x8
+; O0-GISel-MOPS-NEXT:    sete [x0]!, x1!, x8
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  call void @llvm.memset.p0i8.i64(i8* align 1 %dst, i8 0, i64 %size, i1 true)
+  ret void
+}
+
+
+define void @memset_0(i8* %dst, i32 %value) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memset_0:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memset_0:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memset_0:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memset_0:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    mov x8, xzr
+; O0-GISel-MOPS-NEXT:    mov x9, x8
+; O0-GISel-MOPS-NEXT:    setp [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    setm [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    sete [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  %value_trunc = trunc i32 %value to i8
+  call void @llvm.memset.p0i8.i64(i8* align 1 %dst, i8 0, i64 0, i1 false)
+  ret void
+}
+
+define void @memset_0_volatile(i8* %dst, i32 %value) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memset_0_volatile:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memset_0_volatile:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memset_0_volatile:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memset_0_volatile:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    mov x8, xzr
+; O0-GISel-MOPS-NEXT:    mov x9, x8
+; O0-GISel-MOPS-NEXT:    setp [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    setm [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    sete [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  %value_trunc = trunc i32 %value to i8
+  call void @llvm.memset.p0i8.i64(i8* align 1 %dst, i8 0, i64 0, i1 true)
+  ret void
+}
+
+define void @memset_10(i8* %dst, i32 %value) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memset_10:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    // kill: def $w1 killed $w1 def $x1
+; O2-SDAG-WITHOUT-MOPS-NEXT:    mov x8, #72340172838076673
+; O2-SDAG-WITHOUT-MOPS-NEXT:    and x9, x1, #0xff
+; O2-SDAG-WITHOUT-MOPS-NEXT:    mul x8, x9, x8
+; O2-SDAG-WITHOUT-MOPS-NEXT:    str x8, [x0]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    strh w8, [x0, #8]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memset_10:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    mov w8, #10
+; O2-SDAG-MOPS-NEXT:    // kill: def $w1 killed $w1 def $x1
+; O2-SDAG-MOPS-NEXT:    setp [x0]!, x8!, x1
+; O2-SDAG-MOPS-NEXT:    setm [x0]!, x8!, x1
+; O2-SDAG-MOPS-NEXT:    sete [x0]!, x8!, x1
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memset_10:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    // implicit-def: $x8
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w8, w1
+; O0-GISel-WITHOUT-MOPS-NEXT:    and x8, x8, #0xff
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov x9, #72340172838076673
+; O0-GISel-WITHOUT-MOPS-NEXT:    mul x8, x8, x9
+; O0-GISel-WITHOUT-MOPS-NEXT:    str x8, [x0]
+; O0-GISel-WITHOUT-MOPS-NEXT:    // kill: def $w8 killed $w8 killed $x8
+; O0-GISel-WITHOUT-MOPS-NEXT:    strh w8, [x0, #8]
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memset_10:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    // implicit-def: $x9
+; O0-GISel-MOPS-NEXT:    mov w9, w1
+; O0-GISel-MOPS-NEXT:    mov w8, #10
+; O0-GISel-MOPS-NEXT:    // kill: def $x8 killed $w8
+; O0-GISel-MOPS-NEXT:    setp [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    setm [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    sete [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  %value_trunc = trunc i32 %value to i8
+  call void @llvm.memset.p0i8.i64(i8* align 1 %dst, i8 %value_trunc, i64 10, i1 false)
+  ret void
+}
+
+define void @memset_10_volatile(i8* %dst, i32 %value) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memset_10_volatile:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    // kill: def $w1 killed $w1 def $x1
+; O2-SDAG-WITHOUT-MOPS-NEXT:    mov x8, #72340172838076673
+; O2-SDAG-WITHOUT-MOPS-NEXT:    and x9, x1, #0xff
+; O2-SDAG-WITHOUT-MOPS-NEXT:    mul x8, x9, x8
+; O2-SDAG-WITHOUT-MOPS-NEXT:    str x8, [x0]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    strh w8, [x0, #8]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memset_10_volatile:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    mov w8, #10
+; O2-SDAG-MOPS-NEXT:    // kill: def $w1 killed $w1 def $x1
+; O2-SDAG-MOPS-NEXT:    setp [x0]!, x8!, x1
+; O2-SDAG-MOPS-NEXT:    setm [x0]!, x8!, x1
+; O2-SDAG-MOPS-NEXT:    sete [x0]!, x8!, x1
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memset_10_volatile:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w8, #10
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w2, w8
+; O0-GISel-WITHOUT-MOPS-NEXT:    bl memset
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memset_10_volatile:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    // implicit-def: $x9
+; O0-GISel-MOPS-NEXT:    mov w9, w1
+; O0-GISel-MOPS-NEXT:    mov w8, #10
+; O0-GISel-MOPS-NEXT:    // kill: def $x8 killed $w8
+; O0-GISel-MOPS-NEXT:    setp [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    setm [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    sete [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  %value_trunc = trunc i32 %value to i8
+  call void @llvm.memset.p0i8.i64(i8* align 1 %dst, i8 %value_trunc, i64 10, i1 true)
+  ret void
+}
+
+define void @memset_10000(i8* %dst, i32 %value) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memset_10000:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    mov w2, #10000
+; O2-SDAG-WITHOUT-MOPS-NEXT:    bl memset
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memset_10000:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    mov w8, #10000
+; O2-SDAG-MOPS-NEXT:    // kill: def $w1 killed $w1 def $x1
+; O2-SDAG-MOPS-NEXT:    setp [x0]!, x8!, x1
+; O2-SDAG-MOPS-NEXT:    setm [x0]!, x8!, x1
+; O2-SDAG-MOPS-NEXT:    sete [x0]!, x8!, x1
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memset_10000:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w8, #10000
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w2, w8
+; O0-GISel-WITHOUT-MOPS-NEXT:    bl memset
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memset_10000:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    // implicit-def: $x9
+; O0-GISel-MOPS-NEXT:    mov w9, w1
+; O0-GISel-MOPS-NEXT:    mov w8, #10000
+; O0-GISel-MOPS-NEXT:    // kill: def $x8 killed $w8
+; O0-GISel-MOPS-NEXT:    setp [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    setm [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    sete [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  %value_trunc = trunc i32 %value to i8
+  call void @llvm.memset.p0i8.i64(i8* align 1 %dst, i8 %value_trunc, i64 10000, i1 false)
+  ret void
+}
+
+define void @memset_10000_volatile(i8* %dst, i32 %value) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memset_10000_volatile:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    mov w2, #10000
+; O2-SDAG-WITHOUT-MOPS-NEXT:    bl memset
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memset_10000_volatile:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    mov w8, #10000
+; O2-SDAG-MOPS-NEXT:    // kill: def $w1 killed $w1 def $x1
+; O2-SDAG-MOPS-NEXT:    setp [x0]!, x8!, x1
+; O2-SDAG-MOPS-NEXT:    setm [x0]!, x8!, x1
+; O2-SDAG-MOPS-NEXT:    sete [x0]!, x8!, x1
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memset_10000_volatile:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w8, #10000
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w2, w8
+; O0-GISel-WITHOUT-MOPS-NEXT:    bl memset
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memset_10000_volatile:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    // implicit-def: $x9
+; O0-GISel-MOPS-NEXT:    mov w9, w1
+; O0-GISel-MOPS-NEXT:    mov w8, #10000
+; O0-GISel-MOPS-NEXT:    // kill: def $x8 killed $w8
+; O0-GISel-MOPS-NEXT:    setp [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    setm [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    sete [x0]!, x8!, x9
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  %value_trunc = trunc i32 %value to i8
+  call void @llvm.memset.p0i8.i64(i8* align 1 %dst, i8 %value_trunc, i64 10000, i1 true)
+  ret void
+}
+
+define void @memset_size(i8* %dst, i64 %size, i32 %value) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memset_size:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    mov x8, x1
+; O2-SDAG-WITHOUT-MOPS-NEXT:    mov w1, w2
+; O2-SDAG-WITHOUT-MOPS-NEXT:    mov x2, x8
+; O2-SDAG-WITHOUT-MOPS-NEXT:    bl memset
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memset_size:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    // kill: def $w2 killed $w2 def $x2
+; O2-SDAG-MOPS-NEXT:    setp [x0]!, x1!, x2
+; O2-SDAG-MOPS-NEXT:    setm [x0]!, x1!, x2
+; O2-SDAG-MOPS-NEXT:    sete [x0]!, x1!, x2
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memset_size:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    sub sp, sp, #32
+; O0-GISel-WITHOUT-MOPS-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 32
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O0-GISel-WITHOUT-MOPS-NEXT:    str x1, [sp, #8] // 8-byte Folded Spill
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w1, w2
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldr x2, [sp, #8] // 8-byte Folded Reload
+; O0-GISel-WITHOUT-MOPS-NEXT:    bl memset
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; O0-GISel-WITHOUT-MOPS-NEXT:    add sp, sp, #32
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memset_size:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    // implicit-def: $x8
+; O0-GISel-MOPS-NEXT:    mov w8, w2
+; O0-GISel-MOPS-NEXT:    setp [x0]!, x1!, x8
+; O0-GISel-MOPS-NEXT:    setm [x0]!, x1!, x8
+; O0-GISel-MOPS-NEXT:    sete [x0]!, x1!, x8
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  %value_trunc = trunc i32 %value to i8
+  call void @llvm.memset.p0i8.i64(i8* align 1 %dst, i8 %value_trunc, i64 %size, i1 false)
+  ret void
+}
+
+define void @memset_size_volatile(i8* %dst, i64 %size, i32 %value) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memset_size_volatile:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    mov x8, x1
+; O2-SDAG-WITHOUT-MOPS-NEXT:    mov w1, w2
+; O2-SDAG-WITHOUT-MOPS-NEXT:    mov x2, x8
+; O2-SDAG-WITHOUT-MOPS-NEXT:    bl memset
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memset_size_volatile:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    // kill: def $w2 killed $w2 def $x2
+; O2-SDAG-MOPS-NEXT:    setp [x0]!, x1!, x2
+; O2-SDAG-MOPS-NEXT:    setm [x0]!, x1!, x2
+; O2-SDAG-MOPS-NEXT:    sete [x0]!, x1!, x2
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memset_size_volatile:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    sub sp, sp, #32
+; O0-GISel-WITHOUT-MOPS-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 32
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O0-GISel-WITHOUT-MOPS-NEXT:    str x1, [sp, #8] // 8-byte Folded Spill
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w1, w2
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldr x2, [sp, #8] // 8-byte Folded Reload
+; O0-GISel-WITHOUT-MOPS-NEXT:    bl memset
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; O0-GISel-WITHOUT-MOPS-NEXT:    add sp, sp, #32
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memset_size_volatile:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    // implicit-def: $x8
+; O0-GISel-MOPS-NEXT:    mov w8, w2
+; O0-GISel-MOPS-NEXT:    setp [x0]!, x1!, x8
+; O0-GISel-MOPS-NEXT:    setm [x0]!, x1!, x8
+; O0-GISel-MOPS-NEXT:    sete [x0]!, x1!, x8
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  %value_trunc = trunc i32 %value to i8
+  call void @llvm.memset.p0i8.i64(i8* align 1 %dst, i8 %value_trunc, i64 %size, i1 true)
+  ret void
+}
+
+
+define void @memcpy_0(i8* %dst, i8* %src, i32 %value) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memcpy_0:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memcpy_0:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memcpy_0:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memcpy_0:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    mov x8, xzr
+; O0-GISel-MOPS-NEXT:    cpyfp [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpyfm [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpyfe [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 0, i1 false)
+  ret void
+}
+
+define void @memcpy_0_volatile(i8* %dst, i8* %src, i32 %value) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memcpy_0_volatile:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memcpy_0_volatile:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memcpy_0_volatile:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memcpy_0_volatile:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    mov x8, xzr
+; O0-GISel-MOPS-NEXT:    cpyfp [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpyfm [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpyfe [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 0, i1 true)
+  ret void
+}
+
+define void @memcpy_10(i8* %dst, i8* %src, i32 %value) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memcpy_10:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ldrh w8, [x1, #8]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ldr x9, [x1]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    strh w8, [x0, #8]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    str x9, [x0]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memcpy_10:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    mov w8, #10
+; O2-SDAG-MOPS-NEXT:    cpyfp [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    cpyfm [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    cpyfe [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memcpy_10:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldr x8, [x1]
+; O0-GISel-WITHOUT-MOPS-NEXT:    str x8, [x0]
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldrh w8, [x1, #8]
+; O0-GISel-WITHOUT-MOPS-NEXT:    strh w8, [x0, #8]
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memcpy_10:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    mov w8, #10
+; O0-GISel-MOPS-NEXT:    // kill: def $x8 killed $w8
+; O0-GISel-MOPS-NEXT:    cpyfp [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpyfm [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpyfe [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 10, i1 false)
+  ret void
+}
+
+define void @memcpy_10_volatile(i8* %dst, i8* %src, i32 %value) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memcpy_10_volatile:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ldr x8, [x1]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ldrh w9, [x1, #8]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    strh w9, [x0, #8]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    str x8, [x0]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memcpy_10_volatile:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    mov w8, #10
+; O2-SDAG-MOPS-NEXT:    cpyfp [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    cpyfm [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    cpyfe [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memcpy_10_volatile:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w8, #10
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w2, w8
+; O0-GISel-WITHOUT-MOPS-NEXT:    bl memcpy
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memcpy_10_volatile:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    mov w8, #10
+; O0-GISel-MOPS-NEXT:    // kill: def $x8 killed $w8
+; O0-GISel-MOPS-NEXT:    cpyfp [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpyfm [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpyfe [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 10, i1 true)
+  ret void
+}
+
+define void @memcpy_1000(i8* %dst, i8* %src, i32 %value) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memcpy_1000:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    mov w2, #1000
+; O2-SDAG-WITHOUT-MOPS-NEXT:    bl memcpy
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memcpy_1000:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    mov w8, #1000
+; O2-SDAG-MOPS-NEXT:    cpyfp [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    cpyfm [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    cpyfe [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memcpy_1000:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w8, #1000
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w2, w8
+; O0-GISel-WITHOUT-MOPS-NEXT:    bl memcpy
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memcpy_1000:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    mov w8, #1000
+; O0-GISel-MOPS-NEXT:    // kill: def $x8 killed $w8
+; O0-GISel-MOPS-NEXT:    cpyfp [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpyfm [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpyfe [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 1000, i1 false)
+  ret void
+}
+
+define void @memcpy_1000_volatile(i8* %dst, i8* %src, i32 %value) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memcpy_1000_volatile:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    mov w2, #1000
+; O2-SDAG-WITHOUT-MOPS-NEXT:    bl memcpy
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memcpy_1000_volatile:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    mov w8, #1000
+; O2-SDAG-MOPS-NEXT:    cpyfp [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    cpyfm [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    cpyfe [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memcpy_1000_volatile:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w8, #1000
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w2, w8
+; O0-GISel-WITHOUT-MOPS-NEXT:    bl memcpy
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memcpy_1000_volatile:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    mov w8, #1000
+; O0-GISel-MOPS-NEXT:    // kill: def $x8 killed $w8
+; O0-GISel-MOPS-NEXT:    cpyfp [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpyfm [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpyfe [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 1000, i1 true)
+  ret void
+}
+
+define void @memcpy_n(i8* %dst, i8* %src, i64 %size, i32 %value) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memcpy_n:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    bl memcpy
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memcpy_n:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    cpyfp [x0]!, [x1]!, x2!
+; O2-SDAG-MOPS-NEXT:    cpyfm [x0]!, [x1]!, x2!
+; O2-SDAG-MOPS-NEXT:    cpyfe [x0]!, [x1]!, x2!
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memcpy_n:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O0-GISel-WITHOUT-MOPS-NEXT:    bl memcpy
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memcpy_n:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    cpyfp [x0]!, [x1]!, x2!
+; O0-GISel-MOPS-NEXT:    cpyfm [x0]!, [x1]!, x2!
+; O0-GISel-MOPS-NEXT:    cpyfe [x0]!, [x1]!, x2!
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 %size, i1 false)
+  ret void
+}
+
+define void @memcpy_n_volatile(i8* %dst, i8* %src, i64 %size, i32 %value) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memcpy_n_volatile:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    bl memcpy
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memcpy_n_volatile:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    cpyfp [x0]!, [x1]!, x2!
+; O2-SDAG-MOPS-NEXT:    cpyfm [x0]!, [x1]!, x2!
+; O2-SDAG-MOPS-NEXT:    cpyfe [x0]!, [x1]!, x2!
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memcpy_n_volatile:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O0-GISel-WITHOUT-MOPS-NEXT:    bl memcpy
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memcpy_n_volatile:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    cpyfp [x0]!, [x1]!, x2!
+; O0-GISel-MOPS-NEXT:    cpyfm [x0]!, [x1]!, x2!
+; O0-GISel-MOPS-NEXT:    cpyfe [x0]!, [x1]!, x2!
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 %size, i1 true)
+  ret void
+}
+
+
+define void @memcpy_inline_0(i8* %dst, i8* %src, i32 %value) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memcpy_inline_0:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memcpy_inline_0:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memcpy_inline_0:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memcpy_inline_0:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    mov x8, xzr
+; O0-GISel-MOPS-NEXT:    cpyfp [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpyfm [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpyfe [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  call void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 0, i1 false)
+  ret void
+}
+
+define void @memcpy_inline_0_volatile(i8* %dst, i8* %src, i32 %value) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memcpy_inline_0_volatile:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memcpy_inline_0_volatile:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memcpy_inline_0_volatile:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memcpy_inline_0_volatile:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    mov x8, xzr
+; O0-GISel-MOPS-NEXT:    cpyfp [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpyfm [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpyfe [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  call void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 0, i1 true)
+  ret void
+}
+
+define void @memcpy_inline_10(i8* %dst, i8* %src, i32 %value) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memcpy_inline_10:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ldrh w8, [x1, #8]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ldr x9, [x1]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    strh w8, [x0, #8]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    str x9, [x0]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memcpy_inline_10:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    mov w8, #10
+; O2-SDAG-MOPS-NEXT:    cpyfp [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    cpyfm [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    cpyfe [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memcpy_inline_10:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldr x8, [x1]
+; O0-GISel-WITHOUT-MOPS-NEXT:    str x8, [x0]
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldrh w8, [x1, #8]
+; O0-GISel-WITHOUT-MOPS-NEXT:    strh w8, [x0, #8]
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memcpy_inline_10:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    mov w8, #10
+; O0-GISel-MOPS-NEXT:    // kill: def $x8 killed $w8
+; O0-GISel-MOPS-NEXT:    cpyfp [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpyfm [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpyfe [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  call void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 10, i1 false)
+  ret void
+}
+
+define void @memcpy_inline_10_volatile(i8* %dst, i8* %src, i32 %value) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memcpy_inline_10_volatile:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ldr x8, [x1]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ldrh w9, [x1, #8]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    strh w9, [x0, #8]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    str x8, [x0]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memcpy_inline_10_volatile:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    mov w8, #10
+; O2-SDAG-MOPS-NEXT:    cpyfp [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    cpyfm [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    cpyfe [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memcpy_inline_10_volatile:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldr x8, [x1]
+; O0-GISel-WITHOUT-MOPS-NEXT:    str x8, [x0]
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldrh w8, [x1, #8]
+; O0-GISel-WITHOUT-MOPS-NEXT:    strh w8, [x0, #8]
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memcpy_inline_10_volatile:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    mov w8, #10
+; O0-GISel-MOPS-NEXT:    // kill: def $x8 killed $w8
+; O0-GISel-MOPS-NEXT:    cpyfp [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpyfm [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpyfe [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  call void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 10, i1 true)
+  ret void
+}
+
+
+define void @memmove_0(i8* %dst, i8* %src, i32 %value) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memmove_0:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memmove_0:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memmove_0:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memmove_0:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    mov x8, xzr
+; O0-GISel-MOPS-NEXT:    cpyp [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpym [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpye [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 0, i1 false)
+  ret void
+}
+
+define void @memmove_0_volatile(i8* %dst, i8* %src, i32 %value) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memmove_0_volatile:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memmove_0_volatile:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memmove_0_volatile:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memmove_0_volatile:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    mov x8, xzr
+; O0-GISel-MOPS-NEXT:    cpyp [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpym [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpye [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 0, i1 true)
+  ret void
+}
+
+define void @memmove_10(i8* %dst, i8* %src, i32 %value) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memmove_10:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ldrh w8, [x1, #8]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ldr x9, [x1]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    strh w8, [x0, #8]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    str x9, [x0]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memmove_10:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    mov w8, #10
+; O2-SDAG-MOPS-NEXT:    cpyp [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    cpym [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    cpye [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memmove_10:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldr x9, [x1]
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldrh w8, [x1, #8]
+; O0-GISel-WITHOUT-MOPS-NEXT:    str x9, [x0]
+; O0-GISel-WITHOUT-MOPS-NEXT:    strh w8, [x0, #8]
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memmove_10:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    mov w8, #10
+; O0-GISel-MOPS-NEXT:    // kill: def $x8 killed $w8
+; O0-GISel-MOPS-NEXT:    cpyp [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpym [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpye [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 10, i1 false)
+  ret void
+}
+
+define void @memmove_10_volatile(i8* %dst, i8* %src, i32 %value) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memmove_10_volatile:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ldr x8, [x1]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ldrh w9, [x1, #8]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    strh w9, [x0, #8]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    str x8, [x0]
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memmove_10_volatile:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    mov w8, #10
+; O2-SDAG-MOPS-NEXT:    cpyp [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    cpym [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    cpye [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memmove_10_volatile:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w8, #10
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w2, w8
+; O0-GISel-WITHOUT-MOPS-NEXT:    bl memmove
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memmove_10_volatile:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    mov w8, #10
+; O0-GISel-MOPS-NEXT:    // kill: def $x8 killed $w8
+; O0-GISel-MOPS-NEXT:    cpyp [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpym [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpye [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 10, i1 true)
+  ret void
+}
+
+define void @memmove_1000(i8* %dst, i8* %src, i32 %value) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memmove_1000:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    mov w2, #1000
+; O2-SDAG-WITHOUT-MOPS-NEXT:    bl memmove
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memmove_1000:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    mov w8, #1000
+; O2-SDAG-MOPS-NEXT:    cpyp [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    cpym [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    cpye [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memmove_1000:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w8, #1000
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w2, w8
+; O0-GISel-WITHOUT-MOPS-NEXT:    bl memmove
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memmove_1000:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    mov w8, #1000
+; O0-GISel-MOPS-NEXT:    // kill: def $x8 killed $w8
+; O0-GISel-MOPS-NEXT:    cpyp [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpym [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpye [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 1000, i1 false)
+  ret void
+}
+
+define void @memmove_1000_volatile(i8* %dst, i8* %src, i32 %value) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memmove_1000_volatile:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    mov w2, #1000
+; O2-SDAG-WITHOUT-MOPS-NEXT:    bl memmove
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memmove_1000_volatile:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    mov w8, #1000
+; O2-SDAG-MOPS-NEXT:    cpyp [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    cpym [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    cpye [x0]!, [x1]!, x8!
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memmove_1000_volatile:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w8, #1000
+; O0-GISel-WITHOUT-MOPS-NEXT:    mov w2, w8
+; O0-GISel-WITHOUT-MOPS-NEXT:    bl memmove
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memmove_1000_volatile:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    mov w8, #1000
+; O0-GISel-MOPS-NEXT:    // kill: def $x8 killed $w8
+; O0-GISel-MOPS-NEXT:    cpyp [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpym [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    cpye [x0]!, [x1]!, x8!
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 1000, i1 true)
+  ret void
+}
+
+define void @memmove_n(i8* %dst, i8* %src, i64 %size, i32 %value) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memmove_n:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    bl memmove
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memmove_n:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    cpyp [x0]!, [x1]!, x2!
+; O2-SDAG-MOPS-NEXT:    cpym [x0]!, [x1]!, x2!
+; O2-SDAG-MOPS-NEXT:    cpye [x0]!, [x1]!, x2!
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memmove_n:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O0-GISel-WITHOUT-MOPS-NEXT:    bl memmove
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memmove_n:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    cpyp [x0]!, [x1]!, x2!
+; O0-GISel-MOPS-NEXT:    cpym [x0]!, [x1]!, x2!
+; O0-GISel-MOPS-NEXT:    cpye [x0]!, [x1]!, x2!
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 %size, i1 false)
+  ret void
+}
+
+define void @memmove_n_volatile(i8* %dst, i8* %src, i64 %size, i32 %value) {
+; O2-SDAG-WITHOUT-MOPS-LABEL: memmove_n_volatile:
+; O2-SDAG-WITHOUT-MOPS:       // %bb.0: // %entry
+; O2-SDAG-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O2-SDAG-WITHOUT-MOPS-NEXT:    bl memmove
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O2-SDAG-WITHOUT-MOPS-NEXT:    ret
+;
+; O2-SDAG-MOPS-LABEL: memmove_n_volatile:
+; O2-SDAG-MOPS:       // %bb.0: // %entry
+; O2-SDAG-MOPS-NEXT:    cpyp [x0]!, [x1]!, x2!
+; O2-SDAG-MOPS-NEXT:    cpym [x0]!, [x1]!, x2!
+; O2-SDAG-MOPS-NEXT:    cpye [x0]!, [x1]!, x2!
+; O2-SDAG-MOPS-NEXT:    ret
+;
+; O0-GISel-WITHOUT-MOPS-LABEL: memmove_n_volatile:
+; O0-GISel-WITHOUT-MOPS:       // %bb.0: // %entry
+; O0-GISel-WITHOUT-MOPS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_def_cfa_offset 16
+; O0-GISel-WITHOUT-MOPS-NEXT:    .cfi_offset w30, -16
+; O0-GISel-WITHOUT-MOPS-NEXT:    bl memmove
+; O0-GISel-WITHOUT-MOPS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; O0-GISel-WITHOUT-MOPS-NEXT:    ret
+;
+; O0-GISel-MOPS-LABEL: memmove_n_volatile:
+; O0-GISel-MOPS:       // %bb.0: // %entry
+; O0-GISel-MOPS-NEXT:    cpyp [x0]!, [x1]!, x2!
+; O0-GISel-MOPS-NEXT:    cpym [x0]!, [x1]!, x2!
+; O0-GISel-MOPS-NEXT:    cpye [x0]!, [x1]!, x2!
+; O0-GISel-MOPS-NEXT:    ret
+entry:
+  call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 %size, i1 true)
+  ret void
+}