diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -7830,9 +7830,40 @@
   MachineOperand SrcBase = earlyUseOperand(MI.getOperand(2));
   uint64_t SrcDisp = MI.getOperand(3).getImm();
   MachineOperand &LengthMO = MI.getOperand(4);
-  uint64_t ImmLength = LengthMO.isImm() ? LengthMO.getImm() : 0;
-  Register LenMinus1Reg =
-      LengthMO.isReg() ? LengthMO.getReg() : SystemZ::NoRegister;
+  bool IsImmForm = LengthMO.isImm();
+  bool IsRegForm = !IsImmForm;
+
+  bool NeedsLoop = false;
+  uint64_t ImmLength = 0;
+  Register LenMinus1Reg = SystemZ::NoRegister;
+  if (IsImmForm) {
+    ImmLength = LengthMO.getImm();
+    ImmLength++; // Add back the '1' subtracted originally.
+    if (ImmLength == 0) {
+      MI.eraseFromParent();
+      return MBB;
+    }
+    if (Opcode == SystemZ::CLC) {
+      if (ImmLength > 3 * 256)
+        // A two-CLC sequence is a clear win over a loop, not least because
+        // it needs only one branch.  A three-CLC sequence needs the same
+        // number of branches as a loop (i.e. 2), but is shorter.  That
+        // brings us to lengths greater than 768 bytes.  It seems relatively
+        // likely that a difference will be found within the first 768 bytes,
+        // so we just optimize for the smallest number of branch
+        // instructions, in order to avoid polluting the prediction buffer
+        // too much.
+        NeedsLoop = true;
+    } else if (ImmLength > 6 * 256)
+      // The heuristic we use is to prefer loops for anything that would
+      // require 7 or more MVCs.  With these kinds of sizes there isn't much
+      // to choose between straight-line code and looping code, since the
+      // time will be dominated by the MVCs themselves.
+      NeedsLoop = true;
+  } else {
+    NeedsLoop = true;
+    LenMinus1Reg = LengthMO.getReg();
+  }
 
   // When generating more than one CLC, all but the last will need to
   // branch to the end when a difference is found.
@@ -7840,16 +7871,25 @@
                                    ? SystemZ::splitBlockAfter(MI, MBB)
                                    : nullptr);
 
-  // Check for the loop form, in which operand 5 is the trip count.
-  if (MI.getNumExplicitOperands() > 5) {
-    Register StartCountReg = MI.getOperand(5).getReg();
-    bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
+  if (NeedsLoop) {
+    Register StartCountReg =
+      MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
+    if (IsImmForm) {
+      TII->loadImmediate(*MBB, MI, StartCountReg, ImmLength / 256);
+      ImmLength &= 255;
+    } else {
+      BuildMI(*MBB, MI, DL, TII->get(SystemZ::SRLG), StartCountReg)
+        .addReg(LenMinus1Reg)
+        .addReg(0)
+        .addImm(8);
+    }
 
     auto loadZeroAddress = [&]() -> MachineOperand {
       Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
       BuildMI(*MBB, MI, DL, TII->get(SystemZ::LGHI), Reg).addImm(0);
       return MachineOperand::CreateReg(Reg, false);
     };
+    bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
     if (DestBase.isReg() && DestBase.getReg() == SystemZ::NoRegister)
       DestBase = loadZeroAddress();
     if (SrcBase.isReg() && SrcBase.getReg() == SystemZ::NoRegister)
@@ -7876,7 +7916,7 @@
     Register ThisCountReg = MRI.createVirtualRegister(RC);
     Register NextCountReg = MRI.createVirtualRegister(RC);
 
-    if (LengthMO.isReg()) {
+    if (IsRegForm) {
       AllDoneMBB = SystemZ::splitBlockBefore(MI, MBB);
       StartMBB = SystemZ::emitBlockAfter(MBB);
       LoopMBB = SystemZ::emitBlockAfter(StartMBB);
@@ -7916,7 +7956,6 @@
 
       DestBase = MachineOperand::CreateReg(NextDestReg, false);
       SrcBase = MachineOperand::CreateReg(NextSrcReg, false);
-      ImmLength &= 255;
       if (EndMBB && !ImmLength)
         // If the loop handled the whole CLC range, DoneMBB will be empty with
         // CC live-through into EndMBB, so add it as live-in.
@@ -7987,7 +8026,7 @@
     MBB->addSuccessor(DoneMBB);
 
     MBB = DoneMBB;
-    if (LengthMO.isReg()) {
+    if (IsRegForm) {
       // DoneMBB:
       // # Make PHIs for RemDestReg/RemSrcReg as the loop may or may not run.
       // # Use EXecute Relative Long for the remainder of the bytes. The target
@@ -8005,7 +8044,6 @@
         BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemSrcReg)
           .addReg(StartSrcReg).addMBB(StartMBB)
           .addReg(NextSrcReg).addMBB(LoopMBB);
-      MRI.constrainRegClass(LenMinus1Reg, &SystemZ::ADDR64BitRegClass);
       BuildMI(MBB, DL, TII->get(SystemZ::EXRL_Pseudo))
         .addImm(Opcode)
         .addReg(LenMinus1Reg)
@@ -8530,21 +8568,16 @@
 
   case SystemZ::ATOMIC_CMP_SWAPW:
     return emitAtomicCmpSwapW(MI, MBB);
-  case SystemZ::MVCSequence:
-  case SystemZ::MVCLoop:
+  case SystemZ::MVCImm:
     return emitMemMemWrapper(MI, MBB, SystemZ::MVC);
-  case SystemZ::NCSequence:
-  case SystemZ::NCLoop:
+  case SystemZ::NCImm:
     return emitMemMemWrapper(MI, MBB, SystemZ::NC);
-  case SystemZ::OCSequence:
-  case SystemZ::OCLoop:
+  case SystemZ::OCImm:
     return emitMemMemWrapper(MI, MBB, SystemZ::OC);
-  case SystemZ::XCSequence:
-  case SystemZ::XCLoop:
-  case SystemZ::XCLoopVarLen:
+  case SystemZ::XCImm:
+  case SystemZ::XCReg:
     return emitMemMemWrapper(MI, MBB, SystemZ::XC);
-  case SystemZ::CLCSequence:
-  case SystemZ::CLCLoop:
+  case SystemZ::CLCImm:
     return emitMemMemWrapper(MI, MBB, SystemZ::CLC);
   case SystemZ::CLSTLoop:
     return emitStringWrapper(MI, MBB, SystemZ::CLST);
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
--- a/llvm/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
@@ -128,9 +128,10 @@
                                     (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
 }
 
-defm LoadStoreF32  : MVCLoadStore<load, f32,  MVCSequence, 4>;
-defm LoadStoreF64  : MVCLoadStore<load, f64,  MVCSequence, 8>;
-defm LoadStoreF128 : MVCLoadStore<load, f128, MVCSequence, 16>;
+// The length is given as one less for MVCImm.
+defm LoadStoreF32  : MVCLoadStore<load, f32,  MVCImm, 3>;
+defm LoadStoreF64  : MVCLoadStore<load, f64,  MVCImm, 7>;
+defm LoadStoreF128 : MVCLoadStore<load, f128, MVCImm, 15>;
 
 //===----------------------------------------------------------------------===//
 // Load instructions
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
--- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -5329,42 +5329,33 @@
 
 // Define an instruction that operates on two fixed-length blocks of memory,
 // and associated pseudo instructions for operating on blocks of any size.
-// The Sequence form uses a straight-line sequence of instructions and
-// the Loop form uses a loop of length-256 instructions followed by
-// another instruction to handle the excess.
-// The LoopVarLen form is for a loop with a non-constant length parameter.
-multiclass MemorySS<string mnemonic, bits<8> opcode,
-                    SDPatternOperator sequence, SDPatternOperator loop> {
+// There are two pseudos for the different cases of when the length is
+// constant or variable. The length operand of a pseudo is actually one less
+// than the intended number of bytes, since the register case needs to use an
+// EXRL with a target instruction that adds one to the length always.
+multiclass MemorySS<string mnemonic, bits<8> opcode, SDPatternOperator memop> {
   def "" : SideEffectBinarySSa<mnemonic, opcode>;
   let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Defs = [CC] in {
-    def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
-                                       imm64:$length),
-                           [(sequence bdaddr12only:$dest, bdaddr12only:$src,
-                                      imm64:$length)]>;
-    def Loop : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
-                                   imm64:$length, GR64:$count256),
-                      [(loop bdaddr12only:$dest, bdaddr12only:$src,
-                             imm64:$length, GR64:$count256)]>;
-    def LoopVarLen : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
-                                         GR64:$length, GR64:$count256),
-                            [(loop bdaddr12only:$dest, bdaddr12only:$src,
-                                   GR64:$length, GR64:$count256)]>;
+    def Imm : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+                                  imm64:$length),
+                             [(memop bdaddr12only:$dest, bdaddr12only:$src,
+                                     imm64:$length)]>;
+    def Reg : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+                                  ADDR64:$length),
+                             [(memop bdaddr12only:$dest, bdaddr12only:$src,
+                                     ADDR64:$length)]>;
   }
 }
 
 // The same, but setting a CC result as comparison operator.
 multiclass CompareMemorySS<string mnemonic, bits<8> opcode,
-                          SDPatternOperator sequence, SDPatternOperator loop> {
+                           SDPatternOperator memop> {
   def "" : SideEffectBinarySSa<mnemonic, opcode>;
   let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
-    def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
-                                       imm64:$length),
-                           [(set CC, (sequence bdaddr12only:$dest, bdaddr12only:$src,
-                                               imm64:$length))]>;
-    def Loop : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
-                                   imm64:$length, GR64:$count256),
-                      [(set CC, (loop bdaddr12only:$dest, bdaddr12only:$src,
-                                      imm64:$length, GR64:$count256))]>;
+    def Imm : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+                                  imm64:$length),
+                          [(set CC, (memop bdaddr12only:$dest, bdaddr12only:$src,
+                                           imm64:$length))]>;
   }
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -503,7 +503,7 @@
 
 // Memory-to-memory moves.
 let mayLoad = 1, mayStore = 1 in
-  defm MVC : MemorySS<"mvc", 0xD2, z_mvc, z_mvc_loop>;
+  defm MVC : MemorySS<"mvc", 0xD2, z_mvc>;
 let mayLoad = 1, mayStore = 1, Defs = [CC] in {
   def MVCL  : SideEffectBinaryMemMemRR<"mvcl", 0x0E, GR128, GR128>;
   def MVCLE : SideEffectTernaryMemMemRS<"mvcle", 0xA8, GR128, GR128>;
@@ -1200,7 +1200,7 @@
 
   // Block AND.
   let mayLoad = 1, mayStore = 1 in
-    defm NC : MemorySS<"nc", 0xD4, z_nc, z_nc_loop>;
+    defm NC : MemorySS<"nc", 0xD4, z_nc>;
 }
 defm : RMWIByte<and, bdaddr12pair, NI>;
 defm : RMWIByte<and, bdaddr20pair, NIY>;
@@ -1257,7 +1257,7 @@
 
   // Block OR.
   let mayLoad = 1, mayStore = 1 in
-    defm OC : MemorySS<"oc", 0xD6, z_oc, z_oc_loop>;
+    defm OC : MemorySS<"oc", 0xD6, z_oc>;
 }
 defm : RMWIByte<or, bdaddr12pair, OI>;
 defm : RMWIByte<or, bdaddr20pair, OIY>;
@@ -1297,7 +1297,7 @@
 
   // Block XOR.
   let mayLoad = 1, mayStore = 1 in
-    defm XC : MemorySS<"xc", 0xD7, z_xc, z_xc_loop>;
+    defm XC : MemorySS<"xc", 0xD7, z_xc>;
 }
 defm : RMWIByte<xor, bdaddr12pair, XI>;
 defm : RMWIByte<xor, bdaddr20pair, XIY>;
@@ -1624,7 +1624,7 @@
 
 // Memory-to-memory comparison.
 let mayLoad = 1, Defs = [CC] in {
-  defm CLC : CompareMemorySS<"clc", 0xD5, z_clc, z_clc_loop>;
+  defm CLC : CompareMemorySS<"clc", 0xD5, z_clc>;
   def CLCL  : SideEffectBinaryMemMemRR<"clcl", 0x0F, GR128, GR128>;
   def CLCLE : SideEffectTernaryMemMemRS<"clcle", 0xA9, GR128, GR128>;
   def CLCLU : SideEffectTernaryMemMemRSY<"clclu", 0xEB8F, GR128, GR128>;
@@ -2355,21 +2355,15 @@
             (RLLG GR64:$val, (LCR GR32:$shift), 0)>;
 }
 
-// Peepholes for turning scalar operations into block operations.
-defm : BlockLoadStore<anyextloadi8, i32, MVCSequence, NCSequence, OCSequence,
-                      XCSequence, 1>;
-defm : BlockLoadStore<anyextloadi16, i32, MVCSequence, NCSequence, OCSequence,
-                      XCSequence, 2>;
-defm : BlockLoadStore<load, i32, MVCSequence, NCSequence, OCSequence,
-                      XCSequence, 4>;
-defm : BlockLoadStore<anyextloadi8, i64, MVCSequence, NCSequence,
-                      OCSequence, XCSequence, 1>;
-defm : BlockLoadStore<anyextloadi16, i64, MVCSequence, NCSequence, OCSequence,
-                      XCSequence, 2>;
-defm : BlockLoadStore<anyextloadi32, i64, MVCSequence, NCSequence, OCSequence,
-                      XCSequence, 4>;
-defm : BlockLoadStore<load, i64, MVCSequence, NCSequence, OCSequence,
-                      XCSequence, 8>;
+// Peepholes for turning scalar operations into block operations.  The length
+// is given as one less for these pseudos.
+defm : BlockLoadStore<anyextloadi8, i32, MVCImm, NCImm, OCImm, XCImm, 0>;
+defm : BlockLoadStore<anyextloadi16, i32, MVCImm, NCImm, OCImm, XCImm, 1>;
+defm : BlockLoadStore<load, i32, MVCImm, NCImm, OCImm, XCImm, 3>;
+defm : BlockLoadStore<anyextloadi8, i64, MVCImm, NCImm, OCImm, XCImm, 0>;
+defm : BlockLoadStore<anyextloadi16, i64, MVCImm, NCImm, OCImm, XCImm, 1>;
+defm : BlockLoadStore<anyextloadi32, i64, MVCImm, NCImm, OCImm, XCImm, 3>;
+defm : BlockLoadStore<load, i64, MVCImm, NCImm, OCImm, XCImm, 7>;
 
 //===----------------------------------------------------------------------===//
 // Mnemonic Aliases
diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td
--- a/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -102,17 +102,6 @@
                                              SDTCisPtrTy<1>,
                                              SDTCisPtrTy<2>,
                                              SDTCisVT<3, i64>]>;
-def SDT_ZMemMemLoop         : SDTypeProfile<0, 4,
-                                            [SDTCisPtrTy<0>,
-                                             SDTCisPtrTy<1>,
-                                             SDTCisVT<2, i64>,
-                                             SDTCisVT<3, i64>]>;
-def SDT_ZMemMemLoopCC       : SDTypeProfile<1, 4,
-                                            [SDTCisVT<0, i32>,
-                                             SDTCisPtrTy<1>,
-                                             SDTCisPtrTy<2>,
-                                             SDTCisVT<3, i64>,
-                                             SDTCisVT<4, i64>]>;
 def SDT_ZString             : SDTypeProfile<1, 3,
                                             [SDTCisPtrTy<0>,
                                              SDTCisPtrTy<1>,
@@ -416,24 +405,14 @@
 
 def z_mvc               : SDNode<"SystemZISD::MVC", SDT_ZMemMemLength,
                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
-def z_mvc_loop          : SDNode<"SystemZISD::MVC_LOOP", SDT_ZMemMemLoop,
-                                 [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 def z_nc                : SDNode<"SystemZISD::NC", SDT_ZMemMemLength,
                                   [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
-def z_nc_loop           : SDNode<"SystemZISD::NC_LOOP", SDT_ZMemMemLoop,
-                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 def z_oc                : SDNode<"SystemZISD::OC", SDT_ZMemMemLength,
                                   [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
-def z_oc_loop           : SDNode<"SystemZISD::OC_LOOP", SDT_ZMemMemLoop,
-                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 def z_xc                : SDNode<"SystemZISD::XC", SDT_ZMemMemLength,
                                   [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
-def z_xc_loop           : SDNode<"SystemZISD::XC_LOOP", SDT_ZMemMemLoop,
-                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 def z_clc               : SDNode<"SystemZISD::CLC", SDT_ZMemMemLengthCC,
                                  [SDNPHasChain, SDNPMayLoad]>;
-def z_clc_loop          : SDNode<"SystemZISD::CLC_LOOP", SDT_ZMemMemLoopCC,
-                                 [SDNPHasChain, SDNPMayLoad]>;
 def z_strcmp            : SDNode<"SystemZISD::STRCMP", SDT_ZStringCC,
                                  [SDNPHasChain, SDNPMayLoad]>;
 def z_stpcpy            : SDNode<"SystemZISD::STPCPY", SDT_ZString,
diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
--- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -17,32 +17,34 @@
 
 #define DEBUG_TYPE "systemz-selectiondag-info"
 
-// Decide whether it is best to use a loop or straight-line code for
-// a block operation of Size bytes with source address Src and destination
-// address Dest.  Sequence is the opcode to use for straight-line code
-// (such as MVC) and Loop is the opcode to use for loops (such as MVC_LOOP).
-// Return the chain for the completed operation.
-static SDValue emitMemMem(SelectionDAG &DAG, const SDLoc &DL, unsigned Sequence,
-                          unsigned Loop, SDValue Chain, SDValue Dst,
-                          SDValue Src, uint64_t Size) {
-  EVT PtrVT = Src.getValueType();
-  // The heuristic we use is to prefer loops for anything that would
-  // require 7 or more MVCs.  With these kinds of sizes there isn't
-  // much to choose between straight-line code and looping code,
-  // since the time will be dominated by the MVCs themselves.
-  // However, the loop has 4 or 5 instructions (depending on whether
-  // the base addresses can be proved equal), so there doesn't seem
-  // much point using a loop for 5 * 256 bytes or fewer.  Anything in
-  // the range (5 * 256, 6 * 256) will need another instruction after
-  // the loop, so it doesn't seem worth using a loop then either.
-  // The next value up, 6 * 256, can be implemented in the same
-  // number of straight-line MVCs as 6 * 256 - 1.
-  if (Size > 6 * 256)
-    return DAG.getNode(Loop, DL, MVT::Other, Chain, Dst, Src,
-                       DAG.getConstant(Size, DL, PtrVT),
-                       DAG.getConstant(Size / 256, DL, PtrVT));
-  return DAG.getNode(Sequence, DL, MVT::Other, Chain, Dst, Src,
-                     DAG.getConstant(Size, DL, PtrVT));
+// Emit a mem-mem operation after subtracting one from size, which will be
+// added back during pseudo expansion. As the Reg case emitted here may be
+// converted by DAGCombiner into having an Imm length, they are both emitted
+// the same way.
+static SDValue emitMemMemImm(SelectionDAG &DAG, const SDLoc &DL, unsigned Op,
+                             SDValue Chain, SDValue Dst, SDValue Src,
+                             uint64_t Size) {
+  return DAG.getNode(Op, DL, MVT::Other, Chain, Dst, Src,
+                     DAG.getConstant(Size - 1, DL, Src.getValueType()));
+}
+
+static SDValue emitMemMemReg(SelectionDAG &DAG, const SDLoc &DL, unsigned Op,
+                             SDValue Chain, SDValue Dst, SDValue Src,
+                             SDValue Size) {
+  SDValue LenMinus1 = DAG.getNode(ISD::ADD, DL, MVT::i64,
+                                  DAG.getZExtOrTrunc(Size, DL, MVT::i64),
+                                  DAG.getConstant(-1, DL, MVT::i64));
+  return DAG.getNode(Op, DL, MVT::Other, Chain, Dst, Src, LenMinus1);
+}
+
+// Use CLC to compare [Src1, Src1 + Size) with [Src2, Src2 + Size).
+// One is subtracted from size also here, per above.
+static SDValue emitCLC(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
+                       SDValue Src1, SDValue Src2, uint64_t Size) {
+  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
+  EVT PtrVT = Src1.getValueType();
+  return DAG.getNode(SystemZISD::CLC, DL, VTs, Chain, Src1, Src2,
+                     DAG.getConstant(Size - 1, DL, PtrVT));
 }
 
 SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemcpy(
@@ -53,8 +55,8 @@
     return SDValue();
 
   if (auto *CSize = dyn_cast<ConstantSDNode>(Size))
-    return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP,
-                      Chain, Dst, Src, CSize->getZExtValue());
+    return emitMemMemImm(DAG, DL, SystemZISD::MVC, Chain, Dst, Src,
+                         CSize->getZExtValue());
   return SDValue();
 }
 
@@ -127,52 +129,23 @@
 
     // Handle the special case of a memset of 0, which can use XC.
     if (CByte && CByte->getZExtValue() == 0)
-      return emitMemMem(DAG, DL, SystemZISD::XC, SystemZISD::XC_LOOP,
-                        Chain, Dst, Dst, Bytes);
+      return emitMemMemImm(DAG, DL, SystemZISD::XC, Chain, Dst, Dst, Bytes);
 
     // Copy the byte to the first location and then use MVC to copy
     // it to the rest.
     Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, Alignment);
     SDValue DstPlus1 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
                                    DAG.getConstant(1, DL, PtrVT));
-    return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP,
-                      Chain, DstPlus1, Dst, Bytes - 1);
+    return emitMemMemImm(DAG, DL, SystemZISD::MVC, Chain, DstPlus1, Dst,
+                         Bytes - 1);
   }
 
   // Variable length
-  if (CByte && CByte->getZExtValue() == 0) {
+  if (CByte && CByte->getZExtValue() == 0)
     // Handle the special case of a variable length memset of 0 with XC.
-    SDValue LenMinus1 = DAG.getNode(ISD::ADD, DL, MVT::i64,
-                                    DAG.getZExtOrTrunc(Size, DL, MVT::i64),
-                                    DAG.getConstant(-1, DL, MVT::i64));
-    SDValue TripC = DAG.getNode(ISD::SRL, DL, MVT::i64, LenMinus1,
-                                DAG.getConstant(8, DL, MVT::i64));
-    return DAG.getNode(SystemZISD::XC_LOOP, DL, MVT::Other, Chain, Dst, Dst,
-                       LenMinus1, TripC);
-  }
-  return SDValue();
-}
+    return emitMemMemReg(DAG, DL, SystemZISD::XC, Chain, Dst, Dst, Size);
 
-// Use CLC to compare [Src1, Src1 + Size) with [Src2, Src2 + Size),
-// deciding whether to use a loop or straight-line code.
-static SDValue emitCLC(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
-                       SDValue Src1, SDValue Src2, uint64_t Size) {
-  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
-  EVT PtrVT = Src1.getValueType();
-  // A two-CLC sequence is a clear win over a loop, not least because it
-  // needs only one branch.  A three-CLC sequence needs the same number
-  // of branches as a loop (i.e. 2), but is shorter.  That brings us to
-  // lengths greater than 768 bytes.  It seems relatively likely that
-  // a difference will be found within the first 768 bytes, so we just
-  // optimize for the smallest number of branch instructions, in order
-  // to avoid polluting the prediction buffer too much.  A loop only ever
-  // needs 2 branches, whereas a straight-line sequence would need 3 or more.
-  if (Size > 3 * 256)
-    return DAG.getNode(SystemZISD::CLC_LOOP, DL, VTs, Chain, Src1, Src2,
-                       DAG.getConstant(Size, DL, PtrVT),
-                       DAG.getConstant(Size / 256, DL, PtrVT));
-  return DAG.getNode(SystemZISD::CLC, DL, VTs, Chain, Src1, Src2,
-                     DAG.getConstant(Size, DL, PtrVT));
+  return SDValue();
 }
 
 // Convert the current CC value into an integer that is 0 if CC == 0,
diff --git a/llvm/test/CodeGen/SystemZ/memset-05.ll b/llvm/test/CodeGen/SystemZ/memset-05.ll
--- a/llvm/test/CodeGen/SystemZ/memset-05.ll
+++ b/llvm/test/CodeGen/SystemZ/memset-05.ll
@@ -48,37 +48,37 @@
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    llgfr %r1, %r3
 ; CHECK-NEXT:    aghi %r1, -1
-; CHECK-NEXT:    srlg %r0, %r1, 8
 ; CHECK-NEXT:    cgije %r1, -1, .LBB2_5
 ; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    srlg %r0, %r1, 8
 ; CHECK-NEXT:    lgr %r3, %r2
 ; CHECK-NEXT:    cgije %r0, 0, .LBB2_4
 ; CHECK-NEXT:  # %bb.2:
 ; CHECK-NEXT:    lgr %r3, %r2
-; CHECK-NEXT:    lgr %r4, %r0
 ; CHECK-NEXT:  .LBB2_3: # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    xc 0(256,%r3), 0(%r3)
 ; CHECK-NEXT:    la %r3, 256(%r3)
-; CHECK-NEXT:    brctg %r4, .LBB2_3
+; CHECK-NEXT:    brctg %r0, .LBB2_3
 ; CHECK-NEXT:  .LBB2_4:
 ; CHECK-NEXT:    exrl %r1, .Ltmp1
 ; CHECK-NEXT:  .LBB2_5:
 ; CHECK-NEXT:    cgije %r1, -1, .LBB2_10
 ; CHECK-NEXT:  # %bb.6:
+; CHECK-NEXT:    srlg %r0, %r1, 8
 ; CHECK-NEXT:    lgr %r3, %r2
 ; CHECK-NEXT:    cgije %r0, 0, .LBB2_9
 ; CHECK-NEXT:  # %bb.7:
 ; CHECK-NEXT:    lgr %r3, %r2
-; CHECK-NEXT:    lgr %r4, %r0
 ; CHECK-NEXT:  .LBB2_8: # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    xc 0(256,%r3), 0(%r3)
 ; CHECK-NEXT:    la %r3, 256(%r3)
-; CHECK-NEXT:    brctg %r4, .LBB2_8
+; CHECK-NEXT:    brctg %r0, .LBB2_8
 ; CHECK-NEXT:  .LBB2_9:
 ; CHECK-NEXT:    exrl %r1, .Ltmp1
 ; CHECK-NEXT:  .LBB2_10:
 ; CHECK-NEXT:    cgibe %r1, -1, 0(%r14)
 ; CHECK-NEXT:  .LBB2_11:
+; CHECK-NEXT:    srlg %r0, %r1, 8
 ; CHECK-NEXT:    cgije %r0, 0, .LBB2_13
 ; CHECK-NEXT:  .LBB2_12: # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    xc 0(256,%r2), 0(%r2)
@@ -114,6 +114,135 @@
   ret void
 }
 
+; Test that a memset with a length argument that DAGCombiner will convert
+; into a constant get the correct number of bytes set.
+@Data = external hidden constant [1024 x i8], align 2
+define void @fun4() {
+; CHECK-LABEL: fun4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    larl %r1, Data
+; CHECK-NEXT:    xc 35(256,%r1), 35(%r1)
+; CHECK-NEXT:    xc 291(256,%r1), 291(%r1)
+; CHECK-NEXT:    xc 547(256,%r1), 547(%r1)
+; CHECK-NEXT:    xc 803(221,%r1), 803(%r1)
+; CHECK-NEXT:    mvghi 0(%r1), 989
+; CHECK-NEXT:    br %r14
+  call void @llvm.memset.p0i8.i64(
+     i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @Data, i64 0, i64 35),
+     i8 0,
+     i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 0) to i64), i64 1),
+              i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 0, i64 35) to i64), i64 1)),
+     i1 false)
+  %i11 = getelementptr i8, i8* null,
+     i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 0) to i64), i64 1),
+              i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 0, i64 35) to i64), i64 1))
+  store i8* %i11, i8** undef, align 8
+  ret void
+}
+
+; The same, with a resulting constant length of 0.
+define void @fun5() {
+; CHECK-LABEL: fun5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mvghi 0(%r1), 0
+; CHECK-NEXT:    br %r14
+  call void @llvm.memset.p0i8.i64(
+     i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @Data, i64 0, i64 35),
+     i8 0,
+     i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1),
+              i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1)),
+     i1 false)
+  %i11 = getelementptr i8, i8* null,
+     i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1),
+              i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1))
+  store i8* %i11, i8** undef, align 8
+  ret void
+}
+
+; The same, with a resulting constant length of 1.
+define void @fun6() {
+; CHECK-LABEL: fun6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    larl %r1, Data
+; CHECK-NEXT:    xc 35(1,%r1), 35(%r1)
+; CHECK-NEXT:    mvghi 0(%r1), 1
+; CHECK-NEXT:    br %r14
+  call void @llvm.memset.p0i8.i64(
+     i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @Data, i64 0, i64 35),
+     i8 0,
+     i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 36) to i64), i64 1),
+              i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1)),
+     i1 false)
+  %i11 = getelementptr i8, i8* null,
+     i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 36) to i64), i64 1),
+              i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1))
+  store i8* %i11, i8** undef, align 8
+  ret void
+}
+
+; The same, with a resulting constant length of 256.
+define void @fun7() {
+; CHECK-LABEL: fun7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    larl %r1, Data
+; CHECK-NEXT:    xc 35(256,%r1), 35(%r1)
+; CHECK-NEXT:    mvghi 0(%r1), 256
+; CHECK-NEXT:    br %r14
+  call void @llvm.memset.p0i8.i64(
+     i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @Data, i64 0, i64 35),
+     i8 0,
+     i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 291) to i64), i64 1),
+              i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1)),
+     i1 false)
+  %i11 = getelementptr i8, i8* null,
+     i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 291) to i64), i64 1),
+              i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1))
+  store i8* %i11, i8** undef, align 8
+  ret void
+}
+
+; The same, with a resulting constant length of 257.
+define void @fun8() {
+; CHECK-LABEL: fun8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    larl %r1, Data
+; CHECK-NEXT:    xc 35(256,%r1), 35(%r1)
+; CHECK-NEXT:    xc 291(1,%r1), 291(%r1)
+; CHECK-NEXT:    mvghi 0(%r1), 257
+; CHECK-NEXT:    br %r14
+  call void @llvm.memset.p0i8.i64(
+     i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @Data, i64 0, i64 35),
+     i8 0,
+     i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 292) to i64), i64 1),
+              i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1)),
+     i1 false)
+  %i11 = getelementptr i8, i8* null,
+     i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 292) to i64), i64 1),
+              i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8],
+                                     [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1))
+  store i8* %i11, i8** undef, align 8
+  ret void
+}
+
 ; CHECK:       .Ltmp2:
 ; CHECK-NEXT: 	 xc 0(1,%r1), 0(%r1)
 ; CHECK-NEXT:  .Ltmp0:
diff --git a/llvm/test/CodeGen/SystemZ/mverify-optypes.mir b/llvm/test/CodeGen/SystemZ/mverify-optypes.mir
--- a/llvm/test/CodeGen/SystemZ/mverify-optypes.mir
+++ b/llvm/test/CodeGen/SystemZ/mverify-optypes.mir
@@ -31,8 +31,8 @@
     %4:addr64bit = LARL @gsrc
     %4:addr64bit = LARL $r2l
 
-    MVCLoop %4, 0, %3, 0, 1680, %0, implicit-def $cc
-    MVCLoop %4, 0, %3, 0, %1, %0, implicit-def $cc
+    MVCImm %4, 0, %3, 0, 1680, implicit-def $cc
+    MVCImm %4, 0, %3, 0, %1, implicit-def $cc
 
     BCR 0, 0, $r2d, implicit $cc
     BCR 0, $r2d, $r2d, implicit $cc
@@ -63,7 +63,7 @@
 # CHECK: - operand 1:   $r2l
 
 # CHECK: *** Bad machine code: Expected a non-register operand. ***
-# CHECK: - instruction: MVCLoop %4:addr64bit, 0, %3:addr64bit, 0, %1:addr64bit, %0:gr64bit, implicit-def $cc
+# CHECK: - instruction: MVCImm %4:addr64bit, 0, %3:addr64bit, 0, %1:addr64bit, implicit-def $cc
 # CHECK: - operand 4:   %1:addr64bit
 
 # CHECK: *** Bad machine code: Expected a non-register operand. ***