Index: include/llvm/CodeGen/TargetInstrInfo.h
===================================================================
--- include/llvm/CodeGen/TargetInstrInfo.h
+++ include/llvm/CodeGen/TargetInstrInfo.h
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineOutliner.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -932,9 +933,12 @@
   /// operand folded, otherwise NULL is returned.
   /// The new instruction is inserted before MI, and the client is responsible
   /// for removing the old instruction.
+  /// If VRM is passed, the assigned physregs can be inspected by target to
+  /// decide on using an opcode (note that those assignments can still change).
   MachineInstr *foldMemoryOperand(MachineInstr &MI, ArrayRef<unsigned> Ops,
                                   int FI,
-                                  LiveIntervals *LIS = nullptr) const;
+                                  LiveIntervals *LIS = nullptr,
+                                  VirtRegMap *VRM = nullptr) const;
 
   /// Same as the previous version except it allows folding of any load and
   /// store from / to any address, not just from a specific stack slot.
@@ -1024,7 +1028,8 @@
   foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
                         ArrayRef<unsigned> Ops,
                         MachineBasicBlock::iterator InsertPt, int FrameIndex,
-                        LiveIntervals *LIS = nullptr) const {
+                        LiveIntervals *LIS = nullptr,
+                        VirtRegMap *VRM = nullptr) const {
     return nullptr;
   }
 
Index: include/llvm/CodeGen/TargetPassConfig.h
===================================================================
--- include/llvm/CodeGen/TargetPassConfig.h
+++ include/llvm/CodeGen/TargetPassConfig.h
@@ -386,6 +386,11 @@
     return false;
   }
 
+  /// Add passes to be run immediately after virtual registers are rewritten
+  /// to physical registers. These passes may replace an MI with a new one,
+  /// but should preserve SlotIndexes while doing so.
+  virtual void addPostRewrite() { }
+
   /// This method may be implemented by targets that want to run passes after
   /// register allocation pass pipeline but before prolog-epilog insertion.
   virtual void addPostRegAlloc() { }
Index: lib/CodeGen/InlineSpiller.cpp
===================================================================
--- lib/CodeGen/InlineSpiller.cpp
+++ lib/CodeGen/InlineSpiller.cpp
@@ -837,7 +837,7 @@
 
   MachineInstr *FoldMI =
       LoadMI ? TII.foldMemoryOperand(*MI, FoldOps, *LoadMI, &LIS)
-             : TII.foldMemoryOperand(*MI, FoldOps, StackSlot, &LIS);
+             : TII.foldMemoryOperand(*MI, FoldOps, StackSlot, &LIS, &VRM);
   if (!FoldMI)
     return false;
 
Index: lib/CodeGen/TargetInstrInfo.cpp
===================================================================
--- lib/CodeGen/TargetInstrInfo.cpp
+++ lib/CodeGen/TargetInstrInfo.cpp
@@ -524,7 +524,8 @@
 
 MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
                                                  ArrayRef<unsigned> Ops, int FI,
-                                                 LiveIntervals *LIS) const {
+                                                 LiveIntervals *LIS,
+                                                 VirtRegMap *VRM) const {
   auto Flags = MachineMemOperand::MONone;
   for (unsigned OpIdx : Ops)
     Flags |= MI.getOperand(OpIdx).isDef() ? MachineMemOperand::MOStore
@@ -570,7 +571,7 @@
       MBB->insert(MI, NewMI);
   } else {
     // Ask the target to do the actual folding.
-    NewMI = foldMemoryOperandImpl(MF, MI, Ops, MI, FI, LIS);
+    NewMI = foldMemoryOperandImpl(MF, MI, Ops, MI, FI, LIS, VRM);
   }
 
   if (NewMI) {
Index: lib/CodeGen/TargetPassConfig.cpp
===================================================================
--- lib/CodeGen/TargetPassConfig.cpp
+++ lib/CodeGen/TargetPassConfig.cpp
@@ -1168,6 +1168,10 @@
   addPass(&MachineSchedulerID);
 
   if (addRegAssignmentOptimized()) {
+    // Allow targets to expand pseudo instructions depending on the choice of
+    // registers before MachineCopyPropagation.
+    addPostRewrite();
+
     // Copy propagate to forward register uses and try to eliminate COPYs that
     // were not coalesced.
     addPass(&MachineCopyPropagationID);
Index: lib/Target/AArch64/AArch64InstrInfo.h
===================================================================
--- lib/Target/AArch64/AArch64InstrInfo.h
+++ lib/Target/AArch64/AArch64InstrInfo.h
@@ -162,7 +162,8 @@
   foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
                         ArrayRef<unsigned> Ops,
                         MachineBasicBlock::iterator InsertPt, int FrameIndex,
-                        LiveIntervals *LIS = nullptr) const override;
+                        LiveIntervals *LIS = nullptr,
+                        VirtRegMap *VRM = nullptr) const override;
 
   /// \returns true if a branch from an instruction with opcode \p BranchOpc
   ///  bytes is capable of jumping to a position \p BrOffset bytes away.
Index: lib/Target/AArch64/AArch64InstrInfo.cpp
===================================================================
--- lib/Target/AArch64/AArch64InstrInfo.cpp
+++ lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -3049,7 +3049,7 @@
 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
     MachineBasicBlock::iterator InsertPt, int FrameIndex,
-    LiveIntervals *LIS) const {
+    LiveIntervals *LIS, VirtRegMap *VRM) const {
   // This is a bit of a hack. Consider this instruction:
   //
   //   %0 = COPY %sp; GPR64all:%0
Index: lib/Target/SystemZ/CMakeLists.txt
===================================================================
--- lib/Target/SystemZ/CMakeLists.txt
+++ lib/Target/SystemZ/CMakeLists.txt
@@ -30,6 +30,7 @@
   SystemZMCInstLower.cpp
   SystemZRegisterInfo.cpp
   SystemZSelectionDAGInfo.cpp
+  SystemZPostRewrite.cpp
   SystemZShortenInst.cpp
   SystemZSubtarget.cpp
   SystemZTargetMachine.cpp
Index: lib/Target/SystemZ/SystemZ.h
===================================================================
--- lib/Target/SystemZ/SystemZ.h
+++ lib/Target/SystemZ/SystemZ.h
@@ -194,6 +194,7 @@
 FunctionPass *createSystemZShortenInstPass(SystemZTargetMachine &TM);
 FunctionPass *createSystemZLongBranchPass(SystemZTargetMachine &TM);
 FunctionPass *createSystemZLDCleanupPass(SystemZTargetMachine &TM);
+FunctionPass *createSystemZPostRewritePass(SystemZTargetMachine &TM);
 FunctionPass *createSystemZTDCPass();
 } // end namespace llvm
 
Index: lib/Target/SystemZ/SystemZInstrFormats.td
===================================================================
--- lib/Target/SystemZ/SystemZInstrFormats.td
+++ lib/Target/SystemZ/SystemZInstrFormats.td
@@ -37,6 +37,12 @@
   string OpKey = "";
   string OpType = "none";
 
+  // MemKey identifies a targe reg-mem opcode, while MemType can be either
+  // "pseudo" or "target". This is used to map a pseduo memory instruction to
+  // its corresponding target opcode. See comment at MemFoldPseudo.
+  string MemKey = "";
+  string MemType = "none";
+
   // Many distinct-operands instructions have older 2-operand equivalents.
   // NumOpsKey uniquely identifies one of these 2-operand and 3-operand pairs,
   // with NumOpsValue being "2" or "3" as appropriate.
@@ -97,6 +103,12 @@
   let TSFlags{20}    = IsLogical;
 }
 
+class Pseudo<dag outs, dag ins, list<dag> pattern>
+  : InstSystemZ<0, outs, ins, "", pattern> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // Mappings between instructions
 //===----------------------------------------------------------------------===//
@@ -120,7 +132,8 @@
   let ValueCols = [["20"]];
 }
 
-// Return the memory form of a register instruction.
+// Return the memory form of a register instruction. Note that this may
+// return a MemFoldPseudo instruction (see below).
 def getMemOpcode : InstrMapping {
   let FilterClass = "InstSystemZ";
   let RowFields = ["OpKey"];
@@ -129,13 +142,22 @@
   let ValueCols = [["mem"]];
 }
 
-// Return the 3-operand form of a 2-operand instruction.
-def getThreeOperandOpcode : InstrMapping {
+// Return the target memory instruction for a MemFoldPseudo.
+def getTargetMemOpcode : InstrMapping {
+  let FilterClass = "InstSystemZ";
+  let RowFields = ["MemKey"];
+  let ColFields = ["MemType"];
+  let KeyCol = ["pseudo"];
+  let ValueCols = [["target"]];
+}
+
+// Return the 2-operand form of a 3-operand instruction.
+def getTwoOperandOpcode : InstrMapping {
   let FilterClass = "InstSystemZ";
   let RowFields = ["NumOpsKey"];
   let ColFields = ["NumOpsValue"];
-  let KeyCol = ["2"];
-  let ValueCols = [["3"]];
+  let KeyCol = ["3"];
+  let ValueCols = [["2"]];
 }
 
 //===----------------------------------------------------------------------===//
@@ -3066,6 +3088,8 @@
              mnemonic#"\t$R1, $R2, $R3",
              [(set cls1:$R1, (operator cls2:$R2, cls3:$R3))]> {
   let M4 = 0;
+  let OpKey = mnemonic#cls1;
+  let OpType = "reg";
 }
 
 multiclass BinaryRRAndK<string mnemonic, bits<8> opcode1, bits<16> opcode2,
@@ -3073,9 +3097,9 @@
                         RegisterOperand cls2> {
   let NumOpsKey = mnemonic in {
     let NumOpsValue = "3" in
-      def K : BinaryRRFa<mnemonic#"k", opcode2, null_frag, cls1, cls1, cls2>,
+      def K : BinaryRRFa<mnemonic#"k", opcode2, operator, cls1, cls1, cls2>,
               Requires<[FeatureDistinctOps]>;
-    let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+    let NumOpsValue = "2" in
       def "" : BinaryRR<mnemonic, opcode1, operator, cls1, cls2>;
   }
 }
@@ -3085,9 +3109,9 @@
                          RegisterOperand cls2> {
   let NumOpsKey = mnemonic in {
     let NumOpsValue = "3" in
-      def K : BinaryRRFa<mnemonic#"k", opcode2, null_frag, cls1, cls1, cls2>,
+      def K : BinaryRRFa<mnemonic#"k", opcode2, operator, cls1, cls1, cls2>,
               Requires<[FeatureDistinctOps]>;
-    let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+    let NumOpsValue = "2" in
       def "" : BinaryRRE<mnemonic, opcode1, operator, cls1, cls2>;
   }
 }
@@ -3188,9 +3212,9 @@
                         Immediate imm> {
   let NumOpsKey = mnemonic in {
     let NumOpsValue = "3" in
-      def K : BinaryRIE<mnemonic##"k", opcode2, null_frag, cls, imm>,
+      def K : BinaryRIE<mnemonic##"k", opcode2, operator, cls, imm>,
               Requires<[FeatureDistinctOps]>;
-    let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+    let NumOpsValue = "2" in
       def "" : BinaryRI<mnemonic, opcode1, operator, cls, imm>;
   }
 }
@@ -3265,9 +3289,9 @@
                         SDPatternOperator operator, RegisterOperand cls> {
   let NumOpsKey = mnemonic in {
     let NumOpsValue = "3" in
-      def K  : BinaryRSY<mnemonic##"k", opcode2, null_frag, cls>,
+      def K  : BinaryRSY<mnemonic##"k", opcode2, operator, cls>,
                Requires<[FeatureDistinctOps]>;
-    let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+    let NumOpsValue = "2" in
       def "" : BinaryRS<mnemonic, opcode1, operator, cls>;
   }
 }
@@ -3334,6 +3358,36 @@
   let AccessBytes = bytes;
 }
 
+// A pseudo that is used during register allocation when folding a memory
+// operand. The 3-address register instruction with a spilled source cannot
+// be converted directly to a target 2-address reg/mem instruction.
+// Mapping:  <INSN>R  ->  MemFoldPseudo  ->  <INSN>
+class MemFoldPseudo<string mnemonic, RegisterOperand cls, bits<5> bytes,
+                    AddressingMode mode>
+  : Pseudo<(outs cls:$R1), (ins cls:$R2, mode:$XBD2), []> {
+    let OpKey = mnemonic#"rk"#cls;
+    let OpType = "mem";
+    let MemKey = mnemonic#cls;
+    let MemType = "pseudo";
+    let mayLoad = 1;
+    let AccessBytes = bytes;
+    let HasIndex = 1;
+    let hasNoSchedulingInfo = 1;
+}
+
+multiclass BinaryRXYAndPseudo<string mnemonic, bits<16> opcode,
+                              SDPatternOperator operator, RegisterOperand cls,
+                              SDPatternOperator load, bits<5> bytes,
+                              AddressingMode mode = bdxaddr20only> {
+
+  def "" : BinaryRXY<mnemonic, opcode, operator, cls, load, bytes, mode> {
+    let MemKey = mnemonic#cls;
+    let MemType = "target";
+  }
+  let Has20BitOffset = 1 in
+    def _MemFoldPseudo : MemFoldPseudo<mnemonic, cls, bytes, mode>;
+}
+
 multiclass BinaryRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
                         SDPatternOperator operator, RegisterOperand cls,
                         SDPatternOperator load, bits<5> bytes> {
@@ -3347,6 +3401,24 @@
   }
 }
 
+multiclass BinaryRXPairAndPseudo<string mnemonic, bits<8> rxOpcode,
+                                 bits<16> rxyOpcode, SDPatternOperator operator,
+                                 RegisterOperand cls,
+                                 SDPatternOperator load, bits<5> bytes> {
+  let DispKey = mnemonic ## #cls in {
+    def "" : BinaryRX<mnemonic, rxOpcode, operator, cls, load, bytes,
+                      bdxaddr12pair> {
+      let DispSize = "12";
+      let MemKey = mnemonic#cls;
+      let MemType = "target";
+    }
+    let DispSize = "20" in
+      def Y  : BinaryRXY<mnemonic#"y", rxyOpcode, operator, cls, load,
+                         bytes, bdxaddr20pair>;
+  }
+  def _MemFoldPseudo : MemFoldPseudo<mnemonic, cls, bytes, bdxaddr12pair>;
+}
+
 class BinarySI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
                Operand imm, AddressingMode mode = bdaddr12only>
   : InstSI<opcode, (outs), (ins mode:$BD1, imm:$I2),
@@ -4539,12 +4611,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-class Pseudo<dag outs, dag ins, list<dag> pattern>
-  : InstSystemZ<0, outs, ins, "", pattern> {
-  let isPseudo = 1;
-  let isCodeGenOnly = 1;
-}
-
 // Like UnaryRI, but expanded after RA depending on the choice of register.
 class UnaryRIPseudo<SDPatternOperator operator, RegisterOperand cls,
                     Immediate imm>
@@ -4593,9 +4659,9 @@
                               RegisterOperand cls, Immediate imm> {
   let NumOpsKey = key in {
     let NumOpsValue = "3" in
-      def K : BinaryRIEPseudo<null_frag, cls, imm>,
+      def K : BinaryRIEPseudo<operator, cls, imm>,
               Requires<[FeatureHighWord, FeatureDistinctOps]>;
-    let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+    let NumOpsValue = "2" in
       def "" : BinaryRIPseudo<operator, cls, imm>,
                Requires<[FeatureHighWord]>;
   }
Index: lib/Target/SystemZ/SystemZInstrInfo.h
===================================================================
--- lib/Target/SystemZ/SystemZInstrInfo.h
+++ lib/Target/SystemZ/SystemZInstrInfo.h
@@ -141,6 +141,11 @@
 
 } // end namespace SystemZII
 
+namespace SystemZ {
+int getTwoOperandOpcode(uint16_t Opcode);
+int getTargetMemOpcode(uint16_t Opcode);
+}
+
 class SystemZInstrInfo : public SystemZGenInstrInfo {
   const SystemZRegisterInfo RI;
   SystemZSubtarget &STI;
@@ -248,7 +253,8 @@
   foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
                         ArrayRef<unsigned> Ops,
                         MachineBasicBlock::iterator InsertPt, int FrameIndex,
-                        LiveIntervals *LIS = nullptr) const override;
+                        LiveIntervals *LIS = nullptr,
+                        VirtRegMap *VRM = nullptr) const override;
   MachineInstr *foldMemoryOperandImpl(
       MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
       MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
Index: lib/Target/SystemZ/SystemZInstrInfo.cpp
===================================================================
--- lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -957,73 +957,13 @@
   }
 }
 
-// Used to return from convertToThreeAddress after replacing two-address
-// instruction OldMI with three-address instruction NewMI.
-static MachineInstr *finishConvertToThreeAddress(MachineInstr *OldMI,
-                                                 MachineInstr *NewMI,
-                                                 LiveVariables *LV) {
-  if (LV) {
-    unsigned NumOps = OldMI->getNumOperands();
-    for (unsigned I = 1; I < NumOps; ++I) {
-      MachineOperand &Op = OldMI->getOperand(I);
-      if (Op.isReg() && Op.isKill())
-        LV->replaceKillInstruction(Op.getReg(), *OldMI, *NewMI);
-    }
-  }
-  transferDeadCC(OldMI, NewMI);
-  return NewMI;
-}
-
 MachineInstr *SystemZInstrInfo::convertToThreeAddress(
     MachineFunction::iterator &MFI, MachineInstr &MI, LiveVariables *LV) const {
   MachineBasicBlock *MBB = MI.getParent();
-  MachineFunction *MF = MBB->getParent();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
-
-  unsigned Opcode = MI.getOpcode();
-  unsigned NumOps = MI.getNumOperands();
-
-  // Try to convert something like SLL into SLLK, if supported.
-  // We prefer to keep the two-operand form where possible both
-  // because it tends to be shorter and because some instructions
-  // have memory forms that can be used during spilling.
-  if (STI.hasDistinctOps()) {
-    MachineOperand &Dest = MI.getOperand(0);
-    MachineOperand &Src = MI.getOperand(1);
-    unsigned DestReg = Dest.getReg();
-    unsigned SrcReg = Src.getReg();
-    // AHIMux is only really a three-operand instruction when both operands
-    // are low registers.  Try to constrain both operands to be low if
-    // possible.
-    if (Opcode == SystemZ::AHIMux &&
-        TargetRegisterInfo::isVirtualRegister(DestReg) &&
-        TargetRegisterInfo::isVirtualRegister(SrcReg) &&
-        MRI.getRegClass(DestReg)->contains(SystemZ::R1L) &&
-        MRI.getRegClass(SrcReg)->contains(SystemZ::R1L)) {
-      MRI.constrainRegClass(DestReg, &SystemZ::GR32BitRegClass);
-      MRI.constrainRegClass(SrcReg, &SystemZ::GR32BitRegClass);
-    }
-    int ThreeOperandOpcode = SystemZ::getThreeOperandOpcode(Opcode);
-    if (ThreeOperandOpcode >= 0) {
-      // Create three address instruction without adding the implicit
-      // operands. Those will instead be copied over from the original
-      // instruction by the loop below.
-      MachineInstrBuilder MIB(
-          *MF, MF->CreateMachineInstr(get(ThreeOperandOpcode), MI.getDebugLoc(),
-                                      /*NoImplicit=*/true));
-      MIB.add(Dest);
-      // Keep the kill state, but drop the tied flag.
-      MIB.addReg(Src.getReg(), getKillRegState(Src.isKill()), Src.getSubReg());
-      // Keep the remaining operands as-is.
-      for (unsigned I = 2; I < NumOps; ++I)
-        MIB.add(MI.getOperand(I));
-      MBB->insert(MI, MIB);
-      return finishConvertToThreeAddress(&MI, MIB, LV);
-    }
-  }
 
   // Try to convert an AND into an RISBG-type instruction.
-  if (LogicOp And = interpretAndImmediate(Opcode)) {
+  // TODO: It might be beneficial to select RISBG and shorten to AND instead.
+  if (LogicOp And = interpretAndImmediate(MI.getOpcode())) {
     uint64_t Imm = MI.getOperand(2).getImm() << And.ImmLSB;
     // AND IMMEDIATE leaves the other bits of the register unchanged.
     Imm |= allOnes(And.RegSize) & ~(allOnes(And.ImmSize) << And.ImmLSB);
@@ -1051,7 +991,16 @@
               .addImm(Start)
               .addImm(End + 128)
               .addImm(0);
-      return finishConvertToThreeAddress(&MI, MIB, LV);
+      if (LV) {
+        unsigned NumOps = MI.getNumOperands();
+        for (unsigned I = 1; I < NumOps; ++I) {
+          MachineOperand &Op = MI.getOperand(I);
+          if (Op.isReg() && Op.isKill())
+            LV->replaceKillInstruction(Op.getReg(), MI, *MIB);
+        }
+      }
+      transferDeadCC(&MI, MIB);
+      return MIB;
     }
   }
   return nullptr;
@@ -1060,7 +1009,7 @@
 MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
     MachineBasicBlock::iterator InsertPt, int FrameIndex,
-    LiveIntervals *LIS) const {
+    LiveIntervals *LIS, VirtRegMap *VRM) const {
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   unsigned Size = MFI.getObjectSize(FrameIndex);
@@ -1214,12 +1163,37 @@
     }
   }
 
-  // If the spilled operand is the final one, try to change <INSN>R
-  // into <INSN>.
+  // If the spilled operand is the final one or the instruction is
+  // commutable, try to change <INSN>R into <INSN>.
+  unsigned NumOps = MI.getNumExplicitOperands();
   int MemOpcode = SystemZ::getMemOpcode(Opcode);
+
+  // See if this is a 3-address instruction that is convertible to 2-address
+  // and suitable for folding below.  Only try this whith virtual registers
+  // and a provided VRM (during regalloc).
+  bool NeedsCommute = false;
+  if (SystemZ::getTwoOperandOpcode(Opcode) != -1 && MemOpcode != -1) {
+    if (VRM == nullptr)
+      MemOpcode = -1;
+    else {
+      assert(NumOps == 3 && "Expected two source registers.");
+      unsigned DstReg = MI.getOperand(0).getReg();
+      unsigned DstPhys =
+        (TRI->isVirtualRegister(DstReg) ? VRM->getPhys(DstReg) : DstReg);
+      unsigned SrcReg = (OpNum == 2 ? MI.getOperand(1).getReg()
+                                    : ((OpNum == 1 && MI.isCommutable())
+                                           ? MI.getOperand(2).getReg()
+                                           : 0));
+      if (DstPhys && !SystemZ::GRH32BitRegClass.contains(DstPhys) && SrcReg &&
+          TRI->isVirtualRegister(SrcReg) && DstPhys == VRM->getPhys(SrcReg))
+        NeedsCommute = (OpNum == 1);
+      else
+        MemOpcode = -1;
+    }
+  }
+
   if (MemOpcode >= 0) {
-    unsigned NumOps = MI.getNumExplicitOperands();
-    if (OpNum == NumOps - 1) {
+    if ((OpNum == NumOps - 1) || NeedsCommute) {
       const MCInstrDesc &MemDesc = get(MemOpcode);
       uint64_t AccessBytes = SystemZII::getAccessSize(MemDesc.TSFlags);
       assert(AccessBytes != 0 && "Size of access should be known");
@@ -1227,8 +1201,12 @@
       uint64_t Offset = Size - AccessBytes;
       MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
                                         MI.getDebugLoc(), get(MemOpcode));
-      for (unsigned I = 0; I < OpNum; ++I)
-        MIB.add(MI.getOperand(I));
+      MIB.add(MI.getOperand(0));
+      if (NeedsCommute)
+        MIB.add(MI.getOperand(2));
+      else
+        for (unsigned I = 1; I < OpNum; ++I)
+          MIB.add(MI.getOperand(I));
       MIB.addFrameIndex(FrameIndex).addImm(Offset);
       if (MemDesc.TSFlags & SystemZII::HasIndex)
         MIB.addReg(0);
Index: lib/Target/SystemZ/SystemZInstrInfo.td
===================================================================
--- lib/Target/SystemZ/SystemZInstrInfo.td
+++ lib/Target/SystemZ/SystemZInstrInfo.td
@@ -916,11 +916,11 @@
 
   // Addition of memory.
   defm AH  : BinaryRXPair<"ah", 0x4A, 0xE37A, z_sadd, GR32, asextloadi16, 2>;
-  defm A   : BinaryRXPair<"a",  0x5A, 0xE35A, z_sadd, GR32, load, 4>;
+  defm A   : BinaryRXPairAndPseudo<"a",  0x5A, 0xE35A, z_sadd, GR32, load, 4>;
   def  AGH : BinaryRXY<"agh", 0xE338, z_sadd, GR64, asextloadi16, 2>,
              Requires<[FeatureMiscellaneousExtensions2]>;
   def  AGF : BinaryRXY<"agf", 0xE318, z_sadd, GR64, asextloadi32, 4>;
-  def  AG  : BinaryRXY<"ag",  0xE308, z_sadd, GR64, load, 8>;
+  defm AG  : BinaryRXYAndPseudo<"ag",  0xE308, z_sadd, GR64, load, 8>;
 
   // Addition to memory.
   def ASI  : BinarySIY<"asi",  0xEB6A, add, imm32sx8>;
@@ -958,9 +958,9 @@
               Requires<[FeatureHighWord]>;
 
   // Addition of memory.
-  defm AL   : BinaryRXPair<"al", 0x5E, 0xE35E, z_uadd, GR32, load, 4>;
+  defm AL   : BinaryRXPairAndPseudo<"al", 0x5E, 0xE35E, z_uadd, GR32, load, 4>;
   def  ALGF : BinaryRXY<"algf", 0xE31A, z_uadd, GR64, azextloadi32, 4>;
-  def  ALG  : BinaryRXY<"alg",  0xE30A, z_uadd, GR64, load, 8>;
+  defm ALG  : BinaryRXYAndPseudo<"alg",  0xE30A, z_uadd, GR64, load, 8>;
 
   // Addition to memory.
   def ALSI  : BinarySIY<"alsi",  0xEB6E, null_frag, imm32sx8>;
@@ -1003,11 +1003,11 @@
 
   // Subtraction of memory.
   defm SH  : BinaryRXPair<"sh", 0x4B, 0xE37B, z_ssub, GR32, asextloadi16, 2>;
-  defm S   : BinaryRXPair<"s", 0x5B, 0xE35B, z_ssub, GR32, load, 4>;
+  defm S   : BinaryRXPairAndPseudo<"s", 0x5B, 0xE35B, z_ssub, GR32, load, 4>;
   def  SGH : BinaryRXY<"sgh", 0xE339, z_ssub, GR64, asextloadi16, 2>,
              Requires<[FeatureMiscellaneousExtensions2]>;
   def  SGF : BinaryRXY<"sgf", 0xE319, z_ssub, GR64, asextloadi32, 4>;
-  def  SG  : BinaryRXY<"sg",  0xE309, z_ssub, GR64, load, 8>;
+  defm SG  : BinaryRXYAndPseudo<"sg",  0xE309, z_ssub, GR64, load, 8>;
 }
 defm : SXB<z_ssub, GR64, SGFR>;
 
@@ -1055,9 +1055,9 @@
   def SLGFI : BinaryRIL<"slgfi", 0xC24, z_usub, GR64, imm64zx32>;
 
   // Subtraction of memory.
-  defm SL   : BinaryRXPair<"sl", 0x5F, 0xE35F, z_usub, GR32, load, 4>;
+  defm SL   : BinaryRXPairAndPseudo<"sl", 0x5F, 0xE35F, z_usub, GR32, load, 4>;
   def  SLGF : BinaryRXY<"slgf", 0xE31B, z_usub, GR64, azextloadi32, 4>;
-  def  SLG  : BinaryRXY<"slg",  0xE30B, z_usub, GR64, load, 8>;
+  defm SLG  : BinaryRXYAndPseudo<"slg",  0xE30B, z_usub, GR64, load, 8>;
 }
 defm : ZXB<z_usub, GR64, SLGFR>;
 
@@ -1132,8 +1132,8 @@
 
   // ANDs of memory.
   let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
-    defm N  : BinaryRXPair<"n", 0x54, 0xE354, and, GR32, load, 4>;
-    def  NG : BinaryRXY<"ng", 0xE380, and, GR64, load, 8>;
+    defm N  : BinaryRXPairAndPseudo<"n", 0x54, 0xE354, and, GR32, load, 4>;
+    defm NG : BinaryRXYAndPseudo<"ng", 0xE380, and, GR64, load, 8>;
   }
 
   // AND to memory
@@ -1189,8 +1189,8 @@
 
   // ORs of memory.
   let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
-    defm O  : BinaryRXPair<"o", 0x56, 0xE356, or, GR32, load, 4>;
-    def  OG : BinaryRXY<"og", 0xE381, or, GR64, load, 8>;
+    defm O  : BinaryRXPairAndPseudo<"o", 0x56, 0xE356, or, GR32, load, 4>;
+    defm OG : BinaryRXYAndPseudo<"og", 0xE381, or, GR64, load, 8>;
   }
 
   // OR to memory
@@ -1229,8 +1229,8 @@
 
   // XORs of memory.
   let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
-    defm X  : BinaryRXPair<"x",0x57, 0xE357, xor, GR32, load, 4>;
-    def  XG : BinaryRXY<"xg", 0xE382, xor, GR64, load, 8>;
+    defm X  : BinaryRXPairAndPseudo<"x",0x57, 0xE357, xor, GR32, load, 4>;
+    defm XG : BinaryRXYAndPseudo<"xg", 0xE382, xor, GR64, load, 8>;
   }
 
   // XOR to memory
Index: lib/Target/SystemZ/SystemZPostRewrite.cpp
===================================================================
--- /dev/null
+++ lib/Target/SystemZ/SystemZPostRewrite.cpp
@@ -0,0 +1,124 @@
+//==---- SystemZPostRewrite.cpp - Select pseudos after RegAlloc ---*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that is run immediately after VirtRegRewriter
+// but before MachineCopyPropagation. The purpose is to lower pseudos to
+// target instructions before any later pass might substitute a register for
+// another.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZ.h"
+#include "SystemZInstrInfo.h"
+#include "SystemZSubtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+using namespace llvm;
+
+#define SYSTEMZ_POSTREWRITE_NAME "SystemZ Post Rewrite pass"
+
+#define DEBUG_TYPE "systemz-postrewrite"
+STATISTIC(MemFoldCopies, "Number of copies inserted before folded mem ops.");
+
+namespace llvm {
+  void initializeSystemZPostRewritePass(PassRegistry&);
+}
+
+namespace {
+
+class SystemZPostRewrite : public MachineFunctionPass {
+public:
+  static char ID;
+  SystemZPostRewrite() : MachineFunctionPass(ID) {
+    initializeSystemZPostRewritePass(*PassRegistry::getPassRegistry());
+  }
+
+  const SystemZInstrInfo *TII;
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  StringRef getPassName() const override { return SYSTEMZ_POSTREWRITE_NAME; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  bool selectMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                MachineBasicBlock::iterator &NextMBBI);
+  bool selectMBB(MachineBasicBlock &MBB);
+};
+
+char SystemZPostRewrite::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(SystemZPostRewrite, "systemz-post-rewrite",
+                SYSTEMZ_POSTREWRITE_NAME, false, false)
+
+/// Returns an instance of the Post Rewrite pass.
+FunctionPass *llvm::createSystemZPostRewritePass(SystemZTargetMachine &TM) {
+  return new SystemZPostRewrite();
+}
+
+/// If MBBI references a pseudo instruction that should be selected here,
+/// do it and return true.  Otherwise return false.
+bool SystemZPostRewrite::selectMI(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MBBI,
+                                MachineBasicBlock::iterator &NextMBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned Opcode = MI.getOpcode();
+
+  // Note: If this could be done during regalloc in foldMemoryOperandImpl()
+  // while also updating the LiveIntervals, there would be no need for the
+  // MemFoldPseudo to begin with.
+  int TargetMemOpcode = SystemZ::getTargetMemOpcode(Opcode);
+  if (TargetMemOpcode != -1) {
+    MI.setDesc(TII->get(TargetMemOpcode));
+    MI.tieOperands(0, 1);
+    unsigned DstReg = MI.getOperand(0).getReg();
+    MachineOperand &SrcMO = MI.getOperand(1);
+    if (DstReg != SrcMO.getReg()) {
+      BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(SystemZ::COPY), DstReg)
+        .addReg(SrcMO.getReg());
+      SrcMO.setReg(DstReg);
+      MemFoldCopies++;
+    }
+    return true;
+  }
+
+  return false;
+}
+
+/// Iterate over the instructions in basic block MBB and select any
+/// pseudo instructions.  Return true if anything was modified.
+bool SystemZPostRewrite::selectMBB(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+    Modified |= selectMI(MBB, MBBI, NMBBI);
+    MBBI = NMBBI;
+  }
+
+  return Modified;
+}
+
+bool SystemZPostRewrite::runOnMachineFunction(MachineFunction &MF) {
+  TII = static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  bool Modified = false;
+  for (auto &MBB : MF)
+    Modified |= selectMBB(MBB);
+
+  return Modified;
+}
+
Index: lib/Target/SystemZ/SystemZRegisterInfo.cpp
===================================================================
--- lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -81,7 +81,8 @@
                                            const VirtRegMap *VRM,
                                            const LiveRegMatrix *Matrix) const {
   const MachineRegisterInfo *MRI = &MF.getRegInfo();
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>();
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
 
   bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints(
       VirtReg, Order, Hints, MF, VRM, Matrix);
@@ -138,6 +139,51 @@
     }
   }
 
+  if (VRM == nullptr)
+    return BaseImplRetVal;
+
+  // Add any two address hints after any copy hints.
+  SmallSet<unsigned, 4> TwoAddrHints;
+  for (auto &Use : MRI->reg_nodbg_instructions(VirtReg))
+    if (SystemZ::getTwoOperandOpcode(Use.getOpcode()) != -1) {
+      const MachineOperand *VRRegMO = nullptr;
+      const MachineOperand *OtherMO = nullptr;
+      const MachineOperand *CommuMO = nullptr;
+      if (VirtReg == Use.getOperand(0).getReg()) {
+        VRRegMO = &Use.getOperand(0);
+        OtherMO = &Use.getOperand(1);
+        if (Use.isCommutable())
+          CommuMO = &Use.getOperand(2);
+      } else if (VirtReg == Use.getOperand(1).getReg()) {
+        VRRegMO = &Use.getOperand(1);
+        OtherMO = &Use.getOperand(0);
+      } else if (VirtReg == Use.getOperand(2).getReg() && Use.isCommutable()) {
+        VRRegMO = &Use.getOperand(2);
+        OtherMO = &Use.getOperand(0);
+      } else
+        continue;
+
+      auto tryAddHint = [&](const MachineOperand *MO) -> void {
+        unsigned Reg = MO->getReg();
+        unsigned PhysReg = isPhysicalRegister(Reg) ? Reg : VRM->getPhys(Reg);
+        if (PhysReg) {
+          if (MO->getSubReg())
+            PhysReg = getSubReg(PhysReg, MO->getSubReg());
+          if (VRRegMO->getSubReg())
+            PhysReg = getMatchingSuperReg(PhysReg, VRRegMO->getSubReg(),
+                                          MRI->getRegClass(VirtReg));
+          if (!MRI->isReserved(PhysReg) && !is_contained(Hints, PhysReg))
+            TwoAddrHints.insert(PhysReg);
+        }
+      };
+      tryAddHint(OtherMO);
+      if (CommuMO)
+        tryAddHint(CommuMO);
+    }
+  for (MCPhysReg OrderReg : Order)
+    if (TwoAddrHints.count(OrderReg))
+      Hints.push_back(OrderReg);
+
   return BaseImplRetVal;
 }
 
Index: lib/Target/SystemZ/SystemZShortenInst.cpp
===================================================================
--- lib/Target/SystemZ/SystemZShortenInst.cpp
+++ lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -299,6 +299,31 @@
     case SystemZ::VST64:
       Changed |= shortenOn0(MI, SystemZ::STD);
       break;
+
+    default: {
+      int TwoOperandOpcode = SystemZ::getTwoOperandOpcode(MI.getOpcode());
+      if (TwoOperandOpcode == -1)
+        break;
+
+      if ((MI.getOperand(0).getReg() != MI.getOperand(1).getReg()) &&
+          (!MI.isCommutable() ||
+           MI.getOperand(0).getReg() != MI.getOperand(2).getReg() ||
+           !TII->commuteInstruction(MI, false, 1, 2)))
+          break;
+
+      MI.setDesc(TII->get(TwoOperandOpcode));
+      MI.tieOperands(0, 1);
+      if (TwoOperandOpcode == SystemZ::SLL ||
+          TwoOperandOpcode == SystemZ::SLA ||
+          TwoOperandOpcode == SystemZ::SRL ||
+          TwoOperandOpcode == SystemZ::SRA) {
+        // These shifts only use the low 6 bits of the shift count.
+        MachineOperand &ImmMO = MI.getOperand(3);
+        ImmMO.setImm(ImmMO.getImm() & 0xfff);
+      }
+      Changed = true;
+      break;
+    }
     }
 
     LiveRegs.stepBackward(MI);
Index: lib/Target/SystemZ/SystemZTargetMachine.cpp
===================================================================
--- lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -183,6 +183,7 @@
   void addIRPasses() override;
   bool addInstSelector() override;
   bool addILPOpts() override;
+  void addPostRewrite() override;
   void addPreSched2() override;
   void addPreEmitPass() override;
 };
@@ -212,7 +213,16 @@
   return true;
 }
 
+void SystemZPassConfig::addPostRewrite() {
+  addPass(createSystemZPostRewritePass(getSystemZTargetMachine()));
+}
+
 void SystemZPassConfig::addPreSched2() {
+  // PostRewrite needs to be run at -O0 also (in which case addPostRewrite()
+  // is not called).
+  if (getOptLevel() == CodeGenOpt::None)
+    addPass(createSystemZPostRewritePass(getSystemZTargetMachine()));
+
   addPass(createSystemZExpandPseudoPass(getSystemZTargetMachine()));
 
   if (getOptLevel() != CodeGenOpt::None)
Index: lib/Target/X86/X86InstrInfo.h
===================================================================
--- lib/Target/X86/X86InstrInfo.h
+++ lib/Target/X86/X86InstrInfo.h
@@ -350,7 +350,8 @@
   foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
                         ArrayRef<unsigned> Ops,
                         MachineBasicBlock::iterator InsertPt, int FrameIndex,
-                        LiveIntervals *LIS = nullptr) const override;
+                        LiveIntervals *LIS = nullptr,
+                        VirtRegMap *VRM = nullptr) const override;
 
   /// foldMemoryOperand - Same as the previous version except it allows folding
   /// of any load and store from / to any address, not just from a specific
Index: lib/Target/X86/X86InstrInfo.cpp
===================================================================
--- lib/Target/X86/X86InstrInfo.cpp
+++ lib/Target/X86/X86InstrInfo.cpp
@@ -4773,7 +4773,8 @@
 X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
                                     ArrayRef<unsigned> Ops,
                                     MachineBasicBlock::iterator InsertPt,
-                                    int FrameIndex, LiveIntervals *LIS) const {
+                                    int FrameIndex, LiveIntervals *LIS,
+                                    VirtRegMap *VRM) const {
   // Check switch flag
   if (NoFusing)
     return nullptr;
Index: test/CodeGen/SystemZ/asm-18.ll
===================================================================
--- test/CodeGen/SystemZ/asm-18.ll
+++ test/CodeGen/SystemZ/asm-18.ll
@@ -603,13 +603,13 @@
 }
 
 ; Test three-operand halfword immediate addition involving mixtures of low
-; and high registers.  RISBHG/AIH would be OK too, instead of AHIK/RISBHG.
+; and high registers.  AHIK/RISBHG would be OK too, instead of RISBHG/AIH.
 define i32 @f28(i32 %old) {
 ; CHECK-LABEL: f28:
 ; CHECK: ahik [[REG1:%r[0-5]]], %r2, 14
 ; CHECK: stepa %r2, [[REG1]]
-; CHECK: ahik [[TMP:%r[0-5]]], [[REG1]], 254
-; CHECK: risbhg [[REG2:%r[0-5]]], [[TMP]], 0, 159, 32
+; CHECK: risbhg  [[REG1]], [[REG1]], 0, 159, 32
+; CHECK: aih     [[REG1]], 254
 ; CHECK: stepb [[REG1]], [[REG2]]
 ; CHECK: risbhg [[REG3:%r[0-5]]], [[REG2]], 0, 159, 0
 ; CHECK: aih [[REG3]], 127
Index: test/CodeGen/SystemZ/codegenprepare-splitstore.ll
===================================================================
--- test/CodeGen/SystemZ/codegenprepare-splitstore.ll
+++ test/CodeGen/SystemZ/codegenprepare-splitstore.ll
@@ -5,9 +5,9 @@
 define void @fun(i16* %Src, i16* %Dst) {
 ; CHECK-LABEL: # %bb.0:
 ; CHECK:       lh   %r0, 0(%r2)
+; CHECK-NEXT:  srlk    %r1, %r0, 8
 ; CHECK-NEXT:  stc  %r0, 1(%r3)
-; CHECK-NEXT:  srl  %r0, 8
-; CHECK-NEXT:  stc  %r0, 0(%r3)
+; CHECK-NEXT:  stc  %r1, 0(%r3)
 ; CHECK-NEXT:  br   %r14
   %1 = load i16, i16* %Src
   %2 = trunc i16 %1 to i8
Index: test/CodeGen/SystemZ/ctpop-01.ll
===================================================================
--- test/CodeGen/SystemZ/ctpop-01.ll
+++ test/CodeGen/SystemZ/ctpop-01.ll
@@ -9,10 +9,10 @@
 ; CHECK-LABEL: f1:
 ; CHECK: popcnt  %r0, %r2
 ; CHECK: sllk    %r1, %r0, 16
-; CHECK: ar      %r1, %r0
-; CHECK: sllk    %r2, %r1, 8
-; CHECK: ar      %r2, %r1
-; CHECK: srl     %r2, 24
+; CHECK: ar      %r0, %r1
+; CHECK: sllk    %r1, %r0, 8
+; CHECK: ar      %r0, %r1
+; CHECK: srlk    %r2, %r0, 24
 ; CHECK: br      %r14
 
   %popcnt = call i32 @llvm.ctpop.i32(i32 %a)
@@ -23,9 +23,9 @@
 ; CHECK-LABEL: f2:
 ; CHECK: llhr    %r0, %r2
 ; CHECK: popcnt  %r0, %r0
-; CHECK: risblg  %r2, %r0, 16, 151, 8
-; CHECK: ar      %r2, %r0
-; CHECK: srl     %r2, 8
+; CHECK: risblg  %r1, %r0, 16, 151, 8
+; CHECK: ar      %r0, %r1
+; CHECK: srlk    %r2, %r0, 8
 ; CHECK: br      %r14
   %and = and i32 %a, 65535
   %popcnt = call i32 @llvm.ctpop.i32(i32 %and)
@@ -46,12 +46,12 @@
 ; CHECK-LABEL: f4:
 ; CHECK: popcnt  %r0, %r2
 ; CHECK: sllg    %r1, %r0, 32
-; CHECK: agr     %r1, %r0
-; CHECK: sllg    %r0, %r1, 16
+; CHECK: agr     %r0, %r1
+; CHECK: sllg    %r1, %r0, 16
 ; CHECK: agr     %r0, %r1
 ; CHECK: sllg    %r1, %r0, 8
-; CHECK: agr     %r1, %r0
-; CHECK: srlg    %r2, %r1, 56
+; CHECK: agr     %r0, %r1
+; CHECK: srlg    %r2, %r0, 56
 ; CHECK: br      %r14
   %popcnt = call i64 @llvm.ctpop.i64(i64 %a)
   ret i64 %popcnt
@@ -76,8 +76,8 @@
 ; CHECK: llghr   %r0, %r2
 ; CHECK: popcnt  %r0, %r0
 ; CHECK: risbg   %r1, %r0, 48, 183, 8
-; CHECK: agr     %r1, %r0
-; CHECK: srlg    %r2, %r1, 8
+; CHECK: agr     %r0, %r1
+; CHECK: srlg    %r2, %r0, 8
 ; CHECK: br      %r14
   %and = and i64 %a, 65535
   %popcnt = call i64 @llvm.ctpop.i64(i64 %and)
Index: test/CodeGen/SystemZ/int-add-05.ll
===================================================================
--- test/CodeGen/SystemZ/int-add-05.ll
+++ test/CodeGen/SystemZ/int-add-05.ll
@@ -1,7 +1,7 @@
 ; Test 64-bit addition in which the second operand is variable.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s --check-prefixes=CHECK,Z10
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s --check-prefixes=CHECK,Z196
 
 declare i64 @foo()
 
@@ -97,10 +97,12 @@
 }
 
 ; Check that additions of spilled values can use AG rather than AGR.
+; Note: Z196 is suboptimal with one unfolded reload.
 define i64 @f9(i64 *%ptr0) {
 ; CHECK-LABEL: f9:
 ; CHECK: brasl %r14, foo@PLT
-; CHECK: ag %r2, 160(%r15)
+; Z10:  ag %r2, 168(%r15)
+; Z196: ag %r0, 168(%r15)
 ; CHECK: br %r14
   %ptr1 = getelementptr i64, i64 *%ptr0, i64 2
   %ptr2 = getelementptr i64, i64 *%ptr0, i64 4
Index: test/CodeGen/SystemZ/int-sub-11.ll
===================================================================
--- /dev/null
+++ test/CodeGen/SystemZ/int-sub-11.ll
@@ -0,0 +1,22 @@
+; Test of subtraction that involves a constant as the first operand
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+; Check highest 16-bit signed int immediate value.
+define i64 @f1(i64 %a) {
+; CHECK-LABEL: f1:
+; CHECK: lghi %r0, 32767
+; CHECK: sgrk %r2, %r0, %r2
+; CHECK: br %r14
+  %sub = sub i64 32767, %a
+  ret i64 %sub
+}
+; Check highest 32-bit signed int immediate value.
+define i64 @f2(i64 %a) {
+; CHECK-LABEL: f2:
+; CHECK: lgfi %r0, 2147483647
+; CHECK: sgrk %r2, %r0, %r2
+; CHECK: br %r14
+  %sub = sub i64 2147483647, %a
+  ret i64 %sub
+}
Index: test/CodeGen/SystemZ/scalar-ctlz.ll
===================================================================
--- test/CodeGen/SystemZ/scalar-ctlz.ll
+++ test/CodeGen/SystemZ/scalar-ctlz.ll
@@ -55,10 +55,9 @@
 ; CHECK-LABEL: %bb.0:
 ; CHECK-NEXT: # kill
 ; CHECK-NEXT: llghr %r0, %r2
-; CHECK-NEXT: flogr %r2, %r0
-; CHECK-NEXT: aghi  %r2, -32
-; CHECK-NEXT: ahi   %r2, -16
-; CHECK-NEXT: # kill
+; CHECK-NEXT: flogr %r0, %r0
+; CHECK-NEXT: aghi  %r0, -32
+; CHECK-NEXT: ahik  %r2, %r0, -16
 ; CHECK-NEXT: br %r14
   %1 = tail call i16 @llvm.ctlz.i16(i16 %arg, i1 false)
   ret i16 %1
@@ -69,10 +68,9 @@
 ; CHECK-LABEL: %bb.0:
 ; CHECK-NEXT: # kill
 ; CHECK-NEXT: llghr %r0, %r2
-; CHECK-NEXT: flogr %r2, %r0
-; CHECK-NEXT: aghi  %r2, -32
-; CHECK-NEXT: ahi   %r2, -16
-; CHECK-NEXT: # kill
+; CHECK-NEXT: flogr %r0, %r0
+; CHECK-NEXT: aghi  %r0, -32
+; CHECK-NEXT: ahik  %r2, %r0, -16
 ; CHECK-NEXT: br %r14
   %1 = tail call i16 @llvm.ctlz.i16(i16 %arg, i1 true)
   ret i16 %1
@@ -83,10 +81,9 @@
 ; CHECK-LABEL: %bb.0:
 ; CHECK-NEXT: # kill
 ; CHECK-NEXT: llgcr %r0, %r2
-; CHECK-NEXT: flogr %r2, %r0
-; CHECK-NEXT: aghi  %r2, -32
-; CHECK-NEXT: ahi   %r2, -24
-; CHECK-NEXT: # kill
+; CHECK-NEXT: flogr %r0, %r0
+; CHECK-NEXT: aghi  %r0, -32
+; CHECK-NEXT: ahik  %r2, %r0, -24
 ; CHECK-NEXT: br %r14
   %1 = tail call i8 @llvm.ctlz.i8(i8 %arg, i1 false)
   ret i8 %1
@@ -97,10 +94,9 @@
 ; CHECK-LABEL: %bb.0:
 ; CHECK-NEXT: # kill
 ; CHECK-NEXT: llgcr %r0, %r2
-; CHECK-NEXT: flogr %r2, %r0
-; CHECK-NEXT: aghi  %r2, -32
-; CHECK-NEXT: ahi   %r2, -24
-; CHECK-NEXT: # kill
+; CHECK-NEXT: flogr %r0, %r0
+; CHECK-NEXT: aghi  %r0, -32
+; CHECK-NEXT: ahik  %r2, %r0, -24
 ; CHECK-NEXT: br %r14
   %1 = tail call i8 @llvm.ctlz.i8(i8 %arg, i1 true)
   ret i8 %1
Index: test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
===================================================================
--- test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
+++ test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
@@ -75,17 +75,17 @@
 ; CHECK-NEXT:    stmg %r14, %r15, 112(%r15)
 ; CHECK-NEXT:    .cfi_offset %r14, -48
 ; CHECK-NEXT:    .cfi_offset %r15, -40
-; CHECK-NEXT:    vlgvf %r3, %v26, 1
-; CHECK-NEXT:    vlgvf %r1, %v26, 2
-; CHECK-NEXT:    risbgn %r4, %r3, 0, 129, 62
-; CHECK-NEXT:    rosbg %r4, %r1, 2, 32, 31
+; CHECK-DAG:     vlgvf [[REG11:%r[0-9]+]], %v26, 1
+; CHECK-DAG:     vlgvf [[REG12:%r[0-9]+]], %v26, 2
+; CHECK-DAG:     risbgn [[REG13:%r[0-9]+]], [[REG11]], 0, 129, 62
+; CHECK-DAG:     rosbg [[REG13]], [[REG12]], 2, 32, 31
 ; CHECK-DAG:     vlgvf %r0, %v26, 3
-; CHECK-DAG:     rosbg %r4, %r0, 33, 63, 0
+; CHECK-DAG:     rosbg [[REG13]], %r0, 33, 63, 0
 ; CHECK-DAG:     stc %r0, 30(%r2)
-; CHECK-DAG:     srl %r0, 8
+; CHECK-DAG:     srlk %r1, %r0, 8
 ; CHECK-DAG:     vlgvf [[REG0:%r[0-9]+]], %v24, 1
 ; CHECK-DAG:     vlgvf [[REG1:%r[0-9]+]], %v24, 0
-; CHECK-DAG:     sth %r0, 28(%r2)
+; CHECK-DAG:     sth %r1, 28(%r2)
 ; CHECK-DAG:     vlgvf [[REG2:%r[0-9]+]], %v24, 2
 ; CHECK-DAG:     risbgn [[REG3:%r[0-9]+]], [[REG0]], 0, 133, 58
 ; CHECK-DAG:     rosbg [[REG3]], [[REG2]], 6, 36, 27
@@ -95,18 +95,18 @@
 ; CHECK-DAG:     rosbg [[REG3]], [[REG5]], 37, 63, 60
 ; CHECK-DAG:     sllg [[REG6:%r[0-9]+]], [[REG4]], 8
 ; CHECK-DAG:     rosbg [[REG6]], [[REG3]], 56, 63, 8
-; CHECK-NEXT:    stg [[REG6]], 0(%r2)
-; CHECK-NEXT:    srlg [[REG7:%r[0-9]+]], %r4, 24
-; CHECK-NEXT:    st [[REG7]], 24(%r2)
-; CHECK-NEXT:    vlgvf [[REG8:%r[0-9]+]], %v26, 0
-; CHECK-NEXT:    risbgn [[REG10:%r[0-9]+]], [[REG5]], 0, 131, 60
-; CHECK-NEXT:    rosbg [[REG10]], [[REG8]], 4, 34, 29
-; CHECK-NEXT:    sllg [[REG9:%r[0-9]+]], [[REG3]], 8
-; CHECK-NEXT:    rosbg [[REG10]], %r3, 35, 63, 62
-; CHECK-NEXT:    rosbg [[REG9]], [[REG10]], 56, 63, 8
-; CHECK-NEXT:    stg [[REG9]], 8(%r2)
-; CHECK-NEXT:    sllg %r0, [[REG10]], 8
-; CHECK-NEXT:    rosbg %r0, %r4, 56, 63, 8
+; CHECK-DAG:     stg [[REG6]], 0(%r2)
+; CHECK-DAG:     srlg [[REG7:%r[0-9]+]], [[REG13]], 24
+; CHECK-DAG:     st [[REG7]], 24(%r2)
+; CHECK-DAG:     vlgvf [[REG8:%r[0-9]+]], %v26, 0
+; CHECK-DAG:     risbgn [[REG10:%r[0-9]+]], [[REG5]], 0, 131, 60
+; CHECK-DAG:     rosbg [[REG10]], [[REG8]], 4, 34, 29
+; CHECK-DAG:     sllg [[REG9:%r[0-9]+]], [[REG3]], 8
+; CHECK-DAG:     rosbg [[REG10]], [[REG11]], 35, 63, 62
+; CHECK-DAG:     rosbg [[REG9]], [[REG10]], 56, 63, 8
+; CHECK-DAG:     stg [[REG9]], 8(%r2)
+; CHECK-DAG:     sllg %r0, [[REG10]], 8
+; CHECK-DAG:     rosbg %r0, [[REG13]], 56, 63, 8
 ; CHECK-NEXT:    stg %r0, 16(%r2)
 ; CHECK-NEXT:    lmg %r14, %r15, 112(%r15)
 ; CHECK-NEXT:    br %r14
Index: test/CodeGen/SystemZ/vec-combine-02.ll
===================================================================
--- test/CodeGen/SystemZ/vec-combine-02.ll
+++ test/CodeGen/SystemZ/vec-combine-02.ll
@@ -408,7 +408,7 @@
 ; CHECK-NOT: vmrh
 ; CHECK: ar {{%r[0-5]}},
 ; CHECK: ar {{%r[0-5]}},
-; CHECK: or %r2,
+; CHECK: ork %r2,
 ; CHECK: br %r14
   %vec0 = insertelement <2 x double> undef, double %scalar0, i32 0
   %vec1 = insertelement <2 x double> undef, double %scalar1, i32 0