Index: llvm/include/llvm/CodeGen/MachineInstrBuilder.h
===================================================================
--- llvm/include/llvm/CodeGen/MachineInstrBuilder.h
+++ llvm/include/llvm/CodeGen/MachineInstrBuilder.h
@@ -195,7 +195,7 @@
   }
 
   const MachineInstrBuilder &addRegMask(const uint32_t *Mask) const {
-    MI->addOperand(*MF, MachineOperand::CreateRegMask(Mask));
+    MI->addOperand(*MF, MachineOperand::CreateRegMask(Mask, MF));
     return *this;
   }
 
Index: llvm/include/llvm/CodeGen/MachineOperand.h
===================================================================
--- llvm/include/llvm/CodeGen/MachineOperand.h
+++ llvm/include/llvm/CodeGen/MachineOperand.h
@@ -27,6 +27,7 @@
 class ConstantInt;
 class GlobalValue;
 class MachineBasicBlock;
+class MachineFunction;
 class MachineInstr;
 class MachineRegisterInfo;
 class MCCFIInstruction;
@@ -704,10 +705,7 @@
   /// operand does not take ownership of the memory referenced by Mask, it must
   /// remain valid for the lifetime of the operand. See CreateRegMask().
   /// Any physreg with a 0 bit in the mask is clobbered by the instruction.
-  void setRegMask(const uint32_t *RegMaskPtr) {
-    assert(isRegMask() && "Wrong MachineOperand mutator");
-    Contents.RegMask = RegMaskPtr;
-  }
+  void setRegMask(const uint32_t *RegMaskPtr, MachineInstr *MI = nullptr);
 
   void setIntrinsicID(Intrinsic::ID IID) {
     assert(isIntrinsicID() && "Wrong MachineOperand mutator");
@@ -892,12 +890,9 @@
   ///
   /// Any physreg with a 0 bit in the mask is clobbered by the instruction.
   ///
-  static MachineOperand CreateRegMask(const uint32_t *Mask) {
-    assert(Mask && "Missing register mask");
-    MachineOperand Op(MachineOperand::MO_RegisterMask);
-    Op.Contents.RegMask = Mask;
-    return Op;
-  }
+  static MachineOperand CreateRegMask(const uint32_t *Mask,
+                                      MachineFunction *MF);
+
   static MachineOperand CreateRegLiveOut(const uint32_t *Mask) {
     assert(Mask && "Missing live-out register mask");
     MachineOperand Op(MachineOperand::MO_RegisterLiveOut);
Index: llvm/include/llvm/CodeGen/MachineRegisterInfo.h
===================================================================
--- llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -882,6 +882,8 @@
     UsedPhysRegMask.setBitsNotInMask(RegMask);
   }
 
+  void recollectUsedPhysRegMask();
+
   const BitVector &getUsedPhysRegsMask() const { return UsedPhysRegMask; }
 
   //===--------------------------------------------------------------------===//
Index: llvm/lib/CodeGen/MIRParser/MIParser.cpp
===================================================================
--- llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -2757,7 +2757,7 @@
 
   if (expectAndConsume(MIToken::rparen))
     return true;
-  Dest = MachineOperand::CreateRegMask(Mask);
+  Dest = MachineOperand::CreateRegMask(Mask, &MF);
   return false;
 }
 
@@ -2870,7 +2870,7 @@
     return true;
   case MIToken::Identifier:
     if (const auto *RegMask = PFS.Target.getRegMask(Token.stringValue())) {
-      Dest = MachineOperand::CreateRegMask(RegMask);
+      Dest = MachineOperand::CreateRegMask(RegMask, &MF);
       lex();
       break;
     } else if (Token.stringValue() == "CustomRegMask") {
Index: llvm/lib/CodeGen/MachineOperand.cpp
===================================================================
--- llvm/lib/CodeGen/MachineOperand.cpp
+++ llvm/lib/CodeGen/MachineOperand.cpp
@@ -150,6 +150,23 @@
     MF->getRegInfo().removeRegOperandFromUseList(this);
 }
 
+void MachineOperand::setRegMask(const uint32_t *RegMaskPtr, MachineInstr *MI) {
+  assert(isRegMask() && "Wrong MachineOperand mutator");
+  Contents.RegMask = RegMaskPtr;
+  if (MI == nullptr)
+    MI = ParentMI;
+  MI->getMF()->getRegInfo().addPhysRegsUsedFromRegMask(RegMaskPtr);
+}
+
+MachineOperand MachineOperand::CreateRegMask(const uint32_t *Mask,
+                                             MachineFunction *MF) {
+  assert(Mask && "Missing register mask");
+  MachineOperand Op(MachineOperand::MO_RegisterMask);
+  Op.Contents.RegMask = Mask;
+  MF->getRegInfo().addPhysRegsUsedFromRegMask(Mask);
+  return Op;
+}
+
 /// ChangeToImmediate - Replace this operand with a new immediate operand of
 /// the specified value.  If an operand is known to be an immediate already,
 /// the setImm method should be used.
Index: llvm/lib/CodeGen/MachineRegisterInfo.cpp
===================================================================
--- llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -523,8 +523,9 @@
   // used later.
   for (MCRegAliasIterator AI(PhysReg, TRI, true);
        AI.isValid(); ++AI)
-    if (!def_empty(*AI) || isAllocatable(*AI))
+    if (!def_empty(*AI) || isAllocatable(*AI) || UsedPhysRegMask.test(PhysReg))
       return false;
+
   return true;
 }
 
@@ -584,6 +585,18 @@
   return false;
 }
 
+void MachineRegisterInfo::recollectUsedPhysRegMask() {
+  UsedPhysRegMask.reset();
+  for (MachineBasicBlock &MBB : *MF) {
+    for (MachineInstr &MI : MBB.instrs()) {
+      for (MachineOperand &MO : MI.operands()) {
+        if (MO.isRegMask())
+          addPhysRegsUsedFromRegMask(MO.getRegMask());
+      }
+    }
+  }
+}
+
 bool MachineRegisterInfo::isPhysRegUsed(MCRegister PhysReg,
                                         bool SkipRegMaskTest) const {
   if (!SkipRegMaskTest && UsedPhysRegMask.test(PhysReg))
Index: llvm/lib/CodeGen/RegUsageInfoPropagate.cpp
===================================================================
--- llvm/lib/CodeGen/RegUsageInfoPropagate.cpp
+++ llvm/lib/CodeGen/RegUsageInfoPropagate.cpp
@@ -143,6 +143,10 @@
     }
   }
 
+  // MRI.UsedPhysRegMask should reflect the updated RegMask.
+  if (Changed)
+    MF.getRegInfo().recollectUsedPhysRegMask();
+
   LLVM_DEBUG(
       dbgs() << " +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
                 "++++++ \n");
Index: llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -863,7 +863,7 @@
 
   // Push the register mask info.
   Ops.push_back(MachineOperand::CreateRegMask(
-      TRI.getCallPreservedMask(*FuncInfo.MF, CC)));
+      TRI.getCallPreservedMask(*FuncInfo.MF, CC), FuncInfo.MF));
 
   // Add scratch registers as implicit def and early clobber.
   const MCPhysReg *ScratchRegs = TLI.getScratchRegisters(CC);
Index: llvm/lib/CodeGen/TargetInstrInfo.cpp
===================================================================
--- llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -970,7 +970,7 @@
         // If the physreg has no defs anywhere, it's just an ambient register
         // and we can freely move its uses. Alternatively, if it's allocatable,
         // it could get allocated to something with a def during allocation.
-        if (!MRI.isConstantPhysReg(Reg))
+        if (!MRI.isConstantPhysReg(Reg) && !isIgnorableUse(MO))
           return false;
       } else {
         // A physreg def. We can't remat it.
Index: llvm/lib/Target/X86/X86InstrInfo.h
===================================================================
--- llvm/lib/Target/X86/X86InstrInfo.h
+++ llvm/lib/Target/X86/X86InstrInfo.h
@@ -577,6 +577,13 @@
   Optional<ParamLoadedValue> describeLoadedValue(const MachineInstr &MI,
                                                  Register Reg) const override;
 
+  /// Given \p MO is a PhysReg use return if it can be ignored for the purpose
+  /// of instruction rematerialization or sinking.
+  bool isIgnorableUse(const MachineOperand &MO) const override {
+    // An RIP relative address is a constant.
+    return MO.getReg() == X86::RIP;
+  }
+
 protected:
   /// Commutes the operands in the given instruction by changing the operands
   /// order and/or changing the instruction's opcode and/or the immediate value
Index: llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -173,18 +173,24 @@
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GCN-NEXT:    s_add_u32 s0, s0, s17
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    s_mov_b32 s32, 0
+; GCN-NEXT:    s_mov_b32 s18, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_bitcmp1_b32 s12, 0
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    v_mov_b32_e32 v4, v0
 ; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT:    s_mov_b32 s19, s18
+; GCN-NEXT:    v_mov_b32_e32 v0, s18
 ; GCN-NEXT:    s_and_b64 vcc, exec, s[12:13]
+; GCN-NEXT:    v_mov_b32_e32 v1, s19
+; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_cbranch_vccnz .LBB4_2
 ; GCN-NEXT:  ; %bb.1: ; %if.else
 ; GCN-NEXT:    s_add_u32 s8, s8, 8
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 20, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v3
 ; GCN-NEXT:    s_addc_u32 s9, s9, 0
-; GCN-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GCN-NEXT:    v_or3_b32 v31, v4, v1, v0
 ; GCN-NEXT:    s_mov_b32 s12, s14
 ; GCN-NEXT:    s_mov_b32 s13, s15
 ; GCN-NEXT:    s_mov_b32 s14, s16
@@ -192,13 +198,7 @@
 ; GCN-NEXT:    s_add_u32 s18, s18, func_v3i16@rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s19, s19, func_v3i16@rel32@hi+12
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GCN-NEXT:    s_branch .LBB4_3
-; GCN-NEXT:  .LBB4_2:
-; GCN-NEXT:    s_mov_b32 s4, 0
-; GCN-NEXT:    s_mov_b32 s5, s4
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:  .LBB4_3: ; %if.end
+; GCN-NEXT:  .LBB4_2: ; %if.end
 ; GCN-NEXT:    global_store_short v[0:1], v1, off
 ; GCN-NEXT:    global_store_dword v[0:1], v0, off
 ; GCN-NEXT:    s_endpgm
@@ -226,18 +226,24 @@
 ; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GCN-NEXT:    s_add_u32 s0, s0, s17
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    s_mov_b32 s32, 0
+; GCN-NEXT:    s_mov_b32 s18, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_bitcmp1_b32 s12, 0
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    v_mov_b32_e32 v4, v0
 ; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT:    s_mov_b32 s19, s18
+; GCN-NEXT:    v_mov_b32_e32 v0, s18
 ; GCN-NEXT:    s_and_b64 vcc, exec, s[12:13]
+; GCN-NEXT:    v_mov_b32_e32 v1, s19
+; GCN-NEXT:    s_mov_b32 s32, 0
 ; GCN-NEXT:    s_cbranch_vccnz .LBB5_2
 ; GCN-NEXT:  ; %bb.1: ; %if.else
 ; GCN-NEXT:    s_add_u32 s8, s8, 8
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 20, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v3
 ; GCN-NEXT:    s_addc_u32 s9, s9, 0
-; GCN-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GCN-NEXT:    v_or3_b32 v31, v4, v1, v0
 ; GCN-NEXT:    s_mov_b32 s12, s14
 ; GCN-NEXT:    s_mov_b32 s13, s15
 ; GCN-NEXT:    s_mov_b32 s14, s16
@@ -245,13 +251,7 @@
 ; GCN-NEXT:    s_add_u32 s18, s18, func_v3f16@rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s19, s19, func_v3f16@rel32@hi+12
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[18:19]
-; GCN-NEXT:    s_branch .LBB5_3
-; GCN-NEXT:  .LBB5_2:
-; GCN-NEXT:    s_mov_b32 s4, 0
-; GCN-NEXT:    s_mov_b32 s5, s4
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:  .LBB5_3: ; %if.end
+; GCN-NEXT:  .LBB5_2: ; %if.end
 ; GCN-NEXT:    global_store_short v[0:1], v1, off
 ; GCN-NEXT:    global_store_dword v[0:1], v0, off
 ; GCN-NEXT:    s_endpgm
Index: llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -33,10 +33,10 @@
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[54:55], s[6:7]
 ; GLOBALNESS1-NEXT:    s_load_dwordx4 s[36:39], s[8:9], 0x0
 ; GLOBALNESS1-NEXT:    s_load_dword s6, s[8:9], 0x14
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v42, v0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v44, 0
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS1-NEXT:    global_store_dword v[0:1], v44, off
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[46:47], 0, 0
+; GLOBALNESS1-NEXT:    global_store_dword v[46:47], v44, off
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v42, v0
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS1-NEXT:    global_load_dword v0, v44, s[36:37]
 ; GLOBALNESS1-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
@@ -176,8 +176,7 @@
 ; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_24
 ; GLOBALNESS1-NEXT:  .LBB1_9: ; %baz.exit.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[32:33], 0, 0
-; GLOBALNESS1-NEXT:    flat_load_dword v0, v[32:33]
+; GLOBALNESS1-NEXT:    flat_load_dword v0, v[46:47]
 ; GLOBALNESS1-NEXT:    s_mov_b32 s68, s93
 ; GLOBALNESS1-NEXT:    s_mov_b32 s70, s93
 ; GLOBALNESS1-NEXT:    s_mov_b32 s71, s69
@@ -217,7 +216,7 @@
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_26
 ; GLOBALNESS1-NEXT:  ; %bb.10: ; %bb33.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    global_load_dwordx2 v[0:1], v[32:33], off
+; GLOBALNESS1-NEXT:    global_load_dwordx2 v[0:1], v[46:47], off
 ; GLOBALNESS1-NEXT:    v_readlane_b32 s4, v41, 0
 ; GLOBALNESS1-NEXT:    v_readlane_b32 s5, v41, 1
 ; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
@@ -225,8 +224,7 @@
 ; GLOBALNESS1-NEXT:  ; %bb.11: ; %bb39.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v45, v44
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[2:3], v[44:45], off
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[46:47], v[44:45], off
 ; GLOBALNESS1-NEXT:  .LBB1_12: ; %bb44.lr.ph.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v43
@@ -281,7 +279,6 @@
 ; GLOBALNESS1-NEXT:    s_mov_b32 s14, s98
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v42
 ; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[66:67]
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[46:47], 0, 0
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[64:65]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[54:55]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[60:61]
@@ -366,14 +363,12 @@
 ; GLOBALNESS1-NEXT:  ; %bb.28: ; %bb69.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v45, v44
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[32:33], 0, 0
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[32:33], v[44:45], off
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[46:47], v[44:45], off
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_1
 ; GLOBALNESS1-NEXT:  .LBB1_29: ; %bb73.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v45, v44
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[32:33], 0, 0
-; GLOBALNESS1-NEXT:    global_store_dwordx2 v[32:33], v[44:45], off
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[46:47], v[44:45], off
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_2
 ; GLOBALNESS1-NEXT:  .LBB1_30: ; %loop.exit.guard
 ; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
@@ -418,10 +413,10 @@
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[54:55], s[6:7]
 ; GLOBALNESS0-NEXT:    s_load_dwordx4 s[36:39], s[8:9], 0x0
 ; GLOBALNESS0-NEXT:    s_load_dword s6, s[8:9], 0x14
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v42, v0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v44, 0
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
-; GLOBALNESS0-NEXT:    global_store_dword v[0:1], v44, off
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[46:47], 0, 0
+; GLOBALNESS0-NEXT:    global_store_dword v[46:47], v44, off
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v42, v0
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GLOBALNESS0-NEXT:    global_load_dword v0, v44, s[36:37]
 ; GLOBALNESS0-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
@@ -561,8 +556,7 @@
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_24
 ; GLOBALNESS0-NEXT:  .LBB1_9: ; %baz.exit.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[32:33], 0, 0
-; GLOBALNESS0-NEXT:    flat_load_dword v0, v[32:33]
+; GLOBALNESS0-NEXT:    flat_load_dword v0, v[46:47]
 ; GLOBALNESS0-NEXT:    s_mov_b32 s68, s93
 ; GLOBALNESS0-NEXT:    s_mov_b32 s70, s93
 ; GLOBALNESS0-NEXT:    s_mov_b32 s71, s69
@@ -602,7 +596,7 @@
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_26
 ; GLOBALNESS0-NEXT:  ; %bb.10: ; %bb33.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    global_load_dwordx2 v[0:1], v[32:33], off
+; GLOBALNESS0-NEXT:    global_load_dwordx2 v[0:1], v[46:47], off
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v41, 0
 ; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v41, 1
 ; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
@@ -610,8 +604,7 @@
 ; GLOBALNESS0-NEXT:  ; %bb.11: ; %bb39.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v45, v44
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[2:3], v[44:45], off
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[46:47], v[44:45], off
 ; GLOBALNESS0-NEXT:  .LBB1_12: ; %bb44.lr.ph.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v43
@@ -666,7 +659,6 @@
 ; GLOBALNESS0-NEXT:    s_mov_b32 s14, s98
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v42
 ; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[66:67]
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[46:47], 0, 0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[62:63]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[54:55]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[64:65]
@@ -751,14 +743,12 @@
 ; GLOBALNESS0-NEXT:  ; %bb.28: ; %bb69.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v45, v44
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[32:33], 0, 0
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[32:33], v[44:45], off
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[46:47], v[44:45], off
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_1
 ; GLOBALNESS0-NEXT:  .LBB1_29: ; %bb73.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v45, v44
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[32:33], 0, 0
-; GLOBALNESS0-NEXT:    global_store_dwordx2 v[32:33], v[44:45], off
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[46:47], v[44:45], off
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_2
 ; GLOBALNESS0-NEXT:  .LBB1_30: ; %loop.exit.guard
 ; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
Index: llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -333,15 +333,15 @@
 ; GFX9-O0:       ; %bb.0:
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s33, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v4, s33, 2
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0x400
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v4, s30, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v4, s31, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s36, s4
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
 ; GFX9-O0-NEXT:    s_mov_b32 s37, s5
@@ -363,19 +363,19 @@
 ; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[46:47]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[42:43]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT:    v_add_u32_e64 v1, v1, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-O0-NEXT:    v_add_u32_e64 v2, v3, v2
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[36:39], s34 offset:4
-; GFX9-O0-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s31, v4, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xfffffc00
-; GFX9-O0-NEXT:    v_readlane_b32 s33, v3, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s33, v4, 2
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -384,15 +384,15 @@
 ; GFX9-O3:       ; %bb.0:
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    v_writelane_b32 v3, s33, 2
-; GFX9-O3-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX9-O3-NEXT:    v_writelane_b32 v4, s33, 2
+; GFX9-O3-NEXT:    v_writelane_b32 v4, s30, 0
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-O3-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX9-O3-NEXT:    v_writelane_b32 v4, s31, 1
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
@@ -403,19 +403,19 @@
 ; GFX9-O3-NEXT:    s_add_u32 s36, s36, strict_wwm_called@rel32@lo+4
 ; GFX9-O3-NEXT:    s_addc_u32 s37, s37, strict_wwm_called@rel32@hi+12
 ; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[36:37]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-O3-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-O3-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
-; GFX9-O3-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX9-O3-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX9-O3-NEXT:    v_readlane_b32 s31, v4, 1
+; GFX9-O3-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-O3-NEXT:    v_readlane_b32 s33, v3, 2
+; GFX9-O3-NEXT:    v_readlane_b32 s33, v4, 2
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
@@ -517,36 +517,36 @@
 ; GFX9-O0:       ; %bb.0:
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v13, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_writelane_b32 v10, s33, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v13, s33, 8
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xc00
-; GFX9-O0-NEXT:    v_writelane_b32 v10, s30, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v10, s31, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v13, s30, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v13, s31, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s34, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s36, s4
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
 ; GFX9-O0-NEXT:    s_mov_b32 s37, s5
 ; GFX9-O0-NEXT:    s_mov_b32 s38, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s39, s7
-; GFX9-O0-NEXT:    v_writelane_b32 v10, s36, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v10, s37, 3
-; GFX9-O0-NEXT:    v_writelane_b32 v10, s38, 4
-; GFX9-O0-NEXT:    v_writelane_b32 v10, s39, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v13, s36, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v13, s37, 3
+; GFX9-O0-NEXT:    v_writelane_b32 v13, s38, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v13, s39, 5
 ; GFX9-O0-NEXT:    ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    s_mov_b32 s35, s9
 ; GFX9-O0-NEXT:    ; kill: def $sgpr40_sgpr41 killed $sgpr34_sgpr35
@@ -558,12 +558,12 @@
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s37
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    v_writelane_b32 v10, s34, 6
-; GFX9-O0-NEXT:    v_writelane_b32 v10, s35, 7
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v8
+; GFX9-O0-NEXT:    v_writelane_b32 v13, s34, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v13, s35, 7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 32
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[3:4], s34, v[8:9]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[11:12], s34, v[8:9]
 ; GFX9-O0-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-O0-NEXT:    s_add_u32 s34, s34, strict_wwm_called_i64@gotpcrel32@lo+4
 ; GFX9-O0-NEXT:    s_addc_u32 s35, s35, strict_wwm_called_i64@gotpcrel32@hi+12
@@ -572,55 +572,53 @@
 ; GFX9-O0-NEXT:    s_mov_b64 s[36:37], s[0:1]
 ; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[36:37]
 ; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v11
 ; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-O0-NEXT:    v_readlane_b32 s34, v10, 6
-; GFX9-O0-NEXT:    v_readlane_b32 s35, v10, 7
-; GFX9-O0-NEXT:    v_readlane_b32 s36, v10, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s37, v10, 3
-; GFX9-O0-NEXT:    v_readlane_b32 s38, v10, 4
-; GFX9-O0-NEXT:    v_readlane_b32 s39, v10, 5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    v_readlane_b32 s34, v13, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s35, v13, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s36, v13, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s37, v13, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s38, v13, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s39, v13, 5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr40
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr40
-; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
-; GFX9-O0-NEXT:    v_add_co_u32_e64 v2, s[40:41], v2, v4
-; GFX9-O0-NEXT:    v_addc_co_u32_e64 v3, s[40:41], v3, v5, s[40:41]
+; GFX9-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 killed $exec
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v8, s[40:41], v10, v8
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v9, s[40:41], v11, v9, s[40:41]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v9
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
 ; GFX9-O0-NEXT:    buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4
-; GFX9-O0-NEXT:    v_readlane_b32 s31, v10, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s30, v10, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s31, v13, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s30, v13, 0
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xfffff400
-; GFX9-O0-NEXT:    v_readlane_b32 s33, v10, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s33, v13, 8
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v13, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_nop 0
 ; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -629,21 +627,18 @@
 ; GFX9-O3:       ; %bb.0:
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    v_writelane_b32 v8, s33, 2
-; GFX9-O3-NEXT:    v_writelane_b32 v8, s30, 0
+; GFX9-O3-NEXT:    v_writelane_b32 v10, s33, 2
+; GFX9-O3-NEXT:    v_writelane_b32 v10, s30, 0
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0x800
-; GFX9-O3-NEXT:    v_writelane_b32 v8, s31, 1
+; GFX9-O3-NEXT:    v_writelane_b32 v10, s31, 1
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    s_getpc_b64 s[36:37]
 ; GFX9-O3-NEXT:    s_add_u32 s36, s36, strict_wwm_called_i64@gotpcrel32@lo+4
@@ -661,31 +656,27 @@
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[36:37]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
+; GFX9-O3-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-O3-NEXT:    v_add_co_u32_e32 v6, vcc, v8, v6
+; GFX9-O3-NEXT:    v_addc_co_u32_e32 v7, vcc, v9, v7, vcc
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[38:39]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX9-O3-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4
-; GFX9-O3-NEXT:    v_readlane_b32 s31, v8, 1
-; GFX9-O3-NEXT:    v_readlane_b32 s30, v8, 0
+; GFX9-O3-NEXT:    v_readlane_b32 s31, v10, 1
+; GFX9-O3-NEXT:    v_readlane_b32 s30, v10, 0
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0xf800
-; GFX9-O3-NEXT:    v_readlane_b32 s33, v8, 2
+; GFX9-O3-NEXT:    v_readlane_b32 s33, v10, 2
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_nop 0
 ; GFX9-O3-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_nop 0
-; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_nop 0
-; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    s_nop 0
-; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
Index: llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -118,9 +118,9 @@
 ; GFX9: v_mov_b32_e32 v0, v2
 ; GFX9: s_swappc_b64
   %tmp134 = call i32 @called(i32 %tmp107)
-; GFX9: v_mov_b32_e32 v1, v0
-; GFX9-O3: v_add_u32_e32 v1, v1, v2
-; GFX9-O0: v_add_u32_e64 v1, v1, v2
+; GFX9: v_mov_b32_e32 v3, v0
+; GFX9-O3: v_add_u32_e32 v2, v3, v2
+; GFX9-O0: v_add_u32_e64 v2, v3, v2
   %tmp136 = add i32 %tmp134, %tmp107
   %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136)
 ; GFX9: buffer_store_dword v0
@@ -309,9 +309,9 @@
 ; GFX9: v_mov_b32_e32 v0, v2
 ; GFX9: s_swappc_b64
   %tmp134 = call i32 @strict_wwm_called(i32 %tmp107)
-; GFX9: v_mov_b32_e32 v1, v0
-; GFX9-O3: v_add_u32_e32 v1, v1, v2
-; GFX9-O0: v_add_u32_e64 v1, v1, v2
+; GFX9: v_mov_b32_e32 v3, v0
+; GFX9-O3: v_add_u32_e32 v2, v3, v2
+; GFX9-O0: v_add_u32_e64 v2, v3, v2
   %tmp136 = add i32 %tmp134, %tmp107
   %tmp137 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp136)
 ; GFX9: buffer_store_dword v0
Index: llvm/test/CodeGen/PowerPC/cse-despite-rounding-mode.ll
===================================================================
--- llvm/test/CodeGen/PowerPC/cse-despite-rounding-mode.ll
+++ llvm/test/CodeGen/PowerPC/cse-despite-rounding-mode.ll
@@ -2,18 +2,18 @@
 ; Without strictfp, CSE should be free to eliminate the repeated multiply
 ; and conversion instructions.
 ; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \
-; RUN:   -mcpu=pwr8 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 2
+; RUN:   -mcpu=pwr8 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 4
 ; RUN: llc -verify-machineinstrs --mtriple powerpc-unknown-linux-gnu \
-; RUN:   -mcpu=pwr9 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 2
+; RUN:   -mcpu=pwr9 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 4
 ; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \
-; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 2
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names < %s | grep 'xvrdpic' | count 4
 
 ; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \
-; RUN:   -mcpu=pwr8 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 2
+; RUN:   -mcpu=pwr8 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 4
 ; RUN: llc -verify-machineinstrs --mtriple powerpc-unknown-linux-gnu \
-; RUN:   -mcpu=pwr9 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 2
+; RUN:   -mcpu=pwr9 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 4
 ; RUN: llc -verify-machineinstrs --mtriple powerpc64le-unknown-linux-gnu \
-; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 2
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names < %s | grep 'xvmuldp' | count 4
 @IndirectCallPtr = dso_local local_unnamed_addr global ptr null, align 8
 
 define dso_local signext i32 @func1() local_unnamed_addr #0 {
Index: llvm/test/CodeGen/PowerPC/cxx_tlscc64.ll
===================================================================
--- llvm/test/CodeGen/PowerPC/cxx_tlscc64.ll
+++ llvm/test/CodeGen/PowerPC/cxx_tlscc64.ll
@@ -19,10 +19,11 @@
 ; CHECK-NEXT:    std 0, 16(1)
 ; CHECK-NEXT:    stdu 1, -48(1)
 ; CHECK-NEXT:    addis 3, 13, __tls_guard@tprel@ha
-; CHECK-NEXT:    lbz 4, __tls_guard@tprel@l(3)
-; CHECK-NEXT:    andi. 4, 4, 1
+; CHECK-NEXT:    lbz 3, __tls_guard@tprel@l(3)
+; CHECK-NEXT:    andi. 3, 3, 1
 ; CHECK-NEXT:    bc 12, 1, .LBB0_2
 ; CHECK-NEXT:  # %bb.1: # %init.i
+; CHECK-NEXT:    addis 3, 13, __tls_guard@tprel@ha
 ; CHECK-NEXT:    li 4, 1
 ; CHECK-NEXT:    stb 4, __tls_guard@tprel@l(3)
 ; CHECK-NEXT:    addis 3, 13, sg@tprel@ha
Index: llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll
===================================================================
--- llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll
+++ llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll
@@ -1231,10 +1231,10 @@
 ; P8-NEXT:    bl __gnu_h2f_ieee
 ; P8-NEXT:    nop
 ; P8-NEXT:    xxlxor f0, f0, f0
+; P8-NEXT:    addis r3, r2, .LCPI20_0@toc@ha
 ; P8-NEXT:    fcmpu cr0, f1, f0
 ; P8-NEXT:    beq cr0, .LBB20_2
 ; P8-NEXT:  # %bb.1:
-; P8-NEXT:    addis r3, r2, .LCPI20_0@toc@ha
 ; P8-NEXT:    lfs f0, .LCPI20_0@toc@l(r3)
 ; P8-NEXT:  .LBB20_2:
 ; P8-NEXT:    fmr f1, f0
Index: llvm/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll
===================================================================
--- llvm/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll
+++ llvm/test/CodeGen/SPARC/2011-01-19-DelaySlot.ll
@@ -154,7 +154,7 @@
 entry:
 ;CHECK-LABEL: restore_sethi:
 ;CHECK-NOT: sethi  3
-;CHECK: restore %g0, 3072, %o0
+;CHECK: restore
   %0 = tail call i32 @bar(i32 %a) nounwind
   %1 = icmp ne i32 %0, 0
   %2 = select i1 %1, i32 3072, i32 0
Index: llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll
===================================================================
--- llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll
+++ llvm/test/CodeGen/SPARC/smulo-128-legalisation-lowering.ll
@@ -15,7 +15,7 @@
 ; SPARC-NEXT:    mov %i3, %l7
 ; SPARC-NEXT:    mov %i2, %l5
 ; SPARC-NEXT:    mov %i1, %l4
-; SPARC-NEXT:    mov %i0, %l6
+; SPARC-NEXT:    mov %i0, %l0
 ; SPARC-NEXT:    sra %i0, 31, %o4
 ; SPARC-NEXT:    st %o4, [%sp+96]
 ; SPARC-NEXT:    st %o4, [%sp+92]
@@ -30,41 +30,41 @@
 ; SPARC-NEXT:    st %o2, [%fp+-20] ! 4-byte Folded Spill
 ; SPARC-NEXT:    st %o3, [%fp+-24] ! 4-byte Folded Spill
 ; SPARC-NEXT:    st %i5, [%sp+96]
-; SPARC-NEXT:    mov %g0, %l0
+; SPARC-NEXT:    mov %g0, %l6
 ; SPARC-NEXT:    st %i4, [%sp+92]
-; SPARC-NEXT:    mov %l0, %o0
-; SPARC-NEXT:    mov %l0, %o1
+; SPARC-NEXT:    mov %l6, %o0
+; SPARC-NEXT:    mov %l6, %o1
 ; SPARC-NEXT:    mov %i2, %o2
 ; SPARC-NEXT:    mov %i3, %o3
-; SPARC-NEXT:    mov %l0, %o4
+; SPARC-NEXT:    mov %l6, %o4
 ; SPARC-NEXT:    call __multi3
-; SPARC-NEXT:    mov %l0, %o5
+; SPARC-NEXT:    mov %l6, %o5
 ; SPARC-NEXT:    st %o0, [%fp+-28] ! 4-byte Folded Spill
 ; SPARC-NEXT:    st %o1, [%fp+-32] ! 4-byte Folded Spill
 ; SPARC-NEXT:    st %o2, [%fp+-36] ! 4-byte Folded Spill
 ; SPARC-NEXT:    mov %o3, %i3
 ; SPARC-NEXT:    st %l3, [%sp+96]
 ; SPARC-NEXT:    st %l2, [%sp+92]
-; SPARC-NEXT:    mov %l0, %o0
-; SPARC-NEXT:    mov %l0, %o1
+; SPARC-NEXT:    mov %l6, %o0
+; SPARC-NEXT:    mov %l6, %o1
 ; SPARC-NEXT:    mov %i2, %o2
 ; SPARC-NEXT:    mov %l7, %o3
-; SPARC-NEXT:    mov %l0, %o4
+; SPARC-NEXT:    mov %l6, %o4
 ; SPARC-NEXT:    call __multi3
-; SPARC-NEXT:    mov %l0, %o5
+; SPARC-NEXT:    mov %l6, %o5
 ; SPARC-NEXT:    mov %o0, %i0
 ; SPARC-NEXT:    mov %o1, %i1
 ; SPARC-NEXT:    st %o2, [%fp+-4] ! 4-byte Folded Spill
 ; SPARC-NEXT:    st %o3, [%fp+-8] ! 4-byte Folded Spill
 ; SPARC-NEXT:    st %l3, [%sp+96]
 ; SPARC-NEXT:    st %l2, [%sp+92]
-; SPARC-NEXT:    mov %l0, %o0
-; SPARC-NEXT:    mov %l0, %o1
-; SPARC-NEXT:    mov %l6, %o2
+; SPARC-NEXT:    mov %l6, %o0
+; SPARC-NEXT:    mov %l6, %o1
+; SPARC-NEXT:    mov %l0, %o2
 ; SPARC-NEXT:    mov %l4, %o3
-; SPARC-NEXT:    mov %l0, %o4
+; SPARC-NEXT:    mov %l6, %o4
 ; SPARC-NEXT:    call __multi3
-; SPARC-NEXT:    mov %l0, %o5
+; SPARC-NEXT:    mov %l6, %o5
 ; SPARC-NEXT:    mov %o0, %l2
 ; SPARC-NEXT:    mov %o1, %l3
 ; SPARC-NEXT:    mov %o2, %l1
@@ -75,7 +75,7 @@
 ; SPARC-NEXT:    mov %o0, %o1
 ; SPARC-NEXT:    mov %o0, %o2
 ; SPARC-NEXT:    mov %o0, %o3
-; SPARC-NEXT:    mov %l6, %o4
+; SPARC-NEXT:    mov %l0, %o4
 ; SPARC-NEXT:    call __multi3
 ; SPARC-NEXT:    mov %l4, %o5
 ; SPARC-NEXT:    st %i5, [%sp+96]
@@ -101,15 +101,15 @@
 ; SPARC-NEXT:    addxcc %g3, 0, %g3
 ; SPARC-NEXT:    addcc %i2, %i3, %i2
 ; SPARC-NEXT:    addxcc %g2, %g3, %i3
-; SPARC-NEXT:    addxcc %l0, 0, %l1
-; SPARC-NEXT:    addxcc %l0, 0, %l2
-; SPARC-NEXT:    mov %l0, %o0
-; SPARC-NEXT:    mov %l0, %o1
-; SPARC-NEXT:    mov %l6, %o2
+; SPARC-NEXT:    addxcc %l6, 0, %l1
+; SPARC-NEXT:    addxcc %l6, 0, %l2
+; SPARC-NEXT:    mov %l6, %o0
+; SPARC-NEXT:    mov %l6, %o1
+; SPARC-NEXT:    mov %l0, %o2
 ; SPARC-NEXT:    mov %l4, %o3
-; SPARC-NEXT:    mov %l0, %o4
+; SPARC-NEXT:    mov %l6, %o4
 ; SPARC-NEXT:    call __multi3
-; SPARC-NEXT:    mov %l0, %o5
+; SPARC-NEXT:    mov %l6, %o5
 ; SPARC-NEXT:    addcc %o3, %i2, %i2
 ; SPARC-NEXT:    addxcc %o2, %i3, %i3
 ; SPARC-NEXT:    addxcc %o1, %l1, %g2
@@ -127,15 +127,15 @@
 ; SPARC-NEXT:    or %i2, %i4, %i2
 ; SPARC-NEXT:    or %i2, %i3, %i2
 ; SPARC-NEXT:    cmp %i2, 0
-; SPARC-NEXT:    be .LBB0_2
-; SPARC-NEXT:    nop
-; SPARC-NEXT:  ! %bb.1:
-; SPARC-NEXT:    mov 1, %l0
+; SPARC-NEXT:    bne .LBB0_2
+; SPARC-NEXT:    mov 1, %i4
+; SPARC-NEXT:  ! %bb.1: ! %start
+; SPARC-NEXT:    mov %l6, %i4
 ; SPARC-NEXT:  .LBB0_2: ! %start
 ; SPARC-NEXT:    ld [%fp+-4], %i2 ! 4-byte Folded Reload
 ; SPARC-NEXT:    ld [%fp+-8], %i3 ! 4-byte Folded Reload
 ; SPARC-NEXT:    ret
-; SPARC-NEXT:    restore %g0, %l0, %o4
+; SPARC-NEXT:    restore
 ;
 ; SPARC64-LABEL: muloti_test:
 ; SPARC64:         .cfi_startproc
Index: llvm/test/CodeGen/X86/2008-10-27-CoalescerBug.ll
===================================================================
--- llvm/test/CodeGen/X86/2008-10-27-CoalescerBug.ll
+++ llvm/test/CodeGen/X86/2008-10-27-CoalescerBug.ll
@@ -3,6 +3,12 @@
 ; Now this test spills one register. But a reload in the loop is cheaper than
 ; the divsd so it's a win.
 
+; FIXME: MachineLICM failed to move DIVSDrr out of loop because it uses register
+; $mxcsr, it is clobbered by function call to sin. We need to model the
+; volatile / non-volatile part of $mxcsr, so DIVSDrr uses the non-volatile part
+; of $mxcsr and function call clobbers volatile part of $mxcsr, then we can
+; safely move DIVSDrr out of the loop.
+
 define fastcc void @fourn(ptr %data, i32 %isign) nounwind {
 ; CHECK: fourn
 entry:
@@ -15,10 +21,9 @@
 	%1 = icmp sgt i32 %0, 2		; <i1> [#uses=1]
 	br i1 %1, label %bb30.loopexit, label %bb
 
-; CHECK: %bb30.loopexit
+; CHECK: %bb18
 ; CHECK: divsd %xmm0
-; CHECK: movsd %xmm0, 16(%esp)
-; CHECK: %bb3
+; CHECK: movsd %xmm0, (%esp)
 bb3:		; preds = %bb30.loopexit, %bb25, %bb3
 	%2 = load i32, ptr null, align 4		; <i32> [#uses=1]
 	%3 = mul i32 %2, 0		; <i32> [#uses=1]
Index: llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
===================================================================
--- llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
+++ llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
@@ -380,10 +380,12 @@
 ; SSE2-NEXT:    orq %rax, %rcx
 ; SSE2-NEXT:    testq %rdi, %rdi
 ; SSE2-NEXT:    cmovnsq %rdi, %rcx
-; SSE2-NEXT:    cvtsi2ss %rcx, %xmm0
-; SSE2-NEXT:    jns .LBB9_2
+; SSE2-NEXT:    cvtsi2ss %rcx, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    addss %xmm1, %xmm0
+; SSE2-NEXT:    js .LBB9_2
 ; SSE2-NEXT:  # %bb.1:
-; SSE2-NEXT:    addss %xmm0, %xmm0
+; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:  .LBB9_2:
 ; SSE2-NEXT:    pushq %rax
 ; SSE2-NEXT:    callq __truncsfhf2@PLT
Index: llvm/test/CodeGen/X86/half.ll
===================================================================
--- llvm/test/CodeGen/X86/half.ll
+++ llvm/test/CodeGen/X86/half.ll
@@ -367,19 +367,19 @@
 ; CHECK-LIBCALL:       # %bb.0:
 ; CHECK-LIBCALL-NEXT:    pushq %rbx
 ; CHECK-LIBCALL-NEXT:    movq %rsi, %rbx
-; CHECK-LIBCALL-NEXT:    testq %rdi, %rdi
-; CHECK-LIBCALL-NEXT:    js .LBB10_1
-; CHECK-LIBCALL-NEXT:  # %bb.2:
-; CHECK-LIBCALL-NEXT:    cvtsi2ss %rdi, %xmm0
-; CHECK-LIBCALL-NEXT:    jmp .LBB10_3
-; CHECK-LIBCALL-NEXT:  .LBB10_1:
 ; CHECK-LIBCALL-NEXT:    movq %rdi, %rax
 ; CHECK-LIBCALL-NEXT:    shrq %rax
-; CHECK-LIBCALL-NEXT:    andl $1, %edi
-; CHECK-LIBCALL-NEXT:    orq %rax, %rdi
-; CHECK-LIBCALL-NEXT:    cvtsi2ss %rdi, %xmm0
+; CHECK-LIBCALL-NEXT:    movl %edi, %ecx
+; CHECK-LIBCALL-NEXT:    andl $1, %ecx
+; CHECK-LIBCALL-NEXT:    orq %rax, %rcx
+; CHECK-LIBCALL-NEXT:    cvtsi2ss %rcx, %xmm0
 ; CHECK-LIBCALL-NEXT:    addss %xmm0, %xmm0
-; CHECK-LIBCALL-NEXT:  .LBB10_3:
+; CHECK-LIBCALL-NEXT:    cvtsi2ss %rdi, %xmm1
+; CHECK-LIBCALL-NEXT:    testq %rdi, %rdi
+; CHECK-LIBCALL-NEXT:    js .LBB10_2
+; CHECK-LIBCALL-NEXT:  # %bb.1:
+; CHECK-LIBCALL-NEXT:    movaps %xmm1, %xmm0
+; CHECK-LIBCALL-NEXT:  .LBB10_2:
 ; CHECK-LIBCALL-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-LIBCALL-NEXT:    pextrw $0, %xmm0, %eax
 ; CHECK-LIBCALL-NEXT:    movw %ax, (%rbx)
Index: llvm/test/CodeGen/X86/pr29112.ll
===================================================================
--- llvm/test/CodeGen/X86/pr29112.ll
+++ llvm/test/CodeGen/X86/pr29112.ll
@@ -39,17 +39,17 @@
 ; CHECK-NEXT:    vblendps {{.*#+}} xmm10 = xmm10[0,1,2],xmm3[3]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm8 = xmm8[0,1,2],xmm3[1]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm11 = xmm11[0,1,2],xmm3[1]
-; CHECK-NEXT:    vaddps %xmm8, %xmm11, %xmm8
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm2 = xmm9[0,1],xmm2[3,3]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2]
 ; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm2
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    vaddps %xmm1, %xmm12, %xmm9
-; CHECK-NEXT:    vaddps %xmm1, %xmm1, %xmm3
+; CHECK-NEXT:    vaddps %xmm8, %xmm11, %xmm3
+; CHECK-NEXT:    vaddps %xmm1, %xmm1, %xmm8
 ; CHECK-NEXT:    vaddps %xmm0, %xmm10, %xmm0
-; CHECK-NEXT:    vaddps %xmm0, %xmm8, %xmm0
+; CHECK-NEXT:    vaddps %xmm3, %xmm0, %xmm0
 ; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT:    vmovaps %xmm3, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovaps %xmm8, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovaps %xmm9, (%rsp)
 ; CHECK-NEXT:    vmovaps %xmm13, %xmm3
 ; CHECK-NEXT:    vzeroupper
Index: llvm/test/CodeGen/X86/sqrt-partial.ll
===================================================================
--- llvm/test/CodeGen/X86/sqrt-partial.ll
+++ llvm/test/CodeGen/X86/sqrt-partial.ll
@@ -12,22 +12,24 @@
 define float @f(float %val) nounwind {
 ; SSE-LABEL: f:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    ucomiss %xmm1, %xmm0
+; SSE-NEXT:    sqrtss %xmm0, %xmm1
+; SSE-NEXT:    xorps %xmm2, %xmm2
+; SSE-NEXT:    ucomiss %xmm2, %xmm0
 ; SSE-NEXT:    jb .LBB0_2
 ; SSE-NEXT:  # %bb.1: # %.split
-; SSE-NEXT:    sqrtss %xmm0, %xmm0
+; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ; SSE-NEXT:  .LBB0_2: # %call.sqrt
 ; SSE-NEXT:    jmp sqrtf # TAILCALL
 ;
 ; AVX-LABEL: f:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vucomiss %xmm1, %xmm0
+; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm1
+; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vucomiss %xmm2, %xmm0
 ; AVX-NEXT:    jb .LBB0_2
 ; AVX-NEXT:  # %bb.1: # %.split
-; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ; AVX-NEXT:  .LBB0_2: # %call.sqrt
 ; AVX-NEXT:    jmp sqrtf # TAILCALL
@@ -38,22 +40,24 @@
 define double @d(double %val) nounwind {
 ; SSE-LABEL: d:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    xorpd %xmm1, %xmm1
-; SSE-NEXT:    ucomisd %xmm1, %xmm0
+; SSE-NEXT:    sqrtsd %xmm0, %xmm1
+; SSE-NEXT:    xorpd %xmm2, %xmm2
+; SSE-NEXT:    ucomisd %xmm2, %xmm0
 ; SSE-NEXT:    jb .LBB1_2
 ; SSE-NEXT:  # %bb.1: # %.split
-; SSE-NEXT:    sqrtsd %xmm0, %xmm0
+; SSE-NEXT:    movapd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ; SSE-NEXT:  .LBB1_2: # %call.sqrt
 ; SSE-NEXT:    jmp sqrt # TAILCALL
 ;
 ; AVX-LABEL: d:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vucomisd %xmm1, %xmm0
+; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm1
+; AVX-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vucomisd %xmm2, %xmm0
 ; AVX-NEXT:    jb .LBB1_2
 ; AVX-NEXT:  # %bb.1: # %.split
-; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovapd %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ; AVX-NEXT:  .LBB1_2: # %call.sqrt
 ; AVX-NEXT:    jmp sqrt # TAILCALL
Index: llvm/test/CodeGen/X86/sse-intel-ocl.ll
===================================================================
--- llvm/test/CodeGen/X86/sse-intel-ocl.ll
+++ llvm/test/CodeGen/X86/sse-intel-ocl.ll
@@ -16,14 +16,14 @@
 ; WIN32-NEXT:    subl $80, %esp
 ; WIN32-NEXT:    movups 72(%ebp), %xmm4
 ; WIN32-NEXT:    movups 8(%ebp), %xmm3
-; WIN32-NEXT:    addps %xmm4, %xmm3
-; WIN32-NEXT:    movups 56(%ebp), %xmm4
-; WIN32-NEXT:    movups 40(%ebp), %xmm5
-; WIN32-NEXT:    movups 24(%ebp), %xmm6
+; WIN32-NEXT:    movups 56(%ebp), %xmm5
+; WIN32-NEXT:    movups 40(%ebp), %xmm6
+; WIN32-NEXT:    movups 24(%ebp), %xmm7
 ; WIN32-NEXT:    movl %esp, %eax
-; WIN32-NEXT:    addps %xmm6, %xmm0
-; WIN32-NEXT:    addps %xmm5, %xmm1
-; WIN32-NEXT:    addps %xmm4, %xmm2
+; WIN32-NEXT:    addps %xmm7, %xmm0
+; WIN32-NEXT:    addps %xmm6, %xmm1
+; WIN32-NEXT:    addps %xmm5, %xmm2
+; WIN32-NEXT:    addps %xmm4, %xmm3
 ; WIN32-NEXT:    pushl %eax
 ; WIN32-NEXT:    calll _func_float16_ptr
 ; WIN32-NEXT:    addl $4, %esp
Index: llvm/test/CodeGen/X86/swifterror.ll
===================================================================
--- llvm/test/CodeGen/X86/swifterror.ll
+++ llvm/test/CodeGen/X86/swifterror.ll
@@ -243,8 +243,6 @@
 ; CHECK-i386-NEXT:    .cfi_offset %edi, -8
 ; CHECK-i386-NEXT:    movl 32(%esp), %esi
 ; CHECK-i386-NEXT:    leal 16(%esp), %edi
-; CHECK-i386-NEXT:    fld1
-; CHECK-i386-NEXT:    fstps {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; CHECK-i386-NEXT:  LBB2_1: ## %bb_loop
 ; CHECK-i386-NEXT:    ## =>This Inner Loop Header: Depth=1
 ; CHECK-i386-NEXT:    movl $0, 16(%esp)
@@ -255,7 +253,7 @@
 ; CHECK-i386-NEXT:    jne LBB2_4
 ; CHECK-i386-NEXT:  ## %bb.2: ## %cont
 ; CHECK-i386-NEXT:    ## in Loop: Header=BB2_1 Depth=1
-; CHECK-i386-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Reload
+; CHECK-i386-NEXT:    fld1
 ; CHECK-i386-NEXT:    fxch %st(1)
 ; CHECK-i386-NEXT:    fucompp
 ; CHECK-i386-NEXT:    fnstsw %ax
@@ -270,7 +268,7 @@
 ; CHECK-i386-NEXT:    fstp %st(0)
 ; CHECK-i386-NEXT:    movl %ecx, (%esp)
 ; CHECK-i386-NEXT:    calll _free
-; CHECK-i386-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Reload
+; CHECK-i386-NEXT:    fld1
 ; CHECK-i386-NEXT:    addl $20, %esp
 ; CHECK-i386-NEXT:    popl %esi
 ; CHECK-i386-NEXT:    popl %edi
@@ -470,8 +468,6 @@
 ; CHECK-i386-NEXT:    fstps {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; CHECK-i386-NEXT:    movl 36(%esp), %esi
 ; CHECK-i386-NEXT:    movl 32(%esp), %edi
-; CHECK-i386-NEXT:    fld1
-; CHECK-i386-NEXT:    fstps {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; CHECK-i386-NEXT:  LBB4_1: ## %bb_loop
 ; CHECK-i386-NEXT:    ## =>This Inner Loop Header: Depth=1
 ; CHECK-i386-NEXT:    testl %esi, %esi
@@ -485,9 +481,8 @@
 ; CHECK-i386-NEXT:    movb $1, 8(%eax)
 ; CHECK-i386-NEXT:  LBB4_3: ## %bb_cont
 ; CHECK-i386-NEXT:    ## in Loop: Header=BB4_1 Depth=1
+; CHECK-i386-NEXT:    fld1
 ; CHECK-i386-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Reload
-; CHECK-i386-NEXT:    flds {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Reload
-; CHECK-i386-NEXT:    fxch %st(1)
 ; CHECK-i386-NEXT:    fucompp
 ; CHECK-i386-NEXT:    fnstsw %ax
 ; CHECK-i386-NEXT:    ## kill: def $ah killed $ah killed $ax
Index: llvm/tools/llvm-reduce/ReducerWorkItem.cpp
===================================================================
--- llvm/tools/llvm-reduce/ReducerWorkItem.cpp
+++ llvm/tools/llvm-reduce/ReducerWorkItem.cpp
@@ -328,7 +328,7 @@
             std::memcpy(DstMask, SrcMO.getRegMask(),
                         sizeof(*DstMask) *
                             MachineOperand::getRegMaskSize(TRI->getNumRegs()));
-            DstMO.setRegMask(DstMask);
+            DstMO.setRegMask(DstMask, DstMI);
           }
         }
 
Index: llvm/unittests/CodeGen/MachineOperandTest.cpp
===================================================================
--- llvm/unittests/CodeGen/MachineOperandTest.cpp
+++ llvm/unittests/CodeGen/MachineOperandTest.cpp
@@ -61,7 +61,7 @@
   auto MF = createMachineFunction(Ctx, Mod);
 
   uint32_t *Dummy = MF->allocateRegMask();
-  MachineOperand MO = MachineOperand::CreateRegMask(Dummy);
+  MachineOperand MO = MachineOperand::CreateRegMask(Dummy, MF.get());
 
   // Checking some preconditions on the newly created
   // MachineOperand.