diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -585,10 +585,25 @@
       case AMDGPU::SOFT_WQM:
       case AMDGPU::STRICT_WWM: {
         Register DstReg = MI.getOperand(0).getReg();
-
         const TargetRegisterClass *SrcRC, *DstRC;
         std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI);
 
+        if (MI.isCopy()) {
+          Register SrcReg = MI.getOperand(1).getReg();
+          if (SrcReg == AMDGPU::SCC) {
+            Register SCCCopy = MRI->createVirtualRegister(TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID));
+            I = BuildMI(*MI.getParent(),
+                        std::next(MachineBasicBlock::iterator(MI)),
+                        MI.getDebugLoc(), TII->get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 : AMDGPU::S_CSELECT_B64), SCCCopy)
+                    .addImm(-1)
+                    .addImm(0);
+            BuildMI(*MI.getParent(), std::next(I), I->getDebugLoc(), TII->get(AMDGPU::COPY), DstReg)
+                    .addReg(SCCCopy);
+            MI.eraseFromParent();
+            continue;
+          }
+        }
+
         if (!DstReg.isVirtual()) {
           // If the destination register is a physical register there isn't
           // really much we can do to fix this.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -122,7 +122,8 @@
 
   void addSCCDefUsersToVALUWorklist(MachineOperand &Op,
                                     MachineInstr &SCCDefInst,
-                                    SetVectorType &Worklist) const;
+                                    SetVectorType &Worklist,
+                                    Register NewCond = Register()) const;
   void addSCCDefsToVALUWorklist(MachineOperand &Op,
                                 SetVectorType &Worklist) const;
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4459,20 +4459,20 @@
   case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
   case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
   case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
-  case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
-  case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
-  case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
-  case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
-  case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
-  case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
-  case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
-  case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
-  case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
-  case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
-  case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
-  case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
-  case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32;
-  case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32;
+  case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
+  case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
+  case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
+  case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
+  case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
+  case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
+  case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
+  case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
+  case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
+  case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
+  case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
+  case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
+  case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
+  case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
   case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
   case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
   case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
@@ -4950,13 +4950,13 @@
       continue;
     }
 
-    if (RI.hasAGPRs(MRI.getRegClass(MO.getReg())) &&
+    if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) &&
         !isOperandLegal(MI, Idx, &MO)) {
       legalizeOpWithMove(MI, Idx);
       continue;
     }
 
-    if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
+    if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
       continue; // VGPRs are legal
 
     // We can use one SGPR in each VOP3 instruction prior to GFX10
@@ -5756,6 +5756,7 @@
     unsigned Opcode = Inst.getOpcode();
     unsigned NewOpcode = getVALUOp(Inst);
 
+    Register CondReg = RI.getVCC();
     // Handle some special cases
     switch (Opcode) {
     default:
@@ -5884,18 +5885,18 @@
       continue;
 
     case AMDGPU::S_CBRANCH_SCC0:
-    case AMDGPU::S_CBRANCH_SCC1:
-      // Clear unused bits of vcc
-      if (ST.isWave32())
-        BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B32),
-                AMDGPU::VCC_LO)
-            .addReg(AMDGPU::EXEC_LO)
-            .addReg(AMDGPU::VCC_LO);
-      else
-        BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
-                AMDGPU::VCC)
-            .addReg(AMDGPU::EXEC)
-            .addReg(AMDGPU::VCC);
+    case AMDGPU::S_CBRANCH_SCC1: {
+        // Clear unused bits of vcc
+        Register CondReg = Inst.getOperand(1).getReg();
+        bool IsSCC = CondReg == AMDGPU::SCC;
+        Register VCC = RI.getVCC();
+        Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+        unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+        BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
+            .addReg(EXEC)
+            .addReg(IsSCC ? VCC : CondReg);
+        Inst.RemoveOperand(1);
+      }
       break;
 
     case AMDGPU::S_BFE_U64:
@@ -6007,8 +6008,36 @@
       lowerSelect(Worklist, Inst, MDT);
       Inst.eraseFromParent();
       continue;
+    case AMDGPU::S_CMP_EQ_I32:
+    case AMDGPU::S_CMP_LG_I32:
+    case AMDGPU::S_CMP_GT_I32:
+    case AMDGPU::S_CMP_GE_I32:
+    case AMDGPU::S_CMP_LT_I32:
+    case AMDGPU::S_CMP_LE_I32:
+    case AMDGPU::S_CMP_EQ_U32:
+    case AMDGPU::S_CMP_LG_U32:
+    case AMDGPU::S_CMP_GT_U32:
+    case AMDGPU::S_CMP_GE_U32:
+    case AMDGPU::S_CMP_LT_U32:
+    case AMDGPU::S_CMP_LE_U32:
+    case AMDGPU::S_CMP_EQ_U64:
+    case AMDGPU::S_CMP_LG_U64: {
+        const MCInstrDesc &NewDesc = get(NewOpcode);
+        CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
+        MachineInstr *NewInstr =
+            BuildMI(*MBB, Inst, Inst.getDebugLoc(), NewDesc, CondReg)
+                .add(Inst.getOperand(0))
+                .add(Inst.getOperand(1));
+        legalizeOperands(*NewInstr, MDT);
+        int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC);
+        MachineOperand SCCOp = Inst.getOperand(SCCIdx);
+        addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
+        Inst.eraseFromParent();
+      }
+      continue;
     }
 
+
     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
       // We cannot move this instruction to the VALU, so we should try to
       // legalize its operands instead.
@@ -6168,47 +6197,50 @@
   MachineOperand &Cond = Inst.getOperand(3);
 
   Register SCCSource = Cond.getReg();
-  // Find SCC def, and if that is a copy (SCC = COPY reg) then use reg instead.
-  if (!Cond.isUndef()) {
-    for (MachineInstr &CandI :
-         make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)),
-                    Inst.getParent()->rend())) {
-      if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) !=
-          -1) {
-        if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
-          SCCSource = CandI.getOperand(1).getReg();
-        }
-        break;
-      }
-    }
-  }
+  bool IsSCC = (SCCSource == AMDGPU::SCC);
 
   // If this is a trivial select where the condition is effectively not SCC
   // (SCCSource is a source of copy to SCC), then the select is semantically
   // equivalent to copying SCCSource. Hence, there is no need to create
   // V_CNDMASK, we can just use that and bail out.
-  if ((SCCSource != AMDGPU::SCC) && Src0.isImm() && (Src0.getImm() == -1) &&
+  if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) &&
       Src1.isImm() && (Src1.getImm() == 0)) {
     MRI.replaceRegWith(Dest.getReg(), SCCSource);
     return;
   }
 
-  const TargetRegisterClass *TC = ST.getWavefrontSize() == 64
-                                      ? &AMDGPU::SReg_64_XEXECRegClass
-                                      : &AMDGPU::SReg_32_XM0_XEXECRegClass;
+  const TargetRegisterClass *TC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+
   Register CopySCC = MRI.createVirtualRegister(TC);
 
-  if (SCCSource == AMDGPU::SCC) {
-    // Insert a trivial select instead of creating a copy, because a copy from
-    // SCC would semantically mean just copying a single bit, but we may need
-    // the result to be a vector condition mask that needs preserving.
-    unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
+  if (IsSCC) {
+    // Now look for the closest SCC def if it is a copy
+    // replacing the SCCSource with the COPY source register
+    bool CopyFound = false;
+    for (MachineInstr &CandI :
+         make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)),
+                    Inst.getParent()->rend())) {
+     if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) !=
+         -1) {
+       if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
+         BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC)
+           .addReg(CandI.getOperand(1).getReg());
+         CopyFound = true;
+       }
+       break;
+     }
+    }
+    if (!CopyFound) {
+      // SCC def is not a copy
+      // Insert a trivial select instead of creating a copy, because a copy from
+      // SCC would semantically mean just copying a single bit, but we may need
+      // the result to be a vector condition mask that needs preserving.
+      unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
                                                     : AMDGPU::S_CSELECT_B32;
-    auto NewSelect =
-        BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0);
-    NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
-  } else {
-    BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC).addReg(SCCSource);
+      auto NewSelect =
+          BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0);
+      NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
+    }
   }
 
   Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -6219,7 +6251,7 @@
           .add(Src1) // False
           .addImm(0)
           .add(Src0) // True
-          .addReg(CopySCC);
+          .addReg(IsSCC ? CopySCC : SCCSource);
 
   MRI.replaceRegWith(Dest.getReg(), ResultReg);
   legalizeOperands(*UpdatedInst, MDT);
@@ -6810,8 +6842,8 @@
 
 void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
                                                MachineInstr &SCCDefInst,
-                                               SetVectorType &Worklist) const {
-  bool SCCUsedImplicitly = false;
+                                               SetVectorType &Worklist,
+                                               Register NewCond) const {
 
   // Ensure that def inst defines SCC, which is still live.
   assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
@@ -6823,33 +6855,18 @@
        make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
                   SCCDefInst.getParent()->end())) {
     // Check if SCC is used first.
-    if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) {
+    int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI);
+    if (SCCIdx != -1) {
       if (MI.isCopy()) {
         MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
         Register DestReg = MI.getOperand(0).getReg();
 
-        for (auto &User : MRI.use_nodbg_instructions(DestReg)) {
-          if ((User.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) ||
-              (User.getOpcode() == AMDGPU::S_SUB_CO_PSEUDO)) {
-            User.getOperand(4).setReg(RI.getVCC());
-            Worklist.insert(&User);
-          } else if (User.getOpcode() == AMDGPU::V_CNDMASK_B32_e64) {
-            User.getOperand(5).setReg(RI.getVCC());
-            // No need to add to Worklist.
-          }
-        }
+        MRI.replaceRegWith(DestReg, NewCond);
         CopyToDelete.push_back(&MI);
       } else {
-        if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
-            MI.getOpcode() == AMDGPU::S_CSELECT_B64) {
-          // This is an implicit use of SCC and it is really expected by
-          // the SCC users to handle.
-          // We cannot preserve the edge to the user so add the explicit
-          // copy: SCC = COPY VCC.
-          // The copy will be cleaned up during the processing of the user
-          // in lowerSelect.
-          SCCUsedImplicitly = true;
-        }
+
+        if (NewCond.isValid())
+          MI.getOperand(SCCIdx).setReg(NewCond);
 
         Worklist.insert(&MI);
       }
@@ -6860,12 +6877,6 @@
   }
   for (auto &Copy : CopyToDelete)
     Copy->eraseFromParent();
-
-  if (SCCUsedImplicitly) {
-    BuildMI(*SCCDefInst.getParent(), std::next(SCCDefInst.getIterator()),
-            SCCDefInst.getDebugLoc(), get(AMDGPU::COPY), AMDGPU::SCC)
-        .addReg(RI.getVCC());
-  }
 }
 
 // Instructions that use SCC may be converted to VALU instructions. When that
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -527,15 +527,7 @@
 def si_setcc_uniform : PatFrag <
   (ops node:$lhs, node:$rhs, node:$cond),
   (setcc node:$lhs, node:$rhs, node:$cond), [{
-  for (SDNode *Use : N->uses()) {
-    if (Use->isMachineOpcode() || Use->getOpcode() != ISD::CopyToReg)
-      return false;
-
-    unsigned Reg = cast<RegisterSDNode>(Use->getOperand(1))->getReg();
-    if (Reg != AMDGPU::SCC)
-      return false;
-  }
-  return true;
+  return !N->isDivergent();
 }]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -10,7 +10,8 @@
 ; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
 ; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
 ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
-; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
+; CI-DAG: s_cmp_lg_u32 [[PTR]], -1
+; CI-DAG: s_cselect_b64 vcc, -1, 0
 ; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
 ; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 ; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
@@ -22,7 +23,8 @@
 ; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]]
 
 ; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base
-; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
+; GFX9: s_cmp_lg_u32 [[PTR]], -1
+; GFX9: s_cselect_b64 vcc, -1, 0
 ; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
 ; GFX9-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 ; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
@@ -76,7 +78,8 @@
 ; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
 
 ; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
-; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
+; CI-DAG: s_cmp_lg_u32 [[PTR]], -1
+; CI-DAG: s_cselect_b64 vcc, -1, 0
 ; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
 ; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 ; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
@@ -89,7 +92,8 @@
 ; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base
 
 ; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
-; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
+; GFX9: s_cmp_lg_u32 [[PTR]], -1
+; GFX9: s_cselect_b64 vcc, -1, 0
 ; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
 ; GFX9: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 ; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
@@ -18,7 +18,8 @@
 
 ; GCN-ALLOCA: v_add_{{[iu]}}32_e32 [[RESULT:v[0-9]+]], vcc, v{{[0-9]+}}, v0
 
-; GCN-PROMOTE: v_cmp_eq_u32_e64 vcc, [[IN]], 1
+; GCN-PROMOTE: s_cmp_eq_u32 [[IN]], 1
+; GCN-PROMOTE-NEXT: s_cselect_b64 vcc, -1, 0
 ; GCN-PROMOTE-NEXT: v_addc_u32_e32 [[RESULT:v[0-9]+]], vcc, 0, v0, vcc
 
 ; GCN: buffer_store_dword [[RESULT]]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -236,38 +236,39 @@
 ; GFX6-LABEL: sdiv_i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
 ; GFX6-NEXT:    s_add_i32 s3, s3, s8
-; GFX6-NEXT:    s_xor_b32 s9, s3, s8
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s9
-; GFX6-NEXT:    s_sub_i32 s3, 0, s9
-; GFX6-NEXT:    s_ashr_i32 s0, s2, 31
-; GFX6-NEXT:    s_add_i32 s1, s2, s0
+; GFX6-NEXT:    s_xor_b32 s3, s3, s8
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX6-NEXT:    s_sub_i32 s4, 0, s3
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_xor_b32 s1, s1, s0
-; GFX6-NEXT:    s_xor_b32 s2, s0, s8
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GFX6-NEXT:    s_ashr_i32 s0, s2, 31
+; GFX6-NEXT:    s_add_i32 s1, s2, s0
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    s_xor_b32 s1, s1, s0
+; GFX6-NEXT:    s_xor_b32 s2, s0, s8
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s9
+; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s3
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v1
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s9, v1
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -279,32 +280,32 @@
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX9-NEXT:    s_add_i32 s3, s3, s4
-; GFX9-NEXT:    s_xor_b32 s5, s3, s4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; GFX9-NEXT:    s_sub_i32 s3, 0, s5
+; GFX9-NEXT:    s_xor_b32 s3, s3, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT:    s_sub_i32 s5, 0, s3
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v1, s3, v0
-; GFX9-NEXT:    s_ashr_i32 s3, s2, 31
-; GFX9-NEXT:    s_add_i32 s2, s2, s3
-; GFX9-NEXT:    s_xor_b32 s2, s2, s3
+; GFX9-NEXT:    v_mul_lo_u32 v1, s5, v0
+; GFX9-NEXT:    s_ashr_i32 s5, s2, 31
+; GFX9-NEXT:    s_add_i32 s2, s2, s5
+; GFX9-NEXT:    s_xor_b32 s2, s2, s5
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX9-NEXT:    s_xor_b32 s3, s3, s4
+; GFX9-NEXT:    s_xor_b32 s4, s5, s4
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s5
+; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s3
 ; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v1, s2, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v1
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v0, s3, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s3, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
 ; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = sdiv i32 %x, %y
@@ -359,13 +360,13 @@
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX6-NEXT:    s_add_i32 s3, s3, s4
-; GFX6-NEXT:    s_xor_b32 s6, s3, s4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX6-NEXT:    s_sub_i32 s3, 0, s6
-; GFX6-NEXT:    s_ashr_i32 s4, s2, 31
-; GFX6-NEXT:    s_add_i32 s2, s2, s4
+; GFX6-NEXT:    s_xor_b32 s4, s3, s4
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX6-NEXT:    s_sub_i32 s3, 0, s4
+; GFX6-NEXT:    s_ashr_i32 s5, s2, 31
+; GFX6-NEXT:    s_add_i32 s2, s2, s5
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_xor_b32 s5, s2, s4
+; GFX6-NEXT:    s_xor_b32 s6, s2, s5
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -373,17 +374,17 @@
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s5, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s6, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s4
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s6, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
+; GFX6-NEXT:    v_xor_b32_e32 v0, s5, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s5, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -1289,62 +1290,62 @@
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_mul_f32_e32 v2, s12, v5
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s8
-; GFX9-NEXT:    v_add_u32_e32 v6, 1, v0
+; GFX9-NEXT:    v_mul_f32_e32 v3, s12, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v6, s8, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s11
-; GFX9-NEXT:    v_add_u32_e32 v6, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, s2, v2
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s9
-; GFX9-NEXT:    s_sub_i32 s2, 0, s11
-; GFX9-NEXT:    v_mul_hi_u32 v6, v2, v6
-; GFX9-NEXT:    v_mul_f32_e32 v3, s12, v3
+; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s8
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s11
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT:    v_sub_u32_e32 v5, s5, v5
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v6
-; GFX9-NEXT:    v_add_u32_e32 v7, 1, v1
-; GFX9-NEXT:    v_mul_lo_u32 v6, s2, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v7, s9, v5
+; GFX9-NEXT:    v_add_u32_e32 v7, 1, v0
+; GFX9-NEXT:    v_sub_u32_e32 v5, s4, v5
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v7, s8, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX9-NEXT:    v_mul_hi_u32 v2, s6, v2
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v6
+; GFX9-NEXT:    v_mul_lo_u32 v6, v1, s9
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v5
+; GFX9-NEXT:    v_add_u32_e32 v7, 1, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v7, s2, v3
+; GFX9-NEXT:    v_sub_u32_e32 v6, s5, v6
+; GFX9-NEXT:    v_mul_f32_e32 v2, s12, v2
+; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v7
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    s_sub_i32 s2, 0, s11
+; GFX9-NEXT:    v_subrev_u32_e32 v7, s9, v6
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
+; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v2
+; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v7, 1, v1
-; GFX9-NEXT:    v_mul_lo_u32 v8, v2, s10
+; GFX9-NEXT:    v_mul_hi_u32 v5, v2, v5
+; GFX9-NEXT:    v_mul_lo_u32 v8, v3, s10
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
-; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
-; GFX9-NEXT:    v_mul_hi_u32 v3, s7, v3
+; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
+; GFX9-NEXT:    v_mul_hi_u32 v5, s7, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v6, s6, v8
+; GFX9-NEXT:    v_add_u32_e32 v7, 1, v3
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v6
-; GFX9-NEXT:    v_subrev_u32_e32 v5, s10, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v3, s11
-; GFX9-NEXT:    v_add_u32_e32 v7, 1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s10, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, v5, s11
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
 ; GFX9-NEXT:    v_add_u32_e32 v7, 1, v2
-; GFX9-NEXT:    v_sub_u32_e32 v5, s7, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
-; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v6, s11, v5
+; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v6
+; GFX9-NEXT:    v_add_u32_e32 v6, 1, v5
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v5
+; GFX9-NEXT:    v_subrev_u32_e32 v6, s11, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX9-NEXT:    v_add_u32_e32 v6, 1, v5
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <4 x i32> %x, %y
@@ -1817,14 +1818,14 @@
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_ashr_i32 s2, s12, 31
 ; GFX6-NEXT:    s_add_i32 s3, s12, s2
-; GFX6-NEXT:    s_xor_b32 s12, s3, s2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s12
-; GFX6-NEXT:    s_ashr_i32 s3, s13, 31
-; GFX6-NEXT:    s_add_i32 s0, s13, s3
-; GFX6-NEXT:    s_xor_b32 s13, s0, s3
+; GFX6-NEXT:    s_xor_b32 s3, s3, s2
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX6-NEXT:    s_ashr_i32 s12, s13, 31
+; GFX6-NEXT:    s_add_i32 s0, s13, s12
+; GFX6-NEXT:    s_xor_b32 s13, s0, s12
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s13
-; GFX6-NEXT:    s_sub_i32 s1, 0, s12
+; GFX6-NEXT:    s_sub_i32 s1, 0, s3
 ; GFX6-NEXT:    s_ashr_i32 s0, s8, 31
 ; GFX6-NEXT:    v_mul_f32_e32 v0, s16, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -1840,28 +1841,28 @@
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
-; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s12
+; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v3
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s12, v3
+; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s3, v3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
 ; GFX6-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX6-NEXT:    s_add_i32 s1, s9, s0
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
-; GFX6-NEXT:    s_xor_b32 s2, s0, s3
+; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
 ; GFX6-NEXT:    s_ashr_i32 s3, s14, 31
 ; GFX6-NEXT:    s_xor_b32 s1, s1, s0
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
+; GFX6-NEXT:    s_xor_b32 s2, s0, s12
 ; GFX6-NEXT:    s_add_i32 s0, s14, s3
-; GFX6-NEXT:    s_xor_b32 s9, s0, s3
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s9
+; GFX6-NEXT:    s_xor_b32 s8, s0, s3
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s8
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s13
@@ -1873,7 +1874,7 @@
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
 ; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s13, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
-; GFX6-NEXT:    s_sub_i32 s0, 0, s9
+; GFX6-NEXT:    s_sub_i32 s0, 0, s8
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v3
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
@@ -1883,25 +1884,25 @@
 ; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v1
 ; GFX6-NEXT:    s_ashr_i32 s2, s15, 31
 ; GFX6-NEXT:    s_ashr_i32 s0, s10, 31
-; GFX6-NEXT:    s_add_i32 s8, s15, s2
+; GFX6-NEXT:    s_add_i32 s9, s15, s2
 ; GFX6-NEXT:    s_add_i32 s1, s10, s0
-; GFX6-NEXT:    s_xor_b32 s8, s8, s2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s8
+; GFX6-NEXT:    s_xor_b32 s9, s9, s2
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s9
 ; GFX6-NEXT:    s_xor_b32 s1, s1, s0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v2, s1, v2
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GFX6-NEXT:    s_xor_b32 s3, s0, s3
-; GFX6-NEXT:    v_mul_lo_u32 v3, v2, s9
+; GFX6-NEXT:    v_mul_lo_u32 v3, v2, s8
 ; GFX6-NEXT:    v_mul_f32_e32 v4, s16, v4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v3
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v3
+; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s8, v3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
-; GFX6-NEXT:    s_sub_i32 s0, 0, s8
+; GFX6-NEXT:    s_sub_i32 s0, 0, s9
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v4
 ; GFX6-NEXT:    s_ashr_i32 s0, s11, 31
 ; GFX6-NEXT:    s_add_i32 s1, s11, s0
@@ -1911,19 +1912,19 @@
 ; GFX6-NEXT:    s_xor_b32 s2, s0, s2
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v4, s1, v4
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v2, s3, v2
-; GFX6-NEXT:    v_mul_lo_u32 v3, v4, s8
+; GFX6-NEXT:    v_mul_lo_u32 v3, v4, s9
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v3
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s8, v3
+; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v3, s2, v3
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v3
@@ -1933,119 +1934,119 @@
 ; GFX9-LABEL: sdiv_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b32 s13, 0x4f7ffffe
+; GFX9-NEXT:    s_mov_b32 s15, 0x4f7ffffe
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s2, s8, 31
 ; GFX9-NEXT:    s_add_i32 s3, s8, s2
-; GFX9-NEXT:    s_xor_b32 s14, s3, s2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s14
-; GFX9-NEXT:    s_ashr_i32 s8, s9, 31
-; GFX9-NEXT:    s_add_i32 s9, s9, s8
-; GFX9-NEXT:    s_xor_b32 s15, s9, s8
+; GFX9-NEXT:    s_xor_b32 s3, s3, s2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT:    s_ashr_i32 s12, s9, 31
+; GFX9-NEXT:    s_add_i32 s9, s9, s12
+; GFX9-NEXT:    s_xor_b32 s9, s9, s12
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s15
-; GFX9-NEXT:    s_sub_i32 s12, 0, s14
-; GFX9-NEXT:    s_ashr_i32 s3, s4, 31
-; GFX9-NEXT:    v_mul_f32_e32 v0, s13, v0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
+; GFX9-NEXT:    s_sub_i32 s14, 0, s3
+; GFX9-NEXT:    s_ashr_i32 s8, s4, 31
+; GFX9-NEXT:    v_mul_f32_e32 v0, s15, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    s_add_i32 s4, s4, s3
-; GFX9-NEXT:    s_xor_b32 s4, s4, s3
-; GFX9-NEXT:    v_mul_lo_u32 v2, s12, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, s13, v1
+; GFX9-NEXT:    s_add_i32 s4, s4, s8
+; GFX9-NEXT:    s_xor_b32 s4, s4, s8
+; GFX9-NEXT:    v_mul_lo_u32 v2, s14, v0
+; GFX9-NEXT:    v_mul_f32_e32 v1, s15, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_sub_i32 s12, 0, s15
+; GFX9-NEXT:    s_sub_i32 s14, 0, s9
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX9-NEXT:    s_ashr_i32 s9, s5, 31
-; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
-; GFX9-NEXT:    s_xor_b32 s2, s3, s2
+; GFX9-NEXT:    s_ashr_i32 s13, s5, 31
+; GFX9-NEXT:    v_mul_lo_u32 v3, s14, v1
+; GFX9-NEXT:    s_add_i32 s5, s5, s13
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v3
-; GFX9-NEXT:    s_add_i32 s3, s5, s9
-; GFX9-NEXT:    s_xor_b32 s3, s3, s9
-; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s14
+; GFX9-NEXT:    s_xor_b32 s5, s5, s13
+; GFX9-NEXT:    s_xor_b32 s2, s8, s2
+; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s3
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
-; GFX9-NEXT:    v_mul_hi_u32 v1, s3, v1
 ; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
+; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s14, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s14, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s14, v2
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s15
-; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
-; GFX9-NEXT:    v_sub_u32_e32 v2, s3, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
 ; GFX9-NEXT:    s_ashr_i32 s3, s10, 31
 ; GFX9-NEXT:    s_add_i32 s4, s10, s3
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s2, v0
-; GFX9-NEXT:    s_xor_b32 s2, s9, s8
-; GFX9-NEXT:    s_xor_b32 s9, s4, s3
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s9
+; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
+; GFX9-NEXT:    s_xor_b32 s4, s4, s3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s4
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s9
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s15, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    s_ashr_i32 s8, s11, 31
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v5, s15, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX9-NEXT:    s_sub_i32 s4, 0, s9
-; GFX9-NEXT:    v_mul_f32_e32 v3, s13, v3
+; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v3, s15, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s15, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v5, s9, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    s_sub_i32 s5, 0, s4
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
+; GFX9-NEXT:    v_mul_lo_u32 v2, s5, v3
+; GFX9-NEXT:    s_add_i32 s9, s11, s8
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, s4, v3
-; GFX9-NEXT:    s_ashr_i32 s4, s6, 31
-; GFX9-NEXT:    s_add_i32 s5, s6, s4
-; GFX9-NEXT:    s_ashr_i32 s6, s11, 31
-; GFX9-NEXT:    s_add_i32 s8, s11, s6
-; GFX9-NEXT:    s_xor_b32 s8, s8, s6
+; GFX9-NEXT:    s_xor_b32 s9, s9, s8
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v3, v2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s8
-; GFX9-NEXT:    s_xor_b32 s5, s5, s4
-; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s9
+; GFX9-NEXT:    s_ashr_i32 s5, s6, 31
+; GFX9-NEXT:    s_add_i32 s6, s6, s5
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v5
-; GFX9-NEXT:    v_mul_hi_u32 v2, s5, v2
-; GFX9-NEXT:    v_subrev_u32_e32 v1, s2, v1
-; GFX9-NEXT:    s_xor_b32 s2, s4, s3
-; GFX9-NEXT:    v_mul_f32_e32 v3, s13, v3
+; GFX9-NEXT:    s_xor_b32 s6, s6, s5
+; GFX9-NEXT:    v_mul_hi_u32 v2, s6, v2
+; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
+; GFX9-NEXT:    v_mul_f32_e32 v3, s15, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT:    v_mul_lo_u32 v5, v2, s9
-; GFX9-NEXT:    s_sub_i32 s3, 0, s8
-; GFX9-NEXT:    v_add_u32_e32 v6, 1, v2
+; GFX9-NEXT:    v_mul_lo_u32 v5, v2, s4
+; GFX9-NEXT:    v_subrev_u32_e32 v0, s2, v0
+; GFX9-NEXT:    s_xor_b32 s2, s13, s12
+; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
+; GFX9-NEXT:    v_subrev_u32_e32 v1, s2, v1
+; GFX9-NEXT:    s_xor_b32 s2, s5, s3
+; GFX9-NEXT:    s_sub_i32 s3, 0, s9
 ; GFX9-NEXT:    v_mul_lo_u32 v7, s3, v3
-; GFX9-NEXT:    v_sub_u32_e32 v5, s5, v5
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
+; GFX9-NEXT:    v_sub_u32_e32 v5, s6, v5
+; GFX9-NEXT:    v_add_u32_e32 v6, 1, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v6, s9, v5
+; GFX9-NEXT:    v_subrev_u32_e32 v6, s4, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v7
 ; GFX9-NEXT:    s_ashr_i32 s3, s7, 31
-; GFX9-NEXT:    s_add_i32 s4, s7, s3
-; GFX9-NEXT:    s_xor_b32 s4, s4, s3
+; GFX9-NEXT:    s_add_i32 s5, s7, s3
+; GFX9-NEXT:    s_xor_b32 s5, s5, s3
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v6
-; GFX9-NEXT:    v_mul_hi_u32 v3, s4, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
+; GFX9-NEXT:    v_mul_hi_u32 v3, s5, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v5
 ; GFX9-NEXT:    v_add_u32_e32 v6, 1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v5, v3, s8
+; GFX9-NEXT:    v_mul_lo_u32 v5, v3, s9
 ; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
 ; GFX9-NEXT:    v_xor_b32_e32 v2, s2, v2
 ; GFX9-NEXT:    v_subrev_u32_e32 v2, s2, v2
-; GFX9-NEXT:    v_sub_u32_e32 v5, s4, v5
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v5
+; GFX9-NEXT:    v_sub_u32_e32 v5, s5, v5
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v6, s8, v5
+; GFX9-NEXT:    v_subrev_u32_e32 v6, s9, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v5
-; GFX9-NEXT:    s_xor_b32 s2, s3, s6
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
+; GFX9-NEXT:    s_xor_b32 s2, s3, s8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v3, s2, v3
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s2, v3
@@ -2216,111 +2217,111 @@
 ; GFX6-LABEL: srem_v4i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
-; GFX6-NEXT:    s_mov_b32 s13, 0x4f7ffffe
+; GFX6-NEXT:    s_mov_b32 s14, 0x4f7ffffe
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_ashr_i32 s2, s8, 31
 ; GFX6-NEXT:    s_add_i32 s8, s8, s2
-; GFX6-NEXT:    s_xor_b32 s12, s8, s2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s12
-; GFX6-NEXT:    s_ashr_i32 s8, s9, 31
-; GFX6-NEXT:    s_add_i32 s9, s9, s8
-; GFX6-NEXT:    s_xor_b32 s14, s9, s8
+; GFX6-NEXT:    s_xor_b32 s8, s8, s2
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GFX6-NEXT:    s_ashr_i32 s12, s9, 31
+; GFX6-NEXT:    s_add_i32 s9, s9, s12
+; GFX6-NEXT:    s_xor_b32 s9, s9, s12
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s14
-; GFX6-NEXT:    s_sub_i32 s9, 0, s12
-; GFX6-NEXT:    s_ashr_i32 s8, s4, 31
-; GFX6-NEXT:    v_mul_f32_e32 v0, s13, v0
+; GFX6-NEXT:    s_sub_i32 s13, 0, s8
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
+; GFX6-NEXT:    s_ashr_i32 s12, s4, 31
+; GFX6-NEXT:    v_mul_f32_e32 v0, s14, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX6-NEXT:    s_add_i32 s4, s4, s8
-; GFX6-NEXT:    s_xor_b32 s4, s4, s8
-; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, s13, v1
+; GFX6-NEXT:    s_add_i32 s4, s4, s12
+; GFX6-NEXT:    s_xor_b32 s4, s4, s12
+; GFX6-NEXT:    v_mul_lo_u32 v2, s13, v0
+; GFX6-NEXT:    v_mul_f32_e32 v1, s14, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    s_sub_i32 s9, 0, s14
+; GFX6-NEXT:    s_sub_i32 s13, 0, s9
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v1
-; GFX6-NEXT:    s_ashr_i32 s9, s5, 31
-; GFX6-NEXT:    s_add_i32 s5, s5, s9
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s12
+; GFX6-NEXT:    v_mul_lo_u32 v2, s13, v1
+; GFX6-NEXT:    s_ashr_i32 s13, s5, 31
+; GFX6-NEXT:    s_add_i32 s5, s5, s13
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
-; GFX6-NEXT:    s_xor_b32 s4, s5, s9
-; GFX6-NEXT:    s_ashr_i32 s5, s10, 31
-; GFX6-NEXT:    s_add_i32 s10, s10, s5
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s12, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
-; GFX6-NEXT:    s_xor_b32 s10, s10, s5
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX6-NEXT:    s_xor_b32 s4, s5, s13
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s10
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
+; GFX6-NEXT:    s_ashr_i32 s5, s10, 31
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
+; GFX6-NEXT:    s_add_i32 s8, s10, s5
+; GFX6-NEXT:    s_xor_b32 s5, s8, s5
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s5
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s12, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s14
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
-; GFX6-NEXT:    v_mul_f32_e32 v2, s13, v2
+; GFX6-NEXT:    v_xor_b32_e32 v0, s12, v0
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s9
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
+; GFX6-NEXT:    s_ashr_i32 s8, s11, 31
+; GFX6-NEXT:    v_mul_f32_e32 v2, s14, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v1
-; GFX6-NEXT:    s_sub_i32 s4, 0, s10
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s14, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s14, v1
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v1
+; GFX6-NEXT:    s_sub_i32 s4, 0, s5
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s4, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s14, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s14, v1
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v1
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v4
+; GFX6-NEXT:    s_add_i32 s9, s11, s8
 ; GFX6-NEXT:    s_ashr_i32 s4, s6, 31
-; GFX6-NEXT:    s_add_i32 s5, s6, s4
-; GFX6-NEXT:    s_ashr_i32 s6, s11, 31
-; GFX6-NEXT:    s_add_i32 s8, s11, s6
-; GFX6-NEXT:    s_xor_b32 s8, s8, s6
+; GFX6-NEXT:    s_xor_b32 s8, s9, s8
+; GFX6-NEXT:    s_add_i32 s6, s6, s4
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s8
-; GFX6-NEXT:    s_xor_b32 s5, s5, s4
-; GFX6-NEXT:    v_mul_hi_u32 v2, s5, v2
-; GFX6-NEXT:    v_xor_b32_e32 v1, s9, v1
+; GFX6-NEXT:    s_xor_b32 s6, s6, s4
+; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v2
+; GFX6-NEXT:    v_xor_b32_e32 v1, s13, v1
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s9, v1
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s10
-; GFX6-NEXT:    v_mul_f32_e32 v3, s13, v3
+; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s13, v1
+; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s5
+; GFX6-NEXT:    v_mul_f32_e32 v3, s14, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s10, v2
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
-; GFX6-NEXT:    s_sub_i32 s5, 0, s8
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s5, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v2
+; GFX6-NEXT:    s_sub_i32 s6, 0, s8
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v4, s5, v3
-; GFX6-NEXT:    s_ashr_i32 s5, s7, 31
-; GFX6-NEXT:    s_add_i32 s6, s7, s5
-; GFX6-NEXT:    s_xor_b32 s6, s6, s5
+; GFX6-NEXT:    v_mul_lo_u32 v4, s6, v3
+; GFX6-NEXT:    s_ashr_i32 s6, s7, 31
+; GFX6-NEXT:    s_add_i32 s7, s7, s6
+; GFX6-NEXT:    s_xor_b32 s7, s7, s6
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v4
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s5, v2
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v3
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
+; GFX6-NEXT:    v_mul_hi_u32 v3, s7, v3
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v2, s4, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s8
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s4, v2
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s7, v3
 ; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s8, v3
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s8, v3
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v3, s5, v3
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s5, v3
+; GFX6-NEXT:    v_xor_b32_e32 v3, s6, v3
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s6, v3
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -2374,8 +2375,8 @@
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s2
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s3
-; GFX9-NEXT:    s_sub_i32 s4, 0, s2
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s8, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v0, s8, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
@@ -2385,26 +2386,27 @@
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
+; GFX9-NEXT:    s_sub_i32 s3, 0, s2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, s4, v2
-; GFX9-NEXT:    s_ashr_i32 s4, s11, 31
-; GFX9-NEXT:    s_add_i32 s5, s11, s4
-; GFX9-NEXT:    s_xor_b32 s4, s5, s4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s4
+; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v2
+; GFX9-NEXT:    s_ashr_i32 s3, s11, 31
+; GFX9-NEXT:    s_add_i32 s4, s11, s3
+; GFX9-NEXT:    s_xor_b32 s3, s4, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s3
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GFX9-NEXT:    s_ashr_i32 s3, s6, 31
-; GFX9-NEXT:    s_add_i32 s5, s6, s3
+; GFX9-NEXT:    s_ashr_i32 s4, s6, 31
+; GFX9-NEXT:    s_add_i32 s5, s6, s4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX9-NEXT:    s_xor_b32 s5, s5, s3
+; GFX9-NEXT:    s_xor_b32 s5, s5, s4
 ; GFX9-NEXT:    v_mul_hi_u32 v2, s5, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v3, s13, v5
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT:    s_sub_i32 s6, 0, s4
+; GFX9-NEXT:    s_sub_i32 s6, 0, s3
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s2
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s9, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s6, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s8, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v1, s9, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
 ; GFX9-NEXT:    s_ashr_i32 s5, s7, 31
 ; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v5
@@ -2416,20 +2418,19 @@
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v5, s2, v2
-; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s4
+; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s3
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v2, s3, v2
+; GFX9-NEXT:    v_xor_b32_e32 v2, s4, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v3, s6, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v5, s4, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v5, s3, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v5, s4, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v5, s3, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v3, s5, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v1, s9, v1
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v2, s4, v2
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v3
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -5737,52 +5738,52 @@
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-NEXT:    s_movk_i32 s4, 0x1000
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s7, s4, s2
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX9-NEXT:    s_lshl_b32 s6, s4, s3
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
+; GFX9-NEXT:    s_lshl_b32 s5, s4, s3
+; GFX9-NEXT:    s_lshl_b32 s4, s4, s2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s5
 ; GFX9-NEXT:    s_mov_b32 s2, 0x4f7ffffe
+; GFX9-NEXT:    s_sub_i32 s3, 0, s5
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX9-NEXT:    v_mul_f32_e32 v0, s2, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GFX9-NEXT:    s_sub_i32 s2, 0, s4
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_sub_i32 s2, 0, s7
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v1, s3, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s7
+; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s4
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
-; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s6
+; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s5
 ; GFX9-NEXT:    v_add_u32_e32 v6, 1, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v3, s2, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v5, s7, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v5, s4, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v4, s3, v4
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v4
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s6, v4
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v4
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = udiv <2 x i32> %x, %shl.y
@@ -6022,26 +6023,26 @@
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
 ; GFX6-NEXT:    s_movk_i32 s4, 0x1000
+; GFX6-NEXT:    s_mov_b32 s5, 0x4f7ffffe
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s8, s4, s2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GFX6-NEXT:    s_lshl_b32 s2, s4, s2
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX6-NEXT:    s_lshl_b32 s3, s4, s3
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GFX6-NEXT:    s_mov_b32 s4, 0x4f7ffffe
+; GFX6-NEXT:    s_sub_i32 s4, 0, s2
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_sub_i32 s2, 0, s8
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_f32_e32 v0, s4, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, s5, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, s4, v1
+; GFX6-NEXT:    v_mul_f32_e32 v1, s5, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v0
+; GFX6-NEXT:    s_sub_i32 s4, 0, s3
+; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v1
 ; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
-; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v0
-; GFX6-NEXT:    s_sub_i32 s2, 0, s3
-; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
@@ -6049,14 +6050,14 @@
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s3
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s2, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s2, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
@@ -6082,38 +6083,38 @@
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, s2, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    s_sub_i32 s2, 0, s4
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
+; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v1, s3, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s5
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT:    v_sub_u32_e32 v1, s3, v1
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s4, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v1, s3, v1
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s5, v1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s4, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v4, s5, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
@@ -6212,40 +6213,41 @@
 ; GFX6-LABEL: sdiv_i32_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
-; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX6-NEXT:    s_add_i32 s3, s3, s4
-; GFX6-NEXT:    s_xor_b32 s7, s3, s4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX6-NEXT:    s_sub_i32 s3, 0, s7
-; GFX6-NEXT:    s_ashr_i32 s5, s2, 31
-; GFX6-NEXT:    s_add_i32 s2, s2, s5
+; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
+; GFX6-NEXT:    s_add_i32 s3, s3, s8
+; GFX6-NEXT:    s_xor_b32 s3, s3, s8
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX6-NEXT:    s_sub_i32 s4, 0, s3
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_xor_b32 s6, s2, s5
-; GFX6-NEXT:    s_xor_b32 s4, s5, s4
-; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GFX6-NEXT:    s_ashr_i32 s0, s2, 31
+; GFX6-NEXT:    s_add_i32 s1, s2, s0
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    s_xor_b32 s1, s1, s0
+; GFX6-NEXT:    s_xor_b32 s2, s0, s8
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s7
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s6, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s7, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s3
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
-; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sdiv_i32_pow2_shl_denom:
@@ -6257,30 +6259,30 @@
 ; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX9-NEXT:    s_add_i32 s3, s3, s4
-; GFX9-NEXT:    s_xor_b32 s5, s3, s4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; GFX9-NEXT:    s_sub_i32 s3, 0, s5
+; GFX9-NEXT:    s_xor_b32 s3, s3, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT:    s_sub_i32 s5, 0, s3
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v1, s3, v0
-; GFX9-NEXT:    s_ashr_i32 s3, s2, 31
-; GFX9-NEXT:    s_add_i32 s2, s2, s3
-; GFX9-NEXT:    s_xor_b32 s2, s2, s3
+; GFX9-NEXT:    v_mul_lo_u32 v1, s5, v0
+; GFX9-NEXT:    s_ashr_i32 s5, s2, 31
+; GFX9-NEXT:    s_add_i32 s2, s2, s5
+; GFX9-NEXT:    s_xor_b32 s2, s2, s5
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s5
+; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s3
 ; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v1, s2, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v1
+; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT:    s_xor_b32 s2, s3, s4
+; GFX9-NEXT:    s_xor_b32 s2, s5, s4
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
 ; GFX9-NEXT:    v_subrev_u32_e32 v0, s2, v0
 ; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
@@ -6493,7 +6495,7 @@
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
 ; GFX6-NEXT:    s_movk_i32 s10, 0x1000
-; GFX6-NEXT:    s_mov_b32 s13, 0x4f7ffffe
+; GFX6-NEXT:    s_mov_b32 s12, 0x4f7ffffe
 ; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
@@ -6501,35 +6503,35 @@
 ; GFX6-NEXT:    s_lshl_b32 s2, s10, s2
 ; GFX6-NEXT:    s_ashr_i32 s11, s2, 31
 ; GFX6-NEXT:    s_add_i32 s2, s2, s11
-; GFX6-NEXT:    s_xor_b32 s12, s2, s11
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s12
+; GFX6-NEXT:    s_xor_b32 s2, s2, s11
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX6-NEXT:    s_lshl_b32 s0, s10, s3
-; GFX6-NEXT:    s_sub_i32 s3, 0, s12
-; GFX6-NEXT:    s_ashr_i32 s2, s0, 31
+; GFX6-NEXT:    s_sub_i32 s10, 0, s2
+; GFX6-NEXT:    s_ashr_i32 s3, s0, 31
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_add_i32 s0, s0, s2
-; GFX6-NEXT:    s_xor_b32 s10, s0, s2
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s10
-; GFX6-NEXT:    v_mul_f32_e32 v0, s13, v0
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX6-NEXT:    s_add_i32 s0, s0, s3
 ; GFX6-NEXT:    s_ashr_i32 s1, s8, 31
+; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    v_mul_f32_e32 v0, s12, v0
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, s10, v0
+; GFX6-NEXT:    s_xor_b32 s10, s0, s3
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s10
 ; GFX6-NEXT:    s_add_i32 s0, s8, s1
+; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    s_xor_b32 s0, s0, s1
-; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX6-NEXT:    s_xor_b32 s3, s1, s11
-; GFX6-NEXT:    s_mov_b32 s6, -1
-; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    s_xor_b32 s8, s1, s11
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, s13, v2
+; GFX6-NEXT:    v_mul_f32_e32 v1, s12, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s12
+; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s0, v2
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v2
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s12, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
 ; GFX6-NEXT:    s_sub_i32 s0, 0, s10
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s0, v1
@@ -6538,15 +6540,15 @@
 ; GFX6-NEXT:    s_xor_b32 s1, s1, s0
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
-; GFX6-NEXT:    s_xor_b32 s2, s0, s2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s12, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v0, s3, v0
+; GFX6-NEXT:    s_xor_b32 s2, s0, s3
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s10
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
+; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
 ; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
@@ -6566,67 +6568,67 @@
 ; GFX9-NEXT:    s_movk_i32 s8, 0x1000
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
-; GFX9-NEXT:    s_mov_b32 s11, 0x4f7ffffe
+; GFX9-NEXT:    s_mov_b32 s10, 0x4f7ffffe
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s2, s8, s2
 ; GFX9-NEXT:    s_ashr_i32 s9, s2, 31
 ; GFX9-NEXT:    s_add_i32 s2, s2, s9
-; GFX9-NEXT:    s_xor_b32 s10, s2, s9
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s10
+; GFX9-NEXT:    s_xor_b32 s2, s2, s9
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX9-NEXT:    s_lshl_b32 s0, s8, s3
 ; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
 ; GFX9-NEXT:    s_add_i32 s0, s0, s1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s8, s0, s1
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s8
-; GFX9-NEXT:    s_sub_i32 s0, 0, s10
-; GFX9-NEXT:    v_mul_f32_e32 v0, s11, v0
+; GFX9-NEXT:    s_xor_b32 s0, s0, s1
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s0
+; GFX9-NEXT:    s_sub_i32 s3, 0, s2
+; GFX9-NEXT:    v_mul_f32_e32 v0, s10, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    s_sub_i32 s3, 0, s8
-; GFX9-NEXT:    v_mul_lo_u32 v3, s0, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, s11, v1
-; GFX9-NEXT:    s_ashr_i32 s0, s6, 31
+; GFX9-NEXT:    s_sub_i32 s8, 0, s0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
+; GFX9-NEXT:    v_mul_f32_e32 v1, s10, v1
+; GFX9-NEXT:    s_ashr_i32 s3, s6, 31
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
-; GFX9-NEXT:    s_add_i32 s2, s6, s0
-; GFX9-NEXT:    s_xor_b32 s2, s2, s0
-; GFX9-NEXT:    s_xor_b32 s0, s0, s9
+; GFX9-NEXT:    s_add_i32 s6, s6, s3
+; GFX9-NEXT:    s_xor_b32 s6, s6, s3
+; GFX9-NEXT:    s_xor_b32 s3, s3, s9
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v3
-; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
-; GFX9-NEXT:    s_ashr_i32 s3, s7, 31
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s10
+; GFX9-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
+; GFX9-NEXT:    s_ashr_i32 s8, s7, 31
+; GFX9-NEXT:    s_xor_b32 s1, s8, s1
+; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s2
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
-; GFX9-NEXT:    v_sub_u32_e32 v4, s2, v4
-; GFX9-NEXT:    s_add_i32 s2, s7, s3
-; GFX9-NEXT:    s_xor_b32 s2, s2, s3
+; GFX9-NEXT:    v_sub_u32_e32 v4, s6, v4
+; GFX9-NEXT:    s_add_i32 s6, s7, s8
+; GFX9-NEXT:    s_xor_b32 s6, s6, s8
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v4
-; GFX9-NEXT:    v_mul_hi_u32 v1, s2, v1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v4
+; GFX9-NEXT:    v_mul_hi_u32 v1, s6, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v5, s10, v4
+; GFX9-NEXT:    v_subrev_u32_e32 v5, s2, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v4
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v4
 ; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s8
+; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s0
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s0, v0
-; GFX9-NEXT:    v_sub_u32_e32 v3, s2, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
+; GFX9-NEXT:    v_xor_b32_e32 v0, s3, v0
+; GFX9-NEXT:    v_subrev_u32_e32 v0, s3, v0
+; GFX9-NEXT:    v_sub_u32_e32 v3, s6, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s8, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v4, s0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
-; GFX9-NEXT:    s_xor_b32 s0, s3, s1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GFX9-NEXT:    v_subrev_u32_e32 v1, s0, v1
+; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v1
+; GFX9-NEXT:    v_subrev_u32_e32 v1, s1, v1
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
@@ -6737,13 +6739,13 @@
 ; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
 ; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX6-NEXT:    s_add_i32 s3, s3, s4
-; GFX6-NEXT:    s_xor_b32 s6, s3, s4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX6-NEXT:    s_sub_i32 s3, 0, s6
-; GFX6-NEXT:    s_ashr_i32 s4, s2, 31
-; GFX6-NEXT:    s_add_i32 s2, s2, s4
+; GFX6-NEXT:    s_xor_b32 s4, s3, s4
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX6-NEXT:    s_sub_i32 s3, 0, s4
+; GFX6-NEXT:    s_ashr_i32 s5, s2, 31
+; GFX6-NEXT:    s_add_i32 s2, s2, s5
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_xor_b32 s5, s2, s4
+; GFX6-NEXT:    s_xor_b32 s6, s2, s5
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -6751,17 +6753,17 @@
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s5, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s6, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s4
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s6, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
+; GFX6-NEXT:    v_xor_b32_e32 v0, s5, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s5, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -6958,56 +6960,56 @@
 ; GFX6-NEXT:    s_lshl_b32 s2, s6, s2
 ; GFX6-NEXT:    s_ashr_i32 s4, s2, 31
 ; GFX6-NEXT:    s_add_i32 s2, s2, s4
-; GFX6-NEXT:    s_xor_b32 s9, s2, s4
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s9
-; GFX6-NEXT:    s_lshl_b32 s2, s6, s3
-; GFX6-NEXT:    s_ashr_i32 s6, s2, 31
-; GFX6-NEXT:    s_add_i32 s2, s2, s6
+; GFX6-NEXT:    s_xor_b32 s2, s2, s4
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX6-NEXT:    s_lshl_b32 s3, s6, s3
+; GFX6-NEXT:    s_ashr_i32 s6, s3, 31
+; GFX6-NEXT:    s_add_i32 s3, s3, s6
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    s_sub_i32 s8, 0, s9
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
+; GFX6-NEXT:    s_xor_b32 s3, s3, s6
+; GFX6-NEXT:    s_sub_i32 s9, 0, s2
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s3
 ; GFX6-NEXT:    v_mul_f32_e32 v0, s10, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_ashr_i32 s3, s0, 31
-; GFX6-NEXT:    s_add_i32 s0, s0, s3
-; GFX6-NEXT:    v_mul_lo_u32 v1, s8, v0
-; GFX6-NEXT:    s_xor_b32 s8, s2, s6
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s8
-; GFX6-NEXT:    s_xor_b32 s0, s0, s3
-; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT:    s_sub_i32 s2, 0, s8
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    v_mul_lo_u32 v1, s9, v0
+; GFX6-NEXT:    s_sub_i32 s9, 0, s3
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_ashr_i32 s8, s0, 31
+; GFX6-NEXT:    s_add_i32 s0, s0, s8
+; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX6-NEXT:    s_xor_b32 s0, s0, s8
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, s10, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s9
-; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v1
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
+; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    s_ashr_i32 s0, s1, 31
-; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
 ; GFX6-NEXT:    s_add_i32 s1, s1, s0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-NEXT:    s_xor_b32 s1, s1, s0
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s8
-; GFX6-NEXT:    v_xor_b32_e32 v0, s3, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s3
+; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v1
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v1
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v1, s0, v1
 ; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s0, v1
@@ -7021,41 +7023,41 @@
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
 ; GFX9-NEXT:    s_movk_i32 s8, 0x1000
 ; GFX9-NEXT:    s_mov_b32 s9, 0x4f7ffffe
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s0, s8, s6
 ; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
 ; GFX9-NEXT:    s_add_i32 s0, s0, s1
 ; GFX9-NEXT:    s_xor_b32 s0, s0, s1
-; GFX9-NEXT:    s_lshl_b32 s1, s8, s7
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GFX9-NEXT:    s_lshl_b32 s1, s8, s7
 ; GFX9-NEXT:    s_ashr_i32 s6, s1, 31
 ; GFX9-NEXT:    s_add_i32 s1, s1, s6
 ; GFX9-NEXT:    s_xor_b32 s1, s1, s6
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s1
 ; GFX9-NEXT:    s_sub_i32 s7, 0, s0
 ; GFX9-NEXT:    s_ashr_i32 s6, s4, 31
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, s9, v0
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    s_add_i32 s4, s4, s6
+; GFX9-NEXT:    s_xor_b32 s4, s4, s6
 ; GFX9-NEXT:    v_mul_f32_e32 v1, s9, v1
+; GFX9-NEXT:    v_mul_lo_u32 v3, s7, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_mul_lo_u32 v2, s7, v0
 ; GFX9-NEXT:    s_sub_i32 s7, 0, s1
-; GFX9-NEXT:    s_xor_b32 s4, s4, s6
-; GFX9-NEXT:    v_mul_lo_u32 v3, s7, v1
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
+; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v1
 ; GFX9-NEXT:    s_ashr_i32 s7, s5, 31
 ; GFX9-NEXT:    s_add_i32 s5, s5, s7
-; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v3
+; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX9-NEXT:    s_xor_b32 s5, s5, s7
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s1
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
 ; GFX9-NEXT:    v_subrev_u32_e32 v3, s0, v0
@@ -7120,7 +7122,7 @@
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v9, v1, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT:    s_movk_i32 s4, 0x11e
+; GFX6-NEXT:    s_mov_b32 s4, 0x976a7376
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
@@ -7136,7 +7138,7 @@
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s3
 ; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v2, s3
-; GFX6-NEXT:    s_mov_b32 s2, 0x976a7377
+; GFX6-NEXT:    s_movk_i32 s2, 0x11f
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s3
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
@@ -7144,7 +7146,7 @@
 ; GFX6-NEXT:    v_mul_hi_u32 v10, v0, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v11, v2, v4
-; GFX6-NEXT:    s_movk_i32 s3, 0x11f
+; GFX6-NEXT:    s_mov_b32 s3, 0x976a7377
 ; GFX6-NEXT:    s_mov_b32 s9, s5
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
@@ -7174,24 +7176,24 @@
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s3
-; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s2
-; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s2
-; GFX6-NEXT:    v_mov_b32_e32 v5, s3
+; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s3
+; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
+; GFX6-NEXT:    v_mov_b32_e32 v5, s2
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s2
+; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s7, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
 ; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
-; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s2, v3
+; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s3, v3
 ; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
-; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s4, v4
-; GFX6-NEXT:    s_mov_b32 s2, 0x976a7376
+; GFX6-NEXT:    s_movk_i32 s3, 0x11e
+; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s3, v4
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s2, v5
+; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s4, v5
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v4
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
 ; GFX6-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
@@ -7201,11 +7203,11 @@
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
 ; GFX6-NEXT:    v_mov_b32_e32 v6, s7
 ; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
-; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v2
+; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v3
+; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v2
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
@@ -7222,60 +7224,58 @@
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_movk_i32 s4, 0xfee0
 ; GFX9-NEXT:    s_mov_b32 s5, 0x68958c89
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_movk_i32 s8, 0x11f
-; GFX9-NEXT:    s_mov_b32 s9, 0x976a7376
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s4
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s5
-; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s5
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s5
+; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s5
+; GFX9-NEXT:    v_mul_lo_u32 v6, v0, s5
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
-; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
+; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
+; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v6
 ; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v4
-; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v7, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v6
+; GFX9-NEXT:    v_mul_hi_u32 v6, v1, v6
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v4, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v4, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s4
-; GFX9-NEXT:    v_mul_hi_u32 v7, v0, s5
-; GFX9-NEXT:    v_mul_lo_u32 v8, v2, s5
+; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s5
+; GFX9-NEXT:    v_mul_lo_u32 v7, v2, s5
 ; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s5
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    v_add_u32_e32 v4, v7, v4
-; GFX9-NEXT:    v_add_u32_e32 v4, v4, v8
-; GFX9-NEXT:    v_mul_lo_u32 v7, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v9
+; GFX9-NEXT:    v_add_u32_e32 v4, v6, v4
+; GFX9-NEXT:    v_add_u32_e32 v4, v4, v7
+; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v4
+; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v9
 ; GFX9-NEXT:    v_mul_hi_u32 v10, v0, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v11, v2, v4
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v10, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v8, v10, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v10, v2, v9
 ; GFX9-NEXT:    v_mul_hi_u32 v9, v2, v9
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v10
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v8, v9, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v11, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v7, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v5, v4, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v10
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v9, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v11, v5, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -7283,56 +7283,58 @@
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v1
-; GFX9-NEXT:    v_mul_hi_u32 v7, s7, v1
+; GFX9-NEXT:    v_mul_hi_u32 v6, s7, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v4, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
-; GFX9-NEXT:    s_mov_b32 s2, 0x976a7377
+; GFX9-NEXT:    s_movk_i32 s2, 0x11f
+; GFX9-NEXT:    s_mov_b32 s3, 0x976a7377
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s8
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s2
-; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s2
-; GFX9-NEXT:    v_mov_b32_e32 v5, s8
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v8, v2, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s2
+; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s3
+; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v6, s2
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s2
+; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s3
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
 ; GFX9-NEXT:    v_sub_u32_e32 v4, s7, v2
-; GFX9-NEXT:    v_sub_co_u32_e64 v3, s[0:1], s6, v3
-; GFX9-NEXT:    v_subb_co_u32_e64 v4, vcc, v4, v5, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v5, vcc, s2, v3
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v4, vcc, 0, v4, vcc
-; GFX9-NEXT:    s_movk_i32 s6, 0x11e
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s9, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v7, v5, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, 2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v9, v7, s[2:3]
+; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s6, v3
+; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc
+; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s3, v3
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
+; GFX9-NEXT:    s_movk_i32 s3, 0x11e
+; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s3, v4
+; GFX9-NEXT:    s_mov_b32 s6, 0x976a7376
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s6, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v6, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v6, s[0:1], 2, v0
+; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v8, s[0:1], 1, v0
+; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[0:1], 0, v1, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v9, v7, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s7
-; GFX9-NEXT:    v_subb_co_u32_e64 v2, vcc, v7, v2, s[0:1]
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s9, v3
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v5, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v6, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv i64 %x, 1235195949943
   store i64 %r, i64 addrspace(1)* %out
@@ -7477,9 +7479,9 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f800000
 ; GFX6-NEXT:    v_madak_f32 v0, 0, v0, 0x457ff000
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX6-NEXT:    s_movk_i32 s6, 0xf001
+; GFX6-NEXT:    s_movk_i32 s2, 0xf001
+; GFX6-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v7, 0
-; GFX6-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
@@ -7488,96 +7490,96 @@
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
-; GFX6-NEXT:    s_movk_i32 s0, 0xfff
-; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s6
-; GFX6-NEXT:    v_mul_lo_u32 v5, v1, s6
-; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s6
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, v0, v3
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s2
+; GFX6-NEXT:    v_mul_lo_u32 v3, v1, s2
+; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s2
+; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v4
-; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v3
-; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v3
-; GFX6-NEXT:    v_mul_hi_u32 v9, v1, v3
-; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v9, v1, v2
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v7, v8, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v8, v1, v4
+; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v6, v4, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v9, v2, vcc
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GFX6-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v3
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
-; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s6
-; GFX6-NEXT:    v_addc_u32_e64 v3, vcc, v1, v4, s[2:3]
-; GFX6-NEXT:    v_mul_lo_u32 v6, v3, s6
-; GFX6-NEXT:    v_mul_lo_u32 v8, v0, s6
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, v0, v5
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v3, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v5
-; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v8
-; GFX6-NEXT:    v_mul_hi_u32 v10, v0, v5
-; GFX6-NEXT:    v_mul_hi_u32 v11, v3, v5
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v7, v10, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v10, v3, v8
-; GFX6-NEXT:    v_mul_hi_u32 v8, v3, v8
-; GFX6-NEXT:    v_mul_lo_u32 v3, v3, v5
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v9, v8, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v11, v2, vcc
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
-; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[2:3]
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
+; GFX6-NEXT:    v_mul_hi_u32 v4, v0, s2
+; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
+; GFX6-NEXT:    v_mul_lo_u32 v5, v2, s2
+; GFX6-NEXT:    v_mul_lo_u32 v6, v0, s2
+; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_mul_lo_u32 v3, s10, v1
-; GFX6-NEXT:    v_mul_hi_u32 v4, s10, v0
-; GFX6-NEXT:    v_mul_hi_u32 v5, s10, v1
-; GFX6-NEXT:    v_mul_hi_u32 v6, s11, v1
+; GFX6-NEXT:    s_lshr_b64 s[2:3], s[8:9], 12
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v4
+; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v6
+; GFX6-NEXT:    v_mul_hi_u32 v10, v0, v4
+; GFX6-NEXT:    v_mul_hi_u32 v11, v2, v4
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v10, v2, v6
+; GFX6-NEXT:    v_mul_hi_u32 v6, v2, v6
+; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v9, v6, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v11, v7, vcc
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
+; GFX6-NEXT:    v_mul_hi_u32 v4, s10, v1
+; GFX6-NEXT:    v_mul_hi_u32 v5, s11, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s11, v1
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v5, s11, v0
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v4, s11, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GFX6-NEXT:    s_lshr_b64 s[2:3], s[8:9], 12
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v4, v0, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v6, v2, vcc
+; GFX6-NEXT:    s_movk_i32 s0, 0xfff
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v7, v2, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s0
-; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s0
-; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_mov_b32_e32 v3, s11
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s10, v4
-; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v3, v2, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s0, v4
-; GFX6-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v2, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s0
+; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s0
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
+; GFX6-NEXT:    v_mul_lo_u32 v8, v0, s0
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
+; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT:    v_mov_b32_e32 v5, s11
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s10, v8
+; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s0, v8
+; GFX6-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
 ; GFX6-NEXT:    s_movk_i32 s0, 0xffe
-; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
-; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
-; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v4
-; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, -1, v4, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v8, v6, vcc
-; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
+; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v5
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; GFX6-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
+; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v8
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, -1, v5, s[0:1]
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, v1, v3, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[0:1]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s3
@@ -7598,7 +7600,6 @@
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_movk_i32 s8, 0xfff
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s4
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s4
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s4
@@ -7623,6 +7624,7 @@
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v0, s4
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v2, s4
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v0, s4
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v0
 ; GFX9-NEXT:    v_add_u32_e32 v4, v4, v6
@@ -7636,8 +7638,7 @@
 ; GFX9-NEXT:    v_mul_lo_u32 v10, v2, v8
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v2, v8
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], 12
+; GFX9-NEXT:    s_movk_i32 s0, 0xfff
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v10
 ; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v9, v8, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v11, v5, vcc
@@ -7646,6 +7647,7 @@
 ; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v1
@@ -7655,44 +7657,43 @@
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[4:5], 12
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s8
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s8
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s8
-; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, s6, v4
-; GFX9-NEXT:    s_movk_i32 s6, 0xffe
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s7
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v2, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s8, v4
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, 2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, -1, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v9, v7, s[0:1]
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v8, v6, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[2:3]
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 2, v0
+; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s0
+; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s0
+; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s0
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v4, v6, v4
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, s6, v9
+; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v6, v4, vcc
+; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s0, v9
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v10, vcc, 0, v4, vcc
+; GFX9-NEXT:    s_movk_i32 s0, 0xffe
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
+; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v9
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, -1, v6, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v1, v3, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[8:9]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv <2 x i64> %x, <i64 4096, i64 4095>
   store <2 x i64> %r, <2 x i64> addrspace(1)* %out
@@ -7781,8 +7782,9 @@
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s3
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
-; GFX6-NEXT:    s_movk_i32 s12, 0x11f
-; GFX6-NEXT:    s_mov_b32 s13, 0x9761f7c9
+; GFX6-NEXT:    s_mov_b32 s12, 0x9761f7c9
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s8, s4
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
@@ -7791,13 +7793,12 @@
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v9, v1, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s9, s5
+; GFX6-NEXT:    s_movk_i32 s4, 0x11f
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
-; GFX6-NEXT:    s_movk_i32 s5, 0x11e
+; GFX6-NEXT:    s_mov_b32 s9, s5
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
@@ -7808,7 +7809,7 @@
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s3
 ; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v2, s3
-; GFX6-NEXT:    s_mov_b32 s8, s4
+; GFX6-NEXT:    s_movk_i32 s5, 0x11e
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s3
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
@@ -7816,14 +7817,13 @@
 ; GFX6-NEXT:    v_mul_hi_u32 v10, v0, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v11, v2, v4
-; GFX6-NEXT:    s_mov_b32 s4, 0x9761f7c8
 ; GFX6-NEXT:    s_mov_b32 s11, 0xf000
+; GFX6-NEXT:    s_mov_b32 s10, -1
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v10, v2, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v2, v5
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
-; GFX6-NEXT:    s_mov_b32 s10, -1
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v9, v5, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v11, v7, vcc
@@ -7847,25 +7847,26 @@
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s12
-; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s13
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s13
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s13
+; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s4
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s12
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s12
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s12
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s7, v1
-; GFX6-NEXT:    v_mov_b32_e32 v3, s12
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
+; GFX6-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
-; GFX6-NEXT:    v_subrev_i32_e64 v4, s[0:1], s13, v0
+; GFX6-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
 ; GFX6-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
 ; GFX6-NEXT:    v_cmp_lt_u32_e64 s[2:3], s5, v5
+; GFX6-NEXT:    s_mov_b32 s6, 0x9761f7c8
 ; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
-; GFX6-NEXT:    v_cmp_lt_u32_e64 s[2:3], s4, v4
-; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s13, v4
+; GFX6-NEXT:    v_cmp_lt_u32_e64 s[2:3], s6, v4
+; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
 ; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s12, v5
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, v5
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
 ; GFX6-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
@@ -7874,9 +7875,9 @@
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
 ; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s5, v1
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v0
+; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v0
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s12, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -7893,61 +7894,61 @@
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_movk_i32 s4, 0xfee0
 ; GFX9-NEXT:    s_mov_b32 s5, 0x689e0837
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    s_movk_i32 s8, 0x11f
-; GFX9-NEXT:    s_mov_b32 s9, 0x9761f7c9
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s4
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s5
-; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s5
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s5
-; GFX9-NEXT:    s_mov_b32 s10, 0x9761f7c8
+; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s5
+; GFX9-NEXT:    v_mul_lo_u32 v6, v0, s5
+; GFX9-NEXT:    s_mov_b32 s9, 0x9761f7c9
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
-; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
+; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
+; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v6
 ; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v4
-; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v7, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v6
+; GFX9-NEXT:    v_mul_hi_u32 v6, v1, v6
+; GFX9-NEXT:    s_mov_b32 s10, 0x9761f7c8
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v4, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v4, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s4
-; GFX9-NEXT:    v_mul_hi_u32 v7, v0, s5
-; GFX9-NEXT:    v_mul_lo_u32 v8, v2, s5
+; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s5
+; GFX9-NEXT:    v_mul_lo_u32 v7, v2, s5
 ; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s5
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    v_add_u32_e32 v4, v7, v4
-; GFX9-NEXT:    v_add_u32_e32 v4, v4, v8
-; GFX9-NEXT:    v_mul_lo_u32 v7, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v9
+; GFX9-NEXT:    v_add_u32_e32 v4, v6, v4
+; GFX9-NEXT:    v_add_u32_e32 v4, v4, v7
+; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v4
+; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v9
 ; GFX9-NEXT:    v_mul_hi_u32 v10, v0, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v11, v2, v4
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v10, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v8, v10, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v10, v2, v9
 ; GFX9-NEXT:    v_mul_hi_u32 v9, v2, v9
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v10
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v8, v9, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v11, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v7, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v5, v4, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v10
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v9, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v11, v5, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -7955,54 +7956,54 @@
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v1
-; GFX9-NEXT:    v_mul_hi_u32 v7, s7, v1
+; GFX9-NEXT:    v_mul_hi_u32 v6, s7, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v4, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v2, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v8, v2, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s8
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s9
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s9
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s9
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
-; GFX9-NEXT:    v_sub_co_u32_e64 v0, s[0:1], s6, v0
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v2, s7, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s8
-; GFX9-NEXT:    v_subb_co_u32_e64 v2, vcc, v2, v3, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[2:3], s9, v0
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v5, vcc, 0, v2, s[2:3]
+; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc
+; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s9, v0
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[2:3], 0, v2, s[0:1]
 ; GFX9-NEXT:    s_movk_i32 s6, 0x11e
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s10, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
-; GFX9-NEXT:    v_subb_co_u32_e64 v2, vcc, v2, v3, s[2:3]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s9, v4
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v5, s7
-; GFX9-NEXT:    v_subb_co_u32_e64 v1, vcc, v5, v1, s[0:1]
+; GFX9-NEXT:    v_cmp_lt_u32_e64 s[2:3], s6, v6
+; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
+; GFX9-NEXT:    v_cmp_lt_u32_e64 s[2:3], s10, v4
+; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s9, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s8, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v6, v1, vcc
 ; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s10, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = urem i64 %x, 1235195393993
   store i64 %r, i64 addrspace(1)* %out
@@ -8293,33 +8294,33 @@
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s3
-; GFX6-NEXT:    v_mul_hi_u32 v3, s3, v0
-; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s3
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s0, v4
-; GFX6-NEXT:    v_mov_b32_e32 v3, s1
-; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v3, v2, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s3, v4
-; GFX6-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v2, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
+; GFX6-NEXT:    v_mul_hi_u32 v5, s3, v0
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
+; GFX6-NEXT:    v_mul_lo_u32 v8, v0, s3
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
+; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s0, v8
+; GFX6-NEXT:    v_mov_b32_e32 v5, s1
+; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s3, v8
+; GFX6-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
 ; GFX6-NEXT:    s_mov_b32 s0, 0x12d8fa
-; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
-; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
-; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v4
-; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, -1, v4, s[0:1]
-; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
+; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v5
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; GFX6-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
+; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v8
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, -1, v5, s[0:1]
+; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v8, v6, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
 ; GFX6-NEXT:    v_xor_b32_e32 v1, s2, v1
@@ -8409,34 +8410,34 @@
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s3
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s3
-; GFX9-NEXT:    v_mul_hi_u32 v3, s3, v0
-; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, s0, v4
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v2, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s3, v4
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s3, 0x12d8fa
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, 2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, -1, v4, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v6, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v9, v7, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 2, v0
+; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s3
+; GFX9-NEXT:    v_mul_hi_u32 v6, s3, v0
+; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s3
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v4, v6, v4
+; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, s0, v9
+; GFX9-NEXT:    v_mov_b32_e32 v6, s1
+; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v6, v4, vcc
+; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s3, v9
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v10, vcc, 0, v4, vcc
+; GFX9-NEXT:    s_mov_b32 s0, 0x12d8fa
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
+; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v9
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, -1, v6, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -8706,16 +8707,16 @@
 ; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v8, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v8, v3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s12, s7, 31
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v5, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v4
-; GFX9-NEXT:    s_add_u32 s0, s6, s12
 ; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[2:3]
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_ashr_i32 s2, s7, 31
+; GFX9-NEXT:    s_add_u32 s0, s6, s2
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX9-NEXT:    s_mov_b32 s13, s12
-; GFX9-NEXT:    s_addc_u32 s1, s7, s12
-; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[12:13]
+; GFX9-NEXT:    s_mov_b32 s3, s2
+; GFX9-NEXT:    s_addc_u32 s1, s7, s2
+; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[2:3]
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s6, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v0
@@ -8739,24 +8740,24 @@
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s10, v0
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
 ; GFX9-NEXT:    v_sub_u32_e32 v5, s7, v3
-; GFX9-NEXT:    v_sub_co_u32_e64 v4, s[0:1], s6, v4
-; GFX9-NEXT:    v_subb_co_u32_e64 v5, vcc, v5, v6, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s10, v4
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, 2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v9, v7, s[2:3]
+; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, s6, v4
+; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc
+; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s10, v4
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v6, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v6, s[0:1], 2, v0
+; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v8, s[0:1], 1, v0
+; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[0:1], 0, v1, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v9, v7, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s7
-; GFX9-NEXT:    v_subb_co_u32_e64 v3, vcc, v7, v3, s[0:1]
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v7, v3, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v4
@@ -8764,9 +8765,9 @@
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v4, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v8, v6, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v8, v6, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[12:13], s[8:9]
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], s[8:9]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v1
@@ -8943,33 +8944,33 @@
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s9
-; GFX6-NEXT:    v_mul_hi_u32 v3, s9, v0
-; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s9
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s0, v4
-; GFX6-NEXT:    v_mov_b32_e32 v3, s1
-; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v3, v2, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v4
-; GFX6-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v2, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s9
+; GFX6-NEXT:    v_mul_hi_u32 v5, s9, v0
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
+; GFX6-NEXT:    v_mul_lo_u32 v8, v0, s9
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
+; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s0, v8
+; GFX6-NEXT:    v_mov_b32_e32 v5, s1
+; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v8
+; GFX6-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
 ; GFX6-NEXT:    s_movk_i32 s0, 0xffe
-; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
-; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
-; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v4
-; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, -1, v4, s[0:1]
-; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
+; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v5
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; GFX6-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
+; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v8
+; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, -1, v5, s[0:1]
+; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v8, v6, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v0
@@ -9062,40 +9063,40 @@
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s7, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
-; GFX9-NEXT:    s_movk_i32 s3, 0xfff
+; GFX9-NEXT:    s_movk_i32 s0, 0xfff
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v4, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v6, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s3
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s3
-; GFX9-NEXT:    v_mul_hi_u32 v3, s3, v0
-; GFX9-NEXT:    v_sub_co_u32_e32 v5, vcc, s6, v5
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s7
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v2, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s3, v5
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc
-; GFX9-NEXT:    s_movk_i32 s3, 0xffe
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, 2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, -1, v5, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v6, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v9, v7, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 2, v0
+; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s0
+; GFX9-NEXT:    v_mul_hi_u32 v6, s0, v0
+; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s0
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, s6, v9
+; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
+; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s0, v9
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc
+; GFX9-NEXT:    s_movk_i32 s0, 0xffe
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
+; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v9
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, -1, v6, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s2, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
@@ -9398,9 +9399,9 @@
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
-; GFX9-NEXT:    s_mov_b32 s18, 0x4f800000
-; GFX9-NEXT:    s_mov_b32 s19, 0x5f7ffffc
-; GFX9-NEXT:    s_mov_b32 s20, 0x2f800000
+; GFX9-NEXT:    s_mov_b32 s16, 0x4f800000
+; GFX9-NEXT:    s_mov_b32 s17, 0x5f7ffffc
+; GFX9-NEXT:    s_mov_b32 s18, 0x2f800000
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], s6
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
@@ -9411,16 +9412,16 @@
 ; GFX9-NEXT:    s_xor_b64 s[10:11], s[2:3], s[12:13]
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s10
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s11
-; GFX9-NEXT:    s_mov_b32 s21, 0xcf800000
+; GFX9-NEXT:    s_mov_b32 s19, 0xcf800000
 ; GFX9-NEXT:    s_sub_u32 s14, 0, s10
 ; GFX9-NEXT:    s_subb_u32 s4, 0, s11
-; GFX9-NEXT:    v_mac_f32_e32 v0, s18, v1
+; GFX9-NEXT:    v_mac_f32_e32 v0, s16, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-NEXT:    v_mul_f32_e32 v0, s19, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, s20, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, s17, v0
+; GFX9-NEXT:    v_mul_f32_e32 v1, s18, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    v_mac_f32_e32 v0, s21, v1
+; GFX9-NEXT:    v_mac_f32_e32 v0, s19, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v3, s14, v0
@@ -9473,21 +9474,20 @@
 ; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
 ; GFX9-NEXT:    s_add_u32 s2, s4, s14
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    s_addc_u32 s3, s5, s14
 ; GFX9-NEXT:    s_mov_b32 s15, s14
-; GFX9-NEXT:    s_xor_b64 s[16:17], s[2:3], s[14:15]
+; GFX9-NEXT:    s_addc_u32 s3, s5, s14
+; GFX9-NEXT:    s_xor_b64 s[4:5], s[2:3], s[14:15]
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, s16, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s16, v0
-; GFX9-NEXT:    v_mul_hi_u32 v4, s16, v1
-; GFX9-NEXT:    v_mul_hi_u32 v7, s17, v1
-; GFX9-NEXT:    v_mul_lo_u32 v1, s17, v1
+; GFX9-NEXT:    v_mul_lo_u32 v2, s4, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, s4, v0
+; GFX9-NEXT:    v_mul_hi_u32 v4, s4, v1
+; GFX9-NEXT:    v_mul_hi_u32 v7, s5, v1
+; GFX9-NEXT:    v_mul_lo_u32 v1, s5, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s17, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, s17, v0
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT:    s_xor_b64 s[12:13], s[14:15], s[12:13]
+; GFX9-NEXT:    v_mul_lo_u32 v4, s5, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v6, vcc
@@ -9497,59 +9497,60 @@
 ; GFX9-NEXT:    v_mul_hi_u32 v3, s10, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s11, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s11
-; GFX9-NEXT:    s_ashr_i32 s14, s9, 31
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v0
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
-; GFX9-NEXT:    v_sub_u32_e32 v4, s17, v2
-; GFX9-NEXT:    s_mov_b32 s15, s14
-; GFX9-NEXT:    v_sub_co_u32_e64 v3, s[0:1], s16, v3
-; GFX9-NEXT:    v_subb_co_u32_e64 v4, vcc, v4, v7, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s10, v3
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v4, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v7, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 2, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, 1, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v10, v8, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v8, s17
-; GFX9-NEXT:    v_subb_co_u32_e64 v2, vcc, v8, v2, s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s8, s14
-; GFX9-NEXT:    s_addc_u32 s1, s9, s14
-; GFX9-NEXT:    s_xor_b64 s[8:9], s[0:1], s[14:15]
+; GFX9-NEXT:    v_sub_u32_e32 v4, s5, v2
+; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s4, v3
+; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v7, vcc
+; GFX9-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s10, v3
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, v7, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v7, s[0:1], 2, v0
+; GFX9-NEXT:    v_addc_co_u32_e64 v8, s[0:1], 0, v1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v9, s[0:1], 1, v0
+; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[0:1], 0, v1, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v10, v8, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v8, s5
+; GFX9-NEXT:    s_xor_b64 s[4:5], s[14:15], s[12:13]
+; GFX9-NEXT:    s_ashr_i32 s12, s9, 31
+; GFX9-NEXT:    s_add_u32 s8, s8, s12
+; GFX9-NEXT:    s_mov_b32 s13, s12
+; GFX9-NEXT:    s_addc_u32 s9, s9, s12
+; GFX9-NEXT:    s_xor_b64 s[8:9], s[8:9], s[12:13]
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v10, s8
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v11, s9
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v8, v2, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v2
-; GFX9-NEXT:    v_mac_f32_e32 v10, s18, v11
+; GFX9-NEXT:    v_mac_f32_e32 v10, s16, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v3, vcc
 ; GFX9-NEXT:    v_rcp_f32_e32 v3, v10
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v9, v7, s[2:3]
-; GFX9-NEXT:    v_mul_f32_e32 v3, s19, v3
-; GFX9-NEXT:    v_mul_f32_e32 v4, s20, v3
+; GFX9-NEXT:    s_sub_u32 s10, 0, s8
+; GFX9-NEXT:    v_mul_f32_e32 v3, s17, v3
+; GFX9-NEXT:    v_mul_f32_e32 v4, s18, v3
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX9-NEXT:    v_mac_f32_e32 v3, s21, v4
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-NEXT:    v_mac_f32_e32 v3, s19, v4
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GFX9-NEXT:    s_sub_u32 s2, 0, s8
-; GFX9-NEXT:    s_subb_u32 s3, 0, s9
-; GFX9-NEXT:    v_mul_hi_u32 v7, s2, v3
-; GFX9-NEXT:    v_mul_lo_u32 v8, s2, v4
-; GFX9-NEXT:    v_mul_lo_u32 v9, s3, v3
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v9, v7, s[0:1]
+; GFX9-NEXT:    s_subb_u32 s11, 0, s9
+; GFX9-NEXT:    v_mul_lo_u32 v8, s10, v4
+; GFX9-NEXT:    v_mul_hi_u32 v7, s10, v3
+; GFX9-NEXT:    v_mul_lo_u32 v9, s11, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v3
+; GFX9-NEXT:    v_mul_lo_u32 v2, s10, v3
 ; GFX9-NEXT:    v_add_u32_e32 v7, v7, v8
 ; GFX9-NEXT:    v_add_u32_e32 v7, v7, v9
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v3, v7
@@ -9561,8 +9562,8 @@
 ; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v10, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v10, v4, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v4, v2
-; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
-; GFX9-NEXT:    s_mov_b32 s11, s10
+; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, s5, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v9, v2, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v6, vcc
@@ -9570,11 +9571,11 @@
 ; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v3, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v5, v8, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e64 v3, vcc, v4, v7, s[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v8, s2, v3
-; GFX9-NEXT:    v_mul_hi_u32 v9, s2, v2
-; GFX9-NEXT:    v_mul_lo_u32 v10, s3, v2
-; GFX9-NEXT:    v_mul_lo_u32 v11, s2, v2
-; GFX9-NEXT:    v_add_u32_e32 v4, v4, v7
+; GFX9-NEXT:    v_mul_lo_u32 v8, s10, v3
+; GFX9-NEXT:    v_mul_hi_u32 v9, s10, v2
+; GFX9-NEXT:    v_mul_lo_u32 v10, s11, v2
+; GFX9-NEXT:    v_mul_lo_u32 v11, s10, v2
+; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
 ; GFX9-NEXT:    v_add_u32_e32 v8, v9, v8
 ; GFX9-NEXT:    v_add_u32_e32 v8, v8, v10
 ; GFX9-NEXT:    v_mul_lo_u32 v12, v2, v8
@@ -9591,9 +9592,11 @@
 ; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v9, v6, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v10, v3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v5, v8, vcc
+; GFX9-NEXT:    v_add_u32_e32 v4, v4, v7
 ; GFX9-NEXT:    v_addc_co_u32_e64 v4, vcc, v4, v8, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s6, s10
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
+; GFX9-NEXT:    s_mov_b32 s11, s10
 ; GFX9-NEXT:    s_addc_u32 s1, s7, s10
 ; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
@@ -9606,8 +9609,7 @@
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v9, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v9, s7, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v2, s7, v2
-; GFX9-NEXT:    v_xor_b32_e32 v0, s12, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, s13, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, s5
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v9
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v2, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v10, v6, vcc
@@ -9616,32 +9618,31 @@
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s8, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v5, s8, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v7, s9, v2
-; GFX9-NEXT:    v_mov_b32_e32 v8, s13
-; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s12, v0
+; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v8, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v4, v5, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s8, v2
 ; GFX9-NEXT:    v_add_u32_e32 v4, v4, v7
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v8, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v7, s7, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v8, s9
-; GFX9-NEXT:    v_sub_co_u32_e64 v5, s[0:1], s6, v5
-; GFX9-NEXT:    v_subb_co_u32_e64 v7, vcc, v7, v8, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v8, vcc, s8, v5
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v8, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, 2, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, 1, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v11, v9, s[2:3]
+; GFX9-NEXT:    v_sub_co_u32_e32 v5, vcc, s6, v5
+; GFX9-NEXT:    v_subb_co_u32_e64 v7, s[0:1], v7, v8, vcc
+; GFX9-NEXT:    v_subrev_co_u32_e64 v8, s[0:1], s8, v5
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v7, s[0:1], 0, v7, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v9, v8, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v8, s[0:1], 2, v2
+; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[0:1], 0, v3, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], 1, v2
+; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v11, v9, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v9, s7
-; GFX9-NEXT:    v_subb_co_u32_e64 v4, vcc, v9, v4, s[0:1]
+; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v9, v4, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v5
@@ -9649,9 +9650,9 @@
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v9, v5, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v10, v8, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v10, v8, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], s[14:15]
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], s[12:13]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v2, s0, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v3, s1, v3
@@ -9659,7 +9660,7 @@
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[4:5]
+; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = sdiv <2 x i64> %x, %shl.y
@@ -9875,28 +9876,28 @@
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s3
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
 ; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s3, v0
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s3, v2
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s3, 0x12d8fa
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v2
+; GFX9-NEXT:    s_mov_b32 s0, 0x12d8fa
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, -1, v7, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
+; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, -1, v6, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -10199,25 +10200,25 @@
 ; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v3, v1
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v4
-; GFX9-NEXT:    v_sub_co_u32_e64 v0, s[0:1], s6, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s9
-; GFX9-NEXT:    v_subb_co_u32_e64 v3, vcc, v3, v4, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e64 v5, s[2:3], s8, v0
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, vcc, 0, v3, s[2:3]
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
-; GFX9-NEXT:    v_subb_co_u32_e64 v3, vcc, v3, v4, s[2:3]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s8, v5
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v3, s[2:3]
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
+; GFX9-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc
+; GFX9-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s8, v0
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v6
+; GFX9-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v5
+; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s8, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v3, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s7
-; GFX9-NEXT:    v_subb_co_u32_e64 v1, vcc, v6, v1, s[0:1]
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v6, v1, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
@@ -10226,7 +10227,7 @@
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v4, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v4, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s10, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s10, v1
@@ -10696,43 +10697,43 @@
 ; GFX9-NEXT:    v_mul_lo_u32 v0, s14, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_sub_co_u32_e64 v0, s[0:1], s8, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v2, s9, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s15
-; GFX9-NEXT:    v_subb_co_u32_e64 v2, vcc, v2, v3, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[2:3], s14, v0
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v7, vcc, 0, v2, s[2:3]
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s15, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s14, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s15, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX9-NEXT:    v_subb_co_u32_e64 v2, vcc, v2, v3, s[2:3]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s14, v4
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s[2:3]
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s8, v0
+; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc
+; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s14, v0
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v7, s[2:3], 0, v2, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s15, v7
+; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s14, v4
+; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s14, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s15, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
+; GFX9-NEXT:    s_ashr_i32 s2, s13, 31
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
+; GFX9-NEXT:    s_add_u32 s8, s12, s2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s9
-; GFX9-NEXT:    v_subb_co_u32_e64 v1, vcc, v7, v1, s[0:1]
-; GFX9-NEXT:    s_ashr_i32 s0, s13, 31
-; GFX9-NEXT:    s_add_u32 s8, s12, s0
-; GFX9-NEXT:    s_mov_b32 s1, s0
-; GFX9-NEXT:    s_addc_u32 s9, s13, s0
-; GFX9-NEXT:    s_xor_b64 s[8:9], s[8:9], s[0:1]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v9, s8
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v10, s9
+; GFX9-NEXT:    s_mov_b32 s3, s2
+; GFX9-NEXT:    s_addc_u32 s9, s13, s2
+; GFX9-NEXT:    s_xor_b64 s[8:9], s[8:9], s[2:3]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, s8
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v9, s9
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s15, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
+; GFX9-NEXT:    v_mac_f32_e32 v8, s16, v9
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s14, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
+; GFX9-NEXT:    v_rcp_f32_e32 v8, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s15, v1
-; GFX9-NEXT:    v_mac_f32_e32 v9, s16, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
-; GFX9-NEXT:    v_rcp_f32_e32 v8, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
 ; GFX9-NEXT:    v_mul_f32_e32 v3, s17, v8
 ; GFX9-NEXT:    v_mul_f32_e32 v4, s18, v3
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
@@ -10814,29 +10815,29 @@
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s9, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s8, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v8, s6
+; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s6, v0
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v8, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
-; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s6, v0
-; GFX9-NEXT:    v_sub_co_u32_e64 v2, s[0:1], s10, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v4, s11, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s9
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v8, vcc
-; GFX9-NEXT:    v_subb_co_u32_e64 v4, vcc, v4, v5, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e64 v7, s[2:3], s8, v2
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v8, vcc, 0, v4, s[2:3]
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc
-; GFX9-NEXT:    v_subb_co_u32_e64 v4, vcc, v4, v5, s[2:3]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v5, vcc, s8, v7
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v4, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v9
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[2:3]
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s10, v2
+; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v5, vcc
+; GFX9-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s8, v2
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v8, s[2:3], 0, v4, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v8
+; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v5, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v7
+; GFX9-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s8, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[2:3]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[2:3]
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v8, s11
-; GFX9-NEXT:    v_subb_co_u32_e64 v3, vcc, v8, v3, s[0:1]
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v8, v3, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
@@ -10845,7 +10846,7 @@
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v5, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v5, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v2, s12, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v3, s12, v3
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -315,7 +315,8 @@
 
 ; GCN: s_load_dword
 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_cmp_{{eq|ne}}_u32_e64
+; GCN-NEXT: s_cmp_lg_u32
+; GCN-NEXT: s_cselect_b64
 ; GCN: s_cbranch_vccz [[BB2:BB[0-9]_[0-9]+]]
 
 ; GCN-NEXT: [[LONGBB1:BB[0-9]+_[0-9]+]]:
@@ -486,8 +487,8 @@
 
 ; GCN: [[LONG_BR_DEST0]]:
 
-; GCN-DAG: v_cmp_lt_i32
-; GCN-DAG: v_cmp_ge_i32
+; GCN-DAG: s_cmp_lt_i32
+; GCN-DAG: s_cmp_ge_i32
 
 ; GCN: s_cbranch_vccz
 ; GCN: s_setpc_b64
diff --git a/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll b/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
--- a/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
@@ -5,7 +5,8 @@
 ; Produces error after adding an implicit def to v_cndmask_b32
 
 ; GCN-LABEL: {{^}}vcc_shrink_vcc_def:
-; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
+; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}}
+; GCN: s_cselect_b64 vcc, -1, 0
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
 define amdgpu_kernel void @vcc_shrink_vcc_def(float %arg, i32 %arg1, float %arg2, i32 %arg3) {
 bb0:
diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll b/llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll
--- a/llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll
+++ b/llvm/test/CodeGen/AMDGPU/control-flow-optnone.ll
@@ -10,7 +10,7 @@
 ; GCN: s_branch
 
 ; GCN-DAG: v_cmp_lt_i32
-; GCN-DAG: v_cmp_gt_i32
+; GCN-DAG: s_cmp_gt_i32
 ; GCN: s_and_b64
 ; GCN: s_mov_b64 exec
 
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -24,11 +24,12 @@
 ; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_flbit_i32_b32 s5, s4
-; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s5
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v0, vcc
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -472,12 +473,14 @@
 ; SI-NEXT:    s_flbit_i32_b32 s6, s4
 ; SI-NEXT:    s_flbit_i32_b32 s7, s5
 ; SI-NEXT:    s_add_i32 s6, s6, 32
-; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_mov_b32_e32 v0, s7
+; SI-NEXT:    s_cmp_eq_u32 s5, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s6
-; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
+; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_cmp_lg_u32 s4, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -555,12 +558,14 @@
 ; SI-NEXT:    s_flbit_i32_b32 s6, s4
 ; SI-NEXT:    s_flbit_i32_b32 s7, s5
 ; SI-NEXT:    s_add_i32 s6, s6, 32
-; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_mov_b32_e32 v0, s7
+; SI-NEXT:    s_cmp_eq_u32 s5, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s6
-; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
+; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_cmp_lg_u32 s4, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -99,7 +99,8 @@
 
 ; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i64:
 ; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
-; SI-DAG: v_cmp_eq_u32_e64 vcc, s[[HI]], 0{{$}}
+; SI-DAG: s_cmp_eq_u32 s[[HI]], 0{{$}}
+; SI-DAG: s_cselect_b64 vcc, -1, 0
 ; VI-DAG: s_cmp_eq_u32 s[[HI]], 0{{$}}
 ; GCN-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
 ; GCN-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -14,10 +14,11 @@
 ; GFX7-NEXT:    s_or_b32 s4, s4, s5
 ; GFX7-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX7-NEXT:    s_addc_u32 s4, s6, 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    s_cselect_b64 vcc, 1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
+; GFX7-NEXT:    s_cmp_gt_u32 s6, 31
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX7-NEXT:    v_cmp_gt_u32_e64 vcc, s6, 31
+; GFX7-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -71,15 +72,15 @@
 define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX7-LABEL: s_add_co_br_user:
 ; GFX7:       ; %bb.0: ; %bb
-; GFX7-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_add_i32 s1, s0, s0
-; GFX7-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-NEXT:    v_cmp_lt_u32_e32 vcc, s1, v0
-; GFX7-NEXT:    s_or_b32 s1, vcc_lo, vcc_hi
-; GFX7-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7-NEXT:    s_addc_u32 s0, s0, 0
+; GFX7-NEXT:    s_add_i32 s0, s2, s2
+; GFX7-NEXT:    s_cmp_lt_u32 s0, s2
+; GFX7-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX7-NEXT:    s_or_b32 s0, s0, s1
+; GFX7-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX7-NEXT:    s_addc_u32 s0, s2, 0
 ; GFX7-NEXT:    v_cmp_ge_u32_e32 vcc, s0, v0
 ; GFX7-NEXT:    s_and_b64 vcc, exec, vcc
 ; GFX7-NEXT:    s_cbranch_vccnz BB1_2
@@ -99,14 +100,14 @@
 ;
 ; GFX9-LABEL: s_add_co_br_user:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_i32 s1, s0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s1, v0
-; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    s_addc_u32 s0, s0, 0
+; GFX9-NEXT:    s_add_i32 s0, s2, s2
+; GFX9-NEXT:    s_cmp_lt_u32 s0, s2
+; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT:    s_addc_u32 s0, s2, 0
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, s0, v0
 ; GFX9-NEXT:    s_and_b64 vcc, exec, vcc
 ; GFX9-NEXT:    s_cbranch_vccnz BB1_2
@@ -129,7 +130,8 @@
 ; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_add_i32 s1, s0, s0
-; GFX10-NEXT:    v_cmp_lt_u32_e64 s1, s1, s0
+; GFX10-NEXT:    s_cmp_lt_u32 s1, s0
+; GFX10-NEXT:    s_cselect_b32 s1, -1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
 ; GFX10-NEXT:    s_cmpk_lg_u32 s1, 0x0
 ; GFX10-NEXT:    s_addc_u32 s0, s0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -2,9 +2,12 @@
 
 ; GCN-LABEL: {{^}}float4_extelt:
 ; GCN-NOT: buffer_
-; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
-; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2
-; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3
+; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
+; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
+; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
+; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1.0, [[C1]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], 2.0, [[V1]], [[C2]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], 4.0, [[V2]], [[C3]]
@@ -19,7 +22,8 @@
 ; GCN-LABEL: {{^}}int4_extelt:
 ; GCN-NOT: buffer_
 ; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 2
-; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX]], 1
+; GCN-DAG: s_cmp_eq_u32 [[IDX]], 1
+; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
 ; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1, [[C1]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], 2, [[V1]], vcc
@@ -34,9 +38,12 @@
 
 ; GCN-LABEL: {{^}}double4_extelt:
 ; GCN-NOT: buffer_
-; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
-; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2
-; GCN-DAG: v_cmp_eq_u32_e64 [[C3:[^,]+]], [[IDX]], 3
+; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
+; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2
+; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_eq_u32 [[IDX]], 3
+; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C2]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C3]]
@@ -50,10 +57,14 @@
 
 ; GCN-LABEL: {{^}}double5_extelt:
 ; GCN-NOT: buffer_
-; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
-; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2
-; GCN-DAG: v_cmp_eq_u32_e64 [[C3:[^,]+]], [[IDX]], 3
-; GCN-DAG: v_cmp_eq_u32_e64 [[C4:[^,]+]], [[IDX]], 4
+; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
+; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2
+; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_eq_u32 [[IDX]], 3
+; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_eq_u32 [[IDX]], 4
+; GCN-DAG: s_cselect_b64 [[C4:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C2]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C3]]
@@ -83,7 +94,8 @@
 
 ; GCN-LABEL: {{^}}float2_extelt:
 ; GCN-NOT: buffer_
-; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
+; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1.0, [[C1]]
 ; GCN: store_dword v[{{[0-9:]+}}], [[V1]]
 define amdgpu_kernel void @float2_extelt(float addrspace(1)* %out, i32 %sel) {
@@ -95,7 +107,8 @@
 
 ; GCN-LABEL: {{^}}double2_extelt:
 ; GCN-NOT: buffer_
-; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
+; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, {{[^,]+}}, [[C1]]
 ; GCN: store_dwordx2 v[{{[0-9:]+}}]
@@ -108,13 +121,20 @@
 
 ; GCN-LABEL: {{^}}half8_extelt:
 ; GCN-NOT: buffer_
-; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
-; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2
-; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3
-; GCN-DAG: v_cmp_ne_u32_e64 [[C4:[^,]+]], [[IDX]], 4
-; GCN-DAG: v_cmp_ne_u32_e64 [[C5:[^,]+]], [[IDX]], 5
-; GCN-DAG: v_cmp_ne_u32_e64 [[C6:[^,]+]], [[IDX]], 6
-; GCN-DAG: v_cmp_ne_u32_e64 [[C7:[^,]+]], [[IDX]], 7
+; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
+; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
+; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
+; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 4
+; GCN-DAG: s_cselect_b64 [[C4:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 5
+; GCN-DAG: s_cselect_b64 [[C5:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 6
+; GCN-DAG: s_cselect_b64 [[C6:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 7
+; GCN-DAG: s_cselect_b64 [[C7:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]]
@@ -132,13 +152,20 @@
 
 ; GCN-LABEL: {{^}}short8_extelt:
 ; GCN-NOT: buffer_
-; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
-; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2
-; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3
-; GCN-DAG: v_cmp_ne_u32_e64 [[C4:[^,]+]], [[IDX]], 4
-; GCN-DAG: v_cmp_ne_u32_e64 [[C5:[^,]+]], [[IDX]], 5
-; GCN-DAG: v_cmp_ne_u32_e64 [[C6:[^,]+]], [[IDX]], 6
-; GCN-DAG: v_cmp_ne_u32_e64 [[C7:[^,]+]], [[IDX]], 7
+; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
+; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
+; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
+; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 4
+; GCN-DAG: s_cselect_b64 [[C4:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 5
+; GCN-DAG: s_cselect_b64 [[C5:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 6
+; GCN-DAG: s_cselect_b64 [[C6:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 7
+; GCN-DAG: s_cselect_b64 [[C7:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]]
@@ -156,13 +183,20 @@
 
 ; GCN-LABEL: {{^}}float8_extelt:
 ; GCN-NOT: buffer_
-; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
-; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2
-; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3
-; GCN-DAG: v_cmp_ne_u32_e64 [[C4:[^,]+]], [[IDX]], 4
-; GCN-DAG: v_cmp_ne_u32_e64 [[C5:[^,]+]], [[IDX]], 5
-; GCN-DAG: v_cmp_ne_u32_e64 [[C6:[^,]+]], [[IDX]], 6
-; GCN-DAG: v_cmp_ne_u32_e64 [[C7:[^,]+]], [[IDX]], 7
+; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
+; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
+; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
+; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 4
+; GCN-DAG: s_cselect_b64 [[C4:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 5
+; GCN-DAG: s_cselect_b64 [[C5:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 6
+; GCN-DAG: s_cselect_b64 [[C6:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 7
+; GCN-DAG: s_cselect_b64 [[C7:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]]
@@ -331,21 +365,36 @@
 
 ; GCN-LABEL: {{^}}byte16_extelt:
 ; GCN-NOT: buffer_
-; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
-; GCN-DAG: v_cmp_ne_u32_e64 [[C2:[^,]+]], [[IDX]], 2
-; GCN-DAG: v_cmp_ne_u32_e64 [[C3:[^,]+]], [[IDX]], 3
-; GCN-DAG: v_cmp_ne_u32_e64 [[C4:[^,]+]], [[IDX]], 4
-; GCN-DAG: v_cmp_ne_u32_e64 [[C5:[^,]+]], [[IDX]], 5
-; GCN-DAG: v_cmp_ne_u32_e64 [[C6:[^,]+]], [[IDX]], 6
-; GCN-DAG: v_cmp_ne_u32_e64 [[C7:[^,]+]], [[IDX]], 7
-; GCN-DAG: v_cmp_ne_u32_e64 [[C8:[^,]+]], [[IDX]], 8
-; GCN-DAG: v_cmp_ne_u32_e64 [[C9:[^,]+]], [[IDX]], 9
-; GCN-DAG: v_cmp_ne_u32_e64 [[C10:[^,]+]], [[IDX]], 10
-; GCN-DAG: v_cmp_ne_u32_e64 [[C11:[^,]+]], [[IDX]], 11
-; GCN-DAG: v_cmp_ne_u32_e64 [[C12:[^,]+]], [[IDX]], 12
-; GCN-DAG: v_cmp_ne_u32_e64 [[C13:[^,]+]], [[IDX]], 13
-; GCN-DAG: v_cmp_ne_u32_e64 [[C14:[^,]+]], [[IDX]], 14
-; GCN-DAG: v_cmp_ne_u32_e64 [[C15:[^,]+]], [[IDX]], 15
+; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
+; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
+; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 3
+; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 4
+; GCN-DAG: s_cselect_b64 [[C4:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 5
+; GCN-DAG: s_cselect_b64 [[C5:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 6
+; GCN-DAG: s_cselect_b64 [[C6:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 7
+; GCN-DAG: s_cselect_b64 [[C7:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 8
+; GCN-DAG: s_cselect_b64 [[C8:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 9
+; GCN-DAG: s_cselect_b64 [[C9:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 10
+; GCN-DAG: s_cselect_b64 [[C10:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 11
+; GCN-DAG: s_cselect_b64 [[C11:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 12
+; GCN-DAG: s_cselect_b64 [[C12:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 13
+; GCN-DAG: s_cselect_b64 [[C13:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 14
+; GCN-DAG: s_cselect_b64 [[C14:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 15
+; GCN-DAG: s_cselect_b64 [[C15:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], {{[^,]+}}, {{[^,]+}}, [[C1]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V2:v[0-9]+]], {{[^,]+}}, [[V1]], [[C2]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V3:v[0-9]+]], {{[^,]+}}, [[V2]], [[C3]]
@@ -390,9 +439,9 @@
 ; GCN-LABEL: {{^}}bit128_extelt:
 ; GCN-NOT: buffer_
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} [[V1:v[0-9]+]], 0, 1
-; GCN-DAG: v_mov_b32_e32 [[LASTIDX:v[0-9]+]], 0x7f
-; GCN-DAG: v_cmp_ne_u32_e32 [[CL:[^,]+]], s{{[0-9]+}}, [[LASTIDX]]
-; GCN-DAG: v_cndmask_b32_e{{32|64}} [[VL:v[0-9]+]], 0, [[V1]], [[CL]]
+; GCN: s_cmpk_lg_i32 {{s[0-9]+}}, 0x7f
+; GCN: s_cselect_b64 [[CL:[^,]+]], -1, 0
+; GCN: v_cndmask_b32_e{{32|64}} [[VL:v[0-9]+]], 0, [[V1]], [[CL]]
 ; GCN:     v_and_b32_e32 [[RES:v[0-9]+]], 1, [[VL]]
 ; GCN:     store_dword v[{{[0-9:]+}}], [[RES]]
 define amdgpu_kernel void @bit128_extelt(i32 addrspace(1)* %out, i32 %sel) {
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
@@ -14,8 +14,10 @@
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3f64:
 ; GCN-NOT: buffer_load
-; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
-; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2
+; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
+; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2
+; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
@@ -29,9 +31,12 @@
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4f64:
 ; GCN-NOT: buffer_load
-; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
-; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2
-; GCN-DAG: v_cmp_eq_u32_e64 [[C3:[^,]+]], [[IDX]], 3
+; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
+; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2
+; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_eq_u32 [[IDX]], 3
+; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
@@ -31,7 +31,8 @@
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64:
 ; GCN-NOT: buffer_load
-; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
+; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
 ; GCN: store_dwordx2 v[{{[0-9:]+}}]
@@ -44,7 +45,8 @@
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64_2:
 ; GCN:     buffer_load_dwordx4
 ; GCN-NOT: buffer_load
-; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
+; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
 ; GCN: store_dwordx2 v[{{[0-9:]+}}]
@@ -58,8 +60,10 @@
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3i64:
 ; GCN-NOT: buffer_load
-; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
-; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2
+; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
+; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2
+; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
@@ -73,9 +77,12 @@
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4i64:
 ; GCN-NOT: buffer_load
-; GCN-DAG: v_cmp_eq_u32_e64 [[C1:[^,]+]], [[IDX:s[0-9]+]], 1
-; GCN-DAG: v_cmp_eq_u32_e64 [[C2:[^,]+]], [[IDX]], 2
-; GCN-DAG: v_cmp_eq_u32_e64 [[C3:[^,]+]], [[IDX]], 3
+; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
+; GCN-DAG: s_cselect_b64 [[C1:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_eq_u32 [[IDX]], 2
+; GCN-DAG: s_cselect_b64 [[C2:[^,]+]], -1, 0
+; GCN-DAG: s_cmp_eq_u32 [[IDX]], 3
+; GCN-DAG: s_cselect_b64 [[C3:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C2]]
 ; GCN-DAG: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[C1]]
diff --git a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
--- a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
@@ -147,23 +147,26 @@
 ; GCN-LABEL: no_extract_volatile_load_dynextract:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    s_load_dword s12, s[0:1], 0xd
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    s_load_dword s12, s[0:1], 0xd
 ; GCN-NEXT:    s_mov_b32 s10, s2
 ; GCN-NEXT:    s_mov_b32 s11, s3
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    s_mov_b32 s8, s6
 ; GCN-NEXT:    s_mov_b32 s9, s7
 ; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_mov_b32 s0, s4
-; GCN-NEXT:    s_mov_b32 s1, s5
-; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 1
+; GCN-NEXT:    s_cmp_eq_u32 s12, 1
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s12, 2
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 2
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s12, 3
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 3
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
--- a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
@@ -4,51 +4,53 @@
 define amdgpu_ps void @i1_copy_from_loop(<4 x i32> inreg %rsrc, i32 %tid) {
 ; SI-LABEL: i1_copy_from_loop:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b32 s8, 0
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
-; SI-NEXT:    ; implicit-def: $sgpr8_sgpr9
+; SI-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; SI-NEXT:    ; implicit-def: $sgpr10_sgpr11
 ; SI-NEXT:    s_branch BB0_3
-; SI-NEXT:  BB0_1: ; %Flow1
-; SI-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; SI-NEXT:    s_or_b64 exec, exec, s[14:15]
+; SI-NEXT:  BB0_1: ; in Loop: Header=BB0_3 Depth=1
+; SI-NEXT:    ; implicit-def: $sgpr8
 ; SI-NEXT:  BB0_2: ; %Flow
 ; SI-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; SI-NEXT:    s_and_b64 s[14:15], exec, s[10:11]
 ; SI-NEXT:    s_or_b64 s[4:5], s[14:15], s[4:5]
-; SI-NEXT:    s_andn2_b64 s[8:9], s[8:9], exec
+; SI-NEXT:    s_andn2_b64 s[6:7], s[6:7], exec
 ; SI-NEXT:    s_and_b64 s[12:13], s[12:13], exec
-; SI-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; SI-NEXT:    s_or_b64 s[6:7], s[6:7], s[12:13]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; SI-NEXT:    s_cbranch_execz BB0_6
+; SI-NEXT:    s_cbranch_execz BB0_7
 ; SI-NEXT:  BB0_3: ; %for.body
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    s_cmp_lt_u32 s8, 4
+; SI-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; SI-NEXT:    s_or_b64 s[10:11], s[10:11], exec
-; SI-NEXT:    s_cmp_gt_u32 s6, 3
-; SI-NEXT:    v_cmp_lt_u32_e64 s[12:13], s6, 4
-; SI-NEXT:    s_cbranch_scc1 BB0_2
+; SI-NEXT:    s_cmp_gt_u32 s8, 3
+; SI-NEXT:    s_cbranch_scc1 BB0_1
 ; SI-NEXT:  ; %bb.4: ; %mid.loop
 ; SI-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; SI-NEXT:    v_mov_b32_e32 v1, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s8
 ; SI-NEXT:    buffer_load_dword v1, v[0:1], s[0:3], 0 idxen offen
 ; SI-NEXT:    s_mov_b64 s[12:13], -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_le_f32_e32 vcc, 0, v1
 ; SI-NEXT:    s_mov_b64 s[10:11], -1
 ; SI-NEXT:    s_and_saveexec_b64 s[14:15], vcc
-; SI-NEXT:    s_cbranch_execz BB0_1
 ; SI-NEXT:  ; %bb.5: ; %end.loop
 ; SI-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; SI-NEXT:    s_add_i32 s6, s6, 1
+; SI-NEXT:    s_add_i32 s8, s8, 1
 ; SI-NEXT:    s_xor_b64 s[10:11], exec, -1
-; SI-NEXT:    s_branch BB0_1
-; SI-NEXT:  BB0_6: ; %for.end
+; SI-NEXT:  ; %bb.6: ; %Flow1
+; SI-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; SI-NEXT:    s_or_b64 exec, exec, s[14:15]
+; SI-NEXT:    s_branch BB0_2
+; SI-NEXT:  BB0_7: ; %for.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    s_and_saveexec_b64 s[0:1], s[8:9]
-; SI-NEXT:    s_cbranch_execz BB0_8
-; SI-NEXT:  ; %bb.7: ; %if
+; SI-NEXT:    s_and_saveexec_b64 s[0:1], s[6:7]
+; SI-NEXT:    s_cbranch_execz BB0_9
+; SI-NEXT:  ; %bb.8: ; %if
 ; SI-NEXT:    exp mrt0 v0, v0, v0, v0 done vm
-; SI-NEXT:  BB0_8: ; %end
+; SI-NEXT:  BB0_9: ; %end
 ; SI-NEXT:    s_endpgm
 entry:
   br label %for.body
diff --git a/llvm/test/CodeGen/AMDGPU/icmp64.ll b/llvm/test/CodeGen/AMDGPU/icmp64.ll
--- a/llvm/test/CodeGen/AMDGPU/icmp64.ll
+++ b/llvm/test/CodeGen/AMDGPU/icmp64.ll
@@ -1,7 +1,8 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
 
-; SI-LABEL: {{^}}test_i64_eq:
+; GCN-LABEL: {{^}}test_i64_eq:
+; VI: s_cmp_eq_u64
 ; SI: v_cmp_eq_u64
 define amdgpu_kernel void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp eq i64 %a, %b
@@ -10,7 +11,8 @@
   ret void
 }
 
-; SI-LABEL: {{^}}test_i64_ne:
+; GCN-LABEL: {{^}}test_i64_ne:
+; VI: s_cmp_lg_u64
 ; SI: v_cmp_ne_u64
 define amdgpu_kernel void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ne i64 %a, %b
@@ -19,8 +21,8 @@
   ret void
 }
 
-; SI-LABEL: {{^}}test_i64_slt:
-; SI: v_cmp_lt_i64
+; GCN-LABEL: {{^}}test_i64_slt:
+; GCN: v_cmp_lt_i64
 define amdgpu_kernel void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp slt i64 %a, %b
   %result = sext i1 %cmp to i32
@@ -28,8 +30,8 @@
   ret void
 }
 
-; SI-LABEL: {{^}}test_i64_ult:
-; SI: v_cmp_lt_u64
+; GCN-LABEL: {{^}}test_i64_ult:
+; GCN: v_cmp_lt_u64
 define amdgpu_kernel void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ult i64 %a, %b
   %result = sext i1 %cmp to i32
@@ -37,8 +39,8 @@
   ret void
 }
 
-; SI-LABEL: {{^}}test_i64_sle:
-; SI: v_cmp_le_i64
+; GCN-LABEL: {{^}}test_i64_sle:
+; GCN: v_cmp_le_i64
 define amdgpu_kernel void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp sle i64 %a, %b
   %result = sext i1 %cmp to i32
@@ -46,8 +48,8 @@
   ret void
 }
 
-; SI-LABEL: {{^}}test_i64_ule:
-; SI: v_cmp_le_u64
+; GCN-LABEL: {{^}}test_i64_ule:
+; GCN: v_cmp_le_u64
 define amdgpu_kernel void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ule i64 %a, %b
   %result = sext i1 %cmp to i32
@@ -55,8 +57,8 @@
   ret void
 }
 
-; SI-LABEL: {{^}}test_i64_sgt:
-; SI: v_cmp_gt_i64
+; GCN-LABEL: {{^}}test_i64_sgt:
+; GCN: v_cmp_gt_i64
 define amdgpu_kernel void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp sgt i64 %a, %b
   %result = sext i1 %cmp to i32
@@ -64,8 +66,8 @@
   ret void
 }
 
-; SI-LABEL: {{^}}test_i64_ugt:
-; SI: v_cmp_gt_u64
+; GCN-LABEL: {{^}}test_i64_ugt:
+; GCN: v_cmp_gt_u64
 define amdgpu_kernel void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ugt i64 %a, %b
   %result = sext i1 %cmp to i32
@@ -73,8 +75,8 @@
   ret void
 }
 
-; SI-LABEL: {{^}}test_i64_sge:
-; SI: v_cmp_ge_i64
+; GCN-LABEL: {{^}}test_i64_sge:
+; GCN: v_cmp_ge_i64
 define amdgpu_kernel void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp sge i64 %a, %b
   %result = sext i1 %cmp to i32
@@ -82,8 +84,8 @@
   ret void
 }
 
-; SI-LABEL: {{^}}test_i64_uge:
-; SI: v_cmp_ge_u64
+; GCN-LABEL: {{^}}test_i64_uge:
+; GCN: v_cmp_ge_u64
 define amdgpu_kernel void @test_i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp uge i64 %a, %b
   %result = sext i1 %cmp to i32
diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -65,13 +65,13 @@
 ; GFX10-NEXT:    v_mul_lo_u32 v2, s5, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v3, s4, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, v3, v2
-; GFX10-NEXT:    v_mul_lo_u32 v4, s3, v2
 ; GFX10-NEXT:    v_not_b32_e32 v3, v2
+; GFX10-NEXT:    v_mul_lo_u32 v4, s3, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v2
 ; GFX10-NEXT:    v_mul_lo_u32 v3, s2, v3
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, s4, v4
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v4
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, s4, v3
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v4
 ; GFX10-NEXT:    s_add_u32 s4, s4, 1
 ; GFX10-NEXT:    s_addc_u32 s5, s5, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -3,13 +3,17 @@
 ; GCN-LABEL: {{^}}float4_inselt:
 ; GCN-NOT: v_movrel
 ; GCN-NOT: buffer_
-; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3
+; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 3
+; GCN-DAG: s_cselect_b64 [[CC1:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]]
-; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
+; GCN-DAG: s_cselect_b64 [[CC2:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]]
-; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 1
+; GCN-DAG: s_cselect_b64 [[CC3:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]]
-; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 0
+; GCN-DAG: s_cselect_b64 [[CC4:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]]
 ; GCN:     flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]]
 define amdgpu_kernel void @float4_inselt(<4 x float> addrspace(1)* %out, <4 x float> %vec, i32 %sel) {
@@ -61,9 +65,11 @@
 ; GCN-LABEL: {{^}}float2_inselt:
 ; GCN-NOT: v_movrel
 ; GCN-NOT: buffer_
-; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 1
+; GCN-DAG: s_cselect_b64 [[CC1:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]]
-; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 0
+; GCN-DAG: s_cselect_b64 [[CC2:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC2]]
 ; GCN:     flat_store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]]
 define amdgpu_kernel void @float2_inselt(<2 x float> addrspace(1)* %out, <2 x float> %vec, i32 %sel) {
@@ -76,21 +82,29 @@
 ; GCN-LABEL: {{^}}float8_inselt:
 ; GCN-NOT: v_movrel
 ; GCN-NOT: buffer_
-; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3
+; GCN-DAG: s_cmp_lg_u32 [[IDX:s[0-9]+]], 3
+; GCN-DAG: s_cselect_b64 [[CC1:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST0:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]]
-; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 2
+; GCN-DAG: s_cselect_b64 [[CC2:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]]
-; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 1
+; GCN-DAG: s_cselect_b64 [[CC3:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]]
-; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 0
+; GCN-DAG: s_cselect_b64 [[CC4:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST0:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]]
-; GCN-DAG: v_cmp_ne_u32_e64 [[CC5:[^,]+]], [[IDX:s[0-9]+]], 7
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 7
+; GCN-DAG: s_cselect_b64 [[CC5:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC5]]
-; GCN-DAG: v_cmp_ne_u32_e64 [[CC6:[^,]+]], [[IDX]], 6
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 6
+; GCN-DAG: s_cselect_b64 [[CC6:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC6]]
-; GCN-DAG: v_cmp_ne_u32_e64 [[CC7:[^,]+]], [[IDX]], 5
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 5
+; GCN-DAG: s_cselect_b64 [[CC7:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC7]]
-; GCN-DAG: v_cmp_ne_u32_e64 [[CC8:[^,]+]], [[IDX]], 4
+; GCN-DAG: s_cmp_lg_u32 [[IDX]], 4
+; GCN-DAG: s_cselect_b64 [[CC8:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC8]]
 ; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST0]]:[[ELT_LAST0]]]
 ; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST1]]:[[ELT_LAST1]]]
@@ -156,14 +170,14 @@
 ; GCN-LABEL: {{^}}half8_inselt:
 ; GCN-NOT: v_movrel
 ; GCN-NOT: buffer_
-; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 0
-; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 1
-; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 2
-; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 3
-; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 4
-; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 5
-; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 6
-; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 7
+; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 0
+; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 1
+; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 2
+; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 3
+; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 4
+; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 5
+; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 6
+; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 7
 ; GCN-DAG: v_cndmask_b32_e32
 ; GCN-DAG: v_cndmask_b32_e32
 ; GCN-DAG: v_cndmask_b32_e32
@@ -237,8 +251,8 @@
 ; GCN-LABEL: {{^}}byte16_inselt:
 ; GCN-NOT: v_movrel
 ; GCN-NOT: buffer_
-; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 0
-; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 15
+; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 0
+; GCN-DAG: s_cmp_lg_u32 {{s[0-9]+}}, 15
 ; GCN-DAG: v_cndmask_b32_e32
 ; GCN-DAG: v_cndmask_b32_e32
 ; GCN-DAG: v_cndmask_b32_e32
@@ -273,10 +287,12 @@
 ; GCN-LABEL: {{^}}double2_inselt:
 ; GCN-NOT: v_movrel
 ; GCN-NOT: buffer_
-; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 1
+; GCN-DAG: s_cmp_eq_u32 [[IDX:s[0-9]+]], 1
+; GCN-DAG: s_cselect_b64 [[CC1:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC1]]
 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]
-; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 0
+; GCN-DAG: s_cmp_eq_u32 [[IDX]], 0
+; GCN-DAG: s_cselect_b64 [[CC2:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC2]]
 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
 define amdgpu_kernel void @double2_inselt(<2 x double> addrspace(1)* %out, <2 x double> %vec, i32 %sel) {
@@ -372,10 +388,12 @@
 
 ; GCN-LABEL: {{^}}bit128_inselt:
 ; GCN-NOT: buffer_
-; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], s{{[0-9]+}}, 0
+; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0
+; GCN-DAG: s_cselect_b64 [[CC1:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC1]]
-; GCN-DAG: v_mov_b32_e32 [[LASTIDX:v[0-9]+]], 0x7f
-; GCN-DAG: v_cmp_ne_u32_e32 [[CCL:[^,]+]], s{{[0-9]+}}, [[LASTIDX]]
+
+; GCN-DAG: s_cmpk_lg_i32 {{s[0-9]+}}, 0x7f
+; GCN-DAG: s_cselect_b64 [[CCL:[^,]+]], -1, 0
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CCL]]
 define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i1> %vec, i32 %sel) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -290,10 +290,12 @@
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
+; SI-NEXT:    s_cmp_lg_u32 s4, 1
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v2, s6
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -308,10 +310,12 @@
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s7
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
+; VI-NEXT:    s_cmp_lg_u32 s4, 1
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
@@ -331,13 +335,16 @@
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s10
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
+; SI-NEXT:    s_cmp_lg_u32 s4, 2
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_lg_u32 s4, 1
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s9
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
+; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v3, s8
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -352,13 +359,16 @@
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s10
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
+; VI-NEXT:    s_cmp_lg_u32 s4, 2
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_lg_u32 s4, 1
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s9
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
+; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v3, s8
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
@@ -378,16 +388,20 @@
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s11
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
+; SI-NEXT:    s_cmp_lg_u32 s4, 3
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_lg_u32 s4, 2
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s10
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
+; SI-NEXT:    s_cmp_lg_u32 s4, 1
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s9
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
+; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v4, s8
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -402,16 +416,20 @@
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s11
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
+; VI-NEXT:    s_cmp_lg_u32 s4, 3
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_lg_u32 s4, 2
 ; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s10
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
+; VI-NEXT:    s_cmp_lg_u32 s4, 1
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s9
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
+; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v4, s8
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
@@ -431,28 +449,36 @@
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s11
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
+; SI-NEXT:    s_cmp_lg_u32 s4, 3
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_lg_u32 s4, 2
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v4, v0, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s10
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
+; SI-NEXT:    s_cmp_lg_u32 s4, 1
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v0, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s9
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
+; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v0, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s8
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
+; SI-NEXT:    s_cmp_lg_u32 s4, 7
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 7
+; SI-NEXT:    s_cmp_lg_u32 s4, 6
 ; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v5, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v5, s14
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 6
+; SI-NEXT:    s_cmp_lg_u32 s4, 5
 ; SI-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v5, s13
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 5
+; SI-NEXT:    s_cmp_lg_u32 s4, 4
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v4, v5, vcc
 ; SI-NEXT:    v_mov_b32_e32 v8, s12
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 4
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
 ; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
@@ -468,28 +494,36 @@
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s11
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
+; VI-NEXT:    s_cmp_lg_u32 s4, 3
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_lg_u32 s4, 2
 ; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v0, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s10
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
+; VI-NEXT:    s_cmp_lg_u32 s4, 1
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v0, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s9
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
+; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v0, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s8
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
+; VI-NEXT:    s_cmp_lg_u32 s4, 7
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v5, s15
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 7
+; VI-NEXT:    s_cmp_lg_u32 s4, 6
 ; VI-NEXT:    v_cndmask_b32_e32 v7, v4, v5, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v5, s14
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 6
+; VI-NEXT:    s_cmp_lg_u32 s4, 5
 ; VI-NEXT:    v_cndmask_b32_e32 v6, v4, v5, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v5, s13
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 5
+; VI-NEXT:    s_cmp_lg_u32 s4, 4
 ; VI-NEXT:    v_cndmask_b32_e32 v5, v4, v5, vcc
 ; VI-NEXT:    v_mov_b32_e32 v8, s12
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 4
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
 ; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
@@ -580,10 +614,12 @@
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s7
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
+; SI-NEXT:    s_cmp_lg_u32 s4, 1
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
 ; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -619,13 +655,16 @@
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s10
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
+; SI-NEXT:    s_cmp_lg_u32 s4, 2
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_lg_u32 s4, 1
 ; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v0, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s9
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
+; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
 ; SI-NEXT:    v_mov_b32_e32 v0, s8
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
 ; SI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -665,17 +704,21 @@
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s11
-; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 3
+; SI-NEXT:    s_cmp_eq_u32 s6, 3
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v4, s4
+; SI-NEXT:    s_cmp_eq_u32 s6, 2
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s10
-; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 2
+; SI-NEXT:    s_cmp_eq_u32 s6, 1
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s9
-; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 1
+; SI-NEXT:    s_cmp_eq_u32 s6, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
 ; SI-NEXT:    v_mov_b32_e32 v0, s8
-; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s6, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -718,28 +761,36 @@
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s11
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
+; SI-NEXT:    s_cmp_lg_u32 s4, 3
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_lg_u32 s4, 2
 ; SI-NEXT:    v_cndmask_b32_e32 v3, 5, v0, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s10
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
+; SI-NEXT:    s_cmp_lg_u32 s4, 1
 ; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v0, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s9
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
+; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v0, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s8
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
+; SI-NEXT:    s_cmp_lg_u32 s4, 7
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v4, s15
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 7
+; SI-NEXT:    s_cmp_lg_u32 s4, 6
 ; SI-NEXT:    v_cndmask_b32_e32 v7, 5, v4, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v4, s14
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 6
+; SI-NEXT:    s_cmp_lg_u32 s4, 5
 ; SI-NEXT:    v_cndmask_b32_e32 v6, 5, v4, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v4, s13
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 5
+; SI-NEXT:    s_cmp_lg_u32 s4, 4
 ; SI-NEXT:    v_cndmask_b32_e32 v5, 5, v4, vcc
 ; SI-NEXT:    v_mov_b32_e32 v4, s12
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 4
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
 ; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
@@ -1132,96 +1183,112 @@
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_lshr_b32 s5, s11, 24
+; SI-NEXT:    s_cmp_lg_u32 s4, 15
 ; SI-NEXT:    v_mov_b32_e32 v0, s5
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 15
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_lshr_b32 s5, s11, 16
+; SI-NEXT:    s_cmp_lg_u32 s4, 14
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
 ; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 14
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_lshr_b32 s6, s11, 8
 ; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
 ; SI-NEXT:    s_movk_i32 s5, 0xff
+; SI-NEXT:    s_cmp_lg_u32 s4, 13
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; SI-NEXT:    v_and_b32_e32 v1, s5, v1
-; SI-NEXT:    s_lshr_b32 s6, s11, 8
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, s6
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 13
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_lg_u32 s4, 12
 ; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v2, s11
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 12
 ; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; SI-NEXT:    v_and_b32_e32 v2, s5, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    s_mov_b32 s6, 0xffff
+; SI-NEXT:    s_lshr_b32 s7, s10, 24
+; SI-NEXT:    s_cmp_lg_u32 s4, 11
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_and_b32_e32 v1, s6, v1
-; SI-NEXT:    s_lshr_b32 s7, s10, 24
 ; SI-NEXT:    v_or_b32_e32 v3, v1, v0
 ; SI-NEXT:    v_mov_b32_e32 v0, s7
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 11
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_lshr_b32 s7, s10, 16
+; SI-NEXT:    s_cmp_lg_u32 s4, 10
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
 ; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 10
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
+; SI-NEXT:    s_lshr_b32 s7, s10, 8
+; SI-NEXT:    s_cmp_lg_u32 s4, 9
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; SI-NEXT:    v_and_b32_e32 v1, s5, v1
-; SI-NEXT:    s_lshr_b32 s7, s10, 8
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 9
+; SI-NEXT:    s_cmp_lg_u32 s4, 8
 ; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v2, s10
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 8
 ; SI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; SI-NEXT:    v_and_b32_e32 v2, s5, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v2, v1
+; SI-NEXT:    s_lshr_b32 s7, s9, 24
+; SI-NEXT:    s_cmp_lg_u32 s4, 7
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_and_b32_e32 v1, s6, v1
-; SI-NEXT:    s_lshr_b32 s7, s9, 24
 ; SI-NEXT:    v_or_b32_e32 v2, v1, v0
 ; SI-NEXT:    v_mov_b32_e32 v0, s7
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 7
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_lshr_b32 s7, s9, 16
+; SI-NEXT:    s_cmp_lg_u32 s4, 6
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
 ; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 6
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
+; SI-NEXT:    s_lshr_b32 s7, s9, 8
+; SI-NEXT:    s_cmp_lg_u32 s4, 5
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; SI-NEXT:    v_and_b32_e32 v1, s5, v1
-; SI-NEXT:    s_lshr_b32 s7, s9, 8
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 5
+; SI-NEXT:    s_cmp_lg_u32 s4, 4
 ; SI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v4, s9
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 4
 ; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; SI-NEXT:    v_and_b32_e32 v4, s5, v4
 ; SI-NEXT:    v_or_b32_e32 v1, v4, v1
+; SI-NEXT:    s_lshr_b32 s7, s8, 24
+; SI-NEXT:    s_cmp_lg_u32 s4, 3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_and_b32_e32 v1, s6, v1
-; SI-NEXT:    s_lshr_b32 s7, s8, 24
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v0
 ; SI-NEXT:    v_mov_b32_e32 v0, s7
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    s_lshr_b32 s7, s8, 16
+; SI-NEXT:    s_cmp_lg_u32 s4, 2
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
 ; SI-NEXT:    v_mov_b32_e32 v4, s7
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
+; SI-NEXT:    s_lshr_b32 s7, s8, 8
+; SI-NEXT:    s_cmp_lg_u32 s4, 1
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; SI-NEXT:    v_and_b32_e32 v4, s5, v4
-; SI-NEXT:    s_lshr_b32 s7, s8, 8
 ; SI-NEXT:    v_or_b32_e32 v0, v4, v0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v4, s7
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
+; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
 ; SI-NEXT:    v_mov_b32_e32 v5, s8
-; SI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v5, 5, v5, vcc
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; SI-NEXT:    v_and_b32_e32 v5, s5, v5
@@ -1241,81 +1308,97 @@
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s5, s11, 24
+; VI-NEXT:    s_cmp_lg_u32 s4, 15
 ; VI-NEXT:    v_mov_b32_e32 v0, s5
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 15
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_lshr_b32 s5, s11, 16
+; VI-NEXT:    s_cmp_lg_u32 s4, 14
 ; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 14
-; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
-; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_lshr_b32 s5, s11, 8
+; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
+; VI-NEXT:    s_cmp_lg_u32 s4, 13
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 13
+; VI-NEXT:    s_cmp_lg_u32 s4, 12
 ; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s11
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 12
-; VI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
+; VI-NEXT:    s_lshr_b32 s5, s10, 24
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_lshr_b32 s5, s10, 24
+; VI-NEXT:    s_cmp_lg_u32 s4, 11
 ; VI-NEXT:    v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v0, s5
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 11
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_lshr_b32 s5, s10, 16
+; VI-NEXT:    s_cmp_lg_u32 s4, 10
 ; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 10
-; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
-; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_lshr_b32 s5, s10, 8
+; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
+; VI-NEXT:    s_cmp_lg_u32 s4, 9
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 9
+; VI-NEXT:    s_cmp_lg_u32 s4, 8
 ; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s10
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 8
-; VI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
+; VI-NEXT:    s_lshr_b32 s5, s9, 24
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    v_cndmask_b32_e32 v2, 5, v2, vcc
 ; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_lshr_b32 s5, s9, 24
+; VI-NEXT:    s_cmp_lg_u32 s4, 7
 ; VI-NEXT:    v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v0, s5
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 7
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_lshr_b32 s5, s9, 16
+; VI-NEXT:    s_cmp_lg_u32 s4, 6
 ; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 6
-; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
-; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_lshr_b32 s5, s9, 8
+; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
+; VI-NEXT:    s_cmp_lg_u32 s4, 5
 ; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 5
+; VI-NEXT:    s_cmp_lg_u32 s4, 4
 ; VI-NEXT:    v_cndmask_b32_e32 v1, 5, v1, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v4, s9
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 4
-; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
+; VI-NEXT:    s_lshr_b32 s5, s8, 24
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
 ; VI-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    s_lshr_b32 s5, s8, 24
+; VI-NEXT:    s_cmp_lg_u32 s4, 3
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v0, s5
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 3
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_lshr_b32 s5, s8, 16
+; VI-NEXT:    s_cmp_lg_u32 s4, 2
 ; VI-NEXT:    v_cndmask_b32_e32 v0, 5, v0, vcc
 ; VI-NEXT:    v_mov_b32_e32 v4, s5
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 2
-; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
-; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_lshr_b32 s5, s8, 8
+; VI-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
+; VI-NEXT:    s_cmp_lg_u32 s4, 1
 ; VI-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v4, s5
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 1
+; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    v_cndmask_b32_e32 v4, 5, v4, vcc
 ; VI-NEXT:    v_mov_b32_e32 v5, s8
-; VI-NEXT:    v_cmp_ne_u32_e64 vcc, s4, 0
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
 ; VI-NEXT:    v_cndmask_b32_e32 v5, 5, v5, vcc
 ; VI-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -1413,12 +1496,14 @@
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s11
-; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
+; SI-NEXT:    s_cmp_eq_u32 s4, 1
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v0, s10
+; SI-NEXT:    s_cmp_eq_u32 s4, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
 ; SI-NEXT:    v_mov_b32_e32 v0, s9
-; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v0, s8
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
@@ -1435,12 +1520,14 @@
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s11
-; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
+; VI-NEXT:    s_cmp_eq_u32 s4, 1
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s10
+; VI-NEXT:    s_cmp_eq_u32 s4, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s9
-; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 0
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s8
 ; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
@@ -1454,19 +1541,21 @@
 define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v2i64:
 ; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s6, s[4:5], 0x8
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x4
-; SI-NEXT:    s_load_dword s6, s[4:5], 0x8
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_cmp_eq_u32 s6, 1
+; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s11
-; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, 1
 ; SI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
 ; SI-NEXT:    v_mov_b32_e32 v0, s10
+; SI-NEXT:    s_cmp_eq_u32 s6, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[4:5]
 ; SI-NEXT:    v_mov_b32_e32 v0, s9
-; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, 0
+; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
 ; SI-NEXT:    v_mov_b32_e32 v0, s8
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[4:5]
@@ -1475,19 +1564,21 @@
 ;
 ; VI-LABEL: dynamic_insertelement_v2i64:
 ; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dword s6, s[4:5], 0x20
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
-; VI-NEXT:    s_load_dword s6, s[4:5], 0x20
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_cmp_eq_u32 s6, 1
+; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s11
-; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, 1
 ; VI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v0, s10
+; VI-NEXT:    s_cmp_eq_u32 s6, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v0, s9
-; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, 0
+; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v0, s8
 ; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[4:5]
@@ -1510,16 +1601,19 @@
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s11
 ; SI-NEXT:    v_mov_b32_e32 v4, s7
-; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 1
+; SI-NEXT:    s_cmp_eq_u32 s12, 1
+; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
 ; SI-NEXT:    v_mov_b32_e32 v0, s10
+; SI-NEXT:    s_cmp_eq_u32 s12, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[4:5]
+; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s9
-; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
 ; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    s_cmp_eq_u32 s12, 2
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[4:5]
-; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 2
+; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v5, v4, 0, s[4:5]
 ; SI-NEXT:    v_mov_b32_e32 v4, s6
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 5, s[4:5]
@@ -1537,17 +1631,20 @@
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s11
-; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 1
+; VI-NEXT:    s_cmp_eq_u32 s12, 1
+; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v3, v0, 0, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v0, s10
+; VI-NEXT:    s_cmp_eq_u32 s12, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 5, s[4:5]
+; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s9
-; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    s_cmp_eq_u32 s12, 2
 ; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 5, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v4, s7
-; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 2
+; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v5, v4, 0, s[4:5]
 ; VI-NEXT:    v_mov_b32_e32 v4, s6
 ; VI-NEXT:    v_cndmask_b32_e64 v4, v4, 5, s[4:5]
@@ -1570,22 +1667,26 @@
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, s11
-; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
+; SI-NEXT:    s_cmp_eq_u32 s4, 1
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
 ; SI-NEXT:    v_mov_b32_e32 v0, s10
+; SI-NEXT:    s_cmp_eq_u32 s4, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s9
-; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
 ; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    s_cmp_eq_u32 s4, 3
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 3
 ; SI-NEXT:    v_cndmask_b32_e32 v7, v5, v4, vcc
 ; SI-NEXT:    v_mov_b32_e32 v5, s14
+; SI-NEXT:    s_cmp_eq_u32 s4, 2
 ; SI-NEXT:    v_cndmask_b32_e64 v6, v5, 0, vcc
 ; SI-NEXT:    v_mov_b32_e32 v5, s13
-; SI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 2
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
 ; SI-NEXT:    v_mov_b32_e32 v4, s12
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
@@ -1603,22 +1704,26 @@
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s11
-; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
+; VI-NEXT:    s_cmp_eq_u32 s4, 1
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s10
+; VI-NEXT:    s_cmp_eq_u32 s4, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s9
-; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 0
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v0, v4, vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    s_cmp_eq_u32 s4, 3
 ; VI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v5, s15
-; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 3
 ; VI-NEXT:    v_cndmask_b32_e32 v7, v5, v4, vcc
 ; VI-NEXT:    v_mov_b32_e32 v5, s14
+; VI-NEXT:    s_cmp_eq_u32 s4, 2
 ; VI-NEXT:    v_cndmask_b32_e64 v6, v5, 0, vcc
 ; VI-NEXT:    v_mov_b32_e32 v5, s13
-; VI-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 2
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    v_cndmask_b32_e32 v5, v5, v4, vcc
 ; VI-NEXT:    v_mov_b32_e32 v4, s12
 ; VI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@@ -86,7 +86,7 @@
 }
 
 ; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc:
-; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
+; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}}
 ; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind {
   %cmp = icmp eq i32 %i, 0
@@ -119,7 +119,8 @@
 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
 
 ; SI-DAG: v_cmp_eq_u32_e32 [[CMP0:vcc]], 0, v{{[0-9]+}}
-; SI-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}}
+; SI-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0{{$}}
+; SI-DAG: s_cselect_b64 [[CMP1:s\[[0-9]+:[0-9]+\]]], -1, 0
 ; SI: s_and_b64 vcc, [[CMP0]], [[CMP1]]
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]]
 ; SI: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll
@@ -298,8 +298,10 @@
 }
 
 ; GCN-LABEL: {{^}}v_icmp_i1_ne0:
-; GCN: v_cmp_gt_u32_e64 s[[C0:\[[0-9]+:[0-9]+\]]],
-; GCN: v_cmp_gt_u32_e64 s[[C1:\[[0-9]+:[0-9]+\]]],
+; GCN: s_cmp_gt_u32
+; GCN: s_cselect_b64 s[[C0:\[[0-9]+:[0-9]+\]]], -1, 0
+; GCN: s_cmp_gt_u32
+; GCN: s_cselect_b64 s[[C1:\[[0-9]+:[0-9]+\]]], -1, 0
 ; GCN: s_and_b64 s[[SRC:\[[0-9]+:[0-9]+\]]], s[[C0]], s[[C1]]
 ; SI-NEXT: s_mov_b32 s{{[0-9]+}}, -1
 ; GCN-NEXT: v_mov_b32_e32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
@@ -29,8 +29,7 @@
 ; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x4{{$}}
 ; GFX9: s_lshl_b32 [[APERTURE]], [[APERTURE]], 16
 
-; GCN: v_mov_b32_e32 [[V_APERTURE:v[0-9]+]], [[APERTURE]]
-; GCN: v_cmp_eq_u32_e32 vcc, [[PTR_HI]], [[V_APERTURE]]
+; GCN: s_cmp_eq_u32 [[PTR_HI]], [[APERTURE]]
 ; GCN: s_cbranch_vccnz
 define amdgpu_kernel void @is_private_sgpr(i8* %ptr) {
   %val = call i1 @llvm.amdgcn.is.private(i8* %ptr)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
@@ -30,8 +30,7 @@
 ; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x1{{$}}
 ; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x4{{$}}
 
-; GCN: v_mov_b32_e32 [[V_APERTURE:v[0-9]+]], [[APERTURE]]
-; GCN: v_cmp_eq_u32_e32 vcc, [[PTR_HI]], [[V_APERTURE]]
+; GCN: s_cmp_eq_u32 [[PTR_HI]], [[APERTURE]]
 ; GCN: s_cbranch_vccnz
 define amdgpu_kernel void @is_local_sgpr(i8* %ptr) {
   %val = call i1 @llvm.amdgcn.is.shared(i8* %ptr)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
@@ -265,7 +265,7 @@
 }
 
 ; GCN-LABEL: {{^}}test_scc_liveness:
-; GCN: v_cmp
+; GCN: s_cmp
 ; GCN: s_and_b64 exec
 ; GCN: s_cmp
 ; GCN: s_cbranch_scc
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -246,9 +246,10 @@
 ; GFX9-NEXT:    s_addc_u32 s5, 0, s5
 ; GFX9-NEXT:    s_add_i32 s1, s8, s7
 ; GFX9-NEXT:    s_add_i32 s1, s1, s6
+; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX9-NEXT:    s_mul_i32 s2, s0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
@@ -274,11 +275,12 @@
 ; GFX10-NEXT:    s_add_u32 s4, s3, s1
 ; GFX10-NEXT:    s_addc_u32 s5, 0, s5
 ; GFX10-NEXT:    s_add_i32 s1, s8, s7
-; GFX10-NEXT:    v_cmp_ne_u64_e64 s3, s[4:5], 0
-; GFX10-NEXT:    s_add_i32 s1, s1, s6
 ; GFX10-NEXT:    s_mul_i32 s0, s0, s2
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, s1, 0, s3
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, s0, 0, s3
+; GFX10-NEXT:    s_add_i32 s1, s1, s6
+; GFX10-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX10-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, s1, 0, s2
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, s0, 0, s2
 ; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_endpgm
 bb:
@@ -306,10 +308,11 @@
 ; SI-NEXT:    v_mul_hi_u32 v1, s0, v1
 ; SI-NEXT:    v_mul_hi_i32 v3, s1, v3
 ; SI-NEXT:    s_mul_i32 s6, s1, s3
-; SI-NEXT:    s_mul_i32 s8, s0, s2
+; SI-NEXT:    s_cmp_lt_i32 s1, 0
+; SI-NEXT:    s_mul_i32 s1, s0, s2
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, s5, v1
 ; SI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; SI-NEXT:    v_mov_b32_e32 v6, s8
+; SI-NEXT:    v_mov_b32_e32 v6, s1
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, s4, v5
 ; SI-NEXT:    v_addc_u32_e32 v2, vcc, v4, v2, vcc
 ; SI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -319,14 +322,15 @@
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, s4, v1
 ; SI-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v2
 ; SI-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v3, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_lt_i32 s3, 0
 ; SI-NEXT:    v_ashrrev_i32_e32 v0, 31, v4
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v1, v0
 ; SI-NEXT:    v_subrev_i32_e32 v5, vcc, s0, v2
 ; SI-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v3, vcc
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s3, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; SI-NEXT:    v_cmp_ne_u64_e32 vcc, v[2:3], v[0:1]
@@ -356,7 +360,8 @@
 ; GFX9-NEXT:    s_addc_u32 s6, 0, s6
 ; GFX9-NEXT:    s_sub_u32 s9, s4, s2
 ; GFX9-NEXT:    s_subb_u32 s10, s6, 0
-; GFX9-NEXT:    v_cmp_lt_i32_e64 vcc, s1, 0
+; GFX9-NEXT:    s_cmp_lt_i32 s1, 0
+; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
@@ -364,10 +369,11 @@
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s0, v2
-; GFX9-NEXT:    s_add_i32 s1, s8, s7
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v0, vcc
+; GFX9-NEXT:    s_cmp_lt_i32 s3, 0
+; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX9-NEXT:    s_add_i32 s1, s8, s7
 ; GFX9-NEXT:    s_add_i32 s1, s1, s5
-; GFX9-NEXT:    v_cmp_lt_i32_e64 vcc, s3, 0
 ; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
@@ -401,21 +407,23 @@
 ; GFX10-NEXT:    s_addc_u32 s6, 0, s6
 ; GFX10-NEXT:    s_sub_u32 s9, s4, s2
 ; GFX10-NEXT:    s_subb_u32 s10, s6, 0
-; GFX10-NEXT:    v_cmp_lt_i32_e64 vcc_lo, s1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s9
-; GFX10-NEXT:    v_mov_b32_e32 v1, s10
+; GFX10-NEXT:    v_mov_b32_e32 v1, s9
+; GFX10-NEXT:    s_cmp_lt_i32 s1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, s10
+; GFX10-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX10-NEXT:    s_cmp_lt_i32 s3, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, s4, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, s6, v0, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v3, vcc_lo, v2, s0
+; GFX10-NEXT:    s_mul_i32 s0, s0, s2
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v0, vcc_lo
+; GFX10-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX10-NEXT:    s_add_i32 s1, s8, s7
 ; GFX10-NEXT:    s_add_i32 s1, s1, s5
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, s4, v0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, s6, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
 ; GFX10-NEXT:    s_ashr_i32 s4, s1, 31
 ; GFX10-NEXT:    s_mov_b32 s5, s4
-; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v0, s0
-; GFX10-NEXT:    s_mul_i32 s0, s0, s2
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_lt_i32_e64 vcc_lo, s3, 0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, s1, 0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, s0, 0, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -16,12 +16,14 @@
 ; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s5
 ; SI-NEXT:    s_andn2_b64 s[2:3], s[10:11], s[0:1]
 ; SI-NEXT:    s_and_b32 s0, s11, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s5, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_gt_i32 s5, 51
 ; SI-NEXT:    v_mov_b32_e32 v1, s0
 ; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s5, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s5, 51
 ; SI-NEXT:    v_mov_b32_e32 v1, s11
+; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
@@ -155,12 +157,14 @@
 ; SI-NEXT:    s_brev_b32 s15, 1
 ; SI-NEXT:    s_andn2_b64 s[12:13], s[10:11], s[0:1]
 ; SI-NEXT:    s_and_b32 s0, s11, s15
-; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    s_cmp_lt_i32 s14, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s13
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s14, 0
+; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    s_cmp_gt_i32 s14, 51
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v1, s11
-; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s14, 51
+; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v0, s12
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
@@ -169,23 +173,25 @@
 ; SI-NEXT:    v_add_f64 v[2:3], s[10:11], -v[0:1]
 ; SI-NEXT:    s_bfe_u32 s0, s9, 0xb0014
 ; SI-NEXT:    s_add_i32 s7, s0, s7
+; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s7
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
 ; SI-NEXT:    s_brev_b32 s10, -2
 ; SI-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
 ; SI-NEXT:    v_mov_b32_e32 v4, s11
-; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s7
 ; SI-NEXT:    v_bfi_b32 v4, s10, v6, v4
-; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
-; SI-NEXT:    v_mov_b32_e32 v2, 0
 ; SI-NEXT:    s_andn2_b64 s[2:3], s[8:9], s[0:1]
 ; SI-NEXT:    s_and_b32 s0, s9, s15
+; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
+; SI-NEXT:    v_mov_b32_e32 v2, 0
+; SI-NEXT:    s_cmp_lt_i32 s7, 0
 ; SI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_gt_i32 s7, 51
 ; SI-NEXT:    v_mov_b32_e32 v1, s0
 ; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s7, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s7, 51
 ; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
@@ -250,12 +256,14 @@
 ; SI-NEXT:    s_brev_b32 s20, 1
 ; SI-NEXT:    s_andn2_b64 s[16:17], s[10:11], s[0:1]
 ; SI-NEXT:    s_and_b32 s0, s11, s20
-; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    s_cmp_lt_i32 s19, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s17
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s19, 0
+; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    s_cmp_gt_i32 s19, 51
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v1, s11
-; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s19, 51
+; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v0, s16
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
@@ -268,40 +276,44 @@
 ; SI-NEXT:    s_brev_b32 s16, -2
 ; SI-NEXT:    v_mov_b32_e32 v12, 0x3ff00000
 ; SI-NEXT:    v_mov_b32_e32 v4, s11
-; SI-NEXT:    v_bfi_b32 v4, s16, v12, v4
 ; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s17
-; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
-; SI-NEXT:    v_mov_b32_e32 v2, 0
+; SI-NEXT:    v_bfi_b32 v4, s16, v12, v4
 ; SI-NEXT:    s_andn2_b64 s[10:11], s[8:9], s[0:1]
 ; SI-NEXT:    s_and_b32 s0, s9, s20
+; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
+; SI-NEXT:    v_mov_b32_e32 v2, 0
+; SI-NEXT:    s_cmp_lt_i32 s17, 0
 ; SI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
-; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s11
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s17, 0
+; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    s_cmp_gt_i32 s17, 51
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-NEXT:    v_mov_b32_e32 v1, s9
-; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s17, 51
+; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v0, s10
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; SI-NEXT:    v_mov_b32_e32 v4, s8
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
-; SI-NEXT:    v_add_f64 v[4:5], s[8:9], -v[0:1]
 ; SI-NEXT:    s_bfe_u32 s0, s15, 0xb0014
 ; SI-NEXT:    s_add_i32 s10, s0, s18
-; SI-NEXT:    v_mov_b32_e32 v6, s9
+; SI-NEXT:    v_add_f64 v[4:5], s[8:9], -v[0:1]
 ; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s10
-; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; SI-NEXT:    v_mov_b32_e32 v6, s9
 ; SI-NEXT:    s_andn2_b64 s[8:9], s[14:15], s[0:1]
-; SI-NEXT:    v_bfi_b32 v6, s16, v12, v6
+; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
 ; SI-NEXT:    s_and_b32 s0, s15, s20
+; SI-NEXT:    v_bfi_b32 v6, s16, v12, v6
+; SI-NEXT:    s_cmp_lt_i32 s10, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v9, 0, v6, vcc
-; SI-NEXT:    v_mov_b32_e32 v5, s0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v4, s9
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s10, 0
+; SI-NEXT:    v_mov_b32_e32 v5, s0
+; SI-NEXT:    s_cmp_gt_i32 s10, 51
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s10, 51
+; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v4, s8
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
@@ -309,22 +321,24 @@
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[0:1]
 ; SI-NEXT:    v_add_f64 v[6:7], s[14:15], -v[4:5]
 ; SI-NEXT:    s_bfe_u32 s0, s13, 0xb0014
-; SI-NEXT:    v_mov_b32_e32 v10, s15
 ; SI-NEXT:    s_add_i32 s8, s0, s18
-; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
+; SI-NEXT:    v_mov_b32_e32 v10, s15
 ; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s8
+; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
 ; SI-NEXT:    v_bfi_b32 v10, s16, v12, v10
-; SI-NEXT:    v_cndmask_b32_e32 v7, 0, v10, vcc
-; SI-NEXT:    v_mov_b32_e32 v6, 0
 ; SI-NEXT:    s_andn2_b64 s[2:3], s[12:13], s[0:1]
 ; SI-NEXT:    s_and_b32 s0, s13, s20
+; SI-NEXT:    v_cndmask_b32_e32 v7, 0, v10, vcc
+; SI-NEXT:    v_mov_b32_e32 v6, 0
+; SI-NEXT:    s_cmp_lt_i32 s8, 0
 ; SI-NEXT:    v_add_f64 v[6:7], v[4:5], v[6:7]
-; SI-NEXT:    v_mov_b32_e32 v5, s0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v4, s3
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s8, 0
+; SI-NEXT:    v_mov_b32_e32 v5, s0
+; SI-NEXT:    s_cmp_gt_i32 s8, 51
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; SI-NEXT:    v_mov_b32_e32 v5, s13
-; SI-NEXT:    v_cmp_gt_i32_e64 s[0:1], s8, 51
+; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v4, s2
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
@@ -396,194 +410,210 @@
 define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 {
 ; SI-LABEL: round_v8f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0x19
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_movk_i32 s7, 0xfc01
-; SI-NEXT:    s_mov_b32 s5, 0xfffff
-; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_load_dwordx16 s[16:31], s[0:1], 0x19
+; SI-NEXT:    s_mov_b32 s14, -1
+; SI-NEXT:    s_movk_i32 s15, 0xfc01
+; SI-NEXT:    s_mov_b32 s13, 0xfffff
+; SI-NEXT:    s_mov_b32 s12, s14
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_bfe_u32 s2, s11, 0xb0014
-; SI-NEXT:    s_add_i32 s26, s2, s7
-; SI-NEXT:    s_lshr_b64 s[2:3], s[4:5], s26
-; SI-NEXT:    s_brev_b32 s27, 1
-; SI-NEXT:    s_andn2_b64 s[24:25], s[10:11], s[2:3]
-; SI-NEXT:    s_and_b32 s2, s11, s27
+; SI-NEXT:    s_bfe_u32 s2, s19, 0xb0014
+; SI-NEXT:    s_add_i32 s6, s2, s15
+; SI-NEXT:    s_lshr_b64 s[2:3], s[12:13], s6
+; SI-NEXT:    s_brev_b32 s33, 1
+; SI-NEXT:    s_andn2_b64 s[4:5], s[18:19], s[2:3]
+; SI-NEXT:    s_and_b32 s2, s19, s33
+; SI-NEXT:    s_cmp_lt_i32 s6, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_mov_b32_e32 v0, s5
 ; SI-NEXT:    v_mov_b32_e32 v1, s2
-; SI-NEXT:    v_mov_b32_e32 v0, s25
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s26, 0
+; SI-NEXT:    s_cmp_gt_i32 s6, 51
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    v_mov_b32_e32 v1, s11
-; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s26, 51
+; SI-NEXT:    v_mov_b32_e32 v1, s19
+; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
-; SI-NEXT:    v_mov_b32_e32 v0, s24
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    v_mov_b32_e32 v2, s18
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[2:3]
-; SI-NEXT:    v_add_f64 v[2:3], s[10:11], -v[0:1]
-; SI-NEXT:    s_bfe_u32 s2, s9, 0xb0014
-; SI-NEXT:    s_add_i32 s25, s2, s7
+; SI-NEXT:    v_add_f64 v[2:3], s[18:19], -v[0:1]
+; SI-NEXT:    s_bfe_u32 s2, s17, 0xb0014
+; SI-NEXT:    s_add_i32 s6, s2, s15
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
-; SI-NEXT:    s_brev_b32 s24, -2
-; SI-NEXT:    v_mov_b32_e32 v18, 0x3ff00000
-; SI-NEXT:    v_mov_b32_e32 v4, s11
-; SI-NEXT:    v_bfi_b32 v4, s24, v18, v4
-; SI-NEXT:    s_lshr_b64 s[2:3], s[4:5], s25
+; SI-NEXT:    s_lshr_b64 s[2:3], s[12:13], s6
+; SI-NEXT:    s_brev_b32 s34, -2
+; SI-NEXT:    v_mov_b32_e32 v14, 0x3ff00000
+; SI-NEXT:    v_mov_b32_e32 v4, s19
+; SI-NEXT:    v_bfi_b32 v4, s34, v14, v4
+; SI-NEXT:    s_andn2_b64 s[4:5], s[16:17], s[2:3]
+; SI-NEXT:    s_and_b32 s2, s17, s33
 ; SI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
 ; SI-NEXT:    v_mov_b32_e32 v2, 0
-; SI-NEXT:    s_andn2_b64 s[10:11], s[8:9], s[2:3]
-; SI-NEXT:    s_and_b32 s2, s9, s27
+; SI-NEXT:    s_cmp_lt_i32 s6, 0
 ; SI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_mov_b32_e32 v0, s5
 ; SI-NEXT:    v_mov_b32_e32 v1, s2
-; SI-NEXT:    v_mov_b32_e32 v0, s11
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s25, 0
+; SI-NEXT:    s_cmp_gt_i32 s6, 51
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NEXT:    v_mov_b32_e32 v1, s9
-; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s25, 51
+; SI-NEXT:    v_mov_b32_e32 v1, s17
+; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[2:3]
-; SI-NEXT:    v_mov_b32_e32 v0, s10
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v4, s8
+; SI-NEXT:    v_mov_b32_e32 v4, s16
 ; SI-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[2:3]
-; SI-NEXT:    v_add_f64 v[4:5], s[8:9], -v[0:1]
-; SI-NEXT:    s_bfe_u32 s2, s15, 0xb0014
-; SI-NEXT:    v_mov_b32_e32 v6, s9
-; SI-NEXT:    s_add_i32 s10, s2, s7
+; SI-NEXT:    v_add_f64 v[4:5], s[16:17], -v[0:1]
+; SI-NEXT:    s_bfe_u32 s2, s23, 0xb0014
+; SI-NEXT:    s_add_i32 s6, s2, s15
+; SI-NEXT:    v_mov_b32_e32 v6, s17
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
-; SI-NEXT:    v_bfi_b32 v6, s24, v18, v6
-; SI-NEXT:    s_lshr_b64 s[2:3], s[4:5], s10
+; SI-NEXT:    s_lshr_b64 s[2:3], s[12:13], s6
+; SI-NEXT:    v_bfi_b32 v6, s34, v14, v6
+; SI-NEXT:    s_andn2_b64 s[4:5], s[22:23], s[2:3]
+; SI-NEXT:    s_and_b32 s2, s23, s33
 ; SI-NEXT:    v_cndmask_b32_e32 v5, 0, v6, vcc
 ; SI-NEXT:    v_mov_b32_e32 v4, 0
-; SI-NEXT:    s_andn2_b64 s[8:9], s[14:15], s[2:3]
-; SI-NEXT:    s_and_b32 s2, s15, s27
+; SI-NEXT:    s_cmp_lt_i32 s6, 0
 ; SI-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_mov_b32_e32 v4, s5
 ; SI-NEXT:    v_mov_b32_e32 v5, s2
-; SI-NEXT:    v_mov_b32_e32 v4, s9
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s10, 0
+; SI-NEXT:    s_cmp_gt_i32 s6, 51
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; SI-NEXT:    v_mov_b32_e32 v5, s15
-; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s10, 51
+; SI-NEXT:    v_mov_b32_e32 v5, s23
+; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[2:3]
-; SI-NEXT:    v_mov_b32_e32 v4, s8
+; SI-NEXT:    v_mov_b32_e32 v4, s4
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v6, s14
+; SI-NEXT:    v_mov_b32_e32 v6, s22
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[2:3]
-; SI-NEXT:    v_add_f64 v[6:7], s[14:15], -v[4:5]
-; SI-NEXT:    s_bfe_u32 s2, s13, 0xb0014
-; SI-NEXT:    v_mov_b32_e32 v8, s15
-; SI-NEXT:    s_add_i32 s10, s2, s7
+; SI-NEXT:    v_add_f64 v[6:7], s[22:23], -v[4:5]
+; SI-NEXT:    s_bfe_u32 s2, s21, 0xb0014
+; SI-NEXT:    s_add_i32 s6, s2, s15
+; SI-NEXT:    v_mov_b32_e32 v8, s23
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
-; SI-NEXT:    v_bfi_b32 v8, s24, v18, v8
-; SI-NEXT:    s_lshr_b64 s[2:3], s[4:5], s10
+; SI-NEXT:    s_lshr_b64 s[2:3], s[12:13], s6
+; SI-NEXT:    v_bfi_b32 v8, s34, v14, v8
+; SI-NEXT:    s_andn2_b64 s[4:5], s[20:21], s[2:3]
+; SI-NEXT:    s_and_b32 s2, s21, s33
 ; SI-NEXT:    v_cndmask_b32_e32 v7, 0, v8, vcc
 ; SI-NEXT:    v_mov_b32_e32 v6, 0
-; SI-NEXT:    s_andn2_b64 s[8:9], s[12:13], s[2:3]
-; SI-NEXT:    s_and_b32 s2, s13, s27
+; SI-NEXT:    s_cmp_lt_i32 s6, 0
 ; SI-NEXT:    v_add_f64 v[6:7], v[4:5], v[6:7]
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_mov_b32_e32 v4, s5
 ; SI-NEXT:    v_mov_b32_e32 v5, s2
-; SI-NEXT:    v_mov_b32_e32 v4, s9
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s10, 0
+; SI-NEXT:    s_cmp_gt_i32 s6, 51
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; SI-NEXT:    v_mov_b32_e32 v5, s13
-; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s10, 51
+; SI-NEXT:    v_mov_b32_e32 v5, s21
+; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[2:3]
-; SI-NEXT:    v_mov_b32_e32 v4, s8
+; SI-NEXT:    v_mov_b32_e32 v4, s4
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v8, s12
+; SI-NEXT:    v_mov_b32_e32 v8, s20
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[2:3]
-; SI-NEXT:    v_add_f64 v[8:9], s[12:13], -v[4:5]
-; SI-NEXT:    s_bfe_u32 s2, s19, 0xb0014
-; SI-NEXT:    v_mov_b32_e32 v10, s13
-; SI-NEXT:    s_add_i32 s10, s2, s7
+; SI-NEXT:    s_bfe_u32 s2, s27, 0xb0014
+; SI-NEXT:    s_add_i32 s4, s2, s15
+; SI-NEXT:    v_add_f64 v[8:9], s[20:21], -v[4:5]
+; SI-NEXT:    s_lshr_b64 s[2:3], s[12:13], s4
+; SI-NEXT:    v_mov_b32_e32 v10, s21
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[8:9]|, 0.5
-; SI-NEXT:    v_bfi_b32 v10, s24, v18, v10
-; SI-NEXT:    s_lshr_b64 s[2:3], s[4:5], s10
+; SI-NEXT:    s_andn2_b64 s[16:17], s[26:27], s[2:3]
+; SI-NEXT:    s_and_b32 s2, s27, s33
+; SI-NEXT:    v_bfi_b32 v10, s34, v14, v10
+; SI-NEXT:    s_cmp_lt_i32 s4, 0
 ; SI-NEXT:    v_cndmask_b32_e32 v9, 0, v10, vcc
 ; SI-NEXT:    v_mov_b32_e32 v8, 0
-; SI-NEXT:    s_andn2_b64 s[8:9], s[18:19], s[2:3]
-; SI-NEXT:    s_and_b32 s2, s19, s27
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_gt_i32 s4, 51
 ; SI-NEXT:    v_add_f64 v[4:5], v[4:5], v[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v9, s2
-; SI-NEXT:    v_mov_b32_e32 v8, s9
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s10, 0
-; SI-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; SI-NEXT:    v_mov_b32_e32 v9, s19
-; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s10, 51
-; SI-NEXT:    v_cndmask_b32_e64 v13, v8, v9, s[2:3]
-; SI-NEXT:    v_mov_b32_e32 v8, s8
-; SI-NEXT:    v_cndmask_b32_e64 v8, v8, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v9, s18
-; SI-NEXT:    v_cndmask_b32_e64 v12, v8, v9, s[2:3]
-; SI-NEXT:    s_bfe_u32 s2, s17, 0xb0014
-; SI-NEXT:    s_add_i32 s12, s2, s7
-; SI-NEXT:    s_lshr_b64 s[2:3], s[4:5], s12
-; SI-NEXT:    s_andn2_b64 s[8:9], s[16:17], s[2:3]
-; SI-NEXT:    s_bfe_u32 s2, s23, 0xb0014
-; SI-NEXT:    s_add_i32 s14, s2, s7
-; SI-NEXT:    s_lshr_b64 s[2:3], s[4:5], s14
+; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; SI-NEXT:    s_bfe_u32 s4, s25, 0xb0014
+; SI-NEXT:    s_add_i32 s6, s4, s15
+; SI-NEXT:    s_lshr_b64 s[4:5], s[12:13], s6
+; SI-NEXT:    s_andn2_b64 s[18:19], s[24:25], s[4:5]
+; SI-NEXT:    s_and_b32 s4, s25, s33
+; SI-NEXT:    v_mov_b32_e32 v8, s17
+; SI-NEXT:    s_cmp_lt_i32 s6, 0
+; SI-NEXT:    v_cndmask_b32_e32 v15, v8, v9, vcc
+; SI-NEXT:    v_mov_b32_e32 v9, s4
+; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT:    s_cmp_gt_i32 s6, 51
+; SI-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; SI-NEXT:    s_bfe_u32 s8, s31, 0xb0014
+; SI-NEXT:    s_add_i32 s17, s8, s15
+; SI-NEXT:    s_lshr_b64 s[8:9], s[12:13], s17
+; SI-NEXT:    s_andn2_b64 s[10:11], s[30:31], s[8:9]
+; SI-NEXT:    s_and_b32 s8, s31, s33
 ; SI-NEXT:    v_mov_b32_e32 v8, s19
-; SI-NEXT:    s_andn2_b64 s[10:11], s[22:23], s[2:3]
-; SI-NEXT:    s_and_b32 s2, s23, s27
-; SI-NEXT:    v_bfi_b32 v19, s24, v18, v8
-; SI-NEXT:    v_mov_b32_e32 v9, s2
+; SI-NEXT:    s_cmp_lt_i32 s17, 0
+; SI-NEXT:    v_cndmask_b32_e64 v17, v8, v9, s[4:5]
+; SI-NEXT:    v_mov_b32_e32 v9, s8
+; SI-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; SI-NEXT:    v_mov_b32_e32 v8, s11
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s14, 0
-; SI-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; SI-NEXT:    v_mov_b32_e32 v9, s23
-; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s14, 51
-; SI-NEXT:    v_cndmask_b32_e64 v9, v8, v9, s[2:3]
-; SI-NEXT:    v_mov_b32_e32 v8, s10
-; SI-NEXT:    v_cndmask_b32_e64 v8, v8, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v10, s22
-; SI-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[2:3]
-; SI-NEXT:    s_bfe_u32 s2, s21, 0xb0014
-; SI-NEXT:    s_add_i32 s7, s2, s7
-; SI-NEXT:    s_lshr_b64 s[2:3], s[4:5], s7
-; SI-NEXT:    s_andn2_b64 s[4:5], s[20:21], s[2:3]
-; SI-NEXT:    s_and_b32 s2, s21, s27
-; SI-NEXT:    v_mov_b32_e32 v11, s2
-; SI-NEXT:    v_mov_b32_e32 v10, s5
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s7, 0
-; SI-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
-; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s7, 51
-; SI-NEXT:    v_mov_b32_e32 v11, s21
-; SI-NEXT:    v_cndmask_b32_e64 v15, v10, v11, s[2:3]
-; SI-NEXT:    v_mov_b32_e32 v10, s4
-; SI-NEXT:    v_cndmask_b32_e64 v10, v10, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v11, s20
-; SI-NEXT:    v_cndmask_b32_e64 v14, v10, v11, s[2:3]
-; SI-NEXT:    v_add_f64 v[10:11], s[20:21], -v[14:15]
-; SI-NEXT:    v_mov_b32_e32 v17, s23
-; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[10:11]|, 0.5
-; SI-NEXT:    v_add_f64 v[10:11], s[22:23], -v[8:9]
-; SI-NEXT:    v_mov_b32_e32 v16, s21
-; SI-NEXT:    v_cmp_ge_f64_e64 s[2:3], |v[10:11]|, 0.5
-; SI-NEXT:    v_bfi_b32 v17, s24, v18, v17
-; SI-NEXT:    v_cndmask_b32_e64 v11, 0, v17, s[2:3]
+; SI-NEXT:    s_cmp_gt_i32 s17, 51
+; SI-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v10, s10
+; SI-NEXT:    v_mov_b32_e32 v9, s31
+; SI-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; SI-NEXT:    v_cndmask_b32_e64 v9, v8, v9, s[10:11]
+; SI-NEXT:    v_cndmask_b32_e64 v8, v10, 0, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v10, s30
+; SI-NEXT:    s_bfe_u32 s8, s29, 0xb0014
+; SI-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[10:11]
+; SI-NEXT:    s_add_i32 s10, s8, s15
+; SI-NEXT:    s_lshr_b64 s[8:9], s[12:13], s10
+; SI-NEXT:    s_andn2_b64 s[12:13], s[28:29], s[8:9]
+; SI-NEXT:    s_and_b32 s8, s29, s33
+; SI-NEXT:    s_cmp_lt_i32 s10, 0
+; SI-NEXT:    v_mov_b32_e32 v11, s8
+; SI-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; SI-NEXT:    v_mov_b32_e32 v10, s13
+; SI-NEXT:    s_cmp_gt_i32 s10, 51
+; SI-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v11, s29
+; SI-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; SI-NEXT:    v_cndmask_b32_e64 v13, v10, v11, s[10:11]
+; SI-NEXT:    v_mov_b32_e32 v10, s12
+; SI-NEXT:    v_cndmask_b32_e64 v10, v10, 0, s[8:9]
+; SI-NEXT:    v_mov_b32_e32 v11, s28
+; SI-NEXT:    v_cndmask_b32_e64 v12, v10, v11, s[10:11]
+; SI-NEXT:    v_add_f64 v[10:11], s[28:29], -v[12:13]
+; SI-NEXT:    v_mov_b32_e32 v19, s29
+; SI-NEXT:    v_cmp_ge_f64_e64 s[8:9], |v[10:11]|, 0.5
+; SI-NEXT:    v_mov_b32_e32 v10, s31
+; SI-NEXT:    v_bfi_b32 v20, s34, v14, v10
+; SI-NEXT:    v_add_f64 v[10:11], s[30:31], -v[8:9]
+; SI-NEXT:    v_bfi_b32 v19, s34, v14, v19
+; SI-NEXT:    v_cmp_ge_f64_e64 s[10:11], |v[10:11]|, 0.5
 ; SI-NEXT:    v_mov_b32_e32 v10, 0
-; SI-NEXT:    v_bfi_b32 v16, s24, v18, v16
+; SI-NEXT:    v_cndmask_b32_e64 v11, 0, v20, s[10:11]
 ; SI-NEXT:    v_add_f64 v[10:11], v[8:9], v[10:11]
-; SI-NEXT:    v_cndmask_b32_e32 v9, 0, v16, vcc
+; SI-NEXT:    v_cndmask_b32_e64 v9, 0, v19, s[8:9]
 ; SI-NEXT:    v_mov_b32_e32 v8, 0
-; SI-NEXT:    s_and_b32 s13, s17, s27
-; SI-NEXT:    v_add_f64 v[8:9], v[14:15], v[8:9]
-; SI-NEXT:    v_mov_b32_e32 v14, s9
-; SI-NEXT:    v_mov_b32_e32 v15, s13
-; SI-NEXT:    v_cmp_lt_i32_e64 vcc, s12, 0
-; SI-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
-; SI-NEXT:    v_mov_b32_e32 v15, s17
-; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s12, 51
-; SI-NEXT:    v_cndmask_b32_e64 v17, v14, v15, s[2:3]
-; SI-NEXT:    v_mov_b32_e32 v14, s8
-; SI-NEXT:    v_cndmask_b32_e64 v14, v14, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v15, s16
-; SI-NEXT:    v_cndmask_b32_e64 v16, v14, v15, s[2:3]
-; SI-NEXT:    v_mov_b32_e32 v14, s17
-; SI-NEXT:    v_bfi_b32 v18, s24, v18, v14
-; SI-NEXT:    v_add_f64 v[14:15], s[16:17], -v[16:17]
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT:    v_add_f64 v[8:9], v[12:13], v[8:9]
+; SI-NEXT:    v_mov_b32_e32 v12, s16
+; SI-NEXT:    v_mov_b32_e32 v16, s27
+; SI-NEXT:    v_cndmask_b32_e64 v13, v15, v16, s[2:3]
+; SI-NEXT:    v_cndmask_b32_e64 v12, v12, 0, vcc
+; SI-NEXT:    v_mov_b32_e32 v15, s26
+; SI-NEXT:    v_cndmask_b32_e64 v12, v12, v15, s[2:3]
+; SI-NEXT:    v_mov_b32_e32 v15, s27
+; SI-NEXT:    v_bfi_b32 v19, s34, v14, v15
+; SI-NEXT:    v_mov_b32_e32 v15, s18
+; SI-NEXT:    v_mov_b32_e32 v18, s25
+; SI-NEXT:    v_cndmask_b32_e64 v15, v15, 0, s[4:5]
+; SI-NEXT:    v_mov_b32_e32 v16, s24
+; SI-NEXT:    v_cndmask_b32_e64 v16, v15, v16, s[6:7]
+; SI-NEXT:    v_cndmask_b32_e64 v17, v17, v18, s[6:7]
+; SI-NEXT:    v_mov_b32_e32 v15, s25
+; SI-NEXT:    v_bfi_b32 v18, s34, v14, v15
+; SI-NEXT:    v_add_f64 v[14:15], s[24:25], -v[16:17]
+; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
 ; SI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5
-; SI-NEXT:    v_add_f64 v[14:15], s[18:19], -v[12:13]
-; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    v_add_f64 v[14:15], s[26:27], -v[12:13]
+; SI-NEXT:    s_mov_b32 s15, 0xf000
 ; SI-NEXT:    v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5
 ; SI-NEXT:    v_mov_b32_e32 v14, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v15, 0, v19, s[0:1]
@@ -592,10 +622,10 @@
 ; SI-NEXT:    v_mov_b32_e32 v12, 0
 ; SI-NEXT:    v_add_f64 v[12:13], v[16:17], v[12:13]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48
-; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:32
-; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[12:15], 0 offset:48
+; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[12:15], 0 offset:32
+; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; CI-LABEL: round_v8f64:
diff --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
--- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
@@ -8,7 +8,7 @@
 ; GCN: s_load_dwordx2
 ; GCN: s_load_dwordx2
 
-; GCN: v_cmp_eq_u32
+; GCN: s_cmp_eq_u32
 ; GCN: v_cndmask_b32
 ; GCN: v_cndmask_b32
 
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -384,7 +384,7 @@
 
 ; FUNC-LABEL: @v_test_umin_ult_i32_multi_use
 ; SI-NOT: v_min
-; GCN: v_cmp_lt_u32
+; GCN: s_cmp_lt_u32
 ; SI-NOT: v_min
 ; SI: v_cndmask_b32
 ; SI-NOT: v_min
diff --git a/llvm/test/CodeGen/AMDGPU/move-load-addr-to-valu.mir b/llvm/test/CodeGen/AMDGPU/move-load-addr-to-valu.mir
--- a/llvm/test/CodeGen/AMDGPU/move-load-addr-to-valu.mir
+++ b/llvm/test/CodeGen/AMDGPU/move-load-addr-to-valu.mir
@@ -19,9 +19,9 @@
   ; GCN:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec
   ; GCN:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec
   ; GCN:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
-  ; GCN:   V_CMP_NE_U64_e32 0, [[REG_SEQUENCE]], implicit-def $vcc, implicit $exec
+  ; GCN:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE]], 0, implicit $exec
   ; GCN:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], implicit $exec
-  ; GCN:   $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc
+  ; GCN:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U64_e64_]], implicit-def $scc
   ; GCN:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; GCN: bb.2:
   ; GCN:   S_ENDPGM 0
@@ -63,9 +63,9 @@
   ; GCN:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec
   ; GCN:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec
   ; GCN:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
-  ; GCN:   V_CMP_NE_U64_e32 0, [[REG_SEQUENCE1]], implicit-def $vcc, implicit $exec
+  ; GCN:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE1]], 0, implicit $exec
   ; GCN:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]], implicit $exec
-  ; GCN:   $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc
+  ; GCN:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U64_e64_]], implicit-def $scc
   ; GCN:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; GCN: bb.2:
   ; GCN:   S_ENDPGM 0
@@ -107,9 +107,9 @@
   ; GCN:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec
   ; GCN:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec
   ; GCN:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
-  ; GCN:   V_CMP_NE_U64_e32 0, [[REG_SEQUENCE1]], implicit-def $vcc, implicit $exec
+  ; GCN:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE1]], 0, implicit $exec
   ; GCN:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]], implicit $exec
-  ; GCN:   $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc
+  ; GCN:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U64_e64_]], implicit-def $scc
   ; GCN:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; GCN: bb.2:
   ; GCN:   S_ENDPGM 0
@@ -147,9 +147,9 @@
   ; GCN:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec
   ; GCN:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec
   ; GCN:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
-  ; GCN:   V_CMP_NE_U64_e32 0, [[REG_SEQUENCE]], implicit-def $vcc, implicit $exec
+  ; GCN:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE]], 0, implicit $exec
   ; GCN:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], implicit $exec
-  ; GCN:   $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc
+  ; GCN:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U64_e64_]], implicit-def $scc
   ; GCN:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; GCN: bb.2:
   ; GCN:   S_ENDPGM 0
@@ -191,9 +191,9 @@
   ; GCN:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec
   ; GCN:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec
   ; GCN:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
-  ; GCN:   V_CMP_NE_U64_e32 0, [[REG_SEQUENCE1]], implicit-def $vcc, implicit $exec
+  ; GCN:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE1]], 0, implicit $exec
   ; GCN:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]], implicit $exec
-  ; GCN:   $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc
+  ; GCN:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U64_e64_]], implicit-def $scc
   ; GCN:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; GCN: bb.2:
   ; GCN:   S_ENDPGM 0
@@ -234,9 +234,9 @@
   ; GCN:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec
   ; GCN:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec
   ; GCN:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
-  ; GCN:   V_CMP_NE_U64_e32 0, [[REG_SEQUENCE1]], implicit-def $vcc, implicit $exec
+  ; GCN:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE1]], 0, implicit $exec
   ; GCN:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]], implicit $exec
-  ; GCN:   $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc
+  ; GCN:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U64_e64_]], implicit-def $scc
   ; GCN:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; GCN: bb.2:
   ; GCN:   S_ENDPGM 0
@@ -275,9 +275,9 @@
   ; GCN:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec
   ; GCN:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec
   ; GCN:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
-  ; GCN:   V_CMP_NE_U64_e32 0, [[REG_SEQUENCE]], implicit-def $vcc, implicit $exec
+  ; GCN:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE]], 0, implicit $exec
   ; GCN:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], implicit $exec
-  ; GCN:   $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc
+  ; GCN:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U64_e64_]], implicit-def $scc
   ; GCN:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; GCN: bb.2:
   ; GCN:   S_ENDPGM 0
@@ -316,9 +316,9 @@
   ; GCN:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec
   ; GCN:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec
   ; GCN:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1
-  ; GCN:   V_CMP_NE_U64_e32 0, [[REG_SEQUENCE]], implicit-def $vcc, implicit $exec
+  ; GCN:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE]], 0, implicit $exec
   ; GCN:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], implicit $exec
-  ; GCN:   $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc
+  ; GCN:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U64_e64_]], implicit-def $scc
   ; GCN:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; GCN: bb.2:
   ; GCN:   S_ENDPGM 0
@@ -352,9 +352,9 @@
   ; GCN:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, %6, %bb.1
   ; GCN:   [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[PHI]], 0, 0, implicit $exec, implicit $flat_scr
   ; GCN:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[PHI]], 1, implicit $exec
-  ; GCN:   V_CMP_NE_U32_e32 0, [[V_AND_B32_e64_]], implicit-def $vcc, implicit $exec
+  ; GCN:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 [[V_AND_B32_e64_]], 0, implicit $exec
   ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_AND_B32_e64_]], implicit $exec
-  ; GCN:   $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc
+  ; GCN:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def $scc
   ; GCN:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; GCN: bb.2:
   ; GCN:   S_ENDPGM 0
@@ -388,9 +388,9 @@
   ; GCN:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; GCN:   SCRATCH_STORE_DWORD [[DEF]], [[PHI]], 0, 0, implicit $exec, implicit $flat_scr
   ; GCN:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[PHI]], 1, implicit $exec
-  ; GCN:   V_CMP_NE_U32_e32 0, [[V_AND_B32_e64_]], implicit-def $vcc, implicit $exec
+  ; GCN:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 [[V_AND_B32_e64_]], 0, implicit $exec
   ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_AND_B32_e64_]], implicit $exec
-  ; GCN:   $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc
+  ; GCN:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def $scc
   ; GCN:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; GCN: bb.2:
   ; GCN:   S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
--- a/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
+++ b/llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll
@@ -35,7 +35,8 @@
 }
 
 ; GCN-LABEL: {{^}}negated_cond_dominated_blocks:
-; GCN:   v_cmp_ne_u32_e64 [[CC1:[^,]+]],
+; GCN:   s_cmp_lg_u32
+; GCN: s_cselect_b64  [[CC1:[^,]+]], -1, 0
 ; GCN:   s_branch [[BB1:BB[0-9]+_[0-9]+]]
 ; GCN: [[BB0:BB[0-9]+_[0-9]+]]
 ; GCN-NOT: v_cndmask_b32
diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll
--- a/llvm/test/CodeGen/AMDGPU/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/or.ll
@@ -262,7 +262,11 @@
 }
 
 ; FUNC-LABEL: {{^}}s_or_i1:
-; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
+; SI: s_cmp_eq_u32
+; SI: s_cselect_b64 [[C1:[^,]+]], -1, 0
+; SI: s_cmp_eq_u32
+; SI: s_cselect_b64 [[C2:[^,]+]], -1, 0
+; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], [[C1]], [[C2]]
 define amdgpu_kernel void @s_or_i1(i1 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
   %cmp0 = icmp eq i32 %a, %b
   %cmp1 = icmp eq i32 %c, %d
diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll
--- a/llvm/test/CodeGen/AMDGPU/sad.ll
+++ b/llvm/test/CodeGen/AMDGPU/sad.ll
@@ -133,8 +133,8 @@
 }
 
 ; GCN-LABEL: {{^}}v_sad_u32_multi_use_select_pat2:
-; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: s_cmp_gt_u32 s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
@@ -254,7 +254,7 @@
 
 ; GCN-LABEL: {{^}}s_sad_u32_i8_pat2:
 ; GCN: s_load_dword
-; GCN: s_bfe_u32
+; GCN-DAG: s_bfe_u32
 ; GCN-DAG: s_sub_i32
 ; GCN-DAG: s_and_b32
 ; GCN-DAG: s_sub_i32
@@ -273,8 +273,8 @@
 }
 
 ; GCN-LABEL: {{^}}v_sad_u32_mismatched_operands_pat1:
-; GCN: v_cmp_le_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-; GCN: s_max_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: s_cmp_le_u32 s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-DAG: s_max_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll
--- a/llvm/test/CodeGen/AMDGPU/saddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddo.ll
@@ -111,14 +111,15 @@
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s0, s4
-; SI-NEXT:    v_cmp_lt_i32_e64 s[10:11], s9, 0
-; SI-NEXT:    s_add_i32 s9, s8, s9
-; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    s_add_i32 s12, s8, s9
+; SI-NEXT:    s_cmp_lt_i32 s9, 0
+; SI-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; SI-NEXT:    s_cmp_lt_i32 s12, s8
 ; SI-NEXT:    s_mov_b32 s1, s5
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, s9, v0
-; SI-NEXT:    v_mov_b32_e32 v0, s9
+; SI-NEXT:    v_mov_b32_e32 v0, s12
+; SI-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT:    s_xor_b64 s[0:1], s[10:11], vcc
+; SI-NEXT:    s_xor_b64 s[0:1], s[10:11], s[8:9]
 ; SI-NEXT:    s_mov_b32 s4, s6
 ; SI-NEXT:    s_mov_b32 s5, s7
 ; SI-NEXT:    s_mov_b32 s6, s2
@@ -134,13 +135,14 @@
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_cmp_lt_i32_e64 s[2:3], s1, 0
-; VI-NEXT:    s_add_i32 s1, s0, s1
-; VI-NEXT:    v_mov_b32_e32 v4, s0
-; VI-NEXT:    v_cmp_lt_i32_e32 vcc, s1, v4
-; VI-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NEXT:    s_add_i32 s4, s0, s1
+; VI-NEXT:    s_cmp_lt_i32 s1, 0
+; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT:    s_cmp_lt_i32 s4, s0
+; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    s_xor_b64 s[0:1], s[2:3], vcc
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
 ; VI-NEXT:    flat_store_dword v[0:1], v4
 ; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    v_mov_b32_e32 v3, s7
@@ -548,42 +550,42 @@
 ; GFX9-LABEL: v_saddo_v2i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v6, s[4:5]
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_i32 v5, v0, v2 clamp
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX9-NEXT:    v_add_i32 v2, v1, v3 clamp
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX9-NEXT:    v_add_u32_e32 v5, v1, v3
+; GFX9-NEXT:    v_add_i32 v1, v1, v3 clamp
+; GFX9-NEXT:    v_add_u32_e32 v4, v0, v2
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v1
+; GFX9-NEXT:    v_add_i32 v0, v0, v2 clamp
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v5
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[2:3]
+; GFX9-NEXT:    global_store_dwordx2 v6, v[4:5], s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_saddo_v2i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[4:5]
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v5, s[4:5]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v5, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add_nc_i32 v5, v1, v3 clamp
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
-; GFX10-NEXT:    v_add_nc_i32 v6, v0, v2 clamp
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v1, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
-; GFX10-NEXT:    global_store_dwordx2 v4, v[2:3], s[2:3]
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, v1, v3
+; GFX10-NEXT:    v_add_nc_i32 v1, v1, v3 clamp
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v0, v2
+; GFX10-NEXT:    v_add_nc_i32 v0, v0, v2 clamp
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v4, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v3, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX10-NEXT:    global_store_dwordx2 v5, v[3:4], s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v5, v[0:1], s[2:3]
 ; GFX10-NEXT:    s_endpgm
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4
   %b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -466,76 +466,75 @@
 ;
 ; TONGA-LABEL: sdiv_v2i32:
 ; TONGA:       ; %bb.0:
-; TONGA-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x24
+; TONGA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; TONGA-NEXT:    s_mov_b32 s7, 0xf000
 ; TONGA-NEXT:    s_mov_b32 s6, -1
-; TONGA-NEXT:    s_mov_b32 s2, s6
-; TONGA-NEXT:    s_mov_b32 s3, s7
+; TONGA-NEXT:    s_mov_b32 s10, s6
+; TONGA-NEXT:    s_mov_b32 s11, s7
 ; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
-; TONGA-NEXT:    s_mov_b32 s0, s10
-; TONGA-NEXT:    s_mov_b32 s1, s11
-; TONGA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; TONGA-NEXT:    s_mov_b32 s8, s2
+; TONGA-NEXT:    s_mov_b32 s9, s3
+; TONGA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
 ; TONGA-NEXT:    s_mov_b32 s2, 0x4f7ffffe
-; TONGA-NEXT:    s_mov_b32 s4, s8
-; TONGA-NEXT:    s_mov_b32 s5, s9
+; TONGA-NEXT:    s_mov_b32 s4, s0
+; TONGA-NEXT:    s_mov_b32 s5, s1
 ; TONGA-NEXT:    s_waitcnt vmcnt(0)
-; TONGA-NEXT:    v_ashrrev_i32_e32 v4, 31, v2
-; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v4, v2
-; TONGA-NEXT:    v_xor_b32_e32 v2, v2, v4
+; TONGA-NEXT:    v_ashrrev_i32_e32 v5, 31, v2
+; TONGA-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
+; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
+; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v7, v3
+; TONGA-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
+; TONGA-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
+; TONGA-NEXT:    v_xor_b32_e32 v2, v2, v5
+; TONGA-NEXT:    v_xor_b32_e32 v3, v3, v7
+; TONGA-NEXT:    v_xor_b32_e32 v8, v4, v5
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v5, v2
-; TONGA-NEXT:    v_sub_u32_e32 v6, vcc, 0, v2
-; TONGA-NEXT:    v_ashrrev_i32_e32 v7, 31, v0
-; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
+; TONGA-NEXT:    v_xor_b32_e32 v9, v6, v7
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v7, v3
+; TONGA-NEXT:    v_sub_u32_e32 v10, vcc, 0, v2
 ; TONGA-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v7
-; TONGA-NEXT:    v_xor_b32_e32 v4, v7, v4
+; TONGA-NEXT:    v_sub_u32_e32 v11, vcc, 0, v3
+; TONGA-NEXT:    v_rcp_iflag_f32_e32 v7, v7
+; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v4, v0
 ; TONGA-NEXT:    v_mul_f32_e32 v5, s2, v5
 ; TONGA-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; TONGA-NEXT:    v_mul_lo_u32 v6, v6, v5
-; TONGA-NEXT:    v_mul_hi_u32 v6, v5, v6
-; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v6, v5
-; TONGA-NEXT:    v_mul_hi_u32 v5, v0, v5
-; TONGA-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
-; TONGA-NEXT:    v_mul_lo_u32 v8, v5, v2
-; TONGA-NEXT:    v_add_u32_e32 v9, vcc, 1, v5
-; TONGA-NEXT:    v_subrev_u32_e32 v0, vcc, v8, v0
+; TONGA-NEXT:    v_mul_f32_e32 v7, s2, v7
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v6, v1
+; TONGA-NEXT:    v_mul_lo_u32 v10, v10, v5
+; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v4
+; TONGA-NEXT:    v_mul_lo_u32 v11, v11, v7
+; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v6
+; TONGA-NEXT:    v_mul_hi_u32 v4, v5, v10
+; TONGA-NEXT:    v_mul_hi_u32 v6, v7, v11
+; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
+; TONGA-NEXT:    v_mul_hi_u32 v4, v0, v4
+; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v6, v7
+; TONGA-NEXT:    v_mul_hi_u32 v5, v1, v5
+; TONGA-NEXT:    v_mul_lo_u32 v6, v4, v2
+; TONGA-NEXT:    v_add_u32_e32 v7, vcc, 1, v4
+; TONGA-NEXT:    v_mul_lo_u32 v10, v5, v3
+; TONGA-NEXT:    v_add_u32_e32 v11, vcc, 1, v5
+; TONGA-NEXT:    v_subrev_u32_e32 v0, vcc, v6, v0
 ; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
-; TONGA-NEXT:    v_subrev_u32_e32 v8, vcc, v2, v0
-; TONGA-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[0:1]
-; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[0:1]
-; TONGA-NEXT:    v_add_u32_e32 v8, vcc, 1, v5
+; TONGA-NEXT:    v_subrev_u32_e32 v1, vcc, v10, v1
+; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
+; TONGA-NEXT:    v_subrev_u32_e32 v6, vcc, v2, v0
+; TONGA-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
+; TONGA-NEXT:    v_subrev_u32_e32 v7, vcc, v3, v1
+; TONGA-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[2:3]
+; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
+; TONGA-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[2:3]
+; TONGA-NEXT:    v_add_u32_e32 v6, vcc, 1, v4
+; TONGA-NEXT:    v_add_u32_e32 v7, vcc, 1, v5
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
-; TONGA-NEXT:    s_mov_b64 s[0:1], vcc
-; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v6, v3
-; TONGA-NEXT:    v_xor_b32_e32 v2, v0, v6
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v0, v2
-; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, 0, v2
-; TONGA-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
-; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v3
-; TONGA-NEXT:    v_xor_b32_e32 v6, v3, v6
-; TONGA-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[0:1]
-; TONGA-NEXT:    v_mul_f32_e32 v0, s2, v0
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; TONGA-NEXT:    v_mul_lo_u32 v9, v9, v0
-; TONGA-NEXT:    v_mul_hi_u32 v7, v0, v9
-; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
-; TONGA-NEXT:    v_mul_hi_u32 v3, v1, v0
-; TONGA-NEXT:    v_xor_b32_e32 v0, v5, v4
-; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v4
-; TONGA-NEXT:    v_mul_lo_u32 v4, v3, v2
-; TONGA-NEXT:    v_add_u32_e32 v5, vcc, 1, v3
-; TONGA-NEXT:    v_subrev_u32_e32 v1, vcc, v4, v1
-; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v1, v2
-; TONGA-NEXT:    v_subrev_u32_e32 v4, vcc, v2, v1
-; TONGA-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
-; TONGA-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
-; TONGA-NEXT:    v_add_u32_e32 v4, vcc, 1, v3
-; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
-; TONGA-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v6
-; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v1, v6
+; TONGA-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
+; TONGA-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
+; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v8
+; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v9
+; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v8
+; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v1, v9
 ; TONGA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; TONGA-NEXT:    s_endpgm
 ;
@@ -931,137 +930,134 @@
 ;
 ; TONGA-LABEL: sdiv_v4i32:
 ; TONGA:       ; %bb.0:
-; TONGA-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x24
-; TONGA-NEXT:    s_mov_b32 s7, 0xf000
-; TONGA-NEXT:    s_mov_b32 s6, -1
-; TONGA-NEXT:    s_mov_b32 s2, s6
-; TONGA-NEXT:    s_mov_b32 s3, s7
+; TONGA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; TONGA-NEXT:    s_mov_b32 s11, 0xf000
+; TONGA-NEXT:    s_mov_b32 s10, -1
+; TONGA-NEXT:    s_mov_b32 s6, s10
+; TONGA-NEXT:    s_mov_b32 s7, s11
 ; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
-; TONGA-NEXT:    s_mov_b32 s0, s10
-; TONGA-NEXT:    s_mov_b32 s1, s11
-; TONGA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:16
-; TONGA-NEXT:    s_mov_b32 s10, 0x4f7ffffe
-; TONGA-NEXT:    s_mov_b32 s4, s8
-; TONGA-NEXT:    s_mov_b32 s5, s9
-; TONGA-NEXT:    s_waitcnt vmcnt(0)
+; TONGA-NEXT:    s_mov_b32 s4, s2
+; TONGA-NEXT:    s_mov_b32 s5, s3
+; TONGA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; TONGA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; TONGA-NEXT:    s_mov_b32 s2, 0x4f7ffffe
+; TONGA-NEXT:    s_mov_b32 s8, s0
+; TONGA-NEXT:    s_mov_b32 s9, s1
+; TONGA-NEXT:    s_waitcnt vmcnt(1)
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
+; TONGA-NEXT:    s_waitcnt vmcnt(0)
+; TONGA-NEXT:    v_ashrrev_i32_e32 v11, 31, v5
+; TONGA-NEXT:    v_ashrrev_i32_e32 v9, 31, v4
+; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v11, v5
+; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v9, v4
+; TONGA-NEXT:    v_xor_b32_e32 v5, v5, v11
+; TONGA-NEXT:    v_xor_b32_e32 v15, v8, v9
+; TONGA-NEXT:    v_xor_b32_e32 v4, v4, v9
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v9, v5
 ; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v8, v0
 ; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v8
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v4, v0
-; TONGA-NEXT:    v_ashrrev_i32_e32 v14, 31, v2
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; TONGA-NEXT:    v_mul_f32_e32 v4, s10, v4
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v9, v4
-; TONGA-NEXT:    v_sub_u32_e32 v4, vcc, 0, v0
-; TONGA-NEXT:    v_mul_lo_u32 v10, v4, v9
-; TONGA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0
-; TONGA-NEXT:    v_mul_hi_u32 v10, v9, v10
-; TONGA-NEXT:    v_add_u32_e32 v9, vcc, v10, v9
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v8, v4
+; TONGA-NEXT:    v_rcp_iflag_f32_e32 v9, v9
+; TONGA-NEXT:    v_ashrrev_i32_e32 v13, 31, v6
+; TONGA-NEXT:    v_add_u32_e32 v6, vcc, v13, v6
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
-; TONGA-NEXT:    s_waitcnt vmcnt(0)
-; TONGA-NEXT:    v_ashrrev_i32_e32 v11, 31, v4
-; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v11, v4
-; TONGA-NEXT:    v_xor_b32_e32 v4, v4, v11
-; TONGA-NEXT:    v_mul_hi_u32 v9, v4, v9
-; TONGA-NEXT:    v_xor_b32_e32 v8, v11, v8
-; TONGA-NEXT:    v_mul_lo_u32 v12, v9, v0
-; TONGA-NEXT:    v_add_u32_e32 v13, vcc, 1, v9
-; TONGA-NEXT:    v_sub_u32_e32 v4, vcc, v4, v12
-; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v4, v0
-; TONGA-NEXT:    v_sub_u32_e32 v12, vcc, v4, v0
-; TONGA-NEXT:    v_cndmask_b32_e64 v9, v9, v13, s[0:1]
-; TONGA-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[0:1]
-; TONGA-NEXT:    v_add_u32_e32 v12, vcc, 1, v9
-; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v0
-; TONGA-NEXT:    s_mov_b64 s[0:1], vcc
-; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v10, v1
-; TONGA-NEXT:    v_xor_b32_e32 v1, v0, v10
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v0, v1
-; TONGA-NEXT:    v_sub_u32_e32 v13, vcc, 0, v1
-; TONGA-NEXT:    v_ashrrev_i32_e32 v4, 31, v5
-; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v4, v5
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; TONGA-NEXT:    v_xor_b32_e32 v5, v5, v4
-; TONGA-NEXT:    v_cndmask_b32_e64 v9, v9, v12, s[0:1]
-; TONGA-NEXT:    v_xor_b32_e32 v4, v4, v10
-; TONGA-NEXT:    v_mul_f32_e32 v0, s10, v0
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; TONGA-NEXT:    v_ashrrev_i32_e32 v10, 31, v6
-; TONGA-NEXT:    v_mul_lo_u32 v13, v13, v0
-; TONGA-NEXT:    v_mul_hi_u32 v11, v0, v13
-; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v11, v0
-; TONGA-NEXT:    v_mul_hi_u32 v11, v5, v0
-; TONGA-NEXT:    v_xor_b32_e32 v0, v9, v8
-; TONGA-NEXT:    v_subrev_u32_e32 v0, vcc, v8, v0
-; TONGA-NEXT:    v_mul_lo_u32 v8, v11, v1
-; TONGA-NEXT:    v_add_u32_e32 v9, vcc, 1, v11
-; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, v5, v8
-; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v5, v1
-; TONGA-NEXT:    v_cndmask_b32_e64 v8, v11, v9, s[0:1]
-; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, v5, v1
-; TONGA-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[0:1]
-; TONGA-NEXT:    v_add_u32_e32 v9, vcc, 1, v8
-; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
-; TONGA-NEXT:    s_mov_b64 s[0:1], vcc
-; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v14, v2
-; TONGA-NEXT:    v_xor_b32_e32 v2, v1, v14
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v1, v2
-; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, 0, v2
-; TONGA-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[0:1]
+; TONGA-NEXT:    v_rcp_iflag_f32_e32 v8, v8
+; TONGA-NEXT:    v_mul_f32_e32 v9, s2, v9
+; TONGA-NEXT:    v_xor_b32_e32 v6, v6, v13
+; TONGA-NEXT:    v_xor_b32_e32 v16, v10, v11
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v11, v6
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v9, v9
+; TONGA-NEXT:    v_ashrrev_i32_e32 v12, 31, v2
+; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v12, v2
+; TONGA-NEXT:    v_mul_f32_e32 v8, s2, v8
+; TONGA-NEXT:    v_xor_b32_e32 v17, v12, v13
+; TONGA-NEXT:    v_xor_b32_e32 v2, v2, v12
+; TONGA-NEXT:    v_sub_u32_e32 v12, vcc, 0, v5
+; TONGA-NEXT:    v_rcp_iflag_f32_e32 v11, v11
+; TONGA-NEXT:    v_mul_lo_u32 v12, v12, v9
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v10, v1
+; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v10
+; TONGA-NEXT:    v_sub_u32_e32 v10, vcc, 0, v4
+; TONGA-NEXT:    v_mul_lo_u32 v10, v10, v8
+; TONGA-NEXT:    v_mul_hi_u32 v12, v9, v12
+; TONGA-NEXT:    v_mul_f32_e32 v11, s2, v11
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v11, v11
+; TONGA-NEXT:    v_mul_hi_u32 v10, v8, v10
+; TONGA-NEXT:    v_add_u32_e32 v9, vcc, v12, v9
+; TONGA-NEXT:    v_sub_u32_e32 v12, vcc, 0, v6
+; TONGA-NEXT:    v_mul_lo_u32 v12, v12, v11
+; TONGA-NEXT:    v_add_u32_e32 v8, vcc, v10, v8
+; TONGA-NEXT:    v_mul_hi_u32 v8, v0, v8
+; TONGA-NEXT:    v_ashrrev_i32_e32 v14, 31, v7
+; TONGA-NEXT:    v_mul_hi_u32 v12, v11, v12
+; TONGA-NEXT:    v_add_u32_e32 v7, vcc, v14, v7
+; TONGA-NEXT:    v_xor_b32_e32 v7, v7, v14
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v10, v7
+; TONGA-NEXT:    v_add_u32_e32 v11, vcc, v12, v11
+; TONGA-NEXT:    v_mul_lo_u32 v12, v8, v4
+; TONGA-NEXT:    v_mul_hi_u32 v9, v1, v9
+; TONGA-NEXT:    v_rcp_iflag_f32_e32 v10, v10
+; TONGA-NEXT:    v_mul_hi_u32 v11, v2, v11
+; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v12
+; TONGA-NEXT:    v_add_u32_e32 v12, vcc, 1, v8
+; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v4
+; TONGA-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s[0:1]
+; TONGA-NEXT:    v_sub_u32_e32 v12, vcc, v0, v4
+; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v12, s[0:1]
+; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v4
+; TONGA-NEXT:    v_mul_lo_u32 v0, v9, v5
+; TONGA-NEXT:    v_mul_f32_e32 v10, s2, v10
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v4, v10
+; TONGA-NEXT:    v_mul_lo_u32 v10, v11, v6
+; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v1, v0
+; TONGA-NEXT:    v_add_u32_e32 v1, vcc, 1, v9
+; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v0, v5
+; TONGA-NEXT:    v_sub_u32_e32 v2, vcc, v2, v10
+; TONGA-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[2:3]
+; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, v0, v5
+; TONGA-NEXT:    v_add_u32_e32 v10, vcc, 1, v11
+; TONGA-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
+; TONGA-NEXT:    v_add_u32_e32 v12, vcc, 1, v8
+; TONGA-NEXT:    v_cndmask_b32_e64 v10, v11, v10, s[4:5]
+; TONGA-NEXT:    v_sub_u32_e32 v11, vcc, v2, v6
+; TONGA-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[2:3]
+; TONGA-NEXT:    v_add_u32_e32 v9, vcc, 1, v1
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
+; TONGA-NEXT:    v_cndmask_b32_e32 v0, v1, v9, vcc
+; TONGA-NEXT:    v_cndmask_b32_e64 v8, v8, v12, s[0:1]
+; TONGA-NEXT:    v_xor_b32_e32 v1, v8, v15
+; TONGA-NEXT:    v_xor_b32_e32 v5, v0, v16
+; TONGA-NEXT:    v_subrev_u32_e32 v0, vcc, v15, v1
+; TONGA-NEXT:    v_subrev_u32_e32 v1, vcc, v16, v5
+; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, 0, v7
+; TONGA-NEXT:    v_mul_lo_u32 v5, v5, v4
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; TONGA-NEXT:    v_mul_f32_e32 v1, s10, v1
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; TONGA-NEXT:    v_mul_lo_u32 v5, v5, v1
-; TONGA-NEXT:    v_mul_hi_u32 v5, v1, v5
-; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v5, v1
-; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v10, v6
-; TONGA-NEXT:    v_xor_b32_e32 v5, v5, v10
-; TONGA-NEXT:    v_mul_hi_u32 v6, v5, v1
-; TONGA-NEXT:    v_xor_b32_e32 v1, v8, v4
-; TONGA-NEXT:    v_subrev_u32_e32 v1, vcc, v4, v1
-; TONGA-NEXT:    v_xor_b32_e32 v10, v10, v14
-; TONGA-NEXT:    v_mul_lo_u32 v4, v6, v2
-; TONGA-NEXT:    v_add_u32_e32 v8, vcc, 1, v6
-; TONGA-NEXT:    v_sub_u32_e32 v4, vcc, v5, v4
-; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v4, v2
-; TONGA-NEXT:    v_cndmask_b32_e64 v5, v6, v8, s[0:1]
-; TONGA-NEXT:    v_sub_u32_e32 v6, vcc, v4, v2
-; TONGA-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[0:1]
-; TONGA-NEXT:    v_add_u32_e32 v6, vcc, 1, v5
-; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v2
-; TONGA-NEXT:    s_mov_b64 s[0:1], vcc
-; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v9, v3
-; TONGA-NEXT:    v_xor_b32_e32 v3, v2, v9
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v2, v3
-; TONGA-NEXT:    v_sub_u32_e32 v8, vcc, 0, v3
-; TONGA-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[0:1]
-; TONGA-NEXT:    v_ashrrev_i32_e32 v4, 31, v7
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; TONGA-NEXT:    v_add_u32_e32 v7, vcc, v4, v7
-; TONGA-NEXT:    v_xor_b32_e32 v9, v4, v9
-; TONGA-NEXT:    v_xor_b32_e32 v4, v7, v4
-; TONGA-NEXT:    v_mul_f32_e32 v2, s10, v2
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; TONGA-NEXT:    v_mul_lo_u32 v8, v8, v2
-; TONGA-NEXT:    v_mul_hi_u32 v6, v2, v8
-; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
-; TONGA-NEXT:    v_mul_hi_u32 v6, v4, v2
-; TONGA-NEXT:    v_xor_b32_e32 v2, v5, v10
-; TONGA-NEXT:    v_subrev_u32_e32 v2, vcc, v10, v2
-; TONGA-NEXT:    v_mul_lo_u32 v5, v6, v3
-; TONGA-NEXT:    v_add_u32_e32 v7, vcc, 1, v6
-; TONGA-NEXT:    v_sub_u32_e32 v4, vcc, v4, v5
-; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v4, v3
-; TONGA-NEXT:    v_cndmask_b32_e64 v5, v6, v7, s[0:1]
-; TONGA-NEXT:    v_sub_u32_e32 v6, vcc, v4, v3
-; TONGA-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[0:1]
-; TONGA-NEXT:    v_add_u32_e32 v6, vcc, 1, v5
-; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v3
-; TONGA-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v9, v3
 ; TONGA-NEXT:    v_xor_b32_e32 v3, v3, v9
-; TONGA-NEXT:    v_subrev_u32_e32 v3, vcc, v9, v3
-; TONGA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; TONGA-NEXT:    v_mul_hi_u32 v5, v4, v5
+; TONGA-NEXT:    v_cndmask_b32_e64 v2, v2, v11, s[4:5]
+; TONGA-NEXT:    v_add_u32_e32 v8, vcc, 1, v10
+; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
+; TONGA-NEXT:    v_mul_hi_u32 v4, v3, v4
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v6
+; TONGA-NEXT:    v_cndmask_b32_e32 v2, v10, v8, vcc
+; TONGA-NEXT:    v_xor_b32_e32 v2, v2, v17
+; TONGA-NEXT:    v_mul_lo_u32 v5, v4, v7
+; TONGA-NEXT:    v_subrev_u32_e32 v2, vcc, v17, v2
+; TONGA-NEXT:    v_xor_b32_e32 v6, v9, v14
+; TONGA-NEXT:    v_sub_u32_e32 v3, vcc, v3, v5
+; TONGA-NEXT:    v_add_u32_e32 v5, vcc, 1, v4
+; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v3, v7
+; TONGA-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
+; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, v3, v7
+; TONGA-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
+; TONGA-NEXT:    v_add_u32_e32 v5, vcc, 1, v4
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v7
+; TONGA-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; TONGA-NEXT:    v_xor_b32_e32 v3, v3, v6
+; TONGA-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
+; TONGA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; TONGA-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sdiv_v4i32:
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -155,27 +155,29 @@
 ; GCN-IR-NEXT:    s_subb_u32 s11, s7, s2
 ; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[8:9], s[0:1]
 ; GCN-IR-NEXT:    s_sub_u32 s6, s0, s8
-; GCN-IR-NEXT:    s_flbit_i32_b32 s14, s6
 ; GCN-IR-NEXT:    s_subb_u32 s7, s1, s8
-; GCN-IR-NEXT:    s_add_i32 s14, s14, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s15, s7
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s14
-; GCN-IR-NEXT:    s_flbit_i32_b32 s14, s10
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s15
-; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
-; GCN-IR-NEXT:    s_add_i32 s14, s14, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s15, s11
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[10:11], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[6:7], 0
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[12:13], s[0:1]
+; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s6
+; GCN-IR-NEXT:    s_add_i32 s12, s12, 32
+; GCN-IR-NEXT:    s_cmp_eq_u32 s7, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s12
+; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s10
+; GCN-IR-NEXT:    s_flbit_i32_b32 s13, s7
+; GCN-IR-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-IR-NEXT:    s_add_i32 s12, s12, 32
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s13
+; GCN-IR-NEXT:    s_flbit_i32_b32 s13, s11
+; GCN-IR-NEXT:    s_cmp_eq_u32 s11, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s15
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s14
-; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s11, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s13
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s12
+; GCN-IR-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v2, v3
-; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[14:15], 0, 0, vcc
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[10:11], 0
+; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[12:13], 0, 0, vcc
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[0:1]
-; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
 ; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[0:1]
 ; GCN-IR-NEXT:    s_xor_b64 s[12:13], s[0:1], -1
@@ -1007,51 +1009,55 @@
 ;
 ; GCN-IR-LABEL: s_test_sdiv24_48:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_load_dword s2, s[0:1], 0xb
 ; GCN-IR-NEXT:    s_load_dword s3, s[0:1], 0xc
-; GCN-IR-NEXT:    s_load_dword s6, s[0:1], 0xd
-; GCN-IR-NEXT:    s_load_dword s0, s[0:1], 0xe
+; GCN-IR-NEXT:    s_load_dword s4, s[0:1], 0xd
+; GCN-IR-NEXT:    s_load_dword s5, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_sext_i32_i16 s3, s3
-; GCN-IR-NEXT:    s_sext_i32_i16 s7, s0
-; GCN-IR-NEXT:    s_ashr_i64 s[0:1], s[2:3], 24
+; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[2:3], 24
 ; GCN-IR-NEXT:    s_ashr_i32 s2, s3, 31
+; GCN-IR-NEXT:    s_sext_i32_i16 s5, s5
 ; GCN-IR-NEXT:    s_mov_b32 s3, s2
-; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[6:7], 24
-; GCN-IR-NEXT:    s_ashr_i32 s6, s7, 31
-; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
-; GCN-IR-NEXT:    s_sub_u32 s10, s0, s2
-; GCN-IR-NEXT:    s_mov_b32 s7, s6
-; GCN-IR-NEXT:    s_subb_u32 s11, s1, s2
-; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[6:7], s[8:9]
-; GCN-IR-NEXT:    s_sub_u32 s8, s0, s6
-; GCN-IR-NEXT:    s_flbit_i32_b32 s14, s8
-; GCN-IR-NEXT:    s_subb_u32 s9, s1, s6
-; GCN-IR-NEXT:    s_add_i32 s14, s14, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s15, s9
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s14
-; GCN-IR-NEXT:    s_flbit_i32_b32 s14, s10
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s15
-; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s9, 0
-; GCN-IR-NEXT:    s_add_i32 s14, s14, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s15, s11
+; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[4:5], 24
+; GCN-IR-NEXT:    s_ashr_i32 s4, s5, 31
+; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[2:3], s[6:7]
+; GCN-IR-NEXT:    s_sub_u32 s10, s6, s2
+; GCN-IR-NEXT:    s_mov_b32 s5, s4
+; GCN-IR-NEXT:    s_subb_u32 s11, s7, s2
+; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[4:5], s[8:9]
+; GCN-IR-NEXT:    s_sub_u32 s6, s6, s4
+; GCN-IR-NEXT:    s_subb_u32 s7, s7, s4
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[10:11], 0
+; GCN-IR-NEXT:    s_or_b64 s[12:13], s[8:9], s[12:13]
+; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s6
+; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
+; GCN-IR-NEXT:    s_cmp_eq_u32 s7, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s8
+; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s10
+; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s7
+; GCN-IR-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s9
+; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s11
+; GCN-IR-NEXT:    s_cmp_eq_u32 s11, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s15
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s14
-; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s11, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s9
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s8
+; GCN-IR-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v2, v3
-; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[14:15], 0, 0, vcc
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[8:9], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[10:11], 0
+; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[8:9], 0, 0, vcc
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[0:1]
-; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
-; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[12:13], vcc
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[0:1]
 ; GCN-IR-NEXT:    s_xor_b64 s[12:13], s[0:1], -1
 ; GCN-IR-NEXT:    s_and_b64 s[12:13], s[12:13], vcc
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[12:13]
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_mov_b64 vcc, vcc
 ; GCN-IR-NEXT:    s_cbranch_vccz BB9_4
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
@@ -1064,10 +1070,10 @@
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_not_b32_e32 v2, v2
 ; GCN-IR-NEXT:    v_lshr_b64 v[6:7], s[10:11], v4
-; GCN-IR-NEXT:    s_add_u32 s10, s8, -1
+; GCN-IR-NEXT:    s_add_u32 s10, s6, -1
 ; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, v2, v3
 ; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-IR-NEXT:    s_addc_u32 s11, s9, -1
+; GCN-IR-NEXT:    s_addc_u32 s11, s7, -1
 ; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[0:1], -1, 0, vcc
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
@@ -1082,9 +1088,9 @@
 ; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, s10, v6
 ; GCN-IR-NEXT:    v_subb_u32_e32 v2, vcc, v2, v7, vcc
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
-; GCN-IR-NEXT:    v_and_b32_e32 v10, s8, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v10, s6, v8
 ; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v8
-; GCN-IR-NEXT:    v_and_b32_e32 v11, s9, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v11, s7, v8
 ; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
 ; GCN-IR-NEXT:    v_or_b32_e32 v1, v9, v1
 ; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
@@ -1112,16 +1118,16 @@
 ; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
 ; GCN-IR-NEXT:  BB9_7: ; %udiv-end
-; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[6:7], s[2:3]
+; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[4:5], s[2:3]
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, s1, v1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s6, -1
-; GCN-IR-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
-; GCN-IR-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-IR-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-IR-NEXT:    s_mov_b32 s10, -1
+; GCN-IR-NEXT:    buffer_store_short v1, off, s[8:11], 0 offset:4
+; GCN-IR-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i48 %x, 24
   %2 = ashr i48 %y, 24
@@ -1262,13 +1268,14 @@
 ; GCN-IR-NEXT:    s_mov_b32 s3, s2
 ; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[2:3], s[6:7]
 ; GCN-IR-NEXT:    s_sub_u32 s6, s0, s2
-; GCN-IR-NEXT:    s_subb_u32 s7, s1, s2
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s6
+; GCN-IR-NEXT:    s_subb_u32 s7, s1, s2
 ; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s7
+; GCN-IR-NEXT:    s_cmp_eq_u32 s7, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s9
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s8
-; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffc5, v2
 ; GCN-IR-NEXT:    v_addc_u32_e64 v1, s[8:9], 0, -1, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll
--- a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll
@@ -11,12 +11,13 @@
 ; GCN-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshr_b32 s0, 1, s2
 ; GCN-NEXT:    s_ff1_i32_b32 s0, s0
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_cmp_lg_u32 s2, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], s2, 0
+; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -1, s[2:3]
 ; GCN-NEXT:    v_ffbh_i32_e32 v1, v0
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
@@ -166,7 +166,8 @@
 ; GCN: buffer_load_dword [[X:v[0-9]+]]
 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
 
-; GCN-DAG: v_cmp_ne_u32_e64 [[VCC:.*]], s{{[0-9]+}}, 0
+; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0
+; GCN: s_cselect_b64 [[VCC:.*]], -1, 0
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, |[[X]]|, [[VCC]]
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
 define amdgpu_kernel void @add_select_negk_fabs_f32(i32 %c) #0 {
@@ -185,7 +186,8 @@
 ; GCN-DAG: buffer_load_dword [[Y:v[0-9]+]]
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xc4800000
 
-; GCN-DAG: v_cmp_ne_u32_e64 [[VCC:.*]], s{{[0-9]+}}, 0
+; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0
+; GCN: s_cselect_b64 [[VCC:.*]], -1, 0
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], [[K]], |[[X]]|, [[VCC]]
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
 define amdgpu_kernel void @add_select_negliteralk_fabs_f32(i32 %c) #0 {
@@ -221,7 +223,8 @@
 ; GCN: buffer_load_dword [[X:v[0-9]+]]
 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
 
-; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
+; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0
+; GCN: s_cselect_b64 [[VCC:.*]], -1, 0
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
 ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Y]]
 define amdgpu_kernel void @add_select_posk_fabs_f32(i32 %c) #0 {
@@ -407,7 +410,7 @@
 ; GCN-LABEL: {{^}}add_select_negk_negk_f32:
 ; GCN: buffer_load_dword [[X:v[0-9]+]]
 
-; GCN: v_cmp_eq_u32_e64
+; GCN: s_cmp_eq_u32
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]]
 define amdgpu_kernel void @add_select_negk_negk_f32(i32 %c) #0 {
@@ -424,7 +427,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0xc5800000
 ; GCN-DAG: buffer_load_dword [[X:v[0-9]+]]
 
-; GCN: v_cmp_eq_u32_e64
+; GCN: s_cmp_eq_u32
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K1]], [[K0]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]]
 define amdgpu_kernel void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 {
@@ -455,7 +458,8 @@
 ; GCN: buffer_load_dword [[X:v[0-9]+]]
 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
 
-; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
+; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
+; GCN: s_cselect_b64 vcc, -1, 0
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
 ; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
 define amdgpu_kernel void @add_select_negk_fneg_f32(i32 %c) #0 {
@@ -490,7 +494,8 @@
 ; GCN: buffer_load_dword [[X:v[0-9]+]]
 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
 
-; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
+; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
+; GCN: s_cselect_b64 vcc, -1, 0
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc
 ; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
 define amdgpu_kernel void @add_select_posk_fneg_f32(i32 %c) #0 {
@@ -632,7 +637,8 @@
 ; GCN: buffer_load_dword [[X:v[0-9]+]]
 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
 
-; GCN-DAG: v_cmp_eq_u32_e64 [[VCC:.*]], s{{[0-9]+}}, 0
+; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0
+; GCN: s_cselect_b64  [[VCC:.*]], -1, 0
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -4.0, |[[X]]|, [[VCC]]
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
 define amdgpu_kernel void @mul_select_negfabs_posk_f32(i32 %c) #0 {
@@ -651,7 +657,8 @@
 ; GCN: buffer_load_dword [[X:v[0-9]+]]
 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
 
-; GCN-DAG: v_cmp_ne_u32_e64 [[VCC:.*]], s{{[0-9]+}}, 0
+; GCN-DAG: s_cmp_lg_u32 s{{[0-9]+}}, 0
+; GCN: s_cselect_b64  [[VCC:.*]], -1, 0
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -4.0, |[[X]]|, [[VCC]]
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
 define amdgpu_kernel void @mul_select_posk_negfabs_f32(i32 %c) #0 {
@@ -688,7 +695,8 @@
 ; GCN: buffer_load_dword [[X:v[0-9]+]]
 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
 
-; GCN: v_cmp_ne_u32_e64 vcc
+; GCN: s_cmp_lg_u32
+; GCN: s_cselect_b64 vcc, -1, 0
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
 define amdgpu_kernel void @mul_select_negk_negfabs_f32(i32 %c) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/select-opt.ll b/llvm/test/CodeGen/AMDGPU/select-opt.ll
--- a/llvm/test/CodeGen/AMDGPU/select-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-opt.ll
@@ -5,9 +5,11 @@
 ; scalar compares, we don't want to use multiple condition registers.
 
 ; GCN-LABEL: {{^}}opt_select_i32_and_cmp_i32:
-; GCN-DAG: v_cmp_ne_u32_e32 vcc,
-; GCN-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
-; GCN: s_and_b64 vcc, vcc, [[CMP1]]
+; GCN-DAG: s_cmp_lg_u32
+; GCN: s_cselect_b64 [[CMP1:s\[[0-9]+:[0-9]+\]]], -1, 0
+; GCN-DAG: s_cmp_lg_u32
+; GCN: s_cselect_b64 [[CMP2:s\[[0-9]+:[0-9]+\]]], -1, 0
+; GCN: s_and_b64 vcc, [[CMP1]], [[CMP2]]
 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
@@ -37,9 +39,11 @@
 }
 
 ; GCN-LABEL: {{^}}opt_select_i64_and_cmp_i32:
-; GCN-DAG: v_cmp_ne_u32_e32 vcc,
-; GCN-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
-; GCN: s_and_b64 vcc, vcc, [[CMP1]]
+; GCN-DAG: s_cmp_lg_u32
+; GCN: s_cselect_b64 [[CMP1:s\[[0-9]+:[0-9]+\]]], -1, 0
+; GCN-DAG: s_cmp_lg_u32
+; GCN: s_cselect_b64 [[CMP2:s\[[0-9]+:[0-9]+\]]], -1, 0
+; GCN: s_and_b64 vcc, [[CMP1]], [[CMP2]]
 ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
@@ -69,9 +73,11 @@
 }
 
 ; GCN-LABEL: {{^}}opt_select_i32_or_cmp_i32:
-; GCN-DAG: v_cmp_ne_u32_e32 vcc,
-; GCN-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
-; GCN: s_or_b64 vcc, vcc, [[CMP1]]
+; GCN-DAG: s_cmp_lg_u32
+; GCN: s_cselect_b64 [[CMP1:s\[[0-9]+:[0-9]+\]]], -1, 0
+; GCN-DAG: s_cmp_lg_u32
+; GCN: s_cselect_b64 [[CMP2:s\[[0-9]+:[0-9]+\]]], -1, 0
+; GCN: s_or_b64 vcc, [[CMP1]], [[CMP2]]
 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
@@ -102,9 +108,11 @@
 }
 
 ; GCN-LABEL: {{^}}opt_select_i64_or_cmp_i32:
-; GCN-DAG: v_cmp_ne_u32_e32 vcc,
-; GCN-DAG: v_cmp_ne_u32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]]
-; GCN: s_or_b64 vcc, vcc, [[CMP1]]
+; GCN-DAG: s_cmp_lg_u32
+; GCN: s_cselect_b64 [[CMP1:s\[[0-9]+:[0-9]+\]]], -1, 0
+; GCN-DAG: s_cmp_lg_u32
+; GCN: s_cselect_b64 [[CMP2:s\[[0-9]+:[0-9]+\]]], -1, 0
+; GCN: s_or_b64 vcc, [[CMP1]], [[CMP2]]
 ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
diff --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
--- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
@@ -183,11 +183,12 @@
 
 ; GCN-LABEL: {{^}}v_select_v4i32:
 ; GCN: buffer_load_dwordx4
-; GCN: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32
-; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; GCN: s_cmp_lt_u32 s{{[0-9]+}}, 32
+; GCN: s_cselect_b64 vcc, -1, 0
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx4
 define amdgpu_kernel void @v_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %cond) #0 {
 bb:
@@ -221,7 +222,7 @@
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[AHI]]
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]]
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]]
-; GCN-DAG: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
+; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}}
 
 ; GCN-DAG: v_cndmask_b32_e32
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]]
@@ -235,7 +236,7 @@
 }
 
 ; GCN-LABEL: {{^}}s_select_v3f32:
-; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
+; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}}
 
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
@@ -252,7 +253,7 @@
 ; GCN-LABEL: {{^}}s_select_v4f32:
 ; GCN: s_load_dwordx4
 ; GCN: s_load_dwordx4
-; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
+; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}}
 
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
@@ -269,11 +270,12 @@
 
 ; GCN-LABEL: {{^}}v_select_v4f32:
 ; GCN: buffer_load_dwordx4
-; GCN: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32
-; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; GCN: s_cmp_lt_u32 s{{[0-9]+}}, 32
+; GCN: s_cselect_b64 vcc, -1, 0
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx4
 define amdgpu_kernel void @v_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %cond) #0 {
 bb:
@@ -285,7 +287,7 @@
 }
 
 ; GCN-LABEL: {{^}}s_select_v5f32:
-; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
+; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}}
 
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
diff --git a/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll b/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll
--- a/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll
@@ -68,7 +68,7 @@
 }
 
 ; FUNC-LABEL: {{^}}selectcc_bool:
-; SI: v_cmp_ne_u32
+; SI: s_cmp_lg_u32
 ; SI: v_cndmask_b32_e64
 ; SI-NOT: cmp
 ; SI-NOT: cndmask
diff --git a/llvm/test/CodeGen/AMDGPU/selectcc.ll b/llvm/test/CodeGen/AMDGPU/selectcc.ll
--- a/llvm/test/CodeGen/AMDGPU/selectcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/selectcc.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -verify-machineinstrs -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI -check-prefix=FUNC %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}selectcc_i64:
 ; EG: XOR_INT
@@ -9,8 +9,9 @@
 ; EG: CNDE_INT
 ; EG: CNDE_INT
 ; SI: v_cmp_eq_u64
-; SI: v_cndmask
-; SI: v_cndmask
+; VI: s_cmp_eq_u64
+; GCN: v_cndmask
+; GCN: v_cndmask
 define amdgpu_kernel void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) {
 entry:
   %0 = icmp eq i64 %lhs, %rhs
diff --git a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
--- a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
@@ -4,8 +4,9 @@
 
 ; FUNC-LABEL: {{^}}sext_bool_icmp_eq_0:
 ; GCN-NOT: v_cmp
-; GCN: v_cmp_ne_u32_e32 vcc,
-; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN: s_cmp_lg_u32
+; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
+; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
 ; GCN-NEXT:buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
 
@@ -21,8 +22,9 @@
 
 ; FUNC-LABEL: {{^}}sext_bool_icmp_ne_0:
 ; GCN-NOT: v_cmp
-; GCN: v_cmp_ne_u32_e32 vcc,
-; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN: s_cmp_lg_u32
+; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
+; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
 
@@ -38,8 +40,9 @@
 
 ; FUNC-LABEL: {{^}}sext_bool_icmp_eq_neg1:
 ; GCN-NOT: v_cmp
-; GCN: v_cmp_eq_u32_e32 vcc,
-; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN: s_cmp_eq_u32
+; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
+; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @sext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
@@ -52,8 +55,9 @@
 
 ; FUNC-LABEL: {{^}}sext_bool_icmp_ne_neg1:
 ; GCN-NOT: v_cmp
-; GCN: v_cmp_eq_u32_e32 vcc,
-; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN: s_cmp_eq_u32
+; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
+; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @sext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
@@ -66,8 +70,9 @@
 
 ; FUNC-LABEL: {{^}}zext_bool_icmp_eq_0:
 ; GCN-NOT: v_cmp
-; GCN: v_cmp_ne_u32_e32 vcc,
-; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN: s_cmp_lg_u32
+; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
+; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
@@ -80,8 +85,9 @@
 
 ; FUNC-LABEL: {{^}}zext_bool_icmp_ne_0:
 ; GCN-NOT: v_cmp
-; GCN: v_cmp_ne_u32_e32 vcc,
-; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN: s_cmp_lg_u32
+; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
+; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
@@ -94,8 +100,9 @@
 
 ; FUNC-LABEL: {{^}}zext_bool_icmp_eq_1:
 ; GCN-NOT: v_cmp
-; GCN: v_cmp_eq_u32_e32 vcc,
-; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN: s_cmp_eq_u32
+; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
+; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
@@ -108,8 +115,9 @@
 
 ; FUNC-LABEL: {{^}}zext_bool_icmp_ne_1:
 ; GCN-NOT: v_cmp
-; GCN: v_cmp_eq_u32_e32 vcc,
-; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN: s_cmp_eq_u32
+; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
+; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 define amdgpu_kernel void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
@@ -149,14 +157,16 @@
 ; SI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
 ; VI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: s_movk_i32 [[K255:s[0-9]+]], 0xff
-; GCN-DAG: v_mov_b32_e32 [[VK255:v[0-9]+]], [[K255]]
 ; SI-DAG: s_and_b32 [[B:s[0-9]+]], [[VALUE]], [[K255]]
-; SI: v_cmp_ne_u32_e32 vcc, [[B]], [[VK255]]
+; SI: s_cmp_lg_u32 [[B]], [[K255]]
+; SI: s_cselect_b64 [[CC:[^,]+]], -1, 0
 
-; VI-DAG: v_and_b32_e32 [[B:v[0-9]+]], [[VALUE]], [[VK255]]
+; VI: v_mov_b32_e32 [[VK255:v[0-9]+]], [[K255]]
+; VI: v_and_b32_e32 [[B:v[0-9]+]], [[VALUE]], [[VK255]]
 ; VI: v_cmp_ne_u16_e32 vcc, [[K255]], [[B]]
 
-; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
+; VI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind {
@@ -200,9 +210,9 @@
 ; VI: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: s_movk_i32 [[K:s[0-9]+]], 0xff
 ; GCN-DAG: s_and_b32 [[B:s[0-9]+]], [[VAL]], [[K]]
-; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]]
-; GCN: v_cmp_ne_u32_e32 vcc, [[B]], [[VK]]{{$}}
-; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN: s_cmp_lg_u32 [[B]], [[K]]{{$}}
+; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0
+; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind {
diff --git a/llvm/test/CodeGen/AMDGPU/setcc.ll b/llvm/test/CodeGen/AMDGPU/setcc.ll
--- a/llvm/test/CodeGen/AMDGPU/setcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc.ll
@@ -7,8 +7,8 @@
 ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[3].X, KC0[3].Z
 ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[2].W, KC0[3].Y
 
-; GCN: v_cmp_eq_u32_e32
-; GCN: v_cmp_eq_u32_e32
+; GCN: s_cmp_eq_u32
+; GCN: s_cmp_eq_u32
 define amdgpu_kernel void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
   %result = icmp eq <2 x i32> %a, %b
   %sext = sext <2 x i1> %result to <2 x i32>
@@ -22,10 +22,10 @@
 ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-; GCN: v_cmp_eq_u32_e32
-; GCN: v_cmp_eq_u32_e32
-; GCN: v_cmp_eq_u32_e32
-; GCN: v_cmp_eq_u32_e32
+; GCN: s_cmp_eq_u32
+; GCN: s_cmp_eq_u32
+; GCN: s_cmp_eq_u32
+; GCN: s_cmp_eq_u32
 define amdgpu_kernel void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
@@ -227,7 +227,7 @@
 
 ; FUNC-LABEL: {{^}}i32_eq:
 ; R600: SETE_INT
-; GCN: v_cmp_eq_u32
+; GCN: s_cmp_eq_u32
 define amdgpu_kernel void @i32_eq(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp eq i32 %a, %b
@@ -238,7 +238,7 @@
 
 ; FUNC-LABEL: {{^}}i32_ne:
 ; R600: SETNE_INT
-; GCN: v_cmp_ne_u32
+; GCN: s_cmp_lg_u32
 define amdgpu_kernel void @i32_ne(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp ne i32 %a, %b
@@ -249,7 +249,7 @@
 
 ; FUNC-LABEL: {{^}}i32_ugt:
 ; R600: SETGT_UINT
-; GCN: v_cmp_gt_u32
+; GCN: s_cmp_gt_u32
 define amdgpu_kernel void @i32_ugt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp ugt i32 %a, %b
@@ -260,7 +260,7 @@
 
 ; FUNC-LABEL: {{^}}i32_uge:
 ; R600: SETGE_UINT
-; GCN: v_cmp_ge_u32
+; GCN: s_cmp_ge_u32
 define amdgpu_kernel void @i32_uge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp uge i32 %a, %b
@@ -271,7 +271,7 @@
 
 ; FUNC-LABEL: {{^}}i32_ult:
 ; R600: SETGT_UINT
-; GCN: v_cmp_lt_u32
+; GCN: s_cmp_lt_u32
 define amdgpu_kernel void @i32_ult(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp ult i32 %a, %b
@@ -282,7 +282,7 @@
 
 ; FUNC-LABEL: {{^}}i32_ule:
 ; R600: SETGE_UINT
-; GCN: v_cmp_le_u32
+; GCN: s_cmp_le_u32
 define amdgpu_kernel void @i32_ule(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp ule i32 %a, %b
@@ -293,7 +293,7 @@
 
 ; FUNC-LABEL: {{^}}i32_sgt:
 ; R600: SETGT_INT
-; GCN: v_cmp_gt_i32
+; GCN: s_cmp_gt_i32
 define amdgpu_kernel void @i32_sgt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp sgt i32 %a, %b
@@ -304,7 +304,7 @@
 
 ; FUNC-LABEL: {{^}}i32_sge:
 ; R600: SETGE_INT
-; GCN: v_cmp_ge_i32
+; GCN: s_cmp_ge_i32
 define amdgpu_kernel void @i32_sge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp sge i32 %a, %b
@@ -315,7 +315,7 @@
 
 ; FUNC-LABEL: {{^}}i32_slt:
 ; R600: SETGT_INT
-; GCN: v_cmp_lt_i32
+; GCN: s_cmp_lt_i32
 define amdgpu_kernel void @i32_slt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp slt i32 %a, %b
@@ -326,7 +326,7 @@
 
 ; FUNC-LABEL: {{^}}i32_sle:
 ; R600: SETGE_INT
-; GCN: v_cmp_le_i32
+; GCN: s_cmp_le_i32
 define amdgpu_kernel void @i32_sle(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp sle i32 %a, %b
@@ -413,8 +413,8 @@
 }
 
 ; FUNC-LABEL: setcc_v2i32_expand
-; GCN: v_cmp_gt_i32
-; GCN: v_cmp_gt_i32
+; GCN: s_cmp_gt_i32
+; GCN: s_cmp_gt_i32
 define amdgpu_kernel void @setcc_v2i32_expand(
   <2 x i32> addrspace(1)* %a,
   <2 x i32> addrspace(1)* %b,
@@ -438,10 +438,10 @@
 }
 
 ; FUNC-LABEL: setcc_v4i32_expand
-; GCN: v_cmp_gt_i32
-; GCN: v_cmp_gt_i32
-; GCN: v_cmp_gt_i32
-; GCN: v_cmp_gt_i32
+; GCN: s_cmp_gt_i32
+; GCN: s_cmp_gt_i32
+; GCN: s_cmp_gt_i32
+; GCN: s_cmp_gt_i32
 define amdgpu_kernel void @setcc_v4i32_expand(
   <4 x i32> addrspace(1)* %a,
   <4 x i32> addrspace(1)* %b,
diff --git a/llvm/test/CodeGen/AMDGPU/setcc64.ll b/llvm/test/CodeGen/AMDGPU/setcc64.ll
--- a/llvm/test/CodeGen/AMDGPU/setcc64.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s| FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s| FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
 
 ; XXX: Merge this into setcc, once R600 supports 64-bit operations
 
@@ -159,7 +159,8 @@
 ;;;==========================================================================;;;
 
 ; GCN-LABEL: {{^}}i64_eq:
-; GCN: v_cmp_eq_u64
+; SI: v_cmp_eq_u64
+; VI: s_cmp_eq_u64
 define amdgpu_kernel void @i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp eq i64 %a, %b
@@ -169,7 +170,8 @@
 }
 
 ; GCN-LABEL: {{^}}i64_ne:
-; GCN: v_cmp_ne_u64
+; SI: v_cmp_ne_u64
+; VI: s_cmp_lg_u64
 define amdgpu_kernel void @i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp ne i64 %a, %b
diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
--- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
@@ -191,14 +191,16 @@
 ; GCN-NEXT:    s_sub_i32 s2, s8, 64
 ; GCN-NEXT:    s_lshl_b64 s[0:1], s[6:7], s8
 ; GCN-NEXT:    s_lshr_b64 s[10:11], s[4:5], s9
-; GCN-NEXT:    s_or_b64 s[10:11], s[0:1], s[10:11]
 ; GCN-NEXT:    s_lshl_b64 s[2:3], s[4:5], s2
+; GCN-NEXT:    s_or_b64 s[10:11], s[0:1], s[10:11]
+; GCN-NEXT:    s_cmp_lt_u32 s8, 64
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s8, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s3
 ; GCN-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-NEXT:    v_cmp_lt_u32_e64 vcc, s8, 64
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s8, 0
+; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, v0, v1, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s10
@@ -230,12 +232,14 @@
 ; GCN-NEXT:    s_lshl_b64 s[10:11], s[6:7], s9
 ; GCN-NEXT:    s_or_b64 s[10:11], s[0:1], s[10:11]
 ; GCN-NEXT:    s_lshr_b64 s[2:3], s[6:7], s2
+; GCN-NEXT:    s_cmp_lt_u32 s8, 64
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s8, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s3
 ; GCN-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-NEXT:    v_cmp_lt_u32_e64 vcc, s8, 64
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s8, 0
+; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v2, s10
@@ -259,25 +263,27 @@
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i64 s[0:1], s[6:7], s8
 ; GCN-NEXT:    s_ashr_i32 s2, s7, 31
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    s_sub_i32 s0, s8, 64
+; GCN-NEXT:    s_ashr_i64 s[0:1], s[6:7], s8
+; GCN-NEXT:    s_cmp_lt_u32 s8, 64
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    s_sub_i32 s0, s8, 64
 ; GCN-NEXT:    s_ashr_i64 s[2:3], s[6:7], s0
 ; GCN-NEXT:    s_sub_i32 s0, 64, s8
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    s_lshl_b64 s[0:1], s[6:7], s0
 ; GCN-NEXT:    s_lshr_b64 s[6:7], s[4:5], s8
-; GCN-NEXT:    v_cmp_lt_u32_e64 vcc, s8, 64
 ; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GCN-NEXT:    s_cmp_eq_u32 s8, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s3
 ; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s8, 0
+; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v1, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v4, s6
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -76,7 +76,8 @@
 ; SI-NEXT:  ; %bb.1: ; %else
 ; SI-NEXT:    s_load_dword s0, s[0:1], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
+; SI-NEXT:    s_cmp_eq_u32 s0, 0
+; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    s_and_b64 s[4:5], s[0:1], exec
 ; SI-NEXT:  BB1_2: ; %endif
 ; SI-NEXT:    s_or_b64 exec, exec, s[6:7]
@@ -100,7 +101,8 @@
 ; FLAT-NEXT:  ; %bb.1: ; %else
 ; FLAT-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; FLAT-NEXT:    v_cmp_eq_u32_e64 s[0:1], s0, 0
+; FLAT-NEXT:    s_cmp_eq_u32 s0, 0
+; FLAT-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; FLAT-NEXT:    s_and_b64 s[4:5], s[0:1], exec
 ; FLAT-NEXT:  BB1_2: ; %endif
 ; FLAT-NEXT:    s_or_b64 exec, exec, s[6:7]
@@ -169,11 +171,14 @@
 ; SI-NEXT:    s_load_dword s8, s[0:1], 0xc
 ; SI-NEXT:    s_brev_b32 s9, 44
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], s2, 1
-; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], s3, 4
-; SI-NEXT:    v_cmp_gt_i32_e64 s[2:3], s3, 3
-; SI-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
-; SI-NEXT:    s_and_b64 s[0:1], exec, s[4:5]
+; SI-NEXT:    s_cmp_lt_i32 s2, 1
+; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT:    s_cmp_lt_i32 s3, 4
+; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; SI-NEXT:    s_cmp_gt_i32 s3, 3
+; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; SI-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
+; SI-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
 ; SI-NEXT:    s_and_b64 s[2:3], exec, s[2:3]
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s9
@@ -242,11 +247,14 @@
 ; FLAT-NEXT:    s_load_dword s8, s[0:1], 0x30
 ; FLAT-NEXT:    s_brev_b32 s9, 44
 ; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; FLAT-NEXT:    v_cmp_lt_i32_e64 s[0:1], s2, 1
-; FLAT-NEXT:    v_cmp_lt_i32_e64 s[4:5], s3, 4
-; FLAT-NEXT:    v_cmp_gt_i32_e64 s[2:3], s3, 3
-; FLAT-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
-; FLAT-NEXT:    s_and_b64 s[0:1], exec, s[4:5]
+; FLAT-NEXT:    s_cmp_lt_i32 s2, 1
+; FLAT-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; FLAT-NEXT:    s_cmp_lt_i32 s3, 4
+; FLAT-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; FLAT-NEXT:    s_cmp_gt_i32 s3, 3
+; FLAT-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; FLAT-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
+; FLAT-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
 ; FLAT-NEXT:    s_and_b64 s[2:3], exec, s[2:3]
 ; FLAT-NEXT:    s_waitcnt vmcnt(0)
 ; FLAT-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s9
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
@@ -6,8 +6,10 @@
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[2:3], s0, 0
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
+; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT:    s_cmp_eq_u32 s1, 0
+; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; CHECK-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; CHECK-NEXT:    s_cbranch_vccnz BB0_3
diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
--- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
@@ -10,9 +10,9 @@
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT:    s_cmp_eq_u32 s0, s1
+; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -23,9 +23,9 @@
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s1
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT:    s_cmp_eq_u32 s0, s1
+; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %cmp = icmp eq i32 %a, %b
@@ -82,9 +82,9 @@
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT:    s_cmp_eq_u32 s0, s1
+; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v1, v0
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -96,9 +96,9 @@
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s1
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT:    s_cmp_eq_u32 s0, s1
+; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v1, v0
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
@@ -220,9 +220,9 @@
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT:    s_cmp_eq_u32 s0, s1
+; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -233,9 +233,9 @@
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s1
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
-; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; VI-NEXT:    s_cmp_eq_u32 s0, s1
+; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %cmp = icmp eq i32 %a, %b
@@ -256,11 +256,11 @@
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    v_mov_b32_e32 v1, s3
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
-; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v1
-; SI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT:    s_cmp_eq_u32 s0, s1
+; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; SI-NEXT:    s_cmp_eq_u32 s2, s3
+; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -272,11 +272,11 @@
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s1
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
-; VI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v1
-; VI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT:    s_cmp_eq_u32 s0, s1
+; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s2, s3
+; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
@@ -298,8 +298,8 @@
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v0
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v0
+; SI-NEXT:    s_cmp_eq_u32 s3, s0
+; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
@@ -314,8 +314,8 @@
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v0
+; VI-NEXT:    s_cmp_eq_u32 s3, s0
+; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; VI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -22,8 +22,9 @@
 ; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
 ; VI: s_endpgm
 
-; SI-DAG: v_cmp_eq_u32_e64 vcc,
-; SI-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, v{{[0-9]+}}
+; SI-DAG: s_cmp_eq_u32
+; SI-DAG: s_cselect_b64 vcc, -1, 0
+; SI-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, v{{[0-9]+}}, vcc
 ; SI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; SI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
 ; SI: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.ll
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.ll
@@ -77,7 +77,8 @@
 }
 
 ; FUNC-LABEL: {{^}}s_sint_to_fp_i1_f32:
-; SI: v_cmp_eq_u32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; SI: s_cmp_eq_u32
+; SI: s_cselect_b64 [[CMP:s\[[0-9]+:[0-9]\]]], -1, 0
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -493,7 +493,8 @@
 define amdgpu_ps float @test_kill_control_flow_return(i32 inreg %arg) #0 {
 ; SI-LABEL: test_kill_control_flow_return:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    v_cmp_eq_u32_e64 s[4:5], s0, 1
+; SI-NEXT:    s_cmp_eq_u32 s0, 1
+; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; SI-NEXT:    s_mov_b64 s[2:3], exec
 ; SI-NEXT:    s_xor_b64 s[4:5], s[4:5], exec
 ; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
@@ -529,8 +530,9 @@
 ;
 ; GFX10-WAVE64-LABEL: test_kill_control_flow_return:
 ; GFX10-WAVE64:       ; %bb.0: ; %entry
-; GFX10-WAVE64-NEXT:    v_cmp_eq_u32_e64 s[4:5], s0, 1
+; GFX10-WAVE64-NEXT:    s_cmp_eq_u32 s0, 1
 ; GFX10-WAVE64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX10-WAVE64-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX10-WAVE64-NEXT:    s_xor_b64 s[4:5], s[4:5], exec
 ; GFX10-WAVE64-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
 ; GFX10-WAVE64-NEXT:    s_cbranch_scc0 BB9_4
@@ -565,8 +567,9 @@
 ;
 ; GFX10-WAVE32-LABEL: test_kill_control_flow_return:
 ; GFX10-WAVE32:       ; %bb.0: ; %entry
-; GFX10-WAVE32-NEXT:    v_cmp_eq_u32_e64 s2, s0, 1
+; GFX10-WAVE32-NEXT:    s_cmp_eq_u32 s0, 1
 ; GFX10-WAVE32-NEXT:    s_mov_b32 s1, exec_lo
+; GFX10-WAVE32-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX10-WAVE32-NEXT:    s_xor_b32 s2, s2, exec_lo
 ; GFX10-WAVE32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10-WAVE32-NEXT:    s_cbranch_scc0 BB9_4
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -375,8 +375,8 @@
   ; CHECK:   [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc
   ; CHECK:   [[V_OR_B32_e32_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e32 [[S_ADD_I32_24]], [[V_OR_B32_e32_66]], implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e32_67]], implicit $exec
-  ; CHECK:   undef %691.sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec
-  ; CHECK:   IMAGE_STORE_V4_V2_gfx10 %691, undef %578:vreg_64, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into custom "ImageResource")
+  ; CHECK:   undef %692.sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec
+  ; CHECK:   IMAGE_STORE_V4_V2_gfx10 %692, undef %578:vreg_64, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into custom "ImageResource")
   ; CHECK:   S_ENDPGM 0
 .expVert:
   %0 = extractelement <31 x i32> %userData, i64 2
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -130,25 +130,27 @@
 ; GCN-IR-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s2
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
+; GCN-IR-NEXT:    s_cmp_eq_u32 s3, 0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s6
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s3
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s10
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s6
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s7
-; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GCN-IR-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s11
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s10
-; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s7
+; GCN-IR-NEXT:    s_cmp_eq_u32 s7, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s9
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s8
+; GCN-IR-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v2, v3
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
-; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[10:11], 0, 0, vcc
+; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[8:9], 0, 0, vcc
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[0:1]
-; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[0:1]
 ; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[0:1], -1
@@ -1036,27 +1038,29 @@
 ; GCN-IR-NEXT:    s_sub_u32 s8, s6, s2
 ; GCN-IR-NEXT:    s_subb_u32 s9, s7, s2
 ; GCN-IR-NEXT:    s_sub_u32 s10, s10, s0
-; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s10
 ; GCN-IR-NEXT:    s_subb_u32 s11, s11, s0
-; GCN-IR-NEXT:    s_add_i32 s12, s12, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s13, s11
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s12
-; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s8
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s13
-; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s11, 0
-; GCN-IR-NEXT:    s_add_i32 s12, s12, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s13, s9
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[10:11], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[8:9], 0
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
+; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s10
+; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
+; GCN-IR-NEXT:    s_cmp_eq_u32 s11, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s6
+; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s8
+; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s11
+; GCN-IR-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s9
+; GCN-IR-NEXT:    s_cmp_eq_u32 s9, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s13
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s12
-; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s9, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s6
+; GCN-IR-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v2, v3
-; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[12:13], 0, 0, vcc
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[10:11], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[8:9], 0
+; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[6:7], 0, 0, vcc
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[0:1]
-; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
 ; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[0:1]
 ; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[0:1], -1
@@ -1187,67 +1191,71 @@
 ;
 ; GCN-IR-LABEL: s_test_srem24_48:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_load_dword s2, s[0:1], 0xb
 ; GCN-IR-NEXT:    s_load_dword s3, s[0:1], 0xc
-; GCN-IR-NEXT:    s_load_dword s6, s[0:1], 0xd
-; GCN-IR-NEXT:    s_load_dword s0, s[0:1], 0xe
+; GCN-IR-NEXT:    s_load_dword s4, s[0:1], 0xd
+; GCN-IR-NEXT:    s_load_dword s5, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_sext_i32_i16 s3, s3
-; GCN-IR-NEXT:    s_sext_i32_i16 s7, s0
-; GCN-IR-NEXT:    s_ashr_i64 s[0:1], s[2:3], 24
+; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[2:3], 24
+; GCN-IR-NEXT:    s_sext_i32_i16 s5, s5
 ; GCN-IR-NEXT:    s_ashr_i32 s2, s3, 31
-; GCN-IR-NEXT:    s_ashr_i32 s10, s7, 31
+; GCN-IR-NEXT:    s_ashr_i32 s10, s5, 31
+; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[4:5], 24
 ; GCN-IR-NEXT:    s_mov_b32 s3, s2
-; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[6:7], 24
 ; GCN-IR-NEXT:    s_mov_b32 s11, s10
-; GCN-IR-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[8:9], s[10:11]
-; GCN-IR-NEXT:    s_sub_u32 s6, s0, s2
-; GCN-IR-NEXT:    s_subb_u32 s7, s1, s2
-; GCN-IR-NEXT:    s_sub_u32 s8, s8, s10
-; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s8
-; GCN-IR-NEXT:    s_subb_u32 s9, s9, s10
-; GCN-IR-NEXT:    s_add_i32 s12, s12, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s13, s9
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s12
-; GCN-IR-NEXT:    s_flbit_i32_b32 s12, s6
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s13
-; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s9, 0
-; GCN-IR-NEXT:    s_add_i32 s12, s12, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s13, s7
+; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[6:7], s[2:3]
+; GCN-IR-NEXT:    s_xor_b64 s[6:7], s[8:9], s[10:11]
+; GCN-IR-NEXT:    s_sub_u32 s4, s4, s2
+; GCN-IR-NEXT:    s_subb_u32 s5, s5, s2
+; GCN-IR-NEXT:    s_sub_u32 s6, s6, s10
+; GCN-IR-NEXT:    s_subb_u32 s7, s7, s10
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[4:5], 0
+; GCN-IR-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
+; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s6
+; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
+; GCN-IR-NEXT:    s_cmp_eq_u32 s7, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s8
+; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s4
+; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s7
+; GCN-IR-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s9
+; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s5
+; GCN-IR-NEXT:    s_cmp_eq_u32 s5, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s13
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s12
-; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s9
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s8
+; GCN-IR-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v2, v3
-; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[12:13], 0, 0, vcc
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[8:9], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[6:7], 0
+; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[8:9], 0, 0, vcc
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[0:1]
-; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
-; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[10:11], vcc
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[0:1]
 ; GCN-IR-NEXT:    s_xor_b64 s[10:11], s[0:1], -1
 ; GCN-IR-NEXT:    s_and_b64 s[10:11], s[10:11], vcc
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[10:11]
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_mov_b64 vcc, vcc
 ; GCN-IR-NEXT:    s_cbranch_vccz BB9_4
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], v[4:5], v[0:1]
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 63, v0
-; GCN-IR-NEXT:    v_lshl_b64 v[0:1], s[6:7], v0
+; GCN-IR-NEXT:    v_lshl_b64 v[0:1], s[4:5], v0
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
 ; GCN-IR-NEXT:    s_cbranch_vccz BB9_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_not_b32_e32 v2, v2
-; GCN-IR-NEXT:    s_add_u32 s10, s8, -1
-; GCN-IR-NEXT:    v_lshr_b64 v[6:7], s[6:7], v4
+; GCN-IR-NEXT:    s_add_u32 s10, s6, -1
+; GCN-IR-NEXT:    v_lshr_b64 v[6:7], s[4:5], v4
 ; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, v2, v3
 ; GCN-IR-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-IR-NEXT:    s_addc_u32 s11, s9, -1
+; GCN-IR-NEXT:    s_addc_u32 s11, s7, -1
 ; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[0:1], -1, 0, vcc
 ; GCN-IR-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0
@@ -1262,9 +1270,9 @@
 ; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, s10, v6
 ; GCN-IR-NEXT:    v_subb_u32_e32 v2, vcc, v2, v7, vcc
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
-; GCN-IR-NEXT:    v_and_b32_e32 v10, s8, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v10, s6, v8
 ; GCN-IR-NEXT:    v_and_b32_e32 v2, 1, v8
-; GCN-IR-NEXT:    v_and_b32_e32 v11, s9, v8
+; GCN-IR-NEXT:    v_and_b32_e32 v11, s7, v8
 ; GCN-IR-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
 ; GCN-IR-NEXT:    v_or_b32_e32 v1, v9, v1
 ; GCN-IR-NEXT:    v_addc_u32_e32 v9, vcc, 0, v5, vcc
@@ -1279,9 +1287,9 @@
 ; GCN-IR-NEXT:    s_cbranch_vccz BB9_3
 ; GCN-IR-NEXT:    s_branch BB9_6
 ; GCN-IR-NEXT:  BB9_4:
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v1, v0, 0, s[0:1]
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[0:1]
 ; GCN-IR-NEXT:    s_branch BB9_7
 ; GCN-IR-NEXT:  BB9_5:
@@ -1292,24 +1300,24 @@
 ; GCN-IR-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GCN-IR-NEXT:    v_or_b32_e32 v1, v3, v1
 ; GCN-IR-NEXT:  BB9_7: ; %udiv-end
-; GCN-IR-NEXT:    v_mul_lo_u32 v1, s8, v1
-; GCN-IR-NEXT:    v_mul_hi_u32 v2, s8, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v3, s9, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, s8, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v1, s6, v1
+; GCN-IR-NEXT:    v_mul_hi_u32 v2, s6, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v3, s7, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, s6, v0
+; GCN-IR-NEXT:    s_mov_b32 s11, 0xf000
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
-; GCN-IR-NEXT:    v_mov_b32_e32 v2, s7
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GCN-IR-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
 ; GCN-IR-NEXT:    v_xor_b32_e32 v0, s2, v0
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, s3, v1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
 ; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
-; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s6, -1
-; GCN-IR-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
-; GCN-IR-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-IR-NEXT:    s_mov_b32 s10, -1
+; GCN-IR-NEXT:    buffer_store_short v1, off, s[8:11], 0 offset:4
+; GCN-IR-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; GCN-IR-NEXT:    s_endpgm
   %1 = ashr i48 %x, 24
   %2 = ashr i48 %y, 24
@@ -1444,13 +1452,14 @@
 ; GCN-IR-NEXT:    s_mov_b32 s1, s0
 ; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[6:7], s[0:1]
 ; GCN-IR-NEXT:    s_sub_u32 s2, s2, s0
-; GCN-IR-NEXT:    s_subb_u32 s3, s3, s0
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
+; GCN-IR-NEXT:    s_subb_u32 s3, s3, s0
 ; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
+; GCN-IR-NEXT:    s_cmp_eq_u32 s3, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s7
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s6
-; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
+; GCN-IR-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffc5, v2
 ; GCN-IR-NEXT:    v_addc_u32_e64 v1, s[6:7], 0, -1, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -131,25 +131,27 @@
 ; GCN-IR-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s2
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
+; GCN-IR-NEXT:    s_cmp_eq_u32 s3, 0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s6
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s3
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s10
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s6
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s7
-; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GCN-IR-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s11
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s10
-; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s7
+; GCN-IR-NEXT:    s_cmp_eq_u32 s7, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s9
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s8
+; GCN-IR-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v2, v3
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
-; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[10:11], 0, 0, vcc
+; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[8:9], 0, 0, vcc
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[0:1]
-; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[0:1]
 ; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[0:1], -1
@@ -703,39 +705,40 @@
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0xd
 ; GCN-NEXT:    s_load_dword s3, s[0:1], 0xe
-; GCN-NEXT:    s_mov_b32 s5, 0xff000000
-; GCN-NEXT:    s_mov_b32 s4, 0xffff
-; GCN-NEXT:    v_cvt_f32_ubyte3_e32 v2, s4
+; GCN-NEXT:    s_mov_b32 s7, 0xff000000
+; GCN-NEXT:    s_mov_b32 s6, 0xffff
+; GCN-NEXT:    v_cvt_f32_ubyte3_e32 v2, s6
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_and_b32 s2, s2, s5
-; GCN-NEXT:    s_and_b32 s3, s3, s4
+; GCN-NEXT:    s_and_b32 s2, s2, s7
+; GCN-NEXT:    s_and_b32 s3, s3, s6
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_alignbit_b32 v0, s3, v0, 24
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v0
-; GCN-NEXT:    s_load_dword s6, s[0:1], 0xb
-; GCN-NEXT:    s_load_dword s7, s[0:1], 0xc
-; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], 24
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GCN-NEXT:    s_load_dword s8, s[0:1], 0xb
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0xc
 ; GCN-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v2
 ; GCN-NEXT:    v_rcp_f32_e32 v1, v1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_and_b32 s7, s7, s4
-; GCN-NEXT:    s_and_b32 s6, s6, s5
-; GCN-NEXT:    s_sub_u32 s8, 0, s2
+; GCN-NEXT:    s_and_b32 s6, s0, s6
+; GCN-NEXT:    s_and_b32 s8, s8, s7
+; GCN-NEXT:    s_lshr_b64 s[0:1], s[2:3], 24
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    s_subb_u32 s9, 0, s3
-; GCN-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-NEXT:    v_mul_lo_u32 v3, s8, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s8, v1
-; GCN-NEXT:    v_mul_lo_u32 v5, s9, v1
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_mul_lo_u32 v4, s8, v1
+; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT:    s_sub_u32 s2, 0, s0
+; GCN-NEXT:    s_subb_u32 s3, 0, s1
+; GCN-NEXT:    v_mul_hi_u32 v3, s2, v1
+; GCN-NEXT:    v_mul_lo_u32 v4, s2, v2
+; GCN-NEXT:    v_mul_lo_u32 v5, s3, v1
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GCN-NEXT:    v_mul_lo_u32 v4, s2, v1
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GCN-NEXT:    v_mul_lo_u32 v6, v1, v3
 ; GCN-NEXT:    v_mul_hi_u32 v5, v1, v3
@@ -750,14 +753,14 @@
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v5, v4, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v10, v8, vcc
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v3
+; GCN-NEXT:    v_add_i32_e64 v1, s[0:1], v1, v3
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v5, vcc
-; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v2, v4, s[2:3]
-; GCN-NEXT:    v_mul_lo_u32 v5, s8, v3
-; GCN-NEXT:    v_mul_hi_u32 v6, s8, v1
-; GCN-NEXT:    v_mul_lo_u32 v7, s9, v1
+; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v2, v4, s[0:1]
+; GCN-NEXT:    v_mul_lo_u32 v5, s2, v3
+; GCN-NEXT:    v_mul_hi_u32 v6, s2, v1
+; GCN-NEXT:    v_mul_lo_u32 v7, s3, v1
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GCN-NEXT:    v_mul_lo_u32 v6, s8, v1
+; GCN-NEXT:    v_mul_lo_u32 v6, s2, v1
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; GCN-NEXT:    v_mul_lo_u32 v11, v1, v5
 ; GCN-NEXT:    v_mul_hi_u32 v13, v1, v5
@@ -774,10 +777,10 @@
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v5, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v2, v5, s[2:3]
+; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v2, v5, s[0:1]
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-NEXT:    v_mov_b32_e32 v3, s6
-; GCN-NEXT:    v_alignbit_b32 v3, s7, v3, 24
+; GCN-NEXT:    v_mov_b32_e32 v3, s8
+; GCN-NEXT:    v_alignbit_b32 v3, s6, v3, 24
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
 ; GCN-NEXT:    v_mul_hi_u32 v5, v3, v1
 ; GCN-NEXT:    v_mul_lo_u32 v4, v3, v2
@@ -791,80 +794,82 @@
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v8, vcc
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, 0, v1
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v9, v2, vcc
-; GCN-NEXT:    v_mul_lo_u32 v4, v0, v2
-; GCN-NEXT:    v_mul_hi_u32 v5, v0, v1
-; GCN-NEXT:    v_mul_lo_u32 v6, v0, v1
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, v3, v6
-; GCN-NEXT:    v_subb_u32_e32 v4, vcc, 0, v4, vcc
-; GCN-NEXT:    v_sub_i32_e32 v5, vcc, v3, v0
-; GCN-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v4, vcc
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
-; GCN-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, 2, v1
-; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v2, vcc
+; GCN-NEXT:    v_mul_lo_u32 v6, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v7, v0, v1
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, 2, v1
+; GCN-NEXT:    v_mul_lo_u32 v10, v0, v1
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v2, vcc
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, 1, v1
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v3, v0
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v2, vcc
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GCN-NEXT:    v_sub_i32_e32 v3, vcc, v3, v10
+; GCN-NEXT:    v_subb_u32_e32 v6, vcc, 0, v6, vcc
+; GCN-NEXT:    v_sub_i32_e32 v7, vcc, v3, v0
+; GCN-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v6, vcc
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v0
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v3, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
+; GCN-NEXT:    v_cndmask_b32_e32 v7, -1, v7, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, -1, v0, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v5, v8, v6, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
+; GCN-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, v5, s[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v9, v7, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, v4, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v9, v5, vcc
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[0:1]
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_udiv24_i48:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
-; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_load_dword s2, s[0:1], 0xb
 ; GCN-IR-NEXT:    s_load_dword s3, s[0:1], 0xc
 ; GCN-IR-NEXT:    s_load_dword s6, s[0:1], 0xd
-; GCN-IR-NEXT:    s_load_dword s7, s[0:1], 0xe
-; GCN-IR-NEXT:    s_mov_b32 s8, 0xffff
-; GCN-IR-NEXT:    s_mov_b32 s9, 0xff000000
+; GCN-IR-NEXT:    s_load_dword s5, s[0:1], 0xe
+; GCN-IR-NEXT:    s_mov_b32 s4, 0xffff
+; GCN-IR-NEXT:    s_mov_b32 s7, 0xff000000
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_and_b32 s1, s3, s8
-; GCN-IR-NEXT:    s_and_b32 s0, s2, s9
-; GCN-IR-NEXT:    s_and_b32 s3, s7, s8
-; GCN-IR-NEXT:    s_and_b32 s2, s6, s9
-; GCN-IR-NEXT:    s_lshr_b64 s[2:3], s[2:3], 24
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s2
-; GCN-IR-NEXT:    s_lshr_b64 s[6:7], s[0:1], 24
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s3
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s10
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s6
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s7
+; GCN-IR-NEXT:    s_and_b32 s3, s3, s4
+; GCN-IR-NEXT:    s_and_b32 s2, s2, s7
+; GCN-IR-NEXT:    s_and_b32 s5, s5, s4
+; GCN-IR-NEXT:    s_and_b32 s4, s6, s7
+; GCN-IR-NEXT:    s_lshr_b64 s[6:7], s[2:3], 24
+; GCN-IR-NEXT:    s_lshr_b64 s[2:3], s[4:5], 24
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[2:3], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[4:5], s[8:9]
+; GCN-IR-NEXT:    s_flbit_i32_b32 s4, s2
+; GCN-IR-NEXT:    s_add_i32 s4, s4, 32
+; GCN-IR-NEXT:    s_cmp_eq_u32 s3, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-IR-NEXT:    s_flbit_i32_b32 s4, s6
+; GCN-IR-NEXT:    s_flbit_i32_b32 s5, s3
+; GCN-IR-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-IR-NEXT:    s_add_i32 s4, s4, 32
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s5
+; GCN-IR-NEXT:    s_flbit_i32_b32 s5, s7
+; GCN-IR-NEXT:    s_cmp_eq_u32 s7, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s10
-; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s5
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-IR-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v2, v3
-; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[10:11], 0, 0, vcc
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
+; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[4:5], 0, 0, vcc
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[0:1]
-; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
-; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[8:9], vcc
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[0:1]
 ; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[0:1], -1
 ; GCN-IR-NEXT:    s_and_b64 s[8:9], s[8:9], vcc
 ; GCN-IR-NEXT:    s_and_b64 vcc, exec, s[8:9]
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_mov_b64 vcc, vcc
 ; GCN-IR-NEXT:    s_cbranch_vccz BB7_4
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
@@ -1052,11 +1057,12 @@
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s2, s6
-; GCN-IR-NEXT:    s_flbit_i32_b32 s3, s7
 ; GCN-IR-NEXT:    s_add_i32 s2, s2, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s3, s7
+; GCN-IR-NEXT:    s_cmp_eq_u32 s7, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s3
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s2
-; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffc5, v2
 ; GCN-IR-NEXT:    v_addc_u32_e64 v1, s[2:3], 0, -1, vcc
@@ -1493,31 +1499,31 @@
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, v1, 24
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, 24
-; GCN-NEXT:    v_mul_lo_u32 v4, v0, 24
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s10, v4
-; GCN-NEXT:    v_mov_b32_e32 v3, s11
-; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v3, v2, vcc
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v4
-; GCN-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v2, vcc
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, 23, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
-; GCN-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
-; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], 23, v4
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v2, -1, v4, s[0:1]
-; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v8, v6, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, v1, 24
+; GCN-NEXT:    v_mul_hi_u32 v5, v0, 24
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
+; GCN-NEXT:    v_mul_lo_u32 v8, v0, 24
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
+; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_sub_i32_e32 v8, vcc, s10, v8
+; GCN-NEXT:    v_mov_b32_e32 v5, s11
+; GCN-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
+; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, 24, v8
+; GCN-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
+; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, 23, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; GCN-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
+; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], 23, v8
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
+; GCN-NEXT:    v_cndmask_b32_e64 v4, -1, v5, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GCN-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -1528,11 +1534,12 @@
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s2, s6
-; GCN-IR-NEXT:    s_flbit_i32_b32 s3, s7
 ; GCN-IR-NEXT:    s_add_i32 s2, s2, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s3, s7
+; GCN-IR-NEXT:    s_cmp_eq_u32 s7, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s3
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s2
-; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 59, v2
 ; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[2:3], 0, 0, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll
--- a/llvm/test/CodeGen/AMDGPU/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll
@@ -36,7 +36,7 @@
 ;
 ; GFX6-LABEL: test_udivrem:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dword s3, s[0:1], 0x26
+; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x26
 ; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x13
 ; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x1d
@@ -44,27 +44,27 @@
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_mov_b32 s10, s6
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX6-NEXT:    s_sub_i32 s2, 0, s3
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX6-NEXT:    s_sub_i32 s3, 0, s2
 ; GFX6-NEXT:    s_mov_b32 s11, s7
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, s2, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s3
+; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s2
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s0, v1
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v1
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s2, v1
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v1
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s2, v1
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v1, v2, s[0:1]
@@ -73,11 +73,11 @@
 ;
 ; GFX8-LABEL: test_udivrem:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_load_dword s7, s[0:1], 0x98
-; GFX8-NEXT:    s_load_dword s6, s[0:1], 0x74
+; GFX8-NEXT:    s_load_dword s6, s[0:1], 0x98
+; GFX8-NEXT:    s_load_dword s7, s[0:1], 0x74
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX8-NEXT:    s_sub_i32 s2, 0, s7
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX8-NEXT:    s_sub_i32 s2, 0, s6
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -86,22 +86,22 @@
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x4c
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
-; GFX8-NEXT:    v_mul_hi_u32 v2, s6, v0
+; GFX8-NEXT:    v_mul_hi_u32 v2, s7, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    v_mul_lo_u32 v3, v2, s7
+; GFX8-NEXT:    v_mul_lo_u32 v3, v2, s6
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s6, v3
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v3
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s7, v3
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s7, v3
+; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s6, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v3
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s7, v3
+; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s6, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
@@ -201,40 +201,40 @@
 ; GFX8-LABEL: test_udivrem_v2:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT:    s_mov_b32 s3, 0x4f7ffffe
+; GFX8-NEXT:    s_mov_b32 s2, 0x4f7ffffe
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX8-NEXT:    s_sub_i32 s2, 0, s6
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s7
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX8-NEXT:    v_mul_f32_e32 v0, s3, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, s2, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, s2, v1
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    v_mul_f32_e32 v1, s3, v1
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX8-NEXT:    s_sub_i32 s2, 0, s6
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s2, v0
 ; GFX8-NEXT:    s_sub_i32 s2, 0, s7
+; GFX8-NEXT:    v_mul_lo_u32 v3, s2, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX8-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX8-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, s6
-; GFX8-NEXT:    v_mul_hi_u32 v2, v1, v2
+; GFX8-NEXT:    v_mul_lo_u32 v1, v1, s7
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s6, v0
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s6, v0
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s5, v1
+; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s7, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s6, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s6, v0
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
-; GFX8-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX8-NEXT:    v_mul_lo_u32 v1, v1, s7
-; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s5, v1
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s7, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s7, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -402,66 +402,66 @@
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX8-NEXT:    s_sub_i32 s2, 0, s8
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, s11
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX8-NEXT:    s_sub_i32 s2, 0, s8
 ; GFX8-NEXT:    s_sub_i32 s3, 0, s9
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s10
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, s10
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, s11
 ; GFX8-NEXT:    v_mul_f32_e32 v0, s12, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, s12, v1
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX8-NEXT:    v_mul_lo_u32 v3, s2, v0
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX8-NEXT:    v_mul_lo_u32 v2, s2, v0
 ; GFX8-NEXT:    s_sub_i32 s2, 0, s10
-; GFX8-NEXT:    v_mul_f32_e32 v2, s12, v2
-; GFX8-NEXT:    v_mul_hi_u32 v3, v0, v3
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v3, v4
 ; GFX8-NEXT:    v_mul_lo_u32 v4, s3, v1
-; GFX8-NEXT:    v_mul_lo_u32 v0, v0, s8
-; GFX8-NEXT:    v_mul_f32_e32 v3, s12, v3
+; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s8, v0
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s8, v0
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v4, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX8-NEXT:    v_mul_lo_u32 v4, s2, v2
-; GFX8-NEXT:    s_sub_i32 s2, 0, s11
+; GFX8-NEXT:    v_mul_f32_e32 v2, s12, v3
+; GFX8-NEXT:    v_mul_lo_u32 v0, v0, s8
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v1, v1, s9
-; GFX8-NEXT:    v_mul_hi_u32 v4, v2, v4
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s8, v0
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s8, v0
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v3, s2, v2
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s5, v1
-; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s9, v1
+; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s9, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s9, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v4, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX8-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v4, v5
+; GFX8-NEXT:    s_sub_i32 s2, 0, s11
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, s12, v4
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s9, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v2, s6, v2
-; GFX8-NEXT:    v_mul_lo_u32 v4, s2, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
+; GFX8-NEXT:    v_mul_lo_u32 v5, s2, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v2, v2, s10
-; GFX8-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GFX8-NEXT:    v_mul_hi_u32 v4, v3, v5
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s6, v2
 ; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s10, v2
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s10, v2
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
 ; GFX8-NEXT:    v_mul_hi_u32 v3, s7, v3
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s10, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v3, s11
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s7, v3
 ; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s11, v3
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -78,8 +78,9 @@
 ; VI-DAG: s_cmp_eq_u32
 ; VI-DAG: s_cselect_b32 s[[SSEL:[0-9]+]], 0x3ff00000, 0
 ; VI-DAG: v_mov_b32_e32 v[[SEL:[0-9]+]], s[[SSEL]]
-; SI-DAG: v_cmp_eq_u32_e64 vcc
-; SI-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, v{{[0-9]+}}
+; SI-DAG: s_cmp_eq_u32
+; SI-DAG: s_cselect_b64 vcc, -1, 0
+; SI-DAG: v_cndmask_b32_e32 v[[SEL:[0-9]+]], 0, {{v[0-9]+}}, vcc
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
 ; GCN: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.ll
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.ll
@@ -77,7 +77,8 @@
 }
 
 ; FUNC-LABEL: {{^}}s_uint_to_fp_i1_to_f32:
-; SI: v_cmp_eq_u32_e64 [[CMP:s\[[0-9]+:[0-9]\]]],
+; SI: s_cmp_eq_u32
+; SI: s_cselect_b64  [[CMP:s\[[0-9]+:[0-9]\]]], -1, 0
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
--- a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
+++ b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
@@ -51,10 +51,13 @@
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], 0
 ; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK-NEXT:    s_waitcnt expcnt(0)
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v1
 ; CHECK-NEXT:  BB1_1: ; %bb9
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; CHECK-NEXT:    s_cbranch_vccnz BB1_1
 ; CHECK-NEXT:  ; %bb.2: ; %bb11
 ; CHECK-NEXT:    s_mov_b32 s3, 0xf000
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
--- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -251,7 +251,7 @@
 ; GCN: s_load_dwordx2 s{{\[}}[[COND0:[0-9]+]]:[[COND1:[0-9]+]]{{\]}}
 ; GCN: s_cmp_lt_i32 s[[COND0]], 1
 ; GCN: s_cbranch_scc1 [[EXIT:[A-Za-z0-9_]+]]
-; GCN: v_cmp_gt_i32_e64 {{[^,]*}}, s[[COND1]], 0{{$}}
+; GCN: s_cmp_gt_i32 s[[COND1]], 0{{$}}
 ; GCN: s_cbranch_vccz [[BODY:[A-Za-z0-9_]+]]
 ; GCN: {{^}}[[EXIT]]:
 ; GCN: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -130,25 +130,27 @@
 ; GCN-IR-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s2
+; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
+; GCN-IR-NEXT:    s_cmp_eq_u32 s3, 0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s6
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s3
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, s10
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s6
-; GCN-IR-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s7
-; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GCN-IR-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s11
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s10
-; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s7
+; GCN-IR-NEXT:    s_cmp_eq_u32 s7, 0
+; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GCN-IR-NEXT:    v_mov_b32_e32 v0, s9
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, s8
+; GCN-IR-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v2, v3
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[2:3], 0
-; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[10:11], 0, 0, vcc
+; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[8:9], 0, 0, vcc
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[0:1]
-; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GCN-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[0:1]
 ; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[0:1], -1
@@ -861,11 +863,12 @@
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s2, s6
-; GCN-IR-NEXT:    s_flbit_i32_b32 s3, s7
 ; GCN-IR-NEXT:    s_add_i32 s2, s2, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s3, s7
+; GCN-IR-NEXT:    s_cmp_eq_u32 s7, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s3
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s2
-; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffc5, v2
 ; GCN-IR-NEXT:    v_addc_u32_e64 v1, s[2:3], 0, -1, vcc
@@ -1064,11 +1067,12 @@
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_flbit_i32_b32 s2, s6
-; GCN-IR-NEXT:    s_flbit_i32_b32 s3, s7
 ; GCN-IR-NEXT:    s_add_i32 s2, s2, 32
+; GCN-IR-NEXT:    s_flbit_i32_b32 s3, s7
+; GCN-IR-NEXT:    s_cmp_eq_u32 s7, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s3
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s2
-; GCN-IR-NEXT:    v_cmp_eq_u32_e64 vcc, s7, 0
+; GCN-IR-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 59, v2
 ; GCN-IR-NEXT:    v_subb_u32_e64 v1, s[2:3], 0, 0, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -8,7 +8,8 @@
 declare double @llvm.fabs.f64(double)
 
 ; GCN-LABEL: {{^}}v_cnd_nan_nosgpr:
-; GCN: v_cmp_eq_u32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0
+; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0
+; GCN: s_cselect_b64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], -1, 0
 ; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]}}, -1, v{{[0-9]+}}, [[COND]]
 ; GCN-DAG: v{{[0-9]}}
 ; All nan values are converted to 0xffffffff
@@ -30,9 +31,11 @@
 ; However on GFX10 constant bus is limited to 2 scalar operands, not one.
 
 ; GCN-LABEL: {{^}}v_cnd_nan:
-; SIVI:  v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0
+; SIVI:  s_cmp_eq_u32 s{{[0-9]+}}, 0
+; SIVI:  s_cselect_b64 vcc, -1, 0
 ; SIVI:  v_cndmask_b32_e32 v{{[0-9]+}}, -1, v{{[0-9]+}}, vcc
-; GFX10: v_cmp_eq_u32_e64 [[CC:s\[[0-9:]+\]]], s{{[0-9]+}}, 0
+; GFX10: s_cmp_eq_u32 s{{[0-9]+}}, 0
+; GFX10: s_cselect_b64 [[CC:s\[[0-9:]+\]]],
 ; GFX10: v_cndmask_b32_e64 v{{[0-9]+}}, -1, s{{[0-9]+}}, [[CC]]
 ; GCN-DAG: v{{[0-9]}}
 ; All nan values are converted to 0xffffffff
diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
@@ -14,7 +14,8 @@
 ; GCN-ALLOCA:         buffer_load_dword
 
 ; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 2
-; GCN-PROMOTE: v_cmp_eq_u32_e64 [[CC1:[^,]+]], s{{[0-9]+}}, 1
+; GCN-PROMOTE: s_cmp_eq_u32 s{{[0-9]+}}, 1
+; GCN-PROMOTE: s_cselect_b64 [[CC1:[^,]+]], -1, 0
 ; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
 ; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND1:v[0-9]+]], 0, 1, [[CC1]]
 ; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 3
@@ -322,7 +323,8 @@
 ; GCN-ALLOCA:         buffer_load_dword
 
 ; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 2
-; GCN-PROMOTE: v_cmp_eq_u32_e64 [[CC1:[^,]+]], s{{[0-9]+}}, 1
+; GCN-PROMOTE: s_cmp_eq_u32 s{{[0-9]+}}, 1
+; GCN-PROMOTE: s_cselect_b64 [[CC1:[^,]+]], -1, 0
 ; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0
 ; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND1:v[0-9]+]], 0, 1, [[CC1]]
 ; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 3
diff --git a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll
--- a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll
@@ -35,7 +35,7 @@
 ; GCN-LABEL: extract_insert_different_dynelt_v4i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; GCN-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0xd
 ; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, 0
@@ -43,26 +43,33 @@
 ; GCN-NEXT:    s_mov_b64 s[4:5], s[10:11]
 ; GCN-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
 ; GCN-NEXT:    buffer_load_dwordx4 v[1:4], v[4:5], s[4:7], 0 addr64
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0xf
+; GCN-NEXT:    s_load_dword s14, s[0:1], 0xf
+; GCN-NEXT:    s_cmp_eq_u32 s13, 3
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s13, 2
+; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s13, 1
+; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s13, 0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
-; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 3
+; GCN-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_eq_u32 s14, 1
 ; GCN-NEXT:    v_mov_b32_e32 v7, v5
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 2
-; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 1
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v0, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s[2:3]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[4:5]
+; GCN-NEXT:    s_cmp_eq_u32 s14, 2
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 2
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s14, 3
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 3
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GCN-NEXT:    buffer_store_dword v0, v[6:7], s[8:11], 0 addr64
 ; GCN-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/vselect.ll b/llvm/test/CodeGen/AMDGPU/vselect.ll
--- a/llvm/test/CodeGen/AMDGPU/vselect.ll
+++ b/llvm/test/CodeGen/AMDGPU/vselect.ll
@@ -12,10 +12,10 @@
 ; VI: s_cmp_gt_i32
 ; VI: s_cselect_b32
 
-; SI: v_cmp_gt_i32_e32 vcc
-; SI: v_cndmask_b32_e32
-; SI: v_cmp_gt_i32_e32 vcc
-; SI: v_cndmask_b32_e32
+; SI-DAG: s_cmp_gt_i32
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: s_cmp_gt_i32
+; SI-DAG: v_cndmask_b32_e32
 
 define amdgpu_kernel void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) {
 entry:
@@ -59,10 +59,10 @@
 ; VI: s_cselect_b32
 ; VI: s_cselect_b32
 
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e64
+; SI-DAG: v_cndmask_b32_e32
 
 define amdgpu_kernel void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -347,9 +347,9 @@
 ; GFX1064: v_add_co_u32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX1064: v_add_co_u32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}}
-; GFX1064: v_sub_co_u32 v{{[0-9]+}}, s[{{[0-9:]+}}], s{{[0-9]+}}, v{{[0-9]+}}
-; GFX1064: v_subrev_co_ci_u32_e64 v{{[0-9]+}}, vcc, {{[vs][0-9]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}]
-; GFX1064: v_sub_co_ci_u32_e64 v{{[0-9]+}}, s[{{[0-9:]+}}], {{[vs][0-9]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}]
+; GFX1064: v_sub_co_u32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
+; GFX1064: v_subrev_co_ci_u32_e64 v{{[0-9]+}}, s[{{[0-9:]+}}], {{[vs][0-9]+}}, v{{[0-9]+}}, vcc
+; GFX1064: v_sub_co_ci_u32_e32 v{{[0-9]+}}, vcc, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc
 define amdgpu_kernel void @test_udiv64(i64 addrspace(1)* %arg) #0 {
 bb:
   %tmp = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 1
diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
--- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
@@ -37,7 +37,7 @@
 
 ; GCN-LABEL: {{^}}s_cmp_zext_i1_to_i64:
 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, 0
-; GCN-DAG: v_cmp_eq_u32
+; GCN-DAG: s_cmp_eq_u32
 ; GCN:     v_cndmask_b32
 define amdgpu_kernel void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %cmp = icmp eq i32 %a, %b
@@ -54,10 +54,9 @@
 ; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}}
 ; GCN-DAG: s_and_b32 [[MASK_A:s[0-9]+]], [[A]], [[MASK]]
 ; GCN-DAG: s_and_b32 [[MASK_B:s[0-9]+]], [[B]], [[MASK]]
-; GCN: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]]
-; GCN: v_cmp_eq_u32_e32 vcc, [[MASK_A]], [[V_B]]
-
-; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
+; GCN: s_cmp_eq_u32 [[MASK_A]], [[B]]
+; GCN: s_cselect_b64 [[CC:s\[[0-9:]+\]]], -1, 0
+; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
 ; GCN: buffer_store_short [[RESULT]]
 define amdgpu_kernel void @s_cmp_zext_i1_to_i16(i16 addrspace(1)* %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) #0 {
   %tmp0 = icmp eq i16 %a, %b