diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -67,6 +67,7 @@
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/SetOperations.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Target/TargetMachine.h"
@@ -81,7 +82,6 @@
   cl::init(true));
 
 namespace {
-
 class SIFixSGPRCopies : public MachineFunctionPass {
   MachineDominatorTree *MDT;
 
@@ -94,7 +94,9 @@
 
   SIFixSGPRCopies() : MachineFunctionPass(ID) {}
 
-  bool runOnMachineFunction(MachineFunction &MF) override;
+  bool runOnMachineFunction(MachineFunction& MF) override;
+  void lowerVGPR2SGPRCopies(MachineFunction &MF);
+  bool LowerSpecialCase(MachineInstr& MI);
 
   MachineBasicBlock *processPHINode(MachineInstr &MI);
 
@@ -569,6 +571,9 @@
   TII = ST.getInstrInfo();
   MDT = &getAnalysis<MachineDominatorTree>();
 
+  // Is kept aside to process V2S copies before the rest of the stuff
+  lowerVGPR2SGPRCopies(MF);
+
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
                                                   BI != BE; ++BI) {
     MachineBasicBlock *MBB = &*BI;
@@ -640,42 +645,7 @@
           continue;
         }
 
-        if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
-          Register SrcReg = MI.getOperand(1).getReg();
-          if (!SrcReg.isVirtual()) {
-            MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT);
-            if (NewBB && NewBB != MBB) {
-              MBB = NewBB;
-              E = MBB->end();
-              BI = MachineFunction::iterator(MBB);
-              BE = MF.end();
-            }
-            assert((!NewBB || NewBB == I->getParent()) &&
-                   "moveToVALU did not return the right basic block");
-            break;
-          }
-
-          MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
-          unsigned SMovOp;
-          int64_t Imm;
-          // If we are just copying an immediate, we can replace the copy with
-          // s_mov_b32.
-          if (isSafeToFoldImmIntoCopy(&MI, DefMI, TII, SMovOp, Imm)) {
-            MI.getOperand(1).ChangeToImmediate(Imm);
-            MI.addImplicitDefUseOperands(MF);
-            MI.setDesc(TII->get(SMovOp));
-            break;
-          }
-          MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT);
-          if (NewBB && NewBB != MBB) {
-            MBB = NewBB;
-            E = MBB->end();
-            BI = MachineFunction::iterator(MBB);
-            BE = MF.end();
-          }
-          assert((!NewBB || NewBB == I->getParent()) &&
-                 "moveToVALU did not return the right basic block");
-        } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
+        if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
           tryChangeVGPRtoSGPRinCopy(MI, TRI, TII);
         }
 
@@ -916,3 +886,265 @@
   }
   return CreatedBB;
 }
+
+bool SIFixSGPRCopies::LowerSpecialCase(MachineInstr &MI) {
+  MachineBasicBlock *MBB = MI.getParent();
+  const TargetRegisterClass *SrcRC, *DstRC;
+  std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI);
+
+  // We return true to indicate that no further processing needed
+  if (!isVGPRToSGPRCopy(SrcRC, DstRC, *TRI))
+    return true;
+
+  Register SrcReg = MI.getOperand(1).getReg();
+  if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) {
+    TII->moveToVALU(MI, MDT);
+    return true;
+  }
+
+  unsigned SMovOp;
+  int64_t Imm;
+  // If we are just copying an immediate, we can replace the copy with
+  // s_mov_b32.
+  if (isSafeToFoldImmIntoCopy(&MI, MRI->getVRegDef(SrcReg), TII, SMovOp, Imm)) {
+    MI.getOperand(1).ChangeToImmediate(Imm);
+    MI.addImplicitDefUseOperands(*MBB->getParent());
+    MI.setDesc(TII->get(SMovOp));
+    return true;
+  }
+  return false;
+}
+
+class V2SCopyInfo {
+  public:
+  // VGPR to SGPR copy being processed
+  MachineInstr *Copy;
+  // All SALU instructions reachable from this copy in SSA graph
+  DenseSet<MachineInstr*> SChain;
+  // Number of SGPR to VGPR copies that are used to put the SALU computation results back to VALU.
+  unsigned NumSVCopies;
+
+  unsigned Score;
+  // Actual count of v_readfirstlane_b32
+  // which need to be inserted to keep SChain SALU
+  unsigned NumReadfirstlanes;
+  // Current score state. To speedup selection V2SCopyInfos for processing
+  bool NeedToBeConvertedToVALU = false;
+  // Unique ID. Used as a key for mapping to keep permanent order.
+  unsigned ID;
+  // Next unique ID to  use while new instance created.
+  static unsigned NextID;
+
+  // Count of another VGPR to SGPR copies that contribute to the
+  // current copy SChain
+  unsigned SiblingPenaulty = 0;
+  SetVector<unsigned> Siblings;
+  V2SCopyInfo() : Copy(nullptr), ID(0) {};
+  V2SCopyInfo(MachineInstr *C, unsigned Width) : Copy(C), NumSVCopies(0),
+    NumReadfirstlanes(Width/32), ID(++NextID) {};
+  void dump() {
+    dbgs() << ID << " : "  << *Copy
+      << "\n\tS:" << SChain.size()
+      << "\n\tSV:" << NumSVCopies
+      << "\n\tSP: " << SiblingPenaulty << "\nScore: " << Score << "\n";
+  }
+};
+unsigned V2SCopyInfo::NextID = 0;
+
+void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
+
+  DenseMap<unsigned, V2SCopyInfo> Copies;
+  DenseMap<MachineInstr *, SetVector<unsigned>> SiblingPenaulty;
+
+  // The main function that computes the VGPR to SGPR copy score
+  // and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU
+  auto needToBeConvertedToVALU = [&](V2SCopyInfo* I) -> bool {
+    if (I->SChain.empty())
+      return true;
+    I->Siblings = SiblingPenaulty[
+      *std::max_element(I->SChain.begin(), I->SChain.end(),
+       [&](MachineInstr *A, MachineInstr *B) -> bool {
+         return SiblingPenaulty[A].size() < SiblingPenaulty[B].size();
+       })];
+    I->Siblings.remove_if([&](unsigned ID) { return ID == I->ID;});
+    SetVector<Register> SrcRegs;
+    for (auto J : I->Siblings) {
+      auto InfoIt = Copies.find(J);
+      if (InfoIt != Copies.end()) {
+        MachineInstr * SiblingCopy = InfoIt->getSecond().Copy;
+        if (SiblingCopy->isImplicitDef())
+          // the COPY has already been MoveToVALUed
+          continue;
+
+        SrcRegs.insert(SiblingCopy->getOperand(1).getReg());
+      }
+    }
+    I->SiblingPenaulty = SrcRegs.size();
+
+    unsigned Penaulty = I->NumSVCopies + I->SiblingPenaulty +
+     I->NumReadfirstlanes;
+    unsigned Profit = I->SChain.size();
+    I->Score = Penaulty > Profit ? 0 : Profit - Penaulty;
+    I->NeedToBeConvertedToVALU = I->Score  < 3;
+    return I->NeedToBeConvertedToVALU;
+  };
+
+  auto needProcessing = [](MachineInstr &MI) -> bool {
+    switch (MI.getOpcode()) {
+    case AMDGPU::COPY:
+    case AMDGPU::WQM:
+    case AMDGPU::STRICT_WQM:
+    case AMDGPU::SOFT_WQM:
+    case AMDGPU::STRICT_WWM:
+      return true;
+    default:
+      return false;
+    }
+  };
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
+       ++BI) {
+    MachineBasicBlock *MBB = &*BI;
+    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+         ++I) {
+      MachineInstr &MI = *I;
+      if (!needProcessing(MI))
+        continue;
+      if (LowerSpecialCase(MI))
+        continue;
+
+      // Compute the COPY width to pass it to V2SCopyInfo Ctor
+      Register SrcReg = MI.getOperand(1).getReg();
+      const  TargetRegisterClass * RC = TRI->getRegClassForReg(*MRI, SrcReg);
+
+      V2SCopyInfo In(&MI, TRI->getRegSizeInBits(*RC));
+
+      SmallVector<MachineInstr *, 8> worklist;
+      // Needed because the SSA is not a tree but a graph and may have
+      // forks and joins. We should not then go same way twice.
+      SetVector<MachineInstr*> Visited;
+      worklist.push_back(&MI);
+      while (!worklist.empty()) {
+
+        MachineInstr *Inst = worklist.pop_back_val();
+
+        // The analysis is per MBB for now.
+        if (Inst->isPHI() || Inst->getParent() != MI.getParent())
+          continue;
+
+        if (!Visited.insert(Inst))
+          continue;
+
+        // Copies and REG_SEQUENCE do not comtribute to the final assembly
+        // So, skip them but take care of the SGPR to VGPR copies bookkeeping.
+        if (Inst->isCopy() || Inst->isRegSequence()) {
+          if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) {
+            if (!Inst->isCopy() || !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) {
+              In.NumSVCopies++;
+              continue;
+            }
+          }
+        }
+
+        SiblingPenaulty[Inst].insert(In.ID);
+
+        SmallVector<MachineInstr*, 4> Users;
+        if ((TII->isSALU(*Inst) && Inst->isCompare()) ||
+            (Inst->isCopy() && Inst->getOperand(0).getReg() == AMDGPU::SCC)) {
+          auto I = Inst->getIterator();
+          auto E = Inst->getParent()->end();
+          while((++I) != E &&
+            !I->findRegisterDefOperand(AMDGPU::SCC)) {
+            if (I->readsRegister(AMDGPU::SCC))
+              Users.push_back(&*I);
+          }
+        } else if (Inst->getNumExplicitDefs() != 0) {
+          Register Reg = Inst->getOperand(0).getReg();
+          if (TRI->isSGPRReg(*MRI, Reg))
+            for (auto &U : MRI->use_instructions(Reg))
+              Users.push_back(&U);
+        }
+        for (auto U : Users) {
+          if (TII->isSALU(*U))
+            In.SChain.insert(U);
+          worklist.push_back(U);
+        }
+      }
+      Copies[In.ID] = In;
+    }
+  }
+
+  SmallVector<unsigned, 8> Worklist;
+  for (auto &C : Copies) {
+    if(needToBeConvertedToVALU(&C.second))
+      Worklist.push_back(C.second.ID);
+  }
+
+  while(!Worklist.empty()) {
+    unsigned CurID = Worklist.pop_back_val();
+    auto CurInfoIt = Copies.find(CurID);
+    if (CurInfoIt != Copies.end()) {
+      V2SCopyInfo C = CurInfoIt->getSecond();
+      LLVM_DEBUG(dbgs() << "Processing ...\n"; C.dump());
+      for (auto S : C.Siblings) {
+        auto SibInfoIt = Copies.find(S);
+        if (SibInfoIt != Copies.end()) {
+          V2SCopyInfo& SI = SibInfoIt->getSecond();
+          LLVM_DEBUG(dbgs() << "Sibling:\n"; SI.dump());
+          if (!SI.NeedToBeConvertedToVALU) {
+            set_subtract(SI.SChain, C.SChain);
+            if (needToBeConvertedToVALU(&SI))
+              Worklist.push_back(SI.ID);
+          }
+          SI.Siblings.remove_if([&](unsigned ID) { return ID == C.ID;});
+        }
+      }
+      LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy << " is being turned to VALU\n");
+      Copies.erase(C.ID);
+      TII->moveToVALU(*C.Copy, MDT);
+    }
+  }
+
+  // Now do actual lowering
+  for (auto C : Copies) {
+    MachineInstr *MI = C.second.Copy;
+    MachineBasicBlock *MBB = MI->getParent();
+    // We decide to turn V2S copy to v_readfirstlanre_b32
+    // remove it from the V2SCopies and remove it from all its siblings
+    LLVM_DEBUG(dbgs() << "V2S copy " << *MI << " is being turned to v_readfirstlane_b32" <<
+      "Score: " << C.second.Score << "\n");
+    uint16_t SubRegs[4] = {AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2,
+                            AMDGPU::sub3};
+    Register DstReg = MI->getOperand(0).getReg();
+    Register SrcReg = MI->getOperand(1).getReg();
+    unsigned SubReg = MI->getOperand(1).getSubReg();
+    bool IsSubReg = SubReg != AMDGPU::NoSubRegister;
+    const TargetRegisterClass *SrcRC = TRI->getRegClassForReg(*MRI, SrcReg);
+    if (IsSubReg)
+      SrcRC = TRI->getSubRegClass(SrcRC, SubReg);
+    if (TRI->getRegSizeInBits(*SrcRC) == 32) {
+      auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
+                          TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
+      if (IsSubReg)
+        MIB.addReg(SrcReg, 0, SubReg);
+      else
+        MIB.addReg(SrcReg);
+    } else {
+      auto Result = BuildMI(*MBB, MI, MI->getDebugLoc(),
+                            TII->get(AMDGPU::REG_SEQUENCE), DstReg);
+      int N = TRI->getRegSizeInBits(*SrcRC) / 32;
+      for (int i = 0; i < N; i++) {
+        Register PartialSrc =
+            TII->buildExtractSubReg(Result, *MRI, MI->getOperand(1), SrcRC,
+                                    SubRegs[i], &AMDGPU::VGPR_32RegClass);
+        Register PartialDst =
+            MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+        BuildMI(*MBB, *Result, Result->getDebugLoc(),
+                TII->get(AMDGPU::V_READFIRSTLANE_B32), PartialDst)
+            .addReg(PartialSrc);
+        Result.addReg(PartialDst).addImm(SubRegs[i]);
+      }
+    }
+    MI->eraseFromParent();
+  }
+}
diff --git a/llvm/test/CodeGen/AMDGPU/add3.ll b/llvm/test/CodeGen/AMDGPU/add3.ll
--- a/llvm/test/CodeGen/AMDGPU/add3.ll
+++ b/llvm/test/CodeGen/AMDGPU/add3.ll
@@ -222,7 +222,7 @@
 ; VI-NEXT:    v_mov_b32_e32 v2, 0x40400000
 ; VI-NEXT:    v_add_f32_e32 v2, s4, v2
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: add3_uniform_vgpr:
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -521,175 +521,168 @@
 define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg2, i64 %arg3, <2 x half> %arg4, <2 x half> %arg5) #3 {
 ; GFX908-LABEL: introduced_copy_to_sgpr:
 ; GFX908:       ; %bb.0: ; %bb
-; GFX908-NEXT:    global_load_ushort v0, v[0:1], off glc
+; GFX908-NEXT:    global_load_ushort v24, v[0:1], off glc
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
 ; GFX908-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x10
-; GFX908-NEXT:    s_load_dword s7, s[4:5], 0x18
-; GFX908-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX908-NEXT:    s_mov_b32 s6, 0
+; GFX908-NEXT:    s_load_dword s8, s[4:5], 0x18
+; GFX908-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX908-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cvt_f32_u32_e32 v1, s1
+; GFX908-NEXT:    v_cvt_f32_u32_e32 v0, s1
 ; GFX908-NEXT:    s_sub_i32 s4, 0, s1
-; GFX908-NEXT:    s_lshl_b64 s[10:11], s[2:3], 5
-; GFX908-NEXT:    s_or_b32 s10, s10, 28
-; GFX908-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX908-NEXT:    v_mov_b32_e32 v35, s10
-; GFX908-NEXT:    s_lshr_b32 s12, s7, 16
-; GFX908-NEXT:    v_mov_b32_e32 v10, s11
-; GFX908-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; GFX908-NEXT:    v_cvt_u32_f32_e32 v2, v1
-; GFX908-NEXT:    v_cvt_f32_f16_e32 v26, s7
-; GFX908-NEXT:    v_cvt_f32_f16_e32 v27, s12
-; GFX908-NEXT:    v_accvgpr_write_b32 a0, v35
-; GFX908-NEXT:    v_mul_lo_u32 v1, s4, v2
-; GFX908-NEXT:    v_accvgpr_write_b32 a1, v10
-; GFX908-NEXT:    s_lshl_b64 s[4:5], s[8:9], 5
-; GFX908-NEXT:    v_mul_hi_u32 v3, v2, v1
-; GFX908-NEXT:    v_mov_b32_e32 v1, 0
-; GFX908-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX908-NEXT:    v_mul_hi_u32 v4, s0, v2
-; GFX908-NEXT:    v_mul_lo_u32 v5, v4, s1
-; GFX908-NEXT:    v_add_u32_e32 v6, 1, v4
-; GFX908-NEXT:    v_sub_u32_e32 v5, s0, v5
-; GFX908-NEXT:    v_cmp_le_u32_e32 vcc, s1, v5
-; GFX908-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX908-NEXT:    v_subrev_u32_e32 v6, s1, v5
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
-; GFX908-NEXT:    v_add_u32_e32 v7, 1, v4
-; GFX908-NEXT:    v_cmp_le_u32_e32 vcc, s1, v5
+; GFX908-NEXT:    s_lshr_b32 s11, s8, 16
+; GFX908-NEXT:    v_cvt_f32_f16_e32 v25, s8
+; GFX908-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX908-NEXT:    s_lshl_b64 s[8:9], s[2:3], 5
+; GFX908-NEXT:    v_cvt_f32_f16_e32 v26, s11
+; GFX908-NEXT:    s_or_b32 s8, s8, 28
+; GFX908-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX908-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX908-NEXT:    v_mov_b32_e32 v7, s3
+; GFX908-NEXT:    s_mov_b32 s10, 0
+; GFX908-NEXT:    v_mov_b32_e32 v6, s2
+; GFX908-NEXT:    v_mul_lo_u32 v2, s4, v0
+; GFX908-NEXT:    s_lshl_b64 s[4:5], s[6:7], 5
+; GFX908-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX908-NEXT:    v_add_u32_e32 v0, v0, v2
+; GFX908-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX908-NEXT:    v_mov_b32_e32 v2, s8
+; GFX908-NEXT:    v_mov_b32_e32 v3, s9
+; GFX908-NEXT:    v_mul_lo_u32 v4, v0, s1
+; GFX908-NEXT:    v_add_u32_e32 v5, 1, v0
+; GFX908-NEXT:    v_sub_u32_e32 v4, s0, v4
+; GFX908-NEXT:    v_cmp_le_u32_e32 vcc, s1, v4
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GFX908-NEXT:    v_subrev_u32_e32 v5, s1, v4
+; GFX908-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX908-NEXT:    v_add_u32_e32 v5, 1, v0
+; GFX908-NEXT:    v_cmp_le_u32_e32 vcc, s1, v4
+; GFX908-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GFX908-NEXT:    v_lshlrev_b64 v[4:5], 5, v[0:1]
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_and_b32_e32 v28, 0xffff, v0
-; GFX908-NEXT:    v_cndmask_b32_e32 v0, v4, v7, vcc
-; GFX908-NEXT:    v_mul_lo_u32 v8, s9, v28
-; GFX908-NEXT:    v_mul_hi_u32 v9, s8, v28
-; GFX908-NEXT:    v_lshlrev_b64 v[2:3], 5, v[0:1]
-; GFX908-NEXT:    v_mul_lo_u32 v6, s8, v28
-; GFX908-NEXT:    v_add_u32_e32 v7, v9, v8
-; GFX908-NEXT:    v_accvgpr_write_b32 a2, v2
-; GFX908-NEXT:    v_accvgpr_write_b32 a3, v3
-; GFX908-NEXT:    v_lshlrev_b64 v[6:7], 5, v[6:7]
-; GFX908-NEXT:    v_mov_b32_e32 v9, s3
-; GFX908-NEXT:    v_mov_b32_e32 v8, s2
+; GFX908-NEXT:    v_readfirstlane_b32 s0, v24
+; GFX908-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX908-NEXT:    s_mul_i32 s1, s7, s0
+; GFX908-NEXT:    s_mul_hi_u32 s7, s6, s0
+; GFX908-NEXT:    s_mul_i32 s0, s6, s0
+; GFX908-NEXT:    s_add_i32 s1, s7, s1
+; GFX908-NEXT:    s_lshl_b64 s[6:7], s[0:1], 5
 ; GFX908-NEXT:    s_branch .LBB3_2
 ; GFX908-NEXT:  .LBB3_1: ; %bb12
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v0
-; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX908-NEXT:    v_accvgpr_read_b32 v5, a3
-; GFX908-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
-; GFX908-NEXT:    v_accvgpr_read_b32 v2, a0
-; GFX908-NEXT:    v_accvgpr_read_b32 v4, a2
+; GFX908-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
 ; GFX908-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX908-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
-; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    v_accvgpr_write_b32 a0, v2
-; GFX908-NEXT:    v_accvgpr_write_b32 a1, v3
 ; GFX908-NEXT:  .LBB3_2: ; %bb9
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX908-NEXT:    ; Child Loop BB3_5 Depth 2
 ; GFX908-NEXT:    s_cbranch_scc0 .LBB3_1
 ; GFX908-NEXT:  ; %bb.3: ; %bb14
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT:    v_mov_b32_e32 v10, 0
-; GFX908-NEXT:    v_mov_b32_e32 v11, 0
-; GFX908-NEXT:    global_load_dwordx2 v[10:11], v[10:11], off
-; GFX908-NEXT:    s_mov_b32 s7, s6
-; GFX908-NEXT:    v_cmp_gt_i64_e64 s[0:1], 0, v[8:9]
-; GFX908-NEXT:    v_accvgpr_read_b32 v13, a1
-; GFX908-NEXT:    v_mov_b32_e32 v15, s7
-; GFX908-NEXT:    v_mov_b32_e32 v17, s7
-; GFX908-NEXT:    v_accvgpr_read_b32 v12, a0
-; GFX908-NEXT:    v_mov_b32_e32 v14, s6
-; GFX908-NEXT:    v_mov_b32_e32 v16, s6
+; GFX908-NEXT:    v_mov_b32_e32 v8, 0
+; GFX908-NEXT:    v_mov_b32_e32 v9, 0
+; GFX908-NEXT:    global_load_dwordx2 v[8:9], v[8:9], off
+; GFX908-NEXT:    s_mov_b32 s11, s10
+; GFX908-NEXT:    v_mov_b32_e32 v13, s11
+; GFX908-NEXT:    v_mov_b32_e32 v15, s11
+; GFX908-NEXT:    v_mov_b32_e32 v17, s11
+; GFX908-NEXT:    v_mov_b32_e32 v12, s10
+; GFX908-NEXT:    v_mov_b32_e32 v14, s10
+; GFX908-NEXT:    v_mov_b32_e32 v16, s10
+; GFX908-NEXT:    v_cmp_gt_i64_e64 s[0:1], 0, v[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v11, v3
+; GFX908-NEXT:    v_mov_b32_e32 v19, v13
+; GFX908-NEXT:    v_mov_b32_e32 v10, v2
+; GFX908-NEXT:    v_mov_b32_e32 v18, v12
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_add_co_u32_e32 v20, vcc, 1, v10
-; GFX908-NEXT:    v_addc_co_u32_e32 v18, vcc, 0, v11, vcc
-; GFX908-NEXT:    v_mul_lo_u32 v21, s4, v18
-; GFX908-NEXT:    v_mul_hi_u32 v22, s4, v20
-; GFX908-NEXT:    v_mul_lo_u32 v23, s5, v20
-; GFX908-NEXT:    v_mul_lo_u32 v29, s4, v20
-; GFX908-NEXT:    v_mov_b32_e32 v19, s7
-; GFX908-NEXT:    v_add_u32_e32 v20, v22, v21
-; GFX908-NEXT:    v_add_u32_e32 v30, v20, v23
-; GFX908-NEXT:    v_mov_b32_e32 v21, s7
-; GFX908-NEXT:    v_mov_b32_e32 v18, s6
-; GFX908-NEXT:    v_mov_b32_e32 v20, s6
+; GFX908-NEXT:    v_readfirstlane_b32 s2, v8
+; GFX908-NEXT:    v_readfirstlane_b32 s3, v9
+; GFX908-NEXT:    s_add_u32 s2, s2, 1
+; GFX908-NEXT:    s_addc_u32 s3, s3, 0
+; GFX908-NEXT:    s_mul_hi_u32 s9, s4, s2
+; GFX908-NEXT:    s_mul_i32 s11, s5, s2
+; GFX908-NEXT:    s_mul_i32 s8, s4, s2
+; GFX908-NEXT:    s_mul_i32 s2, s4, s3
+; GFX908-NEXT:    s_add_i32 s2, s9, s2
+; GFX908-NEXT:    s_add_i32 s9, s2, s11
 ; GFX908-NEXT:    s_branch .LBB3_5
 ; GFX908-NEXT:  .LBB3_4: ; %bb58
 ; GFX908-NEXT:    ; in Loop: Header=BB3_5 Depth=2
-; GFX908-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v28
-; GFX908-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
-; GFX908-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[10:11]
-; GFX908-NEXT:    v_add_co_u32_e64 v12, s[2:3], v12, v6
-; GFX908-NEXT:    v_addc_co_u32_e64 v13, s[2:3], v13, v7, s[2:3]
+; GFX908-NEXT:    v_add_co_u32_sdwa v8, vcc, v8, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX908-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX908-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[8:9]
+; GFX908-NEXT:    v_mov_b32_e32 v20, s7
+; GFX908-NEXT:    v_add_co_u32_e64 v10, s[2:3], s6, v10
+; GFX908-NEXT:    v_addc_co_u32_e64 v11, s[2:3], v11, v20, s[2:3]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_1
 ; GFX908-NEXT:  .LBB3_5: ; %bb16
 ; GFX908-NEXT:    ; Parent Loop BB3_2 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX908-NEXT:    v_add_co_u32_e32 v22, vcc, v12, v29
-; GFX908-NEXT:    v_addc_co_u32_e32 v23, vcc, v13, v30, vcc
-; GFX908-NEXT:    global_load_dword v32, v[22:23], off offset:-12 glc
+; GFX908-NEXT:    v_mov_b32_e32 v21, s9
+; GFX908-NEXT:    v_add_co_u32_e32 v20, vcc, s8, v10
+; GFX908-NEXT:    v_addc_co_u32_e32 v21, vcc, v11, v21, vcc
+; GFX908-NEXT:    global_load_dword v28, v[20:21], off offset:-12 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    global_load_dword v31, v[22:23], off offset:-8 glc
+; GFX908-NEXT:    global_load_dword v27, v[20:21], off offset:-8 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    global_load_dword v24, v[22:23], off offset:-4 glc
+; GFX908-NEXT:    global_load_dword v22, v[20:21], off offset:-4 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    global_load_dword v22, v[22:23], off glc
+; GFX908-NEXT:    global_load_dword v20, v[20:21], off glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    ds_read_b64 v[22:23], v1
-; GFX908-NEXT:    ds_read_b64 v[24:25], v0
+; GFX908-NEXT:    ds_read_b64 v[20:21], v1
+; GFX908-NEXT:    ds_read_b64 v[22:23], v0
 ; GFX908-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    s_cbranch_vccnz .LBB3_4
 ; GFX908-NEXT:  ; %bb.6: ; %bb51
 ; GFX908-NEXT:    ; in Loop: Header=BB3_5 Depth=2
-; GFX908-NEXT:    v_cvt_f32_f16_sdwa v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX908-NEXT:    v_cvt_f32_f16_e32 v32, v32
-; GFX908-NEXT:    v_cvt_f32_f16_sdwa v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX908-NEXT:    v_cvt_f32_f16_e32 v31, v31
-; GFX908-NEXT:    v_add_f32_e32 v4, v26, v22
-; GFX908-NEXT:    v_add_f32_e32 v5, v27, v23
-; GFX908-NEXT:    v_add_f32_e32 v2, 0, v22
-; GFX908-NEXT:    v_add_f32_e32 v3, 0, v23
-; GFX908-NEXT:    v_add_f32_e32 v25, v33, v25
-; GFX908-NEXT:    v_add_f32_e32 v24, v32, v24
-; GFX908-NEXT:    v_add_f32_e32 v23, v34, v23
-; GFX908-NEXT:    v_add_f32_e32 v22, v31, v22
-; GFX908-NEXT:    v_add_f32_e32 v15, v15, v5
-; GFX908-NEXT:    v_add_f32_e32 v14, v14, v4
-; GFX908-NEXT:    v_add_f32_e32 v17, v17, v3
-; GFX908-NEXT:    v_add_f32_e32 v16, v16, v2
-; GFX908-NEXT:    v_add_f32_e32 v18, v18, v24
-; GFX908-NEXT:    v_add_f32_e32 v19, v19, v25
-; GFX908-NEXT:    v_add_f32_e32 v20, v20, v22
-; GFX908-NEXT:    v_add_f32_e32 v21, v21, v23
+; GFX908-NEXT:    v_cvt_f32_f16_sdwa v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX908-NEXT:    v_cvt_f32_f16_e32 v28, v28
+; GFX908-NEXT:    v_cvt_f32_f16_sdwa v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX908-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; GFX908-NEXT:    v_add_f32_e32 v31, v25, v20
+; GFX908-NEXT:    v_add_f32_e32 v32, v26, v21
+; GFX908-NEXT:    v_add_f32_e32 v33, 0, v20
+; GFX908-NEXT:    v_add_f32_e32 v34, 0, v21
+; GFX908-NEXT:    v_add_f32_e32 v23, v29, v23
+; GFX908-NEXT:    v_add_f32_e32 v22, v28, v22
+; GFX908-NEXT:    v_add_f32_e32 v21, v30, v21
+; GFX908-NEXT:    v_add_f32_e32 v20, v27, v20
+; GFX908-NEXT:    v_add_f32_e32 v13, v13, v32
+; GFX908-NEXT:    v_add_f32_e32 v12, v12, v31
+; GFX908-NEXT:    v_add_f32_e32 v15, v15, v34
+; GFX908-NEXT:    v_add_f32_e32 v14, v14, v33
+; GFX908-NEXT:    v_add_f32_e32 v16, v16, v22
+; GFX908-NEXT:    v_add_f32_e32 v17, v17, v23
+; GFX908-NEXT:    v_add_f32_e32 v18, v18, v20
+; GFX908-NEXT:    v_add_f32_e32 v19, v19, v21
 ; GFX908-NEXT:    s_branch .LBB3_4
 ;
 ; GFX90A-LABEL: introduced_copy_to_sgpr:
 ; GFX90A:       ; %bb.0: ; %bb
-; GFX90A-NEXT:    global_load_ushort v10, v[0:1], off glc
+; GFX90A-NEXT:    global_load_ushort v28, v[0:1], off glc
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX90A-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
-; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x10
+; GFX90A-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x10
 ; GFX90A-NEXT:    s_load_dword s2, s[4:5], 0x18
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
-; GFX90A-NEXT:    s_mov_b32 s4, 0
+; GFX90A-NEXT:    s_mov_b32 s8, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s7
-; GFX90A-NEXT:    s_sub_i32 s5, 0, s7
+; GFX90A-NEXT:    s_sub_i32 s9, 0, s7
+; GFX90A-NEXT:    s_lshl_b64 s[4:5], s[10:11], 5
+; GFX90A-NEXT:    s_or_b32 s4, s4, 28
+; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX90A-NEXT:    s_lshr_b32 s12, s2, 16
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-NEXT:    v_cvt_f32_f16_e32 v2, s2
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX90A-NEXT:    v_cvt_f32_f16_e32 v3, s12
-; GFX90A-NEXT:    s_lshl_b64 s[10:11], s[8:9], 5
-; GFX90A-NEXT:    s_or_b32 s10, s10, 28
 ; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v3, s12
 ; GFX90A-NEXT:    s_lshl_b64 s[2:3], s[0:1], 5
-; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], s[8:9], s[8:9] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], s[10:11], s[10:11] op_sel:[0,1]
-; GFX90A-NEXT:    v_mul_lo_u32 v8, s5, v0
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90A-NEXT:    v_mul_lo_u32 v8, s9, v0
 ; GFX90A-NEXT:    v_mul_hi_u32 v8, v0, v8
 ; GFX90A-NEXT:    v_add_u32_e32 v0, v0, v8
 ; GFX90A-NEXT:    v_mul_hi_u32 v0, s6, v0
@@ -704,14 +697,15 @@
 ; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s7, v8
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
 ; GFX90A-NEXT:    v_lshlrev_b64 v[8:9], 5, v[0:1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], 0, 0
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_and_b32_e32 v30, 0xffff, v10
-; GFX90A-NEXT:    v_mul_lo_u32 v11, s1, v30
-; GFX90A-NEXT:    v_mul_hi_u32 v12, s0, v30
-; GFX90A-NEXT:    v_mul_lo_u32 v10, s0, v30
-; GFX90A-NEXT:    v_add_u32_e32 v11, v12, v11
-; GFX90A-NEXT:    v_lshlrev_b64 v[10:11], 5, v[10:11]
-; GFX90A-NEXT:    v_pk_mov_b32 v[12:13], 0, 0
+; GFX90A-NEXT:    v_readfirstlane_b32 s4, v28
+; GFX90A-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX90A-NEXT:    s_mul_i32 s1, s1, s4
+; GFX90A-NEXT:    s_mul_hi_u32 s5, s0, s4
+; GFX90A-NEXT:    s_mul_i32 s0, s0, s4
+; GFX90A-NEXT:    s_add_i32 s1, s5, s1
+; GFX90A-NEXT:    s_lshl_b64 s[4:5], s[0:1], 5
 ; GFX90A-NEXT:    s_branch .LBB3_2
 ; GFX90A-NEXT:  .LBB3_1: ; %bb12
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
@@ -725,66 +719,70 @@
 ; GFX90A-NEXT:    s_cbranch_scc0 .LBB3_1
 ; GFX90A-NEXT:  ; %bb.3: ; %bb14
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT:    global_load_dwordx2 v[14:15], v[12:13], off
-; GFX90A-NEXT:    s_mov_b32 s5, s4
+; GFX90A-NEXT:    global_load_dwordx2 v[12:13], v[10:11], off
+; GFX90A-NEXT:    s_mov_b32 s9, s8
+; GFX90A-NEXT:    v_pk_mov_b32 v[16:17], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[18:19], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[20:21], s[8:9], s[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:    v_cmp_gt_i64_e64 s[0:1], 0, v[4:5]
-; GFX90A-NEXT:    v_pk_mov_b32 v[16:17], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[18:19], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[20:21], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[22:23], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[22:23], v[16:17], v[16:17] op_sel:[0,1]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v24, vcc, 1, v14
-; GFX90A-NEXT:    v_addc_co_u32_e32 v25, vcc, 0, v15, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v25, s2, v25
-; GFX90A-NEXT:    v_mul_hi_u32 v26, s2, v24
-; GFX90A-NEXT:    v_mul_lo_u32 v27, s3, v24
-; GFX90A-NEXT:    v_mul_lo_u32 v31, s2, v24
-; GFX90A-NEXT:    v_add_u32_e32 v24, v26, v25
-; GFX90A-NEXT:    v_add_u32_e32 v32, v24, v27
-; GFX90A-NEXT:    v_pk_mov_b32 v[24:25], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_readfirstlane_b32 s6, v12
+; GFX90A-NEXT:    v_readfirstlane_b32 s7, v13
+; GFX90A-NEXT:    s_add_u32 s6, s6, 1
+; GFX90A-NEXT:    s_addc_u32 s7, s7, 0
+; GFX90A-NEXT:    s_mul_hi_u32 s9, s2, s6
+; GFX90A-NEXT:    s_mul_i32 s7, s2, s7
+; GFX90A-NEXT:    s_mul_i32 s10, s3, s6
+; GFX90A-NEXT:    s_add_i32 s7, s9, s7
+; GFX90A-NEXT:    s_mul_i32 s6, s2, s6
+; GFX90A-NEXT:    s_add_i32 s7, s7, s10
 ; GFX90A-NEXT:    s_branch .LBB3_5
 ; GFX90A-NEXT:  .LBB3_4: ; %bb58
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_5 Depth=2
-; GFX90A-NEXT:    v_add_co_u32_e32 v14, vcc, v14, v30
-; GFX90A-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v15, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v16, vcc, v16, v10
-; GFX90A-NEXT:    v_addc_co_u32_e32 v17, vcc, v17, v11, vcc
-; GFX90A-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[14:15]
+; GFX90A-NEXT:    v_add_co_u32_sdwa v12, vcc, v12, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
+; GFX90A-NEXT:    v_mov_b32_e32 v24, s5
+; GFX90A-NEXT:    v_add_co_u32_e32 v14, vcc, s4, v14
+; GFX90A-NEXT:    v_addc_co_u32_e32 v15, vcc, v15, v24, vcc
+; GFX90A-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[12:13]
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB3_1
 ; GFX90A-NEXT:  .LBB3_5: ; %bb16
 ; GFX90A-NEXT:    ; Parent Loop BB3_2 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX90A-NEXT:    v_add_co_u32_e32 v26, vcc, v16, v31
-; GFX90A-NEXT:    v_addc_co_u32_e32 v27, vcc, v17, v32, vcc
-; GFX90A-NEXT:    global_load_dword v34, v[26:27], off offset:-12 glc
+; GFX90A-NEXT:    v_mov_b32_e32 v25, s7
+; GFX90A-NEXT:    v_add_co_u32_e32 v24, vcc, s6, v14
+; GFX90A-NEXT:    v_addc_co_u32_e32 v25, vcc, v15, v25, vcc
+; GFX90A-NEXT:    global_load_dword v30, v[24:25], off offset:-12 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_load_dword v33, v[26:27], off offset:-8 glc
+; GFX90A-NEXT:    global_load_dword v29, v[24:25], off offset:-8 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_load_dword v28, v[26:27], off offset:-4 glc
+; GFX90A-NEXT:    global_load_dword v26, v[24:25], off offset:-4 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_load_dword v28, v[26:27], off glc
+; GFX90A-NEXT:    global_load_dword v26, v[24:25], off glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    ; kill: killed $vgpr26 killed $vgpr27
-; GFX90A-NEXT:    ds_read_b64 v[26:27], v1
+; GFX90A-NEXT:    ; kill: killed $vgpr24 killed $vgpr25
+; GFX90A-NEXT:    ds_read_b64 v[24:25], v1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    ds_read_b64 v[28:29], v0
+; GFX90A-NEXT:    ds_read_b64 v[26:27], v0
 ; GFX90A-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_cbranch_vccnz .LBB3_4
 ; GFX90A-NEXT:  ; %bb.6: ; %bb51
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_5 Depth=2
-; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v35, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX90A-NEXT:    v_cvt_f32_f16_e32 v34, v34
-; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v37, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX90A-NEXT:    v_cvt_f32_f16_e32 v36, v33
-; GFX90A-NEXT:    v_pk_add_f32 v[38:39], v[2:3], v[26:27]
-; GFX90A-NEXT:    v_pk_add_f32 v[40:41], v[26:27], 0 op_sel_hi:[1,0]
-; GFX90A-NEXT:    v_pk_add_f32 v[28:29], v[34:35], v[28:29]
-; GFX90A-NEXT:    v_pk_add_f32 v[26:27], v[36:37], v[26:27]
-; GFX90A-NEXT:    v_pk_add_f32 v[18:19], v[18:19], v[38:39]
-; GFX90A-NEXT:    v_pk_add_f32 v[20:21], v[20:21], v[40:41]
-; GFX90A-NEXT:    v_pk_add_f32 v[22:23], v[22:23], v[28:29]
-; GFX90A-NEXT:    v_pk_add_f32 v[24:25], v[24:25], v[26:27]
+; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v31, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v30, v30
+; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v33, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v32, v29
+; GFX90A-NEXT:    v_pk_add_f32 v[34:35], v[2:3], v[24:25]
+; GFX90A-NEXT:    v_pk_add_f32 v[36:37], v[24:25], 0 op_sel_hi:[1,0]
+; GFX90A-NEXT:    v_pk_add_f32 v[26:27], v[30:31], v[26:27]
+; GFX90A-NEXT:    v_pk_add_f32 v[24:25], v[32:33], v[24:25]
+; GFX90A-NEXT:    v_pk_add_f32 v[16:17], v[16:17], v[34:35]
+; GFX90A-NEXT:    v_pk_add_f32 v[18:19], v[18:19], v[36:37]
+; GFX90A-NEXT:    v_pk_add_f32 v[20:21], v[20:21], v[26:27]
+; GFX90A-NEXT:    v_pk_add_f32 v[22:23], v[22:23], v[24:25]
 ; GFX90A-NEXT:    s_branch .LBB3_4
 bb:
   %i = load volatile i16, i16 addrspace(4)* undef, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -51,7 +51,7 @@
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
 ; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s3
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
@@ -145,7 +145,7 @@
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s2, v0
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s5
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
@@ -161,27 +161,28 @@
 ; GFX9-LABEL: urem_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GFX9-NEXT:    s_sub_i32 s4, 0, s3
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v1, s4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
-; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
-; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX9-NEXT:    s_mul_i32 s4, s4, s5
+; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX9-NEXT:    s_add_i32 s5, s5, s4
+; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s5
+; GFX9-NEXT:    s_mul_i32 s4, s4, s3
+; GFX9-NEXT:    s_sub_i32 s2, s2, s4
+; GFX9-NEXT:    s_sub_i32 s4, s2, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX9-NEXT:    s_sub_i32 s4, s2, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = urem i32 %x, %y
@@ -372,7 +373,7 @@
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
@@ -390,35 +391,36 @@
 ; GFX9-LABEL: srem_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX9-NEXT:    s_add_i32 s3, s3, s4
 ; GFX9-NEXT:    s_xor_b32 s3, s3, s4
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_sub_i32 s4, 0, s3
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    s_sub_i32 s5, 0, s3
+; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
+; GFX9-NEXT:    s_add_i32 s2, s2, s4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_xor_b32 s2, s2, s4
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v1, s4, v0
-; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
-; GFX9-NEXT:    s_add_i32 s2, s2, s4
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX9-NEXT:    s_mul_i32 s5, s5, s6
+; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
+; GFX9-NEXT:    s_add_i32 s6, s6, s5
+; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
+; GFX9-NEXT:    s_mul_i32 s5, s5, s3
+; GFX9-NEXT:    s_sub_i32 s2, s2, s5
+; GFX9-NEXT:    s_sub_i32 s5, s2, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX9-NEXT:    s_sub_i32 s5, s2, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
 ; GFX9-NEXT:    s_xor_b32 s2, s2, s4
-; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
-; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
-; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_sub_i32 s2, s2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = srem i32 %x, %y
@@ -697,7 +699,7 @@
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
@@ -919,7 +921,7 @@
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -1198,9 +1200,9 @@
 ; GFX6-NEXT:    s_sub_i32 s2, 0, s10
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s8
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
@@ -1226,7 +1228,7 @@
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v2, v4
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v1
 ; GFX6-NEXT:    s_sub_i32 s0, 0, s11
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v6
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
@@ -1522,7 +1524,7 @@
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
 ; GFX6-NEXT:    s_sub_i32 s4, 0, s11
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
@@ -1564,68 +1566,76 @@
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s10
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    s_sub_i32 s3, 0, s9
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GFX9-NEXT:    v_mul_lo_u32 v3, s2, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX9-NEXT:    s_mul_i32 s2, s2, s3
+; GFX9-NEXT:    s_mul_hi_u32 s2, s3, s2
+; GFX9-NEXT:    s_add_i32 s3, s3, s2
+; GFX9-NEXT:    s_mul_hi_u32 s2, s4, s3
+; GFX9-NEXT:    s_mul_i32 s2, s2, s8
+; GFX9-NEXT:    s_sub_i32 s2, s4, s2
+; GFX9-NEXT:    s_sub_i32 s3, s2, s8
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s8
+; GFX9-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX9-NEXT:    s_sub_i32 s3, s2, s8
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s8
+; GFX9-NEXT:    v_readfirstlane_b32 s12, v1
+; GFX9-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX9-NEXT:    s_sub_i32 s3, 0, s9
+; GFX9-NEXT:    s_mul_i32 s3, s3, s12
+; GFX9-NEXT:    s_mul_hi_u32 s3, s12, s3
+; GFX9-NEXT:    s_add_i32 s12, s12, s3
+; GFX9-NEXT:    s_mul_hi_u32 s3, s5, s12
+; GFX9-NEXT:    s_mul_i32 s3, s3, s9
+; GFX9-NEXT:    s_sub_i32 s3, s5, s3
+; GFX9-NEXT:    s_sub_i32 s4, s3, s9
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_mul_lo_u32 v5, s3, v1
-; GFX9-NEXT:    s_sub_i32 s2, 0, s10
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
-; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v3
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s11
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v5
-; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v2
-; GFX9-NEXT:    s_sub_i32 s2, 0, s11
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v5, v2, v5
-; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s8
-; GFX9-NEXT:    v_mul_hi_u32 v2, s6, v2
-; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v3
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s9
-; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v6, s8, v0
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v5
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s10
-; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
-; GFX9-NEXT:    v_mul_hi_u32 v3, s7, v3
-; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
-; GFX9-NEXT:    v_subrev_u32_e32 v6, s8, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v6, s9, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s11
-; GFX9-NEXT:    v_subrev_u32_e32 v6, s9, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GFX9-NEXT:    v_sub_u32_e32 v2, s6, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v5, s10, v2
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v5, s10, v2
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
-; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v5, s11, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v5, s11, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s9
+; GFX9-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX9-NEXT:    s_sub_i32 s4, s3, s9
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s9
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s11
+; GFX9-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX9-NEXT:    s_sub_i32 s4, 0, s10
+; GFX9-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX9-NEXT:    s_mul_i32 s4, s4, s5
+; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX9-NEXT:    s_add_i32 s5, s5, s4
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_mul_hi_u32 s4, s6, s5
+; GFX9-NEXT:    s_mul_i32 s4, s4, s10
+; GFX9-NEXT:    s_sub_i32 s4, s6, s4
+; GFX9-NEXT:    s_sub_i32 s5, s4, s10
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s10
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX9-NEXT:    s_sub_i32 s5, s4, s10
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s10
+; GFX9-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX9-NEXT:    s_sub_i32 s5, 0, s11
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX9-NEXT:    s_mul_i32 s5, s5, s6
+; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
+; GFX9-NEXT:    s_add_i32 s6, s6, s5
+; GFX9-NEXT:    s_mul_hi_u32 s5, s7, s6
+; GFX9-NEXT:    s_mul_i32 s5, s5, s11
+; GFX9-NEXT:    s_sub_i32 s5, s7, s5
+; GFX9-NEXT:    s_sub_i32 s6, s5, s11
+; GFX9-NEXT:    s_cmp_ge_u32 s5, s11
+; GFX9-NEXT:    s_cselect_b32 s5, s6, s5
+; GFX9-NEXT:    s_sub_i32 s6, s5, s11
+; GFX9-NEXT:    s_cmp_ge_u32 s5, s11
+; GFX9-NEXT:    s_cselect_b32 s5, s6, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = urem <4 x i32> %x, %y
@@ -1831,7 +1841,7 @@
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    s_sub_i32 s0, 0, s9
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
@@ -1903,7 +1913,7 @@
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
 ; GFX6-NEXT:    s_xor_b32 s2, s0, s2
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v4, s1, v4
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
@@ -2323,105 +2333,113 @@
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s2, s8, 31
-; GFX9-NEXT:    s_add_i32 s8, s8, s2
-; GFX9-NEXT:    s_xor_b32 s2, s8, s2
+; GFX9-NEXT:    s_add_i32 s3, s8, s2
+; GFX9-NEXT:    s_xor_b32 s2, s3, s2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX9-NEXT:    s_ashr_i32 s3, s9, 31
-; GFX9-NEXT:    s_add_i32 s8, s9, s3
-; GFX9-NEXT:    s_sub_i32 s12, 0, s2
+; GFX9-NEXT:    s_sub_i32 s8, 0, s2
+; GFX9-NEXT:    s_ashr_i32 s3, s4, 31
+; GFX9-NEXT:    s_add_i32 s4, s4, s3
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_xor_b32 s3, s8, s3
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GFX9-NEXT:    s_ashr_i32 s8, s4, 31
+; GFX9-NEXT:    s_xor_b32 s4, s4, s3
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    s_add_i32 s4, s4, s8
-; GFX9-NEXT:    s_xor_b32 s4, s4, s8
-; GFX9-NEXT:    v_mul_lo_u32 v2, s12, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_sub_i32 s12, 0, s3
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX9-NEXT:    s_ashr_i32 s9, s5, 31
-; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
-; GFX9-NEXT:    s_ashr_i32 s12, s10, 31
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX9-NEXT:    s_add_i32 s5, s5, s9
-; GFX9-NEXT:    s_xor_b32 s5, s5, s9
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s2
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s2, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s2, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
-; GFX9-NEXT:    s_add_i32 s2, s10, s12
-; GFX9-NEXT:    s_xor_b32 s2, s2, s12
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s2
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s3
-; GFX9-NEXT:    v_xor_b32_e32 v0, s8, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s8, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s12, v0
+; GFX9-NEXT:    s_mul_i32 s8, s8, s12
+; GFX9-NEXT:    s_mul_hi_u32 s8, s12, s8
+; GFX9-NEXT:    s_add_i32 s12, s12, s8
+; GFX9-NEXT:    s_mul_hi_u32 s8, s4, s12
+; GFX9-NEXT:    s_mul_i32 s8, s8, s2
+; GFX9-NEXT:    s_sub_i32 s4, s4, s8
+; GFX9-NEXT:    s_sub_i32 s8, s4, s2
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s2
+; GFX9-NEXT:    s_cselect_b32 s4, s8, s4
+; GFX9-NEXT:    s_sub_i32 s8, s4, s2
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s2
+; GFX9-NEXT:    s_cselect_b32 s2, s8, s4
+; GFX9-NEXT:    s_ashr_i32 s4, s9, 31
+; GFX9-NEXT:    s_add_i32 s8, s9, s4
+; GFX9-NEXT:    s_xor_b32 s4, s8, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT:    s_ashr_i32 s8, s5, 31
+; GFX9-NEXT:    s_xor_b32 s2, s2, s3
+; GFX9-NEXT:    s_add_i32 s5, s5, s8
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_sub_i32 s2, s2, s3
+; GFX9-NEXT:    s_xor_b32 s3, s5, s8
+; GFX9-NEXT:    s_sub_i32 s5, 0, s4
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s9, v0
+; GFX9-NEXT:    s_mul_i32 s5, s5, s9
+; GFX9-NEXT:    s_mul_hi_u32 s5, s9, s5
+; GFX9-NEXT:    s_add_i32 s9, s9, s5
+; GFX9-NEXT:    s_mul_hi_u32 s5, s3, s9
+; GFX9-NEXT:    s_mul_i32 s5, s5, s4
+; GFX9-NEXT:    s_sub_i32 s3, s3, s5
+; GFX9-NEXT:    s_sub_i32 s5, s3, s4
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s4
+; GFX9-NEXT:    s_cselect_b32 s3, s5, s3
+; GFX9-NEXT:    s_sub_i32 s5, s3, s4
+; GFX9-NEXT:    s_cmp_ge_u32 s3, s4
+; GFX9-NEXT:    s_cselect_b32 s3, s5, s3
+; GFX9-NEXT:    s_ashr_i32 s4, s10, 31
+; GFX9-NEXT:    s_add_i32 s5, s10, s4
+; GFX9-NEXT:    s_xor_b32 s4, s5, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT:    s_xor_b32 s3, s3, s8
+; GFX9-NEXT:    s_sub_i32 s3, s3, s8
+; GFX9-NEXT:    s_sub_i32 s8, 0, s4
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_ashr_i32 s5, s6, 31
+; GFX9-NEXT:    s_add_i32 s6, s6, s5
+; GFX9-NEXT:    s_xor_b32 s6, s6, s5
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_readfirstlane_b32 s9, v0
+; GFX9-NEXT:    s_mul_i32 s8, s8, s9
+; GFX9-NEXT:    s_mul_hi_u32 s8, s9, s8
+; GFX9-NEXT:    s_add_i32 s9, s9, s8
+; GFX9-NEXT:    s_mul_hi_u32 s8, s6, s9
+; GFX9-NEXT:    s_mul_i32 s8, s8, s4
+; GFX9-NEXT:    s_sub_i32 s6, s6, s8
+; GFX9-NEXT:    s_sub_i32 s8, s6, s4
+; GFX9-NEXT:    s_cmp_ge_u32 s6, s4
+; GFX9-NEXT:    s_cselect_b32 s6, s8, s6
+; GFX9-NEXT:    s_sub_i32 s8, s6, s4
+; GFX9-NEXT:    s_cmp_ge_u32 s6, s4
+; GFX9-NEXT:    s_cselect_b32 s4, s8, s6
+; GFX9-NEXT:    s_ashr_i32 s6, s11, 31
+; GFX9-NEXT:    s_add_i32 s8, s11, s6
+; GFX9-NEXT:    s_xor_b32 s6, s8, s6
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    s_ashr_i32 s2, s7, 31
+; GFX9-NEXT:    s_xor_b32 s3, s4, s5
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
+; GFX9-NEXT:    s_add_i32 s4, s7, s2
+; GFX9-NEXT:    s_sub_i32 s3, s3, s5
+; GFX9-NEXT:    s_sub_i32 s5, 0, s6
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
-; GFX9-NEXT:    s_sub_i32 s3, 0, s2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v2
-; GFX9-NEXT:    s_ashr_i32 s3, s11, 31
-; GFX9-NEXT:    s_add_i32 s4, s11, s3
-; GFX9-NEXT:    s_xor_b32 s3, s4, s3
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s3
-; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GFX9-NEXT:    s_ashr_i32 s4, s6, 31
-; GFX9-NEXT:    s_add_i32 s5, s6, s4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX9-NEXT:    s_xor_b32 s5, s5, s4
-; GFX9-NEXT:    v_mul_hi_u32 v2, s5, v2
-; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v5
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT:    s_sub_i32 s6, 0, s3
-; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s2
-; GFX9-NEXT:    v_xor_b32_e32 v1, s9, v1
-; GFX9-NEXT:    v_mul_lo_u32 v5, s6, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v1, s9, v1
-; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
-; GFX9-NEXT:    s_ashr_i32 s5, s7, 31
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v5
-; GFX9-NEXT:    s_add_i32 s6, s7, s5
-; GFX9-NEXT:    s_xor_b32 s6, s6, s5
-; GFX9-NEXT:    v_subrev_u32_e32 v6, s2, v2
-; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
-; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v5, s2, v2
-; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v2, s4, v2
-; GFX9-NEXT:    v_sub_u32_e32 v3, s6, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v5, s3, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v5, s3, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v3, s5, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s4, v2
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v3
+; GFX9-NEXT:    s_xor_b32 s4, s4, s2
+; GFX9-NEXT:    v_readfirstlane_b32 s7, v2
+; GFX9-NEXT:    s_mul_i32 s5, s5, s7
+; GFX9-NEXT:    s_mul_hi_u32 s5, s7, s5
+; GFX9-NEXT:    s_add_i32 s7, s7, s5
+; GFX9-NEXT:    s_mul_hi_u32 s5, s4, s7
+; GFX9-NEXT:    s_mul_i32 s5, s5, s6
+; GFX9-NEXT:    s_sub_i32 s4, s4, s5
+; GFX9-NEXT:    s_sub_i32 s5, s4, s6
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s6
+; GFX9-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX9-NEXT:    s_sub_i32 s5, s4, s6
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s6
+; GFX9-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX9-NEXT:    s_xor_b32 s4, s4, s2
+; GFX9-NEXT:    s_sub_i32 s2, s4, s2
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %r = srem <4 x i32> %x, %y
@@ -2770,7 +2788,7 @@
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    s_lshr_b32 s4, s7, 16
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v1, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v2, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
 ; GFX6-NEXT:    s_lshr_b32 s6, s5, 16
@@ -2995,7 +3013,7 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
 ; GFX6-NEXT:    s_xor_b32 s4, s4, s6
@@ -3263,7 +3281,7 @@
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s4
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX6-NEXT:    v_alignbit_b32 v2, s7, v2, 16
 ; GFX6-NEXT:    v_bfe_i32 v3, v2, 0, 16
@@ -3283,7 +3301,7 @@
 ; GFX6-NEXT:    v_or_b32_e32 v3, 1, v3
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, |v4|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GFX6-NEXT:    s_sext_i32_i16 s4, s7
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v3, v2
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s4
@@ -3617,7 +3635,7 @@
 ; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
@@ -3701,7 +3719,7 @@
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GFX6-NEXT:    s_lshr_b32 s3, s4, 8
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s3
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
@@ -4191,7 +4209,7 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
 ; GFX6-NEXT:    s_xor_b32 s4, s4, s6
@@ -4207,7 +4225,7 @@
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
 ; GFX6-NEXT:    s_sext_i32_i16 s5, s5
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s5
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
@@ -4435,7 +4453,7 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v6, s4
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v4|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s7
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
@@ -4821,7 +4839,7 @@
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v3, v2
 ; GFX6-NEXT:    s_lshr_b32 s3, s2, 15
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s3, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0x7fff, v6
@@ -5008,7 +5026,7 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
 ; GFX6-NEXT:    s_bfe_i32 s1, s2, 0xf000f
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s1
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
 ; GFX6-NEXT:    s_xor_b32 s0, s1, s0
@@ -5024,7 +5042,7 @@
 ; GFX6-NEXT:    v_mov_b32_e32 v6, s0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 15
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, v0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v0, v1
@@ -5233,7 +5251,7 @@
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v4|
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, 0, v7, vcc
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0x7fff, v2
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v4, s0
 ; GFX6-NEXT:    s_bfe_i32 s0, s0, 0xf000f
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, s0
@@ -5256,7 +5274,7 @@
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v8, vcc
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 15
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v7, v0
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v6
 ; GFX6-NEXT:    v_xor_b32_e32 v0, v0, v2
@@ -5764,7 +5782,7 @@
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
 ; GFX6-NEXT:    s_mov_b32 s2, -1
@@ -5998,7 +6016,7 @@
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
@@ -6022,46 +6040,49 @@
 ; GFX9-LABEL: urem_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s6
-; GFX9-NEXT:    s_lshl_b32 s2, 0x1000, s7
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT:    s_lshl_b32 s2, 0x1000, s7
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
 ; GFX9-NEXT:    s_sub_i32 s6, 0, s3
-; GFX9-NEXT:    s_sub_i32 s7, 0, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s7, v1
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s2
-; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s2, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v4, s2, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX9-NEXT:    s_mul_i32 s6, s6, s7
+; GFX9-NEXT:    s_mul_hi_u32 s6, s7, s6
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_mul_hi_u32 s6, s4, s7
+; GFX9-NEXT:    s_mul_i32 s6, s6, s3
+; GFX9-NEXT:    s_sub_i32 s4, s4, s6
+; GFX9-NEXT:    s_sub_i32 s6, s4, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s3
+; GFX9-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX9-NEXT:    s_sub_i32 s6, s4, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s3
+; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX9-NEXT:    s_cselect_b32 s3, s6, s4
+; GFX9-NEXT:    s_sub_i32 s4, 0, s2
+; GFX9-NEXT:    s_mul_i32 s4, s4, s8
+; GFX9-NEXT:    s_mul_hi_u32 s4, s8, s4
+; GFX9-NEXT:    s_add_i32 s8, s8, s4
+; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s8
+; GFX9-NEXT:    s_mul_i32 s4, s4, s2
+; GFX9-NEXT:    s_sub_i32 s4, s5, s4
+; GFX9-NEXT:    s_sub_i32 s5, s4, s2
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s2
+; GFX9-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX9-NEXT:    s_sub_i32 s5, s4, s2
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s2
+; GFX9-NEXT:    s_cselect_b32 s2, s5, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
@@ -6179,7 +6200,7 @@
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    s_xor_b32 s1, s1, s0
 ; GFX6-NEXT:    s_xor_b32 s2, s0, s8
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s3
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
@@ -6483,7 +6504,7 @@
 ; GFX6-NEXT:    s_ashr_i32 s0, s9, 31
 ; GFX6-NEXT:    s_add_i32 s1, s9, s0
 ; GFX6-NEXT:    s_xor_b32 s1, s1, s0
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
@@ -6588,18 +6609,19 @@
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 0xd9528441
-; GFX6-NEXT:    s_mov_b32 s2, 0x12d8fb
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mul_hi_i32 v0, s4, v0
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
-; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 20, v0
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
-; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX6-NEXT:    s_add_i32 s5, s5, s4
+; GFX6-NEXT:    s_lshr_b32 s6, s5, 31
+; GFX6-NEXT:    s_ashr_i32 s5, s5, 20
+; GFX6-NEXT:    s_add_i32 s5, s5, s6
+; GFX6-NEXT:    s_mul_i32 s5, s5, 0x12d8fb
+; GFX6-NEXT:    s_sub_i32 s4, s4, s5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -6693,7 +6715,7 @@
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
@@ -6711,36 +6733,37 @@
 ; GFX9-LABEL: srem_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX9-NEXT:    s_add_i32 s3, s3, s4
 ; GFX9-NEXT:    s_xor_b32 s3, s3, s4
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_sub_i32 s4, 0, s3
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    s_sub_i32 s5, 0, s3
+; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
+; GFX9-NEXT:    s_add_i32 s2, s2, s4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_xor_b32 s2, s2, s4
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v1, s4, v0
-; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
-; GFX9-NEXT:    s_add_i32 s2, s2, s4
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX9-NEXT:    s_mul_i32 s5, s5, s6
+; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s5
+; GFX9-NEXT:    s_add_i32 s6, s6, s5
+; GFX9-NEXT:    s_mul_hi_u32 s5, s2, s6
+; GFX9-NEXT:    s_mul_i32 s5, s5, s3
+; GFX9-NEXT:    s_sub_i32 s2, s2, s5
+; GFX9-NEXT:    s_sub_i32 s5, s2, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX9-NEXT:    s_sub_i32 s5, s2, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX9-NEXT:    s_cselect_b32 s2, s5, s2
 ; GFX9-NEXT:    s_xor_b32 s2, s2, s4
-; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
-; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
-; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_sub_i32 s2, s2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
@@ -6919,7 +6942,7 @@
 ; GFX6-NEXT:    s_sub_i32 s9, 0, s7
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v1
 ; GFX6-NEXT:    s_ashr_i32 s9, s5, 31
@@ -6954,62 +6977,65 @@
 ; GFX9-LABEL: srem_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s6
-; GFX9-NEXT:    s_ashr_i32 s6, s3, 31
-; GFX9-NEXT:    s_add_i32 s3, s3, s6
-; GFX9-NEXT:    s_lshl_b32 s2, 0x1000, s7
-; GFX9-NEXT:    s_xor_b32 s3, s3, s6
-; GFX9-NEXT:    s_ashr_i32 s7, s2, 31
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT:    s_add_i32 s2, s2, s7
-; GFX9-NEXT:    s_xor_b32 s2, s2, s7
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_sub_i32 s8, 0, s3
+; GFX9-NEXT:    s_lshl_b32 s2, 0x1000, s6
+; GFX9-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX9-NEXT:    s_add_i32 s2, s2, s3
+; GFX9-NEXT:    s_xor_b32 s2, s2, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s7
+; GFX9-NEXT:    s_sub_i32 s7, 0, s2
 ; GFX9-NEXT:    s_ashr_i32 s6, s4, 31
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    s_add_i32 s4, s4, s6
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_mul_lo_u32 v2, s8, v0
-; GFX9-NEXT:    s_sub_i32 s8, 0, s2
 ; GFX9-NEXT:    s_xor_b32 s4, s4, s6
-; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX9-NEXT:    s_ashr_i32 s7, s5, 31
-; GFX9-NEXT:    s_add_i32 s5, s5, s7
-; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX9-NEXT:    s_xor_b32 s5, s5, s7
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s2
-; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
-; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s2, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s2, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v0, s6, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, s7, v1
-; GFX9-NEXT:    v_subrev_u32_e32 v0, s6, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v1, s7, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX9-NEXT:    s_mul_i32 s7, s7, s8
+; GFX9-NEXT:    s_mul_hi_u32 s7, s8, s7
+; GFX9-NEXT:    s_add_i32 s8, s8, s7
+; GFX9-NEXT:    s_mul_hi_u32 s7, s4, s8
+; GFX9-NEXT:    s_mul_i32 s7, s7, s2
+; GFX9-NEXT:    s_sub_i32 s4, s4, s7
+; GFX9-NEXT:    s_sub_i32 s7, s4, s2
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s2
+; GFX9-NEXT:    s_cselect_b32 s4, s7, s4
+; GFX9-NEXT:    s_sub_i32 s7, s4, s2
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s2
+; GFX9-NEXT:    s_cselect_b32 s2, s7, s4
+; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX9-NEXT:    s_add_i32 s3, s3, s4
+; GFX9-NEXT:    s_xor_b32 s3, s3, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT:    s_xor_b32 s2, s2, s6
+; GFX9-NEXT:    s_sub_i32 s2, s2, s6
+; GFX9-NEXT:    s_sub_i32 s6, 0, s3
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    s_ashr_i32 s4, s5, 31
+; GFX9-NEXT:    s_add_i32 s5, s5, s4
+; GFX9-NEXT:    s_xor_b32 s5, s5, s4
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX9-NEXT:    s_mul_i32 s6, s6, s7
+; GFX9-NEXT:    s_mul_hi_u32 s6, s7, s6
+; GFX9-NEXT:    s_add_i32 s7, s7, s6
+; GFX9-NEXT:    s_mul_hi_u32 s6, s5, s7
+; GFX9-NEXT:    s_mul_i32 s6, s6, s3
+; GFX9-NEXT:    s_sub_i32 s5, s5, s6
+; GFX9-NEXT:    s_sub_i32 s6, s5, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s5, s3
+; GFX9-NEXT:    s_cselect_b32 s5, s6, s5
+; GFX9-NEXT:    s_sub_i32 s6, s5, s3
+; GFX9-NEXT:    s_cmp_ge_u32 s5, s3
+; GFX9-NEXT:    s_cselect_b32 s3, s6, s5
+; GFX9-NEXT:    s_xor_b32 s3, s3, s4
+; GFX9-NEXT:    s_sub_i32 s3, s3, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
@@ -7046,8 +7072,8 @@
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s5
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s5
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
@@ -7070,7 +7096,7 @@
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s5
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s4, s0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s5
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v0, v2
@@ -7151,116 +7177,130 @@
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX9-NEXT:    s_movk_i32 s2, 0xfee0
-; GFX9-NEXT:    s_mov_b32 s3, 0x68958c89
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s2
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s3
-; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s3
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s3
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
-; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v2
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v4
-; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s2
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s3
-; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s3
-; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s3
-; GFX9-NEXT:    s_movk_i32 s2, 0x11f
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
-; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v5
-; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v2
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v6, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v5
-; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GFX9-NEXT:    s_mov_b32 s3, 0x976a7377
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v4, v5, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    s_mul_i32 s1, s0, 0xfffffee0
+; GFX9-NEXT:    s_mul_hi_u32 s2, s0, 0x68958c89
+; GFX9-NEXT:    s_add_i32 s1, s2, s1
+; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX9-NEXT:    s_mul_i32 s3, s2, 0x68958c89
+; GFX9-NEXT:    s_add_i32 s1, s1, s3
+; GFX9-NEXT:    s_mul_i32 s9, s0, 0x68958c89
+; GFX9-NEXT:    s_mul_hi_u32 s3, s0, s1
+; GFX9-NEXT:    s_mul_i32 s8, s0, s1
+; GFX9-NEXT:    s_mul_hi_u32 s0, s0, s9
+; GFX9-NEXT:    s_add_u32 s0, s0, s8
+; GFX9-NEXT:    s_addc_u32 s3, 0, s3
+; GFX9-NEXT:    s_mul_hi_u32 s10, s2, s9
+; GFX9-NEXT:    s_mul_i32 s9, s2, s9
+; GFX9-NEXT:    s_add_u32 s0, s0, s9
+; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s1
+; GFX9-NEXT:    s_addc_u32 s0, s3, s10
+; GFX9-NEXT:    s_addc_u32 s3, s8, 0
+; GFX9-NEXT:    s_mul_i32 s1, s2, s1
+; GFX9-NEXT:    s_add_u32 s0, s0, s1
+; GFX9-NEXT:    s_addc_u32 s1, 0, s3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_addc_u32 s0, s2, s1
+; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX9-NEXT:    s_mul_i32 s3, s2, 0xfffffee0
+; GFX9-NEXT:    s_mul_hi_u32 s8, s2, 0x68958c89
+; GFX9-NEXT:    s_mul_i32 s1, s0, 0x68958c89
+; GFX9-NEXT:    s_add_i32 s3, s8, s3
+; GFX9-NEXT:    s_add_i32 s3, s3, s1
+; GFX9-NEXT:    s_mul_i32 s9, s2, 0x68958c89
+; GFX9-NEXT:    s_mul_hi_u32 s1, s2, s3
+; GFX9-NEXT:    s_mul_i32 s8, s2, s3
+; GFX9-NEXT:    s_mul_hi_u32 s2, s2, s9
+; GFX9-NEXT:    s_add_u32 s2, s2, s8
+; GFX9-NEXT:    s_addc_u32 s1, 0, s1
+; GFX9-NEXT:    s_mul_hi_u32 s10, s0, s9
+; GFX9-NEXT:    s_mul_i32 s9, s0, s9
+; GFX9-NEXT:    s_add_u32 s2, s2, s9
+; GFX9-NEXT:    s_mul_hi_u32 s8, s0, s3
+; GFX9-NEXT:    s_addc_u32 s1, s1, s10
+; GFX9-NEXT:    s_addc_u32 s2, s8, 0
+; GFX9-NEXT:    s_mul_i32 s3, s0, s3
+; GFX9-NEXT:    s_add_u32 s1, s1, s3
+; GFX9-NEXT:    s_addc_u32 s2, 0, s2
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s1, v0
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_addc_u32 s0, s0, s2
+; GFX9-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
-; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v1
-; GFX9-NEXT:    v_mul_hi_u32 v5, s7, v1
-; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0x11f
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s2
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s3
-; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s3
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s3
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
-; GFX9-NEXT:    v_sub_u32_e32 v4, s7, v2
-; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s6, v3
-; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s3, v3
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
-; GFX9-NEXT:    s_movk_i32 s3, 0x11e
-; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s3, v4
-; GFX9-NEXT:    s_mov_b32 s6, 0x976a7376
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s6, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v6, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v6, s[0:1], 2, v0
-; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v8, s[0:1], 1, v0
-; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[0:1], 0, v1, s[0:1]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v9, v7, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v7, s7
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v7, v2, vcc
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v6, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
+; GFX9-NEXT:    s_mul_i32 s2, s6, s0
+; GFX9-NEXT:    s_mul_hi_u32 s8, s6, s3
+; GFX9-NEXT:    s_mul_hi_u32 s1, s6, s0
+; GFX9-NEXT:    s_add_u32 s2, s8, s2
+; GFX9-NEXT:    s_addc_u32 s1, 0, s1
+; GFX9-NEXT:    s_mul_hi_u32 s9, s7, s3
+; GFX9-NEXT:    s_mul_i32 s3, s7, s3
+; GFX9-NEXT:    s_add_u32 s2, s2, s3
+; GFX9-NEXT:    s_mul_hi_u32 s8, s7, s0
+; GFX9-NEXT:    s_addc_u32 s1, s1, s9
+; GFX9-NEXT:    s_addc_u32 s2, s8, 0
+; GFX9-NEXT:    s_mul_i32 s0, s7, s0
+; GFX9-NEXT:    s_add_u32 s3, s1, s0
+; GFX9-NEXT:    s_addc_u32 s2, 0, s2
+; GFX9-NEXT:    s_mul_i32 s0, s3, 0x11f
+; GFX9-NEXT:    s_mul_hi_u32 s8, s3, 0x976a7377
+; GFX9-NEXT:    s_add_i32 s0, s8, s0
+; GFX9-NEXT:    s_mul_i32 s8, s2, 0x976a7377
+; GFX9-NEXT:    s_mul_i32 s9, s3, 0x976a7377
+; GFX9-NEXT:    s_add_i32 s8, s0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v0, s9
+; GFX9-NEXT:    s_sub_i32 s0, s7, s8
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
+; GFX9-NEXT:    s_mov_b32 s1, 0x976a7377
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_subb_u32 s6, s0, 0x11f
+; GFX9-NEXT:    v_subrev_co_u32_e64 v1, s[0:1], s1, v0
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    s_subb_u32 s6, s6, 0
+; GFX9-NEXT:    s_cmpk_gt_u32 s6, 0x11e
+; GFX9-NEXT:    s_mov_b32 s10, 0x976a7376
+; GFX9-NEXT:    s_cselect_b32 s9, -1, 0
+; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s10, v1
+; GFX9-NEXT:    s_cmpk_eq_i32 s6, 0x11f
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v3, s9
+; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT:    s_add_u32 s6, s3, 2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
+; GFX9-NEXT:    s_addc_u32 s0, s2, 0
+; GFX9-NEXT:    s_add_u32 s9, s3, 1
+; GFX9-NEXT:    s_addc_u32 s1, s2, 0
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_subb_u32 s7, s7, s8
+; GFX9-NEXT:    s_cmpk_gt_u32 s7, 0x11e
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
+; GFX9-NEXT:    s_cselect_b32 s8, -1, 0
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s10, v0
+; GFX9-NEXT:    s_cmpk_eq_i32 s7, 0x11f
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v4, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v0, s9
+; GFX9-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = udiv i64 %x, 1235195949943
   store i64 %r, i64 addrspace(1)* %out
@@ -7421,8 +7461,8 @@
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s6
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s6
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s6
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
@@ -7443,8 +7483,8 @@
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v1, s6
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s6
 ; GFX6-NEXT:    s_mov_b32 s6, -1
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
@@ -7702,8 +7742,8 @@
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s3
 ; GFX6-NEXT:    s_mov_b32 s12, 0x9761f7c9
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
@@ -7727,7 +7767,7 @@
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
 ; GFX6-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s10, -1
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v0, v2
@@ -7764,8 +7804,8 @@
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s12
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s12
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s12
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s7, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v3, 0x11f
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
@@ -7805,115 +7845,128 @@
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX9-NEXT:    s_movk_i32 s2, 0xfee0
-; GFX9-NEXT:    s_mov_b32 s3, 0x689e0837
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s12, 0x9761f7c8
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_movk_i32 s8, 0x11f
-; GFX9-NEXT:    s_mov_b32 s9, 0x9761f7c9
-; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s2
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s3
-; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s3
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s3
-; GFX9-NEXT:    s_mov_b32 s10, 0x9761f7c8
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
-; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v2
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v4
-; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s2
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s3
-; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s3
-; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s3
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
-; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v5
-; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v2
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v6, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v5
-; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v4, v5, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    s_mul_i32 s1, s0, 0xfffffee0
+; GFX9-NEXT:    s_mul_hi_u32 s2, s0, 0x689e0837
+; GFX9-NEXT:    s_add_i32 s1, s2, s1
+; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX9-NEXT:    s_mul_i32 s3, s2, 0x689e0837
+; GFX9-NEXT:    s_add_i32 s1, s1, s3
+; GFX9-NEXT:    s_mul_i32 s9, s0, 0x689e0837
+; GFX9-NEXT:    s_mul_hi_u32 s3, s0, s1
+; GFX9-NEXT:    s_mul_i32 s8, s0, s1
+; GFX9-NEXT:    s_mul_hi_u32 s0, s0, s9
+; GFX9-NEXT:    s_add_u32 s0, s0, s8
+; GFX9-NEXT:    s_addc_u32 s3, 0, s3
+; GFX9-NEXT:    s_mul_hi_u32 s10, s2, s9
+; GFX9-NEXT:    s_mul_i32 s9, s2, s9
+; GFX9-NEXT:    s_add_u32 s0, s0, s9
+; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s1
+; GFX9-NEXT:    s_addc_u32 s0, s3, s10
+; GFX9-NEXT:    s_addc_u32 s3, s8, 0
+; GFX9-NEXT:    s_mul_i32 s1, s2, s1
+; GFX9-NEXT:    s_add_u32 s0, s0, s1
+; GFX9-NEXT:    s_addc_u32 s1, 0, s3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_addc_u32 s0, s2, s1
+; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX9-NEXT:    s_mul_i32 s3, s2, 0xfffffee0
+; GFX9-NEXT:    s_mul_hi_u32 s8, s2, 0x689e0837
+; GFX9-NEXT:    s_mul_i32 s1, s0, 0x689e0837
+; GFX9-NEXT:    s_add_i32 s3, s8, s3
+; GFX9-NEXT:    s_add_i32 s3, s3, s1
+; GFX9-NEXT:    s_mul_i32 s9, s2, 0x689e0837
+; GFX9-NEXT:    s_mul_hi_u32 s1, s2, s3
+; GFX9-NEXT:    s_mul_i32 s8, s2, s3
+; GFX9-NEXT:    s_mul_hi_u32 s2, s2, s9
+; GFX9-NEXT:    s_add_u32 s2, s2, s8
+; GFX9-NEXT:    s_addc_u32 s1, 0, s1
+; GFX9-NEXT:    s_mul_hi_u32 s10, s0, s9
+; GFX9-NEXT:    s_mul_i32 s9, s0, s9
+; GFX9-NEXT:    s_add_u32 s2, s2, s9
+; GFX9-NEXT:    s_mul_hi_u32 s8, s0, s3
+; GFX9-NEXT:    s_addc_u32 s1, s1, s10
+; GFX9-NEXT:    s_addc_u32 s2, s8, 0
+; GFX9-NEXT:    s_mul_i32 s3, s0, s3
+; GFX9-NEXT:    s_add_u32 s1, s1, s3
+; GFX9-NEXT:    s_addc_u32 s2, 0, s2
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s1, v0
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_addc_u32 s0, s0, s2
+; GFX9-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
-; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v1
-; GFX9-NEXT:    v_mul_hi_u32 v5, s7, v1
-; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s8
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s9
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s9
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s9
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
-; GFX9-NEXT:    v_sub_u32_e32 v2, s7, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x11f
+; GFX9-NEXT:    s_mul_i32 s2, s6, s0
+; GFX9-NEXT:    s_mul_hi_u32 s8, s6, s3
+; GFX9-NEXT:    s_mul_hi_u32 s1, s6, s0
+; GFX9-NEXT:    s_add_u32 s2, s8, s2
+; GFX9-NEXT:    s_addc_u32 s1, 0, s1
+; GFX9-NEXT:    s_mul_hi_u32 s9, s7, s3
+; GFX9-NEXT:    s_mul_i32 s3, s7, s3
+; GFX9-NEXT:    s_add_u32 s2, s2, s3
+; GFX9-NEXT:    s_mul_hi_u32 s8, s7, s0
+; GFX9-NEXT:    s_addc_u32 s1, s1, s9
+; GFX9-NEXT:    s_addc_u32 s2, s8, 0
+; GFX9-NEXT:    s_mul_i32 s0, s7, s0
+; GFX9-NEXT:    s_add_u32 s0, s1, s0
+; GFX9-NEXT:    s_addc_u32 s1, 0, s2
+; GFX9-NEXT:    s_mul_i32 s2, s0, 0x11f
+; GFX9-NEXT:    s_mul_hi_u32 s3, s0, 0x9761f7c9
+; GFX9-NEXT:    s_add_i32 s2, s3, s2
+; GFX9-NEXT:    s_mul_i32 s1, s1, 0x9761f7c9
+; GFX9-NEXT:    s_mul_i32 s0, s0, 0x9761f7c9
+; GFX9-NEXT:    s_add_i32 s9, s2, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    s_sub_i32 s1, s7, s9
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
-; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s9, v0
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[2:3], 0, v2, s[0:1]
-; GFX9-NEXT:    s_movk_i32 s6, 0x11e
-; GFX9-NEXT:    v_cmp_lt_u32_e64 s[2:3], s6, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
-; GFX9-NEXT:    v_cmp_lt_u32_e64 s[2:3], s10, v5
-; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s8, v6
-; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s9, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v6, s7
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v6, v1, vcc
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s10, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v3, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
+; GFX9-NEXT:    s_mov_b32 s8, 0x9761f7c9
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_subb_u32 s6, s1, 0x11f
+; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s8, v0
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    s_subb_u32 s10, s6, 0
+; GFX9-NEXT:    s_cmpk_gt_u32 s10, 0x11e
+; GFX9-NEXT:    s_cselect_b32 s11, -1, 0
+; GFX9-NEXT:    v_cmp_lt_u32_e64 s[2:3], s12, v3
+; GFX9-NEXT:    s_cmpk_eq_i32 s10, 0x11f
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v4, s11
+; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[2:3]
+; GFX9-NEXT:    s_subb_u32 s2, s6, 0x11f
+; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s8, v3
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    s_subb_u32 s0, s2, 0
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_subb_u32 s2, s7, s9
+; GFX9-NEXT:    s_cmpk_gt_u32 s2, 0x11e
+; GFX9-NEXT:    v_mov_b32_e32 v5, s10
+; GFX9-NEXT:    v_mov_b32_e32 v6, s0
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
+; GFX9-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s12, v0
+; GFX9-NEXT:    s_cmpk_eq_i32 s2, 0x11f
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, v6, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v6, s3
+; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v6, s2
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = urem i64 %x, 1235195393993
   store i64 %r, i64 addrspace(1)* %out
@@ -8138,8 +8191,8 @@
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s5
 ; GFX6-NEXT:    s_mov_b32 s9, s8
 ; GFX6-NEXT:    s_addc_u32 s3, s3, s8
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
@@ -8162,7 +8215,7 @@
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s5
 ; GFX6-NEXT:    s_mov_b32 s0, 0x12d8fb
 ; GFX6-NEXT:    s_mov_b32 s6, -1
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s5
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
@@ -8434,8 +8487,8 @@
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s5, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s4, v0
 ; GFX6-NEXT:    s_addc_u32 s3, s3, s12
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
@@ -8495,9 +8548,9 @@
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s11, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s11
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s10, v0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s3, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s2, v3
 ; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
@@ -8549,125 +8602,143 @@
 ; GFX9-NEXT:    s_xor_b64 s[8:9], s[4:5], s[2:3]
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX9-NEXT:    s_sub_u32 s10, 0, s8
-; GFX9-NEXT:    s_subb_u32 s4, 0, s9
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_sub_u32 s0, 0, s8
+; GFX9-NEXT:    s_subb_u32 s1, 0, s9
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
-; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX9-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v2, s10, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s10, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, s4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v4, s10, v0
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
-; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v6, v1, v4
-; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v4
-; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v6, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v8, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, s10, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s10, v0
-; GFX9-NEXT:    v_mul_lo_u32 v4, s4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, s10, v0
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
-; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v5
-; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v5
-; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v5
-; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s11, v1
+; GFX9-NEXT:    s_mul_i32 s12, s0, s10
+; GFX9-NEXT:    s_mul_hi_u32 s14, s0, s11
+; GFX9-NEXT:    s_mul_i32 s13, s1, s11
+; GFX9-NEXT:    s_add_i32 s12, s14, s12
+; GFX9-NEXT:    s_add_i32 s12, s12, s13
+; GFX9-NEXT:    s_mul_i32 s15, s0, s11
+; GFX9-NEXT:    s_mul_hi_u32 s13, s11, s12
+; GFX9-NEXT:    s_mul_i32 s14, s11, s12
+; GFX9-NEXT:    s_mul_hi_u32 s11, s11, s15
+; GFX9-NEXT:    s_add_u32 s11, s11, s14
+; GFX9-NEXT:    s_addc_u32 s13, 0, s13
+; GFX9-NEXT:    s_mul_hi_u32 s16, s10, s15
+; GFX9-NEXT:    s_mul_i32 s15, s10, s15
+; GFX9-NEXT:    s_add_u32 s11, s11, s15
+; GFX9-NEXT:    s_mul_hi_u32 s14, s10, s12
+; GFX9-NEXT:    s_addc_u32 s11, s13, s16
+; GFX9-NEXT:    s_addc_u32 s13, s14, 0
+; GFX9-NEXT:    s_mul_i32 s12, s10, s12
+; GFX9-NEXT:    s_add_u32 s11, s11, s12
+; GFX9-NEXT:    s_addc_u32 s12, 0, s13
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s11, v1
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_addc_u32 s10, s10, s12
+; GFX9-NEXT:    v_readfirstlane_b32 s12, v1
+; GFX9-NEXT:    s_mul_i32 s11, s0, s10
+; GFX9-NEXT:    s_mul_hi_u32 s13, s0, s12
+; GFX9-NEXT:    s_add_i32 s11, s13, s11
+; GFX9-NEXT:    s_mul_i32 s1, s1, s12
+; GFX9-NEXT:    s_add_i32 s11, s11, s1
+; GFX9-NEXT:    s_mul_i32 s0, s0, s12
+; GFX9-NEXT:    s_mul_hi_u32 s13, s10, s0
+; GFX9-NEXT:    s_mul_i32 s14, s10, s0
+; GFX9-NEXT:    s_mul_i32 s16, s12, s11
+; GFX9-NEXT:    s_mul_hi_u32 s0, s12, s0
+; GFX9-NEXT:    s_mul_hi_u32 s15, s12, s11
+; GFX9-NEXT:    s_add_u32 s0, s0, s16
+; GFX9-NEXT:    s_addc_u32 s12, 0, s15
+; GFX9-NEXT:    s_add_u32 s0, s0, s14
+; GFX9-NEXT:    s_mul_hi_u32 s1, s10, s11
+; GFX9-NEXT:    s_addc_u32 s0, s12, s13
+; GFX9-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-NEXT:    s_mul_i32 s11, s10, s11
+; GFX9-NEXT:    s_add_u32 s0, s0, s11
+; GFX9-NEXT:    s_addc_u32 s1, 0, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s0, v1
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_addc_u32 s12, s10, s1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    s_add_u32 s0, s6, s10
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s11, s10
 ; GFX9-NEXT:    s_addc_u32 s1, s7, s10
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
-; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
-; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v1
-; GFX9-NEXT:    v_mul_hi_u32 v5, s7, v1
-; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
-; GFX9-NEXT:    v_mov_b32_e32 v6, s9
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, s8, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v0
-; GFX9-NEXT:    v_mul_lo_u32 v4, s9, v0
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v0
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
-; GFX9-NEXT:    v_sub_u32_e32 v4, s7, v2
-; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s6, v3
-; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s8, v3
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v6, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v6, s[0:1], 2, v0
-; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v8, s[0:1], 1, v0
-; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[0:1], 0, v1, s[0:1]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v9, v7, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v7, s7
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v7, v2, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v6, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_readfirstlane_b32 s13, v1
+; GFX9-NEXT:    s_mul_i32 s1, s6, s12
+; GFX9-NEXT:    s_mul_hi_u32 s14, s6, s13
+; GFX9-NEXT:    s_mul_hi_u32 s0, s6, s12
+; GFX9-NEXT:    s_add_u32 s1, s14, s1
+; GFX9-NEXT:    s_addc_u32 s0, 0, s0
+; GFX9-NEXT:    s_mul_hi_u32 s15, s7, s13
+; GFX9-NEXT:    s_mul_i32 s13, s7, s13
+; GFX9-NEXT:    s_add_u32 s1, s1, s13
+; GFX9-NEXT:    s_mul_hi_u32 s14, s7, s12
+; GFX9-NEXT:    s_addc_u32 s0, s0, s15
+; GFX9-NEXT:    s_addc_u32 s1, s14, 0
+; GFX9-NEXT:    s_mul_i32 s12, s7, s12
+; GFX9-NEXT:    s_add_u32 s12, s0, s12
+; GFX9-NEXT:    s_addc_u32 s13, 0, s1
+; GFX9-NEXT:    s_mul_i32 s0, s8, s13
+; GFX9-NEXT:    s_mul_hi_u32 s1, s8, s12
+; GFX9-NEXT:    s_add_i32 s0, s1, s0
+; GFX9-NEXT:    s_mul_i32 s1, s9, s12
+; GFX9-NEXT:    s_add_i32 s14, s0, s1
+; GFX9-NEXT:    s_mul_i32 s1, s8, s12
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_sub_i32 s0, s7, s14
+; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, s6, v1
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_subb_u32 s6, s0, s9
+; GFX9-NEXT:    v_subrev_co_u32_e64 v2, s[0:1], s8, v1
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    s_subb_u32 s6, s6, 0
+; GFX9-NEXT:    s_cmp_ge_u32 s6, s9
+; GFX9-NEXT:    s_cselect_b32 s15, -1, 0
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v2
+; GFX9-NEXT:    s_cmp_eq_u32 s6, s9
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v3, s15
+; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT:    s_add_u32 s6, s12, 2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
+; GFX9-NEXT:    s_addc_u32 s0, s13, 0
+; GFX9-NEXT:    s_add_u32 s15, s12, 1
+; GFX9-NEXT:    s_addc_u32 s1, s13, 0
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_subb_u32 s7, s7, s14
+; GFX9-NEXT:    s_cmp_ge_u32 s7, s9
+; GFX9-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
+; GFX9-NEXT:    s_cselect_b32 s14, -1, 0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v1
+; GFX9-NEXT:    s_cmp_eq_u32 s7, s9
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, s14
+; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, s13
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, s15
+; GFX9-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v3, s12
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], s[2:3]
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
-; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
+; GFX9-NEXT:    v_xor_b32_e32 v2, s0, v2
+; GFX9-NEXT:    v_xor_b32_e32 v3, s1, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s0, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v4, vcc
+; GFX9-NEXT:    global_store_dwordx2 v0, v[1:2], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i64 4096, %y
   %r = sdiv i64 %x, %shl.y
@@ -8772,9 +8843,9 @@
 ; GFX6-NEXT:    s_add_u32 s0, s0, s8
 ; GFX6-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX6-NEXT:    s_ashr_i64 s[8:9], s[0:1], 12
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s6
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
@@ -8798,7 +8869,7 @@
 ; GFX6-NEXT:    s_mov_b32 s11, s10
 ; GFX6-NEXT:    s_addc_u32 s1, s3, s10
 ; GFX6-NEXT:    s_xor_b64 s[0:1], s[0:1], s[10:11]
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s6
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
@@ -9046,8 +9117,8 @@
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s11, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s10, v0
 ; GFX6-NEXT:    s_xor_b64 s[14:15], s[16:17], s[14:15]
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v2
@@ -9106,9 +9177,9 @@
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s12, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s13, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s13
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s12, v0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s5, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s4, v3
 ; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
@@ -9159,9 +9230,9 @@
 ; GFX6-NEXT:    s_subb_u32 s1, 0, s3
 ; GFX6-NEXT:    v_mul_lo_u32 v6, s1, v3
 ; GFX6-NEXT:    s_ashr_i32 s12, s7, 31
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v3
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v3, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v7, v3, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v8, v3, v2
@@ -9273,257 +9344,293 @@
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
-; GFX9-NEXT:    s_ashr_i32 s12, s3, 31
-; GFX9-NEXT:    s_add_u32 s2, s2, s12
-; GFX9-NEXT:    s_mov_b32 s13, s12
-; GFX9-NEXT:    s_addc_u32 s3, s3, s12
-; GFX9-NEXT:    s_xor_b64 s[8:9], s[2:3], s[12:13]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX9-NEXT:    s_sub_u32 s2, 0, s8
-; GFX9-NEXT:    s_subb_u32 s3, 0, s9
-; GFX9-NEXT:    s_ashr_i32 s14, s5, 31
-; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
-; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX9-NEXT:    s_mov_b32 s15, s14
+; GFX9-NEXT:    s_ashr_i32 s8, s3, 31
+; GFX9-NEXT:    s_add_u32 s2, s2, s8
+; GFX9-NEXT:    s_mov_b32 s9, s8
+; GFX9-NEXT:    s_addc_u32 s3, s3, s8
+; GFX9-NEXT:    s_xor_b64 s[12:13], s[2:3], s[8:9]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s12
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s13
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT:    s_sub_u32 s0, 0, s12
+; GFX9-NEXT:    s_subb_u32 s1, 0, s13
+; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, s3, v0
-; GFX9-NEXT:    v_mul_lo_u32 v4, s2, v0
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
-; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v6, v1, v4
-; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v4
-; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v6, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v8, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v4, s3, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v0
-; GFX9-NEXT:    s_add_u32 s2, s4, s14
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
-; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v5
-; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v5
-; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v5
-; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    s_addc_u32 s3, s5, s14
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    s_xor_b64 s[4:5], s[2:3], s[14:15]
-; GFX9-NEXT:    v_mul_lo_u32 v2, s4, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v4, s4, v1
-; GFX9-NEXT:    v_mul_hi_u32 v5, s5, v1
-; GFX9-NEXT:    v_mul_lo_u32 v1, s5, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s5, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, s5, v0
-; GFX9-NEXT:    v_mov_b32_e32 v6, s9
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v0, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v2
-; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v1
-; GFX9-NEXT:    v_mul_lo_u32 v5, s9, v1
-; GFX9-NEXT:    s_xor_b64 s[12:13], s[14:15], s[12:13]
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_mul_lo_u32 v4, s8, v1
-; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
-; GFX9-NEXT:    v_sub_u32_e32 v5, s5, v3
-; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, s4, v4
-; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s8, v4
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v6, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v6, s[0:1], 2, v1
-; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v2, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v8, s[0:1], 1, v1
-; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[0:1], 0, v2, s[0:1]
+; GFX9-NEXT:    v_readfirstlane_b32 s14, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s15, v0
+; GFX9-NEXT:    s_mul_i32 s16, s0, s14
+; GFX9-NEXT:    s_mul_hi_u32 s18, s0, s15
+; GFX9-NEXT:    s_mul_i32 s17, s1, s15
+; GFX9-NEXT:    s_add_i32 s16, s18, s16
+; GFX9-NEXT:    s_add_i32 s16, s16, s17
+; GFX9-NEXT:    s_mul_i32 s19, s0, s15
+; GFX9-NEXT:    s_mul_hi_u32 s17, s15, s16
+; GFX9-NEXT:    s_mul_i32 s18, s15, s16
+; GFX9-NEXT:    s_mul_hi_u32 s15, s15, s19
+; GFX9-NEXT:    s_add_u32 s15, s15, s18
+; GFX9-NEXT:    s_addc_u32 s17, 0, s17
+; GFX9-NEXT:    s_mul_hi_u32 s20, s14, s19
+; GFX9-NEXT:    s_mul_i32 s19, s14, s19
+; GFX9-NEXT:    s_add_u32 s15, s15, s19
+; GFX9-NEXT:    s_mul_hi_u32 s18, s14, s16
+; GFX9-NEXT:    s_addc_u32 s15, s17, s20
+; GFX9-NEXT:    s_addc_u32 s17, s18, 0
+; GFX9-NEXT:    s_mul_i32 s16, s14, s16
+; GFX9-NEXT:    s_add_u32 s15, s15, s16
+; GFX9-NEXT:    s_addc_u32 s16, 0, s17
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s15, v0
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_addc_u32 s14, s14, s16
+; GFX9-NEXT:    v_readfirstlane_b32 s16, v0
+; GFX9-NEXT:    s_mul_i32 s15, s0, s14
+; GFX9-NEXT:    s_mul_hi_u32 s17, s0, s16
+; GFX9-NEXT:    s_add_i32 s15, s17, s15
+; GFX9-NEXT:    s_mul_i32 s1, s1, s16
+; GFX9-NEXT:    s_add_i32 s15, s15, s1
+; GFX9-NEXT:    s_mul_i32 s0, s0, s16
+; GFX9-NEXT:    s_mul_hi_u32 s17, s14, s0
+; GFX9-NEXT:    s_mul_i32 s18, s14, s0
+; GFX9-NEXT:    s_mul_i32 s20, s16, s15
+; GFX9-NEXT:    s_mul_hi_u32 s0, s16, s0
+; GFX9-NEXT:    s_mul_hi_u32 s19, s16, s15
+; GFX9-NEXT:    s_add_u32 s0, s0, s20
+; GFX9-NEXT:    s_addc_u32 s16, 0, s19
+; GFX9-NEXT:    s_add_u32 s0, s0, s18
+; GFX9-NEXT:    s_mul_hi_u32 s1, s14, s15
+; GFX9-NEXT:    s_addc_u32 s0, s16, s17
+; GFX9-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-NEXT:    s_mul_i32 s15, s14, s15
+; GFX9-NEXT:    s_add_u32 s0, s0, s15
+; GFX9-NEXT:    s_addc_u32 s1, 0, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_addc_u32 s16, s14, s1
+; GFX9-NEXT:    s_ashr_i32 s14, s5, 31
+; GFX9-NEXT:    s_add_u32 s0, s4, s14
+; GFX9-NEXT:    s_mov_b32 s15, s14
+; GFX9-NEXT:    s_addc_u32 s1, s5, s14
+; GFX9-NEXT:    s_xor_b64 s[4:5], s[0:1], s[14:15]
+; GFX9-NEXT:    v_readfirstlane_b32 s17, v0
+; GFX9-NEXT:    s_mul_i32 s1, s4, s16
+; GFX9-NEXT:    s_mul_hi_u32 s18, s4, s17
+; GFX9-NEXT:    s_mul_hi_u32 s0, s4, s16
+; GFX9-NEXT:    s_add_u32 s1, s18, s1
+; GFX9-NEXT:    s_addc_u32 s0, 0, s0
+; GFX9-NEXT:    s_mul_hi_u32 s19, s5, s17
+; GFX9-NEXT:    s_mul_i32 s17, s5, s17
+; GFX9-NEXT:    s_add_u32 s1, s1, s17
+; GFX9-NEXT:    s_mul_hi_u32 s18, s5, s16
+; GFX9-NEXT:    s_addc_u32 s0, s0, s19
+; GFX9-NEXT:    s_addc_u32 s1, s18, 0
+; GFX9-NEXT:    s_mul_i32 s16, s5, s16
+; GFX9-NEXT:    s_add_u32 s16, s0, s16
+; GFX9-NEXT:    s_addc_u32 s17, 0, s1
+; GFX9-NEXT:    s_mul_i32 s0, s12, s17
+; GFX9-NEXT:    s_mul_hi_u32 s1, s12, s16
+; GFX9-NEXT:    s_add_i32 s0, s1, s0
+; GFX9-NEXT:    s_mul_i32 s1, s13, s16
+; GFX9-NEXT:    s_add_i32 s18, s0, s1
+; GFX9-NEXT:    s_mul_i32 s1, s12, s16
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    s_sub_i32 s0, s5, s18
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_subb_u32 s4, s0, s13
+; GFX9-NEXT:    v_subrev_co_u32_e64 v1, s[0:1], s12, v0
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    s_subb_u32 s4, s4, 0
+; GFX9-NEXT:    s_cmp_ge_u32 s4, s13
+; GFX9-NEXT:    s_cselect_b32 s19, -1, 0
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v1
+; GFX9-NEXT:    s_cmp_eq_u32 s4, s13
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s19
+; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT:    s_add_u32 s4, s16, 2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[0:1]
+; GFX9-NEXT:    s_addc_u32 s0, s17, 0
+; GFX9-NEXT:    s_add_u32 s19, s16, 1
+; GFX9-NEXT:    s_addc_u32 s1, s17, 0
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_subb_u32 s5, s5, s18
+; GFX9-NEXT:    s_cmp_ge_u32 s5, s13
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
+; GFX9-NEXT:    s_cselect_b32 s18, -1, 0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
+; GFX9-NEXT:    s_cmp_eq_u32 s5, s13
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, s17
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s19
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[14:15], s[8:9]
 ; GFX9-NEXT:    s_ashr_i32 s4, s11, 31
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
-; GFX9-NEXT:    s_add_u32 s10, s10, s4
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v9, v7, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v7, s5
+; GFX9-NEXT:    s_add_u32 s8, s10, s4
 ; GFX9-NEXT:    s_mov_b32 s5, s4
-; GFX9-NEXT:    s_addc_u32 s11, s11, s4
-; GFX9-NEXT:    s_xor_b64 s[10:11], s[10:11], s[4:5]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v9, s10
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v10, s11
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v7, v3, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v3
-; GFX9-NEXT:    v_mac_f32_e32 v9, 0x4f800000, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v4, vcc
-; GFX9-NEXT:    v_rcp_f32_e32 v4, v9
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v8, v6, s[0:1]
-; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX9-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX9-NEXT:    s_sub_u32 s0, 0, s10
-; GFX9-NEXT:    s_subb_u32 s1, 0, s11
-; GFX9-NEXT:    v_mul_hi_u32 v6, s0, v4
-; GFX9-NEXT:    v_mul_lo_u32 v7, s0, v5
-; GFX9-NEXT:    v_mul_lo_u32 v8, s1, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, s0, v4
-; GFX9-NEXT:    v_add_u32_e32 v6, v6, v7
-; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
-; GFX9-NEXT:    v_mul_lo_u32 v7, v4, v6
-; GFX9-NEXT:    v_mul_hi_u32 v8, v4, v3
-; GFX9-NEXT:    v_mul_hi_u32 v9, v4, v6
-; GFX9-NEXT:    v_mul_hi_u32 v10, v5, v6
-; GFX9-NEXT:    v_mul_lo_u32 v6, v5, v6
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v9, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v9, v5, v3
-; GFX9-NEXT:    v_mul_hi_u32 v3, v5, v3
-; GFX9-NEXT:    s_ashr_i32 s8, s7, 31
-; GFX9-NEXT:    s_mov_b32 s9, s8
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v10, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v5, v6, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v5, s0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v6, s0, v3
-; GFX9-NEXT:    v_mul_lo_u32 v7, s1, v3
-; GFX9-NEXT:    v_mul_lo_u32 v8, s0, v3
-; GFX9-NEXT:    s_add_u32 s0, s6, s8
-; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
-; GFX9-NEXT:    v_add_u32_e32 v5, v5, v7
-; GFX9-NEXT:    v_mul_lo_u32 v9, v3, v5
-; GFX9-NEXT:    v_mul_hi_u32 v10, v3, v8
-; GFX9-NEXT:    v_mul_hi_u32 v11, v3, v5
-; GFX9-NEXT:    v_mul_hi_u32 v7, v4, v8
-; GFX9-NEXT:    v_mul_lo_u32 v8, v4, v8
-; GFX9-NEXT:    v_mul_hi_u32 v6, v4, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
-; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, 0, v11, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v5, v4, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v10, v7, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    s_addc_u32 s1, s7, s8
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
-; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[8:9]
-; GFX9-NEXT:    v_mul_lo_u32 v5, s6, v4
-; GFX9-NEXT:    v_mul_hi_u32 v6, s6, v3
-; GFX9-NEXT:    v_mul_hi_u32 v8, s6, v4
-; GFX9-NEXT:    v_mul_hi_u32 v9, s7, v4
-; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v8, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v8, s7, v3
-; GFX9-NEXT:    v_mul_hi_u32 v3, s7, v3
-; GFX9-NEXT:    v_xor_b32_e32 v1, s12, v1
-; GFX9-NEXT:    v_xor_b32_e32 v2, s13, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v8
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v3, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v9, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v5, s10, v4
-; GFX9-NEXT:    v_mul_hi_u32 v6, s10, v3
-; GFX9-NEXT:    v_mul_lo_u32 v8, s11, v3
-; GFX9-NEXT:    v_mov_b32_e32 v7, s13
-; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s12, v1
-; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
-; GFX9-NEXT:    v_mul_lo_u32 v6, s10, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v7, vcc
-; GFX9-NEXT:    v_add_u32_e32 v5, v5, v8
-; GFX9-NEXT:    v_sub_u32_e32 v7, s7, v5
-; GFX9-NEXT:    v_mov_b32_e32 v8, s11
-; GFX9-NEXT:    v_sub_co_u32_e32 v6, vcc, s6, v6
-; GFX9-NEXT:    v_subb_co_u32_e64 v7, s[0:1], v7, v8, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e64 v8, s[0:1], s10, v6
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v7, s[0:1], 0, v7, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v9, v8, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v8, s[0:1], 2, v3
-; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[0:1], 0, v4, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], 1, v3
-; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v4, s[0:1]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v11, v9, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v9, s7
-; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v9, v5, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v10, v8, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[8:9], s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v3, s0, v3
-; GFX9-NEXT:    v_xor_b32_e32 v4, s1, v4
+; GFX9-NEXT:    s_addc_u32 s9, s11, s4
+; GFX9-NEXT:    v_mov_b32_e32 v2, s16
+; GFX9-NEXT:    s_xor_b64 s[8:9], s[8:9], s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s8
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s9
+; GFX9-NEXT:    v_xor_b32_e32 v1, s0, v1
+; GFX9-NEXT:    v_xor_b32_e32 v5, s1, v0
+; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v1
+; GFX9-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
+; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX9-NEXT:    s_sub_u32 s0, 0, s8
+; GFX9-NEXT:    v_mov_b32_e32 v6, s1
+; GFX9-NEXT:    s_subb_u32 s1, 0, s9
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
+; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX9-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v5, v6, vcc
+; GFX9-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s13, v3
+; GFX9-NEXT:    s_mul_hi_u32 s12, s0, s10
+; GFX9-NEXT:    s_mul_i32 s14, s0, s13
+; GFX9-NEXT:    s_mul_i32 s11, s1, s10
+; GFX9-NEXT:    s_add_i32 s12, s12, s14
+; GFX9-NEXT:    s_add_i32 s12, s12, s11
+; GFX9-NEXT:    s_mul_i32 s15, s0, s10
+; GFX9-NEXT:    s_mul_hi_u32 s11, s10, s12
+; GFX9-NEXT:    s_mul_i32 s14, s10, s12
+; GFX9-NEXT:    s_mul_hi_u32 s10, s10, s15
+; GFX9-NEXT:    s_add_u32 s10, s10, s14
+; GFX9-NEXT:    s_addc_u32 s11, 0, s11
+; GFX9-NEXT:    s_mul_hi_u32 s16, s13, s15
+; GFX9-NEXT:    s_mul_i32 s15, s13, s15
+; GFX9-NEXT:    s_add_u32 s10, s10, s15
+; GFX9-NEXT:    s_mul_hi_u32 s14, s13, s12
+; GFX9-NEXT:    s_addc_u32 s10, s11, s16
+; GFX9-NEXT:    s_addc_u32 s11, s14, 0
+; GFX9-NEXT:    s_mul_i32 s12, s13, s12
+; GFX9-NEXT:    s_add_u32 s10, s10, s12
+; GFX9-NEXT:    s_addc_u32 s11, 0, s11
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s10, v2
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_addc_u32 s10, s13, s11
+; GFX9-NEXT:    v_readfirstlane_b32 s12, v2
+; GFX9-NEXT:    s_mul_i32 s11, s0, s10
+; GFX9-NEXT:    s_mul_hi_u32 s13, s0, s12
+; GFX9-NEXT:    s_add_i32 s11, s13, s11
+; GFX9-NEXT:    s_mul_i32 s1, s1, s12
+; GFX9-NEXT:    s_add_i32 s11, s11, s1
+; GFX9-NEXT:    s_mul_i32 s0, s0, s12
+; GFX9-NEXT:    s_mul_hi_u32 s13, s10, s0
+; GFX9-NEXT:    s_mul_i32 s14, s10, s0
+; GFX9-NEXT:    s_mul_i32 s16, s12, s11
+; GFX9-NEXT:    s_mul_hi_u32 s0, s12, s0
+; GFX9-NEXT:    s_mul_hi_u32 s15, s12, s11
+; GFX9-NEXT:    s_add_u32 s0, s0, s16
+; GFX9-NEXT:    s_addc_u32 s12, 0, s15
+; GFX9-NEXT:    s_add_u32 s0, s0, s14
+; GFX9-NEXT:    s_mul_hi_u32 s1, s10, s11
+; GFX9-NEXT:    s_addc_u32 s0, s12, s13
+; GFX9-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-NEXT:    s_mul_i32 s11, s10, s11
+; GFX9-NEXT:    s_add_u32 s0, s0, s11
+; GFX9-NEXT:    s_addc_u32 s1, 0, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_addc_u32 s12, s10, s1
+; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
+; GFX9-NEXT:    s_add_u32 s0, s6, s10
+; GFX9-NEXT:    s_mov_b32 s11, s10
+; GFX9-NEXT:    s_addc_u32 s1, s7, s10
+; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
+; GFX9-NEXT:    v_readfirstlane_b32 s13, v2
+; GFX9-NEXT:    s_mul_i32 s1, s6, s12
+; GFX9-NEXT:    s_mul_hi_u32 s14, s6, s13
+; GFX9-NEXT:    s_mul_hi_u32 s0, s6, s12
+; GFX9-NEXT:    s_add_u32 s1, s14, s1
+; GFX9-NEXT:    s_addc_u32 s0, 0, s0
+; GFX9-NEXT:    s_mul_hi_u32 s15, s7, s13
+; GFX9-NEXT:    s_mul_i32 s13, s7, s13
+; GFX9-NEXT:    s_add_u32 s1, s1, s13
+; GFX9-NEXT:    s_mul_hi_u32 s14, s7, s12
+; GFX9-NEXT:    s_addc_u32 s0, s0, s15
+; GFX9-NEXT:    s_addc_u32 s1, s14, 0
+; GFX9-NEXT:    s_mul_i32 s12, s7, s12
+; GFX9-NEXT:    s_add_u32 s12, s0, s12
+; GFX9-NEXT:    s_addc_u32 s13, 0, s1
+; GFX9-NEXT:    s_mul_i32 s0, s8, s13
+; GFX9-NEXT:    s_mul_hi_u32 s1, s8, s12
+; GFX9-NEXT:    s_add_i32 s0, s1, s0
+; GFX9-NEXT:    s_mul_i32 s1, s9, s12
+; GFX9-NEXT:    s_add_i32 s14, s0, s1
+; GFX9-NEXT:    s_mul_i32 s1, s8, s12
+; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    s_sub_i32 s0, s7, s14
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s6, v2
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_subb_u32 s6, s0, s9
+; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s8, v2
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    s_subb_u32 s6, s6, 0
+; GFX9-NEXT:    s_cmp_ge_u32 s6, s9
+; GFX9-NEXT:    s_cselect_b32 s15, -1, 0
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v3
+; GFX9-NEXT:    s_cmp_eq_u32 s6, s9
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT:    s_add_u32 s6, s12, 2
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[0:1]
+; GFX9-NEXT:    s_addc_u32 s0, s13, 0
+; GFX9-NEXT:    s_add_u32 s15, s12, 1
+; GFX9-NEXT:    s_addc_u32 s1, s13, 0
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_subb_u32 s7, s7, s14
+; GFX9-NEXT:    s_cmp_ge_u32 s7, s9
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s1
-; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s0, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v5, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v6, s0
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
+; GFX9-NEXT:    s_cselect_b32 s14, -1, 0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
+; GFX9-NEXT:    s_cmp_eq_u32 s7, s9
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, s14
+; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, s13
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v3, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v3, s15
+; GFX9-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v5, s12
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], s[4:5]
+; GFX9-NEXT:    v_xor_b32_e32 v3, s0, v3
+; GFX9-NEXT:    v_xor_b32_e32 v5, s1, v2
+; GFX9-NEXT:    v_mov_b32_e32 v6, s1
+; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v3
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v6, vcc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx4 v0, v[1:4], s[2:3]
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = sdiv <2 x i64> %x, %shl.y
@@ -9559,8 +9666,8 @@
 ; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s4
 ; GFX6-NEXT:    s_mov_b32 s9, s8
 ; GFX6-NEXT:    s_addc_u32 s3, s3, s8
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
@@ -9653,113 +9760,127 @@
 ;
 ; GFX9-LABEL: srem_i64_oddk_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f800000
-; GFX9-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4996c7d8
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; GFX9-NEXT:    v_mac_f32_e32 v0, 0, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX9-NEXT:    s_mov_b32 s2, 0xffed2705
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s2
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s2
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s2
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
-; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
-; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v4
-; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v8, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s2
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s2
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s2
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-NEXT:    s_mul_hi_u32 s2, s1, 0xffed2705
+; GFX9-NEXT:    s_mul_i32 s3, s0, 0xffed2705
+; GFX9-NEXT:    s_add_i32 s2, s2, s3
+; GFX9-NEXT:    s_sub_i32 s2, s2, s1
+; GFX9-NEXT:    s_mul_i32 s9, s1, 0xffed2705
+; GFX9-NEXT:    s_mul_hi_u32 s3, s1, s2
+; GFX9-NEXT:    s_mul_i32 s8, s1, s2
+; GFX9-NEXT:    s_mul_hi_u32 s1, s1, s9
+; GFX9-NEXT:    s_add_u32 s1, s1, s8
+; GFX9-NEXT:    s_addc_u32 s3, 0, s3
+; GFX9-NEXT:    s_mul_hi_u32 s10, s0, s9
+; GFX9-NEXT:    s_mul_i32 s9, s0, s9
+; GFX9-NEXT:    s_add_u32 s1, s1, s9
+; GFX9-NEXT:    s_mul_hi_u32 s8, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s3, s10
+; GFX9-NEXT:    s_addc_u32 s3, s8, 0
+; GFX9-NEXT:    s_mul_i32 s2, s0, s2
+; GFX9-NEXT:    s_add_u32 s1, s1, s2
+; GFX9-NEXT:    s_addc_u32 s2, 0, s3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s1, v0
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_addc_u32 s0, s0, s2
+; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX9-NEXT:    s_mul_i32 s1, s0, 0xffed2705
+; GFX9-NEXT:    s_mul_hi_u32 s3, s2, 0xffed2705
+; GFX9-NEXT:    s_add_i32 s3, s3, s1
+; GFX9-NEXT:    s_sub_i32 s1, s3, s2
+; GFX9-NEXT:    s_mul_i32 s8, s2, 0xffed2705
+; GFX9-NEXT:    s_mul_hi_u32 s11, s2, s1
+; GFX9-NEXT:    s_mul_i32 s12, s2, s1
+; GFX9-NEXT:    s_mul_hi_u32 s2, s2, s8
+; GFX9-NEXT:    s_add_u32 s2, s2, s12
+; GFX9-NEXT:    s_mul_hi_u32 s9, s0, s8
+; GFX9-NEXT:    s_mul_i32 s10, s0, s8
+; GFX9-NEXT:    s_addc_u32 s8, 0, s11
+; GFX9-NEXT:    s_add_u32 s2, s2, s10
+; GFX9-NEXT:    s_mul_hi_u32 s3, s0, s1
+; GFX9-NEXT:    s_addc_u32 s2, s8, s9
+; GFX9-NEXT:    s_addc_u32 s3, s3, 0
+; GFX9-NEXT:    s_mul_i32 s1, s0, s1
+; GFX9-NEXT:    s_add_u32 s1, s2, s1
+; GFX9-NEXT:    s_addc_u32 s2, 0, s3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s1, v0
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_addc_u32 s8, s0, s2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s2, s7, 31
 ; GFX9-NEXT:    s_add_u32 s0, s6, s2
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v4
-; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v4
-; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v6, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v5, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s3, s2
 ; GFX9-NEXT:    s_addc_u32 s1, s7, s2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s0, v0
-; GFX9-NEXT:    v_mul_hi_u32 v4, s0, v1
-; GFX9-NEXT:    v_mul_hi_u32 v5, s1, v1
-; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s1, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, s1, v0
-; GFX9-NEXT:    s_mov_b32 s3, 0x12d8fb
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s3
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s3
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX9-NEXT:    s_mul_i32 s6, s0, s8
+; GFX9-NEXT:    s_mul_hi_u32 s9, s0, s7
+; GFX9-NEXT:    s_mul_hi_u32 s3, s0, s8
+; GFX9-NEXT:    s_add_u32 s6, s9, s6
+; GFX9-NEXT:    s_addc_u32 s3, 0, s3
+; GFX9-NEXT:    s_mul_hi_u32 s10, s1, s7
+; GFX9-NEXT:    s_mul_i32 s7, s1, s7
+; GFX9-NEXT:    s_add_u32 s6, s6, s7
+; GFX9-NEXT:    s_mul_hi_u32 s9, s1, s8
+; GFX9-NEXT:    s_addc_u32 s3, s3, s10
+; GFX9-NEXT:    s_addc_u32 s6, s9, 0
+; GFX9-NEXT:    s_mul_i32 s7, s1, s8
+; GFX9-NEXT:    s_add_u32 s3, s3, s7
+; GFX9-NEXT:    s_addc_u32 s6, 0, s6
+; GFX9-NEXT:    s_mul_hi_u32 s8, s3, 0x12d8fb
+; GFX9-NEXT:    s_mul_i32 s3, s3, 0x12d8fb
+; GFX9-NEXT:    s_mul_i32 s6, s6, 0x12d8fb
+; GFX9-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-NEXT:    s_add_i32 s8, s8, s6
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s3, v0
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v4, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e32 v5, vcc, s3, v2
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v4, vcc
-; GFX9-NEXT:    s_mov_b32 s0, 0x12d8fa
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, -1, v7, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, -1, v6, s[0:1]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
+; GFX9-NEXT:    s_mov_b32 s7, 0x12d8fb
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_subb_u32 s3, s1, s8
+; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s7, v0
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_subb_u32 s0, s3, 0
+; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s7, v1
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_subb_u32 s1, s0, 0
+; GFX9-NEXT:    s_mov_b32 s6, 0x12d8fa
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v1
+; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
+; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, s0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s1
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cmp_lt_u32_e64 s[0:1], s6, v0
+; GFX9-NEXT:    s_cmp_eq_u32 s3, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, -1, v5, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v6, s3
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[0:1]
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s2, v0
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
-; GFX9-NEXT:    global_store_dwordx2 v3, v[0:1], s[4:5]
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %r = srem i64 %x, 1235195
   store i64 %r, i64 addrspace(1)* %out
@@ -9855,8 +9976,8 @@
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s5, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s4, v0
 ; GFX6-NEXT:    s_addc_u32 s3, s3, s10
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
@@ -9968,123 +10089,140 @@
 ; GFX9-NEXT:    s_xor_b64 s[8:9], s[2:3], s[4:5]
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX9-NEXT:    s_sub_u32 s2, 0, s8
-; GFX9-NEXT:    s_subb_u32 s3, 0, s9
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_sub_u32 s0, 0, s8
+; GFX9-NEXT:    s_subb_u32 s1, 0, s9
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
-; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX9-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX9-NEXT:    s_mul_i32 s10, s0, s2
+; GFX9-NEXT:    s_mul_hi_u32 s12, s0, s3
+; GFX9-NEXT:    s_mul_i32 s11, s1, s3
+; GFX9-NEXT:    s_add_i32 s10, s12, s10
+; GFX9-NEXT:    s_add_i32 s10, s10, s11
+; GFX9-NEXT:    s_mul_i32 s13, s0, s3
+; GFX9-NEXT:    s_mul_hi_u32 s11, s3, s10
+; GFX9-NEXT:    s_mul_i32 s12, s3, s10
+; GFX9-NEXT:    s_mul_hi_u32 s3, s3, s13
+; GFX9-NEXT:    s_add_u32 s3, s3, s12
+; GFX9-NEXT:    s_addc_u32 s11, 0, s11
+; GFX9-NEXT:    s_mul_hi_u32 s14, s2, s13
+; GFX9-NEXT:    s_mul_i32 s13, s2, s13
+; GFX9-NEXT:    s_add_u32 s3, s3, s13
+; GFX9-NEXT:    s_mul_hi_u32 s12, s2, s10
+; GFX9-NEXT:    s_addc_u32 s3, s11, s14
+; GFX9-NEXT:    s_addc_u32 s11, s12, 0
+; GFX9-NEXT:    s_mul_i32 s10, s2, s10
+; GFX9-NEXT:    s_add_u32 s3, s3, s10
+; GFX9-NEXT:    s_addc_u32 s10, 0, s11
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s3, v1
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_addc_u32 s2, s2, s10
+; GFX9-NEXT:    v_readfirstlane_b32 s10, v1
+; GFX9-NEXT:    s_mul_i32 s3, s0, s2
+; GFX9-NEXT:    s_mul_hi_u32 s11, s0, s10
+; GFX9-NEXT:    s_add_i32 s3, s11, s3
+; GFX9-NEXT:    s_mul_i32 s1, s1, s10
+; GFX9-NEXT:    s_add_i32 s3, s3, s1
+; GFX9-NEXT:    s_mul_i32 s0, s0, s10
+; GFX9-NEXT:    s_mul_hi_u32 s11, s2, s0
+; GFX9-NEXT:    s_mul_i32 s12, s2, s0
+; GFX9-NEXT:    s_mul_i32 s14, s10, s3
+; GFX9-NEXT:    s_mul_hi_u32 s0, s10, s0
+; GFX9-NEXT:    s_mul_hi_u32 s13, s10, s3
+; GFX9-NEXT:    s_add_u32 s0, s0, s14
+; GFX9-NEXT:    s_addc_u32 s10, 0, s13
+; GFX9-NEXT:    s_add_u32 s0, s0, s12
+; GFX9-NEXT:    s_mul_hi_u32 s1, s2, s3
+; GFX9-NEXT:    s_addc_u32 s0, s10, s11
+; GFX9-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-NEXT:    s_mul_i32 s3, s2, s3
+; GFX9-NEXT:    s_add_u32 s0, s0, s3
+; GFX9-NEXT:    s_addc_u32 s1, 0, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, s0, v1
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_addc_u32 s2, s2, s1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    s_add_u32 s0, s6, s10
 ; GFX9-NEXT:    s_mov_b32 s11, s10
-; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, s3, v0
-; GFX9-NEXT:    v_mul_lo_u32 v4, s2, v0
 ; GFX9-NEXT:    s_addc_u32 s1, s7, s10
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
-; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v6, v1, v4
-; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v4
-; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v6, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v8, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v4, s3, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v0
 ; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
-; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v5
-; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v5
-; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v5
-; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
-; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v1
-; GFX9-NEXT:    v_mul_hi_u32 v5, s7, v1
-; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v1, s8, v1
-; GFX9-NEXT:    v_mul_hi_u32 v2, s8, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s9, v0
-; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_sub_u32_e32 v2, s7, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s6, v0
-; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s8, v0
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[2:3], 0, v2, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v5
-; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v6
-; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s8, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v6, s7
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v6, v1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX9-NEXT:    s_mul_i32 s1, s6, s2
+; GFX9-NEXT:    s_mul_hi_u32 s11, s6, s3
+; GFX9-NEXT:    s_mul_hi_u32 s0, s6, s2
+; GFX9-NEXT:    s_add_u32 s1, s11, s1
+; GFX9-NEXT:    s_addc_u32 s0, 0, s0
+; GFX9-NEXT:    s_mul_hi_u32 s12, s7, s3
+; GFX9-NEXT:    s_mul_i32 s3, s7, s3
+; GFX9-NEXT:    s_add_u32 s1, s1, s3
+; GFX9-NEXT:    s_mul_hi_u32 s11, s7, s2
+; GFX9-NEXT:    s_addc_u32 s0, s0, s12
+; GFX9-NEXT:    s_addc_u32 s1, s11, 0
+; GFX9-NEXT:    s_mul_i32 s2, s7, s2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, 0, s1
+; GFX9-NEXT:    s_mul_i32 s1, s8, s1
+; GFX9-NEXT:    s_mul_hi_u32 s2, s8, s0
+; GFX9-NEXT:    s_add_i32 s1, s2, s1
+; GFX9-NEXT:    s_mul_i32 s2, s9, s0
+; GFX9-NEXT:    s_mul_i32 s0, s8, s0
+; GFX9-NEXT:    s_add_i32 s11, s1, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_sub_i32 s1, s7, s11
+; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, s6, v1
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_subb_u32 s6, s1, s9
+; GFX9-NEXT:    v_subrev_co_u32_e64 v2, s[0:1], s8, v1
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    s_subb_u32 s12, s6, 0
+; GFX9-NEXT:    s_cmp_ge_u32 s12, s9
+; GFX9-NEXT:    s_cselect_b32 s13, -1, 0
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v2
+; GFX9-NEXT:    s_cmp_eq_u32 s12, s9
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v4, s13
+; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[2:3]
+; GFX9-NEXT:    s_subb_u32 s2, s6, s9
+; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s8, v2
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    s_subb_u32 s0, s2, 0
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_subb_u32 s2, s7, s11
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s9
+; GFX9-NEXT:    v_mov_b32_e32 v5, s12
+; GFX9-NEXT:    v_mov_b32_e32 v6, s0
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
+; GFX9-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v1
+; GFX9-NEXT:    s_cmp_eq_u32 s2, s9
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v6, s3
+; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v6, s2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v3, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v0, s10, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s10, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s10
-; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s10, v0
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
+; GFX9-NEXT:    v_xor_b32_e32 v2, s10, v3
+; GFX9-NEXT:    v_mov_b32_e32 v3, s10
+; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s10, v1
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v3, vcc
+; GFX9-NEXT:    global_store_dwordx2 v0, v[1:2], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl i64 4096, %y
   %r = srem i64 %x, %shl.y
@@ -10212,8 +10350,8 @@
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s3, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s2, v0
 ; GFX6-NEXT:    s_mov_b32 s10, -1
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v2
@@ -10322,9 +10460,9 @@
 ; GFX6-NEXT:    s_subb_u32 s1, 0, s5
 ; GFX6-NEXT:    v_mul_lo_u32 v6, s1, v3
 ; GFX6-NEXT:    s_ashr_i32 s14, s7, 31
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v3
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v3, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v7, v3, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v8, v3, v2
@@ -10390,8 +10528,8 @@
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v2
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s7, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
@@ -10434,6 +10572,7 @@
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
@@ -10444,243 +10583,276 @@
 ; GFX9-NEXT:    s_xor_b64 s[12:13], s[2:3], s[8:9]
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s12
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s13
-; GFX9-NEXT:    s_sub_u32 s2, 0, s12
-; GFX9-NEXT:    s_subb_u32 s3, 0, s13
-; GFX9-NEXT:    s_ashr_i32 s8, s5, 31
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
+; GFX9-NEXT:    s_sub_u32 s0, 0, s12
+; GFX9-NEXT:    s_subb_u32 s1, 0, s13
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX9-NEXT:    s_mov_b32 s9, s8
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, s3, v0
-; GFX9-NEXT:    v_mul_lo_u32 v4, s2, v0
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
-; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v6, v1, v4
-; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v4
-; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v6, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v8, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v4, s3, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v0
-; GFX9-NEXT:    s_add_u32 s2, s4, s8
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
-; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v5
-; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
-; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v5
-; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v5
-; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    s_addc_u32 s3, s5, s8
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    s_xor_b64 s[14:15], s[2:3], s[8:9]
-; GFX9-NEXT:    v_mul_lo_u32 v2, s14, v1
-; GFX9-NEXT:    v_mul_hi_u32 v3, s14, v0
-; GFX9-NEXT:    v_mul_hi_u32 v4, s14, v1
-; GFX9-NEXT:    v_mul_hi_u32 v5, s15, v1
-; GFX9-NEXT:    v_mul_lo_u32 v1, s15, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s15, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, s15, v0
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v0, v1
-; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, s12, v0
-; GFX9-NEXT:    v_mul_hi_u32 v3, s12, v1
-; GFX9-NEXT:    v_mul_lo_u32 v4, s13, v1
-; GFX9-NEXT:    v_mul_lo_u32 v1, s12, v1
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
-; GFX9-NEXT:    v_sub_u32_e32 v3, s15, v2
-; GFX9-NEXT:    v_mov_b32_e32 v4, s13
-; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, s14, v1
-; GFX9-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s12, v1
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
-; GFX9-NEXT:    s_ashr_i32 s2, s11, 31
-; GFX9-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1]
-; GFX9-NEXT:    s_add_u32 s10, s10, s2
-; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s12, v5
-; GFX9-NEXT:    s_mov_b32 s3, s2
-; GFX9-NEXT:    s_addc_u32 s11, s11, s2
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT:    s_xor_b64 s[10:11], s[10:11], s[2:3]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s10
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, s11
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v3, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v6, s15
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v6, v2, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
-; GFX9-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v1
-; GFX9-NEXT:    v_rcp_f32_e32 v7, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX9-NEXT:    s_mul_i32 s14, s0, s2
+; GFX9-NEXT:    s_mul_hi_u32 s16, s0, s3
+; GFX9-NEXT:    s_mul_i32 s15, s1, s3
+; GFX9-NEXT:    s_add_i32 s14, s16, s14
+; GFX9-NEXT:    s_add_i32 s14, s14, s15
+; GFX9-NEXT:    s_mul_i32 s17, s0, s3
+; GFX9-NEXT:    s_mul_hi_u32 s15, s3, s14
+; GFX9-NEXT:    s_mul_i32 s16, s3, s14
+; GFX9-NEXT:    s_mul_hi_u32 s3, s3, s17
+; GFX9-NEXT:    s_add_u32 s3, s3, s16
+; GFX9-NEXT:    s_addc_u32 s15, 0, s15
+; GFX9-NEXT:    s_mul_hi_u32 s18, s2, s17
+; GFX9-NEXT:    s_mul_i32 s17, s2, s17
+; GFX9-NEXT:    s_add_u32 s3, s3, s17
+; GFX9-NEXT:    s_mul_hi_u32 s16, s2, s14
+; GFX9-NEXT:    s_addc_u32 s3, s15, s18
+; GFX9-NEXT:    s_addc_u32 s15, s16, 0
+; GFX9-NEXT:    s_mul_i32 s14, s2, s14
+; GFX9-NEXT:    s_add_u32 s3, s3, s14
+; GFX9-NEXT:    s_addc_u32 s14, 0, s15
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s3, v0
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_addc_u32 s2, s2, s14
+; GFX9-NEXT:    v_readfirstlane_b32 s14, v0
+; GFX9-NEXT:    s_mul_i32 s3, s0, s2
+; GFX9-NEXT:    s_mul_hi_u32 s15, s0, s14
+; GFX9-NEXT:    s_add_i32 s3, s15, s3
+; GFX9-NEXT:    s_mul_i32 s1, s1, s14
+; GFX9-NEXT:    s_add_i32 s3, s3, s1
+; GFX9-NEXT:    s_mul_i32 s0, s0, s14
+; GFX9-NEXT:    s_mul_hi_u32 s15, s2, s0
+; GFX9-NEXT:    s_mul_i32 s16, s2, s0
+; GFX9-NEXT:    s_mul_i32 s18, s14, s3
+; GFX9-NEXT:    s_mul_hi_u32 s0, s14, s0
+; GFX9-NEXT:    s_mul_hi_u32 s17, s14, s3
+; GFX9-NEXT:    s_add_u32 s0, s0, s18
+; GFX9-NEXT:    s_addc_u32 s14, 0, s17
+; GFX9-NEXT:    s_add_u32 s0, s0, s16
+; GFX9-NEXT:    s_mul_hi_u32 s1, s2, s3
+; GFX9-NEXT:    s_addc_u32 s0, s14, s15
+; GFX9-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-NEXT:    s_mul_i32 s3, s2, s3
+; GFX9-NEXT:    s_add_u32 s0, s0, s3
+; GFX9-NEXT:    s_addc_u32 s1, 0, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_addc_u32 s2, s2, s1
+; GFX9-NEXT:    s_ashr_i32 s14, s5, 31
+; GFX9-NEXT:    s_add_u32 s0, s4, s14
+; GFX9-NEXT:    s_mov_b32 s15, s14
+; GFX9-NEXT:    s_addc_u32 s1, s5, s14
+; GFX9-NEXT:    s_xor_b64 s[4:5], s[0:1], s[14:15]
+; GFX9-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX9-NEXT:    s_mul_i32 s1, s4, s2
+; GFX9-NEXT:    s_mul_hi_u32 s15, s4, s3
+; GFX9-NEXT:    s_mul_hi_u32 s0, s4, s2
+; GFX9-NEXT:    s_add_u32 s1, s15, s1
+; GFX9-NEXT:    s_addc_u32 s0, 0, s0
+; GFX9-NEXT:    s_mul_hi_u32 s16, s5, s3
+; GFX9-NEXT:    s_mul_i32 s3, s5, s3
+; GFX9-NEXT:    s_add_u32 s1, s1, s3
+; GFX9-NEXT:    s_mul_hi_u32 s15, s5, s2
+; GFX9-NEXT:    s_addc_u32 s0, s0, s16
+; GFX9-NEXT:    s_addc_u32 s1, s15, 0
+; GFX9-NEXT:    s_mul_i32 s2, s5, s2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, 0, s1
+; GFX9-NEXT:    s_mul_i32 s1, s12, s1
+; GFX9-NEXT:    s_mul_hi_u32 s2, s12, s0
+; GFX9-NEXT:    s_add_i32 s1, s2, s1
+; GFX9-NEXT:    s_mul_i32 s2, s13, s0
+; GFX9-NEXT:    s_mul_i32 s0, s12, s0
+; GFX9-NEXT:    s_add_i32 s15, s1, s2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    s_sub_i32 s1, s5, s15
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_subb_u32 s4, s1, s13
+; GFX9-NEXT:    v_subrev_co_u32_e64 v1, s[0:1], s12, v0
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    s_subb_u32 s16, s4, 0
+; GFX9-NEXT:    s_cmp_ge_u32 s16, s13
+; GFX9-NEXT:    s_cselect_b32 s17, -1, 0
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v1
+; GFX9-NEXT:    s_cmp_eq_u32 s16, s13
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v3, s17
+; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[2:3]
+; GFX9-NEXT:    s_subb_u32 s2, s4, s13
+; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s12, v1
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    s_subb_u32 s0, s2, 0
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_subb_u32 s2, s5, s15
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s13
+; GFX9-NEXT:    v_mov_b32_e32 v5, s16
+; GFX9-NEXT:    v_mov_b32_e32 v6, s0
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
+; GFX9-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
+; GFX9-NEXT:    s_cmp_eq_u32 s2, s13
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v6, s3
+; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX9-NEXT:    s_ashr_i32 s0, s11, 31
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v6, s2
+; GFX9-NEXT:    s_add_u32 s2, s10, s0
+; GFX9-NEXT:    s_mov_b32 s1, s0
+; GFX9-NEXT:    s_addc_u32 s3, s11, s0
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GFX9-NEXT:    s_xor_b64 s[4:5], s[2:3], s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s4
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v0, s14, v0
+; GFX9-NEXT:    v_xor_b32_e32 v2, s14, v2
+; GFX9-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v3
+; GFX9-NEXT:    v_rcp_f32_e32 v3, v1
+; GFX9-NEXT:    v_mov_b32_e32 v5, s14
+; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s14, v0
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v5, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v3
+; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
+; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX9-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX9-NEXT:    s_sub_u32 s0, 0, s4
+; GFX9-NEXT:    s_subb_u32 s1, 0, s5
+; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s11, v3
+; GFX9-NEXT:    s_mul_hi_u32 s10, s0, s2
+; GFX9-NEXT:    s_mul_i32 s12, s0, s11
+; GFX9-NEXT:    s_mul_i32 s3, s1, s2
+; GFX9-NEXT:    s_add_i32 s10, s10, s12
+; GFX9-NEXT:    s_add_i32 s10, s10, s3
+; GFX9-NEXT:    s_mul_i32 s13, s0, s2
+; GFX9-NEXT:    s_mul_hi_u32 s3, s2, s10
+; GFX9-NEXT:    s_mul_i32 s12, s2, s10
+; GFX9-NEXT:    s_mul_hi_u32 s2, s2, s13
+; GFX9-NEXT:    s_add_u32 s2, s2, s12
+; GFX9-NEXT:    s_addc_u32 s3, 0, s3
+; GFX9-NEXT:    s_mul_hi_u32 s14, s11, s13
+; GFX9-NEXT:    s_mul_i32 s13, s11, s13
+; GFX9-NEXT:    s_add_u32 s2, s2, s13
+; GFX9-NEXT:    s_mul_hi_u32 s12, s11, s10
+; GFX9-NEXT:    s_addc_u32 s2, s3, s14
+; GFX9-NEXT:    s_addc_u32 s3, s12, 0
+; GFX9-NEXT:    s_mul_i32 s10, s11, s10
+; GFX9-NEXT:    s_add_u32 s2, s2, s10
+; GFX9-NEXT:    s_addc_u32 s3, 0, s3
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v2
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_addc_u32 s2, s11, s3
+; GFX9-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX9-NEXT:    s_mul_i32 s3, s0, s2
+; GFX9-NEXT:    s_mul_hi_u32 s11, s0, s10
+; GFX9-NEXT:    s_add_i32 s3, s11, s3
+; GFX9-NEXT:    s_mul_i32 s1, s1, s10
+; GFX9-NEXT:    s_add_i32 s3, s3, s1
+; GFX9-NEXT:    s_mul_i32 s0, s0, s10
+; GFX9-NEXT:    s_mul_hi_u32 s11, s2, s0
+; GFX9-NEXT:    s_mul_i32 s12, s2, s0
+; GFX9-NEXT:    s_mul_i32 s14, s10, s3
+; GFX9-NEXT:    s_mul_hi_u32 s0, s10, s0
+; GFX9-NEXT:    s_mul_hi_u32 s13, s10, s3
+; GFX9-NEXT:    s_add_u32 s0, s0, s14
+; GFX9-NEXT:    s_addc_u32 s10, 0, s13
+; GFX9-NEXT:    s_add_u32 s0, s0, s12
+; GFX9-NEXT:    s_mul_hi_u32 s1, s2, s3
+; GFX9-NEXT:    s_addc_u32 s0, s10, s11
+; GFX9-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-NEXT:    s_mul_i32 s3, s2, s3
+; GFX9-NEXT:    s_add_u32 s0, s0, s3
+; GFX9-NEXT:    s_addc_u32 s1, 0, s1
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_addc_u32 s2, s2, s1
+; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
+; GFX9-NEXT:    s_add_u32 s0, s6, s10
+; GFX9-NEXT:    s_mov_b32 s11, s10
+; GFX9-NEXT:    s_addc_u32 s1, s7, s10
+; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
+; GFX9-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX9-NEXT:    s_mul_i32 s1, s6, s2
+; GFX9-NEXT:    s_mul_hi_u32 s11, s6, s3
+; GFX9-NEXT:    s_mul_hi_u32 s0, s6, s2
+; GFX9-NEXT:    s_add_u32 s1, s11, s1
+; GFX9-NEXT:    s_addc_u32 s0, 0, s0
+; GFX9-NEXT:    s_mul_hi_u32 s12, s7, s3
+; GFX9-NEXT:    s_mul_i32 s3, s7, s3
+; GFX9-NEXT:    s_add_u32 s1, s1, s3
+; GFX9-NEXT:    s_mul_hi_u32 s11, s7, s2
+; GFX9-NEXT:    s_addc_u32 s0, s0, s12
+; GFX9-NEXT:    s_addc_u32 s1, s11, 0
+; GFX9-NEXT:    s_mul_i32 s2, s7, s2
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, 0, s1
+; GFX9-NEXT:    s_mul_i32 s1, s4, s1
+; GFX9-NEXT:    s_mul_hi_u32 s2, s4, s0
+; GFX9-NEXT:    s_add_i32 s1, s2, s1
+; GFX9-NEXT:    s_mul_i32 s2, s5, s0
+; GFX9-NEXT:    s_mul_i32 s0, s4, s0
+; GFX9-NEXT:    s_add_i32 s11, s1, s2
+; GFX9-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-NEXT:    s_sub_i32 s1, s7, s11
+; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s6, v2
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_subb_u32 s6, s1, s5
+; GFX9-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s4, v2
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    s_subb_u32 s12, s6, 0
+; GFX9-NEXT:    s_cmp_ge_u32 s12, s5
+; GFX9-NEXT:    s_cselect_b32 s13, -1, 0
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s4, v3
+; GFX9-NEXT:    s_cmp_eq_u32 s12, s5
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v6, s13
+; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s[2:3]
+; GFX9-NEXT:    s_subb_u32 s2, s6, s5
+; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s4, v3
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    s_subb_u32 s0, s2, 0
+; GFX9-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX9-NEXT:    s_subb_u32 s2, s7, s11
+; GFX9-NEXT:    s_cmp_ge_u32 s2, s5
+; GFX9-NEXT:    v_mov_b32_e32 v7, s12
+; GFX9-NEXT:    v_mov_b32_e32 v8, s0
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
+; GFX9-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
+; GFX9-NEXT:    s_cmp_eq_u32 s2, s5
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v8, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v8, s3
+; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v8, s2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v4, s[0:1]
-; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v7
-; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX9-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX9-NEXT:    s_sub_u32 s0, 0, s10
-; GFX9-NEXT:    s_subb_u32 s1, 0, s11
-; GFX9-NEXT:    v_mul_hi_u32 v6, s0, v4
-; GFX9-NEXT:    v_mul_lo_u32 v7, s0, v5
-; GFX9-NEXT:    v_mul_lo_u32 v8, s1, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, s0, v4
-; GFX9-NEXT:    v_add_u32_e32 v6, v6, v7
-; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
-; GFX9-NEXT:    v_mul_lo_u32 v7, v4, v6
-; GFX9-NEXT:    v_mul_hi_u32 v8, v4, v3
-; GFX9-NEXT:    v_mul_hi_u32 v9, v4, v6
-; GFX9-NEXT:    v_mul_hi_u32 v10, v5, v6
-; GFX9-NEXT:    v_mul_lo_u32 v6, v5, v6
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v9, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v9, v5, v3
-; GFX9-NEXT:    v_mul_hi_u32 v3, v5, v3
-; GFX9-NEXT:    s_ashr_i32 s12, s7, 31
-; GFX9-NEXT:    s_mov_b32 s13, s12
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v10, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v5, v6, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v5, s0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v6, s0, v3
-; GFX9-NEXT:    v_mul_lo_u32 v7, s1, v3
-; GFX9-NEXT:    v_mul_lo_u32 v8, s0, v3
-; GFX9-NEXT:    s_add_u32 s0, s6, s12
-; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
-; GFX9-NEXT:    v_add_u32_e32 v5, v5, v7
-; GFX9-NEXT:    v_mul_lo_u32 v9, v3, v5
-; GFX9-NEXT:    v_mul_hi_u32 v10, v3, v8
-; GFX9-NEXT:    v_mul_hi_u32 v11, v3, v5
-; GFX9-NEXT:    v_mul_hi_u32 v7, v4, v8
-; GFX9-NEXT:    v_mul_lo_u32 v8, v4, v8
-; GFX9-NEXT:    v_mul_hi_u32 v6, v4, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
-; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, 0, v11, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v5, v4, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v10, v7, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    s_addc_u32 s1, s7, s12
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v6, vcc
-; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[12:13]
-; GFX9-NEXT:    v_mul_lo_u32 v5, s6, v4
-; GFX9-NEXT:    v_mul_hi_u32 v6, s6, v3
-; GFX9-NEXT:    v_mul_hi_u32 v8, s6, v4
-; GFX9-NEXT:    v_mul_hi_u32 v9, s7, v4
-; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v8, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v8, s7, v3
-; GFX9-NEXT:    v_mul_hi_u32 v3, s7, v3
-; GFX9-NEXT:    v_xor_b32_e32 v1, s8, v1
-; GFX9-NEXT:    v_xor_b32_e32 v2, s8, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v8
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v3, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v9, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s10, v4
-; GFX9-NEXT:    v_mul_hi_u32 v5, s10, v3
-; GFX9-NEXT:    v_mul_lo_u32 v6, s11, v3
-; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v3
-; GFX9-NEXT:    v_mov_b32_e32 v7, s8
-; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s8, v1
-; GFX9-NEXT:    v_add_u32_e32 v4, v5, v4
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v7, vcc
-; GFX9-NEXT:    v_add_u32_e32 v4, v4, v6
-; GFX9-NEXT:    v_sub_u32_e32 v5, s7, v4
-; GFX9-NEXT:    v_mov_b32_e32 v6, s11
-; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s6, v3
-; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s10, v3
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v8, s[2:3], 0, v5, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s11, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[2:3], s10, v7
-; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v5, v6, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[2:3]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s11, v8
-; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s10, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[2:3]
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v8, s7
-; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v8, v4, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v6, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v3, s12, v3
-; GFX9-NEXT:    v_xor_b32_e32 v4, s12, v4
-; GFX9-NEXT:    v_mov_b32_e32 v5, s12
-; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s12, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v2, s10, v2
+; GFX9-NEXT:    v_xor_b32_e32 v3, s10, v5
+; GFX9-NEXT:    v_mov_b32_e32 v5, s10
+; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s10, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v5, vcc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dwordx4 v0, v[1:4], s[4:5]
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = srem <2 x i64> %x, %shl.y
diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
--- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
@@ -1,14 +1,73 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s | FileCheck -check-prefix=GCN_DBG %s
 
-; GCN-LABEL: {{^}}test_loop:
-; GCN: s_and_b64 s[0:1], exec, -1
-; GCN: [[LABEL:.LBB[0-9]+_[0-9]+]]: ; %for.body{{$}}
-; GCN: ds_read_b32
-; GCN: ds_write_b32
-; GCN: s_cbranch_vccnz [[LABEL]]
-; GCN: s_endpgm
 define amdgpu_kernel void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind {
+; GCN-LABEL: test_loop:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dword s2, s[0:1], 0xa
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_eq_u32 s2, -1
+; GCN-NEXT:    s_cbranch_scc1 .LBB0_3
+; GCN-NEXT:  ; %bb.1: ; %for.body.preheader
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0x9
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_addk_i32 s0, 0x80
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_and_b64 s[0:1], exec, -1
+; GCN-NEXT:    s_mov_b32 m0, -1
+; GCN-NEXT:  .LBB0_2: ; %for.body
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    ds_read_b32 v1, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    ds_write_b32 v0, v1
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, 4, v0
+; GCN-NEXT:    s_mov_b64 vcc, s[0:1]
+; GCN-NEXT:    s_cbranch_vccnz .LBB0_2
+; GCN-NEXT:  .LBB0_3: ; %for.exit
+; GCN-NEXT:    s_endpgm
+;
+; GCN_DBG-LABEL: test_loop:
+; GCN_DBG:       ; %bb.0: ; %entry
+; GCN_DBG-NEXT:    s_load_dword s2, s[0:1], 0x9
+; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s2, 0
+; GCN_DBG-NEXT:    s_load_dword s1, s[0:1], 0xa
+; GCN_DBG-NEXT:    s_mov_b32 s0, 0
+; GCN_DBG-NEXT:    s_mov_b32 s2, -1
+; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN_DBG-NEXT:    s_cmp_lg_u32 s1, s2
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 1
+; GCN_DBG-NEXT:    s_cbranch_scc1 .LBB0_2
+; GCN_DBG-NEXT:  ; %bb.1: ; %for.exit
+; GCN_DBG-NEXT:    s_endpgm
+; GCN_DBG-NEXT:  .LBB0_2: ; %for.body
+; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN_DBG-NEXT:    v_readlane_b32 s0, v2, 1
+; GCN_DBG-NEXT:    v_readlane_b32 s2, v2, 0
+; GCN_DBG-NEXT:    s_mov_b32 s1, 2
+; GCN_DBG-NEXT:    s_lshl_b32 s1, s0, s1
+; GCN_DBG-NEXT:    s_add_i32 s1, s1, s2
+; GCN_DBG-NEXT:    s_mov_b32 s2, 0x80
+; GCN_DBG-NEXT:    s_add_i32 s1, s1, s2
+; GCN_DBG-NEXT:    s_mov_b32 m0, -1
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_read_b32 v0, v0
+; GCN_DBG-NEXT:    s_mov_b32 s2, 1.0
+; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN_DBG-NEXT:    v_add_f32_e64 v1, v0, s2
+; GCN_DBG-NEXT:    s_mov_b32 m0, -1
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_write_b32 v0, v1
+; GCN_DBG-NEXT:    s_mov_b32 s1, 1
+; GCN_DBG-NEXT:    s_add_i32 s0, s0, s1
+; GCN_DBG-NEXT:    s_mov_b64 s[2:3], -1
+; GCN_DBG-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 1
+; GCN_DBG-NEXT:    s_cbranch_vccnz .LBB0_2
+; GCN_DBG-NEXT:  ; %bb.3: ; %DummyReturnBlock
+; GCN_DBG-NEXT:    s_endpgm
 entry:
   %cmp = icmp eq i32 %n, -1
   br i1 %cmp, label %for.exit, label %for.body
@@ -27,12 +86,58 @@
   br label %for.body
 }
 
-; GCN-LABEL: @loop_const_true
-; GCN: [[LABEL:.LBB[0-9]+_[0-9]+]]:
-; GCN: ds_read_b32
-; GCN: ds_write_b32
-; GCN: s_branch [[LABEL]]
 define amdgpu_kernel void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nounwind {
+; GCN-LABEL: loop_const_true:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0x9
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_addk_i32 s0, 0x80
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_mov_b32 m0, -1
+; GCN-NEXT:  .LBB1_1: ; %for.body
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    ds_read_b32 v1, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    ds_write_b32 v0, v1
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, 4, v0
+; GCN-NEXT:    s_branch .LBB1_1
+;
+; GCN_DBG-LABEL: loop_const_true:
+; GCN_DBG:       ; %bb.0: ; %entry
+; GCN_DBG-NEXT:    s_load_dword s0, s[0:1], 0x9
+; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 0
+; GCN_DBG-NEXT:    s_mov_b32 s0, 0
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 1
+; GCN_DBG-NEXT:    s_branch .LBB1_2
+; GCN_DBG-NEXT:  .LBB1_1: ; %for.exit
+; GCN_DBG-NEXT:    s_endpgm
+; GCN_DBG-NEXT:  .LBB1_2: ; %for.body
+; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN_DBG-NEXT:    v_readlane_b32 s0, v2, 1
+; GCN_DBG-NEXT:    v_readlane_b32 s2, v2, 0
+; GCN_DBG-NEXT:    s_mov_b32 s1, 2
+; GCN_DBG-NEXT:    s_lshl_b32 s1, s0, s1
+; GCN_DBG-NEXT:    s_add_i32 s1, s1, s2
+; GCN_DBG-NEXT:    s_mov_b32 s2, 0x80
+; GCN_DBG-NEXT:    s_add_i32 s1, s1, s2
+; GCN_DBG-NEXT:    s_mov_b32 m0, -1
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_read_b32 v0, v0
+; GCN_DBG-NEXT:    s_mov_b32 s2, 1.0
+; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN_DBG-NEXT:    v_add_f32_e64 v1, v0, s2
+; GCN_DBG-NEXT:    s_mov_b32 m0, -1
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_write_b32 v0, v1
+; GCN_DBG-NEXT:    s_mov_b32 s1, 1
+; GCN_DBG-NEXT:    s_add_i32 s0, s0, s1
+; GCN_DBG-NEXT:    s_mov_b64 s[2:3], 0
+; GCN_DBG-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 1
+; GCN_DBG-NEXT:    s_cbranch_vccnz .LBB1_1
+; GCN_DBG-NEXT:    s_branch .LBB1_2
 entry:
   br label %for.body
 
@@ -50,10 +155,54 @@
   br i1 true, label %for.body, label %for.exit
 }
 
-; GCN-LABEL: {{^}}loop_const_false:
-; GCN-NOT: s_branch
-; GCN: s_endpgm
 define amdgpu_kernel void @loop_const_false(float addrspace(3)* %ptr, i32 %n) nounwind {
+; GCN-LABEL: loop_const_false:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0x9
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_mov_b32 m0, -1
+; GCN-NEXT:    ds_read_b32 v1, v0 offset:128
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    ds_write_b32 v0, v1 offset:128
+; GCN-NEXT:    s_endpgm
+;
+; GCN_DBG-LABEL: loop_const_false:
+; GCN_DBG:       ; %bb.0: ; %entry
+; GCN_DBG-NEXT:    s_load_dword s0, s[0:1], 0x9
+; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 0
+; GCN_DBG-NEXT:    s_mov_b32 s0, 0
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 1
+; GCN_DBG-NEXT:    s_branch .LBB2_2
+; GCN_DBG-NEXT:  .LBB2_1: ; %for.exit
+; GCN_DBG-NEXT:    s_endpgm
+; GCN_DBG-NEXT:  .LBB2_2: ; %for.body
+; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN_DBG-NEXT:    v_readlane_b32 s0, v2, 1
+; GCN_DBG-NEXT:    v_readlane_b32 s2, v2, 0
+; GCN_DBG-NEXT:    s_mov_b32 s1, 2
+; GCN_DBG-NEXT:    s_lshl_b32 s1, s0, s1
+; GCN_DBG-NEXT:    s_add_i32 s1, s1, s2
+; GCN_DBG-NEXT:    s_mov_b32 s2, 0x80
+; GCN_DBG-NEXT:    s_add_i32 s1, s1, s2
+; GCN_DBG-NEXT:    s_mov_b32 m0, -1
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_read_b32 v0, v0
+; GCN_DBG-NEXT:    s_mov_b32 s2, 1.0
+; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN_DBG-NEXT:    v_add_f32_e64 v1, v0, s2
+; GCN_DBG-NEXT:    s_mov_b32 m0, -1
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_write_b32 v0, v1
+; GCN_DBG-NEXT:    s_mov_b32 s1, 1
+; GCN_DBG-NEXT:    s_add_i32 s0, s0, s1
+; GCN_DBG-NEXT:    s_mov_b64 s[2:3], -1
+; GCN_DBG-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 1
+; GCN_DBG-NEXT:    s_cbranch_vccnz .LBB2_1
+; GCN_DBG-NEXT:    s_branch .LBB2_2
 entry:
   br label %for.body
 
@@ -72,10 +221,52 @@
   br i1 false, label %for.body, label %for.exit
 }
 
-; GCN-LABEL: {{^}}loop_const_undef:
-; GCN-NOT: s_branch
-; GCN: s_endpgm
 define amdgpu_kernel void @loop_const_undef(float addrspace(3)* %ptr, i32 %n) nounwind {
+; GCN-LABEL: loop_const_undef:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0x9
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_mov_b32 m0, -1
+; GCN-NEXT:    ds_read_b32 v1, v0 offset:128
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    ds_write_b32 v0, v1 offset:128
+; GCN-NEXT:    s_endpgm
+;
+; GCN_DBG-LABEL: loop_const_undef:
+; GCN_DBG:       ; %bb.0: ; %entry
+; GCN_DBG-NEXT:    s_load_dword s0, s[0:1], 0x9
+; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 0
+; GCN_DBG-NEXT:    s_mov_b32 s0, 0
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 1
+; GCN_DBG-NEXT:    s_branch .LBB3_2
+; GCN_DBG-NEXT:  .LBB3_1: ; %for.exit
+; GCN_DBG-NEXT:    s_endpgm
+; GCN_DBG-NEXT:  .LBB3_2: ; %for.body
+; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN_DBG-NEXT:    v_readlane_b32 s0, v2, 1
+; GCN_DBG-NEXT:    v_readlane_b32 s2, v2, 0
+; GCN_DBG-NEXT:    s_mov_b32 s1, 2
+; GCN_DBG-NEXT:    s_lshl_b32 s1, s0, s1
+; GCN_DBG-NEXT:    s_add_i32 s1, s1, s2
+; GCN_DBG-NEXT:    s_mov_b32 s2, 0x80
+; GCN_DBG-NEXT:    s_add_i32 s1, s1, s2
+; GCN_DBG-NEXT:    s_mov_b32 m0, -1
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_read_b32 v0, v0
+; GCN_DBG-NEXT:    s_mov_b32 s2, 1.0
+; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN_DBG-NEXT:    v_add_f32_e64 v1, v0, s2
+; GCN_DBG-NEXT:    s_mov_b32 m0, -1
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_write_b32 v0, v1
+; GCN_DBG-NEXT:    s_mov_b32 s1, 1
+; GCN_DBG-NEXT:    s_add_i32 s0, s0, s1
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 1
+; GCN_DBG-NEXT:    s_cbranch_scc1 .LBB3_1
+; GCN_DBG-NEXT:    s_branch .LBB3_2
 entry:
   br label %for.body
 
@@ -94,18 +285,81 @@
   br i1 undef, label %for.body, label %for.exit
 }
 
-; GCN-LABEL: {{^}}loop_arg_0:
-; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
-; GCN: v_cmp_eq_u32{{[^,]*}}, 1,
-; GCN: s_add_i32 s2, s0, 0x80
-
-; GCN: [[LOOPBB:.LBB[0-9]+_[0-9]+]]
-; GCN: _add_i32_e32 v0, vcc, 4, v0
-
-; GCN: s_cbranch_{{vccz|vccnz}} [[LOOPBB]]
-; GCN-NEXT: ; %bb.2
-; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @loop_arg_0(float addrspace(3)* %ptr, i32 %n) nounwind {
+; GCN-LABEL: loop_arg_0:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_mov_b32 m0, -1
+; GCN-NEXT:    ds_read_u8 v0, v0
+; GCN-NEXT:    s_load_dword s2, s[0:1], 0x9
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    s_bitcmp1_b32 s0, 0
+; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT:    s_addk_i32 s2, 0x80
+; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GCN-NEXT:  .LBB4_1: ; %for.body
+; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    ds_read_b32 v1, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    ds_write_b32 v0, v1
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, 4, v0
+; GCN-NEXT:    s_mov_b64 vcc, s[0:1]
+; GCN-NEXT:    s_cbranch_vccz .LBB4_1
+; GCN-NEXT:  ; %bb.2: ; %for.exit
+; GCN-NEXT:    s_endpgm
+;
+; GCN_DBG-LABEL: loop_arg_0:
+; GCN_DBG:       ; %bb.0: ; %entry
+; GCN_DBG-NEXT:    s_load_dword s0, s[0:1], 0x9
+; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 0
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, 0
+; GCN_DBG-NEXT:    s_mov_b32 m0, -1
+; GCN_DBG-NEXT:    ds_read_u8 v0, v0
+; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN_DBG-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN_DBG-NEXT:    s_and_b32 s0, 1, s0
+; GCN_DBG-NEXT:    s_cmp_eq_u32 s0, 1
+; GCN_DBG-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GCN_DBG-NEXT:    s_mov_b64 s[2:3], -1
+; GCN_DBG-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 1
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s1, 2
+; GCN_DBG-NEXT:    s_mov_b32 s0, 0
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 3
+; GCN_DBG-NEXT:    s_branch .LBB4_2
+; GCN_DBG-NEXT:  .LBB4_1: ; %for.exit
+; GCN_DBG-NEXT:    s_endpgm
+; GCN_DBG-NEXT:  .LBB4_2: ; %for.body
+; GCN_DBG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GCN_DBG-NEXT:    v_readlane_b32 s0, v2, 3
+; GCN_DBG-NEXT:    v_readlane_b32 s2, v2, 1
+; GCN_DBG-NEXT:    v_readlane_b32 s3, v2, 2
+; GCN_DBG-NEXT:    v_readlane_b32 s4, v2, 0
+; GCN_DBG-NEXT:    s_mov_b32 s1, 2
+; GCN_DBG-NEXT:    s_lshl_b32 s1, s0, s1
+; GCN_DBG-NEXT:    s_add_i32 s1, s1, s4
+; GCN_DBG-NEXT:    s_mov_b32 s4, 0x80
+; GCN_DBG-NEXT:    s_add_i32 s1, s1, s4
+; GCN_DBG-NEXT:    s_mov_b32 m0, -1
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_read_b32 v0, v0
+; GCN_DBG-NEXT:    s_mov_b32 s4, 1.0
+; GCN_DBG-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN_DBG-NEXT:    v_add_f32_e64 v1, v0, s4
+; GCN_DBG-NEXT:    s_mov_b32 m0, -1
+; GCN_DBG-NEXT:    v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT:    ds_write_b32 v0, v1
+; GCN_DBG-NEXT:    s_mov_b32 s1, 1
+; GCN_DBG-NEXT:    s_add_i32 s0, s0, s1
+; GCN_DBG-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GCN_DBG-NEXT:    v_writelane_b32 v2, s0, 3
+; GCN_DBG-NEXT:    s_cbranch_vccnz .LBB4_1
+; GCN_DBG-NEXT:    s_branch .LBB4_2
 entry:
   %cond = load volatile i1, i1 addrspace(3)* null
   br label %for.body
diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -8,35 +8,41 @@
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GFX9-NEXT:    s_sub_i32 s5, 0, s4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v1, s5, v0
-; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:  .LBB0_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_mul_lo_u32 v2, s3, v0
-; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mul_lo_u32 v3, s5, v2
-; GFX9-NEXT:    v_not_b32_e32 v5, v2
-; GFX9-NEXT:    v_mul_lo_u32 v5, s4, v5
-; GFX9-NEXT:    v_add_u32_e32 v4, 1, v2
-; GFX9-NEXT:    v_add_u32_e32 v3, s2, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX9-NEXT:    s_mul_i32 s7, s5, s6
+; GFX9-NEXT:    s_mul_hi_u32 s7, s6, s7
+; GFX9-NEXT:    s_add_i32 s6, s6, s7
+; GFX9-NEXT:    s_mul_i32 s7, s3, s6
+; GFX9-NEXT:    s_mul_hi_u32 s6, s2, s6
+; GFX9-NEXT:    s_add_i32 s6, s6, s7
+; GFX9-NEXT:    s_mul_i32 s7, s5, s6
+; GFX9-NEXT:    s_add_i32 s7, s2, s7
+; GFX9-NEXT:    s_cmp_ge_u32 s7, s4
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX9-NEXT:    s_add_i32 s7, s6, 1
+; GFX9-NEXT:    s_not_b32 s6, s6
+; GFX9-NEXT:    s_mul_i32 s6, s4, s6
+; GFX9-NEXT:    v_mov_b32_e32 v4, s7
+; GFX9-NEXT:    s_add_i32 s6, s2, s6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT:    v_add_u32_e32 v4, s2, v5
+; GFX9-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT:    v_add_u32_e32 v5, 1, v2
 ; GFX9-NEXT:    s_add_u32 s2, s2, 1
-; GFX9-NEXT:    v_add_u32_e32 v4, 1, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
 ; GFX9-NEXT:    s_addc_u32 s3, s3, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GFX9-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, 4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
@@ -48,34 +54,38 @@
 ; GFX10-LABEL: udiv32_invariant_denom:
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX10-NEXT:    s_mov_b64 s[2:3], 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GFX10-NEXT:    s_sub_i32 s5, 0, s4
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_lo_u32 v1, s5, v0
-; GFX10-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:  .LBB0_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_mul_lo_u32 v2, s3, v0
-; GFX10-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, v3, v2
-; GFX10-NEXT:    v_not_b32_e32 v3, v2
-; GFX10-NEXT:    v_mul_lo_u32 v4, s5, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v2
-; GFX10-NEXT:    v_mul_lo_u32 v3, s4, v3
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, s2, v4
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, s2, v3
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s4, v4
+; GFX10-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX10-NEXT:    s_mul_i32 s7, s5, s6
+; GFX10-NEXT:    s_mul_hi_u32 s7, s6, s7
+; GFX10-NEXT:    s_add_i32 s6, s6, s7
+; GFX10-NEXT:    s_mul_i32 s7, s3, s6
+; GFX10-NEXT:    s_mul_hi_u32 s6, s2, s6
+; GFX10-NEXT:    s_add_i32 s6, s6, s7
+; GFX10-NEXT:    s_mul_i32 s7, s5, s6
+; GFX10-NEXT:    s_add_i32 s7, s2, s7
+; GFX10-NEXT:    s_cmp_ge_u32 s7, s4
+; GFX10-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX10-NEXT:    s_add_i32 s8, s6, 1
+; GFX10-NEXT:    s_not_b32 s9, s6
+; GFX10-NEXT:    v_mov_b32_e32 v2, s8
+; GFX10-NEXT:    s_mul_i32 s8, s4, s9
+; GFX10-NEXT:    s_add_i32 s8, s2, s8
 ; GFX10-NEXT:    s_add_u32 s2, s2, 1
+; GFX10-NEXT:    v_mov_b32_e32 v3, s8
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, s6, v2, vcc_lo
 ; GFX10-NEXT:    s_addc_u32 s3, s3, 0
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, s7, v3, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s4, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
@@ -110,33 +120,35 @@
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GFX9-NEXT:    s_sub_i32 s5, 0, s4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v1, s5, v0
-; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:  .LBB1_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_mul_lo_u32 v2, s3, v0
-; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mul_lo_u32 v3, s5, v2
-; GFX9-NEXT:    v_not_b32_e32 v2, v2
-; GFX9-NEXT:    v_mul_lo_u32 v2, s4, v2
-; GFX9-NEXT:    v_add_u32_e32 v3, s2, v3
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
-; GFX9-NEXT:    v_add_u32_e32 v2, s2, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX9-NEXT:    s_mul_i32 s7, s5, s6
+; GFX9-NEXT:    s_mul_hi_u32 s7, s6, s7
+; GFX9-NEXT:    s_add_i32 s6, s6, s7
+; GFX9-NEXT:    s_mul_i32 s7, s3, s6
+; GFX9-NEXT:    s_mul_hi_u32 s6, s2, s6
+; GFX9-NEXT:    s_add_i32 s6, s6, s7
+; GFX9-NEXT:    s_mul_i32 s7, s5, s6
+; GFX9-NEXT:    s_not_b32 s6, s6
+; GFX9-NEXT:    s_mul_i32 s6, s4, s6
+; GFX9-NEXT:    s_add_i32 s7, s2, s7
+; GFX9-NEXT:    s_add_i32 s6, s2, s6
+; GFX9-NEXT:    s_cmp_ge_u32 s7, s4
+; GFX9-NEXT:    s_cselect_b32 s6, s6, s7
+; GFX9-NEXT:    s_sub_i32 s7, s6, s4
+; GFX9-NEXT:    s_cmp_ge_u32 s6, s4
+; GFX9-NEXT:    s_cselect_b32 s6, s7, s6
 ; GFX9-NEXT:    s_add_u32 s2, s2, 1
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s4, v2
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX9-NEXT:    s_addc_u32 s3, s3, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX9-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, 4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
@@ -148,35 +160,37 @@
 ; GFX10-LABEL: urem32_invariant_denom:
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
-; GFX10-NEXT:    s_mov_b64 s[2:3], 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GFX10-NEXT:    s_sub_i32 s5, 0, s4
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_lo_u32 v1, s5, v0
-; GFX10-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:  .LBB1_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_mul_lo_u32 v2, s3, v0
-; GFX10-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, v3, v2
-; GFX10-NEXT:    v_not_b32_e32 v3, v2
-; GFX10-NEXT:    v_mul_lo_u32 v2, s5, v2
-; GFX10-NEXT:    v_mul_lo_u32 v3, s4, v3
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, s2, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, s2, v3
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s4, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX10-NEXT:    s_mul_i32 s7, s5, s6
+; GFX10-NEXT:    s_mul_hi_u32 s7, s6, s7
+; GFX10-NEXT:    s_add_i32 s6, s6, s7
+; GFX10-NEXT:    s_mul_i32 s7, s3, s6
+; GFX10-NEXT:    s_mul_hi_u32 s6, s2, s6
+; GFX10-NEXT:    s_add_i32 s6, s6, s7
+; GFX10-NEXT:    s_not_b32 s7, s6
+; GFX10-NEXT:    s_mul_i32 s6, s5, s6
+; GFX10-NEXT:    s_mul_i32 s7, s4, s7
+; GFX10-NEXT:    s_add_i32 s6, s2, s6
+; GFX10-NEXT:    s_add_i32 s7, s2, s7
+; GFX10-NEXT:    s_cmp_ge_u32 s6, s4
+; GFX10-NEXT:    s_cselect_b32 s6, s7, s6
+; GFX10-NEXT:    s_sub_i32 s7, s6, s4
+; GFX10-NEXT:    s_cmp_ge_u32 s6, s4
+; GFX10-NEXT:    s_cselect_b32 s6, s7, s6
 ; GFX10-NEXT:    s_add_u32 s2, s2, 1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s6
 ; GFX10-NEXT:    s_addc_u32 s3, s3, 0
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s4, v2
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s4, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
 ; GFX10-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_add_u32 s0, s0, 4
@@ -249,38 +263,41 @@
 ; GFX10-LABEL: sdiv32_invariant_denom:
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_load_dword s3, s[0:1], 0x2c
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_ashr_i32 s2, s3, 31
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10-NEXT:    s_add_i32 s3, s3, s2
 ; GFX10-NEXT:    s_xor_b32 s3, s3, s2
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX10-NEXT:    s_sub_i32 s4, 0, s3
+; GFX10-NEXT:    s_sub_i32 s5, 0, s3
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_lo_u32 v1, s4, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:  .LBB2_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_mul_hi_u32 v2, s4, v0
-; GFX10-NEXT:    v_mul_lo_u32 v3, v2, s3
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v2
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s4, v3
+; GFX10-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX10-NEXT:    s_mul_i32 s7, s5, s6
+; GFX10-NEXT:    s_mul_hi_u32 s7, s6, s7
+; GFX10-NEXT:    s_add_i32 s6, s6, s7
+; GFX10-NEXT:    s_mul_hi_u32 s6, s4, s6
+; GFX10-NEXT:    s_mul_i32 s7, s6, s3
+; GFX10-NEXT:    s_sub_i32 s7, s4, s7
+; GFX10-NEXT:    s_cmp_ge_u32 s7, s3
+; GFX10-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX10-NEXT:    s_add_i32 s8, s6, 1
 ; GFX10-NEXT:    s_add_i32 s4, s4, 1
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, s3, v3
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v2, s8
+; GFX10-NEXT:    s_sub_i32 s8, s7, s3
+; GFX10-NEXT:    v_mov_b32_e32 v3, s8
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, s6, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, s7, v3, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_xor_b32_e32 v2, s2, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s2, v2
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_add_u32 s0, s0, 4
@@ -310,34 +327,35 @@
 ; GFX9-LABEL: srem32_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s3, s2, 31
 ; GFX9-NEXT:    s_add_i32 s2, s2, s3
 ; GFX9-NEXT:    s_xor_b32 s2, s2, s3
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX9-NEXT:    s_sub_i32 s3, 0, s2
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0
+; GFX9-NEXT:    s_sub_i32 s4, 0, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v1, s3, v0
-; GFX9-NEXT:    s_mov_b32 s3, 0
-; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:  .LBB3_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_mul_hi_u32 v2, s3, v0
-; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s2
-; GFX9-NEXT:    v_sub_u32_e32 v2, s3, v2
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s2, v2
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s2, v2
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX9-NEXT:    s_mul_i32 s6, s4, s5
+; GFX9-NEXT:    s_mul_hi_u32 s6, s5, s6
+; GFX9-NEXT:    s_add_i32 s5, s5, s6
+; GFX9-NEXT:    s_mul_hi_u32 s5, s3, s5
+; GFX9-NEXT:    s_mul_i32 s5, s5, s2
+; GFX9-NEXT:    s_sub_i32 s5, s3, s5
+; GFX9-NEXT:    s_sub_i32 s6, s5, s2
+; GFX9-NEXT:    s_cmp_ge_u32 s5, s2
+; GFX9-NEXT:    s_cselect_b32 s5, s6, s5
+; GFX9-NEXT:    s_sub_i32 s6, s5, s2
+; GFX9-NEXT:    s_cmp_ge_u32 s5, s2
+; GFX9-NEXT:    s_cselect_b32 s5, s6, s5
 ; GFX9-NEXT:    s_add_i32 s3, s3, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX9-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, 4
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
@@ -349,34 +367,35 @@
 ; GFX10-LABEL: srem32_invariant_denom:
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x2c
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_ashr_i32 s3, s2, 31
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10-NEXT:    s_add_i32 s2, s2, s3
 ; GFX10-NEXT:    s_xor_b32 s2, s2, s3
+; GFX10-NEXT:    s_mov_b32 s3, 0
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX10-NEXT:    s_sub_i32 s3, 0, s2
+; GFX10-NEXT:    s_sub_i32 s4, 0, s2
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_lo_u32 v1, s3, v0
-; GFX10-NEXT:    s_mov_b32 s3, 0
-; GFX10-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:  .LBB3_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_mul_hi_u32 v2, s3, v0
-; GFX10-NEXT:    v_mul_lo_u32 v2, v2, s2
-; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s3, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX10-NEXT:    s_mul_i32 s6, s4, s5
+; GFX10-NEXT:    s_mul_hi_u32 s6, s5, s6
+; GFX10-NEXT:    s_add_i32 s5, s5, s6
+; GFX10-NEXT:    s_mul_hi_u32 s5, s3, s5
+; GFX10-NEXT:    s_mul_i32 s5, s5, s2
+; GFX10-NEXT:    s_sub_i32 s5, s3, s5
+; GFX10-NEXT:    s_sub_i32 s6, s5, s2
+; GFX10-NEXT:    s_cmp_ge_u32 s5, s2
+; GFX10-NEXT:    s_cselect_b32 s5, s6, s5
+; GFX10-NEXT:    s_sub_i32 s6, s5, s2
+; GFX10-NEXT:    s_cmp_ge_u32 s5, s2
+; GFX10-NEXT:    s_cselect_b32 s5, s6, s5
 ; GFX10-NEXT:    s_add_i32 s3, s3, 1
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s2, v2
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s2, v2
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-NEXT:    global_store_dword v1, v2, s[0:1]
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_add_u32 s0, s0, 4
diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll
--- a/llvm/test/CodeGen/AMDGPU/saddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddo.ll
@@ -211,7 +211,7 @@
 ; SI-NEXT:    s_mov_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s5, s3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_add_i32_e32 v2, vcc, v1, v0
+; SI-NEXT:    v_add_i32_e32 v2, vcc, v0, v1
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
 ; SI-NEXT:    v_cmp_lt_i32_e64 s[0:1], v2, v0
 ; SI-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
@@ -235,7 +235,7 @@
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v6, vcc, v5, v4
+; VI-NEXT:    v_add_u32_e32 v6, vcc, v4, v5
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v5
 ; VI-NEXT:    v_cmp_lt_i32_e64 s[0:1], v6, v4
 ; VI-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -104,14 +104,17 @@
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
-; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    v_and_b32_e32 v1, 0xff00, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-NEXT:    s_lshl_b32 s1, s0, 8
+; SI-NEXT:    s_or_b32 s0, s1, s0
+; SI-NEXT:    s_and_b32 s1, s0, 0xff00
+; SI-NEXT:    s_lshr_b32 s4, s0, 8
+; SI-NEXT:    s_or_b32 s1, s4, s1
+; SI-NEXT:    s_lshl_b32 s4, s1, 16
+; SI-NEXT:    s_or_b32 s1, s1, s4
+; SI-NEXT:    s_or_b32 s0, s0, s4
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -145,14 +148,17 @@
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
-; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    v_and_b32_e32 v1, 0xff00, v0
-; SI-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
-; SI-NEXT:    v_or_b32_e32 v1, v2, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
-; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    v_readfirstlane_b32 s0, v0
+; SI-NEXT:    s_lshl_b32 s1, s0, 8
+; SI-NEXT:    s_or_b32 s0, s1, s0
+; SI-NEXT:    s_and_b32 s1, s0, 0xff00
+; SI-NEXT:    s_lshr_b32 s4, s0, 8
+; SI-NEXT:    s_or_b32 s1, s4, s1
+; SI-NEXT:    s_lshl_b32 s4, s1, 16
+; SI-NEXT:    s_or_b32 s1, s1, s4
+; SI-NEXT:    s_or_b32 s0, s0, s4
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -42,20 +42,20 @@
 ; GCN-NEXT:    v_xor_b32_e32 v2, v5, v2
 ; GCN-NEXT:    v_mul_lo_u32 v4, v4, v3
 ; GCN-NEXT:    v_mul_hi_u32 v4, v3, v4
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GCN-NEXT:    v_mul_hi_u32 v3, v0, v3
 ; GCN-NEXT:    v_mul_lo_u32 v4, v3, v1
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v3
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v4, v0
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
-; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v1, v0
+; GCN-NEXT:    v_sub_i32_e32 v4, vcc, v0, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v2, v0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
@@ -87,7 +87,7 @@
 ; TONGA-NEXT:    v_xor_b32_e32 v2, v5, v2
 ; TONGA-NEXT:    v_mul_lo_u32 v4, v4, v3
 ; TONGA-NEXT:    v_mul_hi_u32 v4, v3, v4
-; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
+; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
 ; TONGA-NEXT:    v_mul_hi_u32 v3, v0, v3
 ; TONGA-NEXT:    v_mul_lo_u32 v4, v3, v1
 ; TONGA-NEXT:    v_add_u32_e32 v5, vcc, 1, v3
@@ -100,7 +100,7 @@
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; TONGA-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v2
-; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
+; TONGA-NEXT:    v_subrev_u32_e32 v0, vcc, v2, v0
 ; TONGA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; TONGA-NEXT:    s_endpgm
 ;
@@ -214,7 +214,7 @@
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 30, v1
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
@@ -235,7 +235,7 @@
 ; TONGA-NEXT:    s_waitcnt vmcnt(0)
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; TONGA-NEXT:    v_lshrrev_b32_e32 v1, 30, v1
-; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 2, v0
 ; TONGA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; TONGA-NEXT:    s_endpgm
@@ -433,7 +433,7 @@
 ; GCN-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; GCN-NEXT:    v_mul_hi_u32 v6, v7, v11
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v7
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v6
 ; GCN-NEXT:    v_mul_hi_u32 v4, v0, v4
 ; GCN-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; GCN-NEXT:    v_mul_lo_u32 v6, v4, v2
@@ -459,7 +459,7 @@
 ; GCN-NEXT:    v_xor_b32_e32 v0, v0, v8
 ; GCN-NEXT:    v_xor_b32_e32 v1, v1, v9
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v9
+; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v9, v1
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
@@ -506,7 +506,7 @@
 ; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; TONGA-NEXT:    v_mul_hi_u32 v6, v7, v11
 ; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
-; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v6, v7
+; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v7, v6
 ; TONGA-NEXT:    v_mul_hi_u32 v4, v0, v4
 ; TONGA-NEXT:    v_mul_hi_u32 v5, v1, v5
 ; TONGA-NEXT:    v_mul_lo_u32 v6, v4, v2
@@ -853,7 +853,7 @@
 ; GCN-NEXT:    v_add_i32_e32 v11, vcc, 1, v8
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v11, s[0:1]
-; GCN-NEXT:    v_sub_i32_e32 v11, vcc, v0, v4
+; GCN-NEXT:    v_subrev_i32_e32 v11, vcc, v4, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v14, 31, v7
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v11, s[0:1]
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
@@ -865,7 +865,7 @@
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v12, v4
 ; GCN-NEXT:    v_mul_hi_u32 v0, v10, v0
 ; GCN-NEXT:    v_add_i32_e32 v11, vcc, 1, v8
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
 ; GCN-NEXT:    v_mul_hi_u32 v7, v1, v7
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
 ; GCN-NEXT:    v_mul_hi_u32 v0, v2, v0
@@ -894,7 +894,7 @@
 ; GCN-NEXT:    v_xor_b32_e32 v1, v8, v15
 ; GCN-NEXT:    v_xor_b32_e32 v5, v0, v16
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v1, v15
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v5, v16
+; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v16, v5
 ; GCN-NEXT:    v_mul_lo_u32 v5, v9, v12
 ; GCN-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
@@ -907,13 +907,13 @@
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v10, v7, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v2, v2, v17
 ; GCN-NEXT:    v_mul_lo_u32 v6, v5, v4
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v17
+; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v17, v2
 ; GCN-NEXT:    v_xor_b32_e32 v7, v8, v14
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, v3, v6
+; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, v6, v3
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v5
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v3, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[0:1]
-; GCN-NEXT:    v_sub_i32_e32 v6, vcc, v3, v4
+; GCN-NEXT:    v_subrev_i32_e32 v6, vcc, v4, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v5
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v4
@@ -996,7 +996,7 @@
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v12, v4
 ; TONGA-NEXT:    v_mul_hi_u32 v0, v10, v0
 ; TONGA-NEXT:    v_add_u32_e32 v11, vcc, 1, v8
-; TONGA-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
+; TONGA-NEXT:    v_add_u32_e32 v7, vcc, v9, v7
 ; TONGA-NEXT:    v_mul_hi_u32 v7, v1, v7
 ; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v10
 ; TONGA-NEXT:    v_mul_hi_u32 v0, v2, v0
@@ -1040,17 +1040,17 @@
 ; TONGA-NEXT:    v_mul_lo_u32 v6, v5, v4
 ; TONGA-NEXT:    v_subrev_u32_e32 v2, vcc, v17, v2
 ; TONGA-NEXT:    v_xor_b32_e32 v7, v8, v14
-; TONGA-NEXT:    v_sub_u32_e32 v3, vcc, v3, v6
+; TONGA-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
 ; TONGA-NEXT:    v_add_u32_e32 v6, vcc, 1, v5
 ; TONGA-NEXT:    v_cmp_ge_u32_e64 s[0:1], v3, v4
 ; TONGA-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[0:1]
-; TONGA-NEXT:    v_sub_u32_e32 v6, vcc, v3, v4
+; TONGA-NEXT:    v_subrev_u32_e32 v6, vcc, v4, v3
 ; TONGA-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
 ; TONGA-NEXT:    v_add_u32_e32 v6, vcc, 1, v5
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v4
 ; TONGA-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; TONGA-NEXT:    v_xor_b32_e32 v3, v3, v7
-; TONGA-NEXT:    v_subrev_u32_e32 v3, vcc, v7, v3
+; TONGA-NEXT:    v_sub_u32_e32 v3, vcc, v3, v7
 ; TONGA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; TONGA-NEXT:    s_endpgm
 ;
@@ -1515,7 +1515,7 @@
 ; TONGA-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; TONGA-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
 ; TONGA-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
+; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
 ; TONGA-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; TONGA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; TONGA-NEXT:    s_endpgm
@@ -1680,7 +1680,7 @@
 ; TONGA-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; TONGA-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
 ; TONGA-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
+; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
 ; TONGA-NEXT:    v_bfe_i32 v0, v0, 0, 23
 ; TONGA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; TONGA-NEXT:    s_endpgm
@@ -1824,7 +1824,7 @@
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v1|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
@@ -1865,7 +1865,7 @@
 ; TONGA-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; TONGA-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v1|
 ; TONGA-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; TONGA-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; TONGA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; TONGA-NEXT:    s_endpgm
@@ -1993,21 +1993,21 @@
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_bfe_i32 v2, v1, 0, 25
 ; GCN-NEXT:    v_bfe_i32 v1, v1, 24, 1
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v1
 ; GCN-NEXT:    v_xor_b32_e32 v2, v2, v1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v2
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v2
 ; GCN-NEXT:    v_bfe_i32 v5, v0, 0, 25
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 24, 1
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v0, v5
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v0
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GCN-NEXT:    v_xor_b32_e32 v5, v5, v0
 ; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GCN-NEXT:    v_mul_lo_u32 v4, v4, v3
 ; GCN-NEXT:    v_mul_hi_u32 v4, v3, v4
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GCN-NEXT:    v_mul_hi_u32 v3, v5, v3
 ; GCN-NEXT:    v_mul_lo_u32 v1, v3, v2
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v3
@@ -2020,7 +2020,7 @@
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v1, v1, v0
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v0, v1
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
@@ -2041,21 +2041,21 @@
 ; TONGA-NEXT:    s_waitcnt vmcnt(0)
 ; TONGA-NEXT:    v_bfe_i32 v2, v1, 0, 25
 ; TONGA-NEXT:    v_bfe_i32 v1, v1, 24, 1
-; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v1, v2
+; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; TONGA-NEXT:    v_xor_b32_e32 v2, v2, v1
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v3, v2
 ; TONGA-NEXT:    v_sub_u32_e32 v4, vcc, 0, v2
 ; TONGA-NEXT:    v_bfe_i32 v5, v0, 0, 25
 ; TONGA-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; TONGA-NEXT:    v_bfe_i32 v0, v0, 24, 1
-; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v0, v5
+; TONGA-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
 ; TONGA-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; TONGA-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; TONGA-NEXT:    v_xor_b32_e32 v5, v5, v0
 ; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; TONGA-NEXT:    v_mul_lo_u32 v4, v4, v3
 ; TONGA-NEXT:    v_mul_hi_u32 v4, v3, v4
-; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
+; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
 ; TONGA-NEXT:    v_mul_hi_u32 v3, v5, v3
 ; TONGA-NEXT:    v_mul_lo_u32 v1, v3, v2
 ; TONGA-NEXT:    v_add_u32_e32 v4, vcc, 1, v3
@@ -2232,7 +2232,7 @@
 ; GCN-NEXT:    v_ashrrev_i32_e32 v3, 12, v3
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
 ; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
@@ -2262,10 +2262,10 @@
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v2, 12, v2
 ; TONGA-NEXT:    v_lshrrev_b32_e32 v7, 31, v3
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v3, 12, v3
-; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v4
+; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v4, v0
 ; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v1, v5
 ; TONGA-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
-; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v3, v7
+; TONGA-NEXT:    v_add_u32_e32 v3, vcc, v7, v3
 ; TONGA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; TONGA-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -36,8 +36,8 @@
 ; GCN-NEXT:    v_mul_lo_u32 v5, s5, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s4, v0
 ; GCN-NEXT:    s_addc_u32 s3, s3, s12
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v7, v0, v2
@@ -97,9 +97,9 @@
 ; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s11, v0
 ; GCN-NEXT:    v_mov_b32_e32 v5, s11
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GCN-NEXT:    v_mul_lo_u32 v3, s10, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s3, v2
 ; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s2, v3
 ; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
@@ -524,7 +524,7 @@
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, s4
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -628,7 +628,7 @@
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
@@ -664,7 +664,7 @@
 ; GCN-NEXT:    v_mov_b32_e32 v3, s4
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 31
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -731,7 +731,7 @@
 ; GCN-NEXT:    v_mov_b32_e32 v3, s4
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 23
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -798,7 +798,7 @@
 ; GCN-NEXT:    v_mov_b32_e32 v3, s4
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -926,7 +926,7 @@
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
-; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GCN-IR-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GCN-IR-NEXT:    v_bfe_i32 v2, v2, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
@@ -1115,8 +1115,8 @@
 ; GCN-NEXT:    v_mul_hi_u32 v3, s4, v0
 ; GCN-NEXT:    v_mul_lo_u32 v5, s5, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s4, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v7, v0, v2
@@ -1165,7 +1165,7 @@
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v1, s3, v0
 ; GCN-NEXT:    v_mul_hi_u32 v2, s2, v0
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; GCN-NEXT:    v_mul_lo_u32 v2, s2, v0
 ; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, 24, v2
@@ -1800,7 +1800,7 @@
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v0|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -1827,7 +1827,7 @@
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v0|
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -1858,7 +1858,7 @@
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s8
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -83,9 +83,9 @@
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshl_b32_e32 v3, v3, v7
-; SI-NEXT:    v_lshl_b32_e32 v2, v2, v6
-; SI-NEXT:    v_lshl_b32_e32 v1, v1, v5
+; SI-NEXT:    v_lshlrev_b32_e32 v3, v7, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v2, v6, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, v5, v1
 ; SI-NEXT:    v_lshl_b32_e32 v0, v0, v4
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll
--- a/llvm/test/CodeGen/AMDGPU/sra.ll
+++ b/llvm/test/CodeGen/AMDGPU/sra.ll
@@ -85,9 +85,9 @@
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_ashr_i32_e32 v3, v3, v7
-; SI-NEXT:    v_ashr_i32_e32 v2, v2, v6
-; SI-NEXT:    v_ashr_i32_e32 v1, v1, v5
+; SI-NEXT:    v_ashrrev_i32_e32 v3, v7, v3
+; SI-NEXT:    v_ashrrev_i32_e32 v2, v6, v2
+; SI-NEXT:    v_ashrrev_i32_e32 v1, v5, v1
 ; SI-NEXT:    v_ashr_i32_e32 v0, v0, v4
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -162,8 +162,8 @@
 ; SI-NEXT:    v_bfe_i32 v2, v0, 0, 16
 ; SI-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; SI-NEXT:    v_ashrrev_i32_e32 v0, v3, v0
-; SI-NEXT:    v_ashrrev_i32_e32 v1, v1, v2
+; SI-NEXT:    v_ashr_i32_e32 v0, v0, v3
+; SI-NEXT:    v_ashr_i32_e32 v1, v2, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -184,9 +184,18 @@
 ; VI-NEXT:    s_mov_b32 s4, s0
 ; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ashrrev_i32_sdwa v2, sext(v1), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; VI-NEXT:    v_ashrrev_i32_sdwa v0, sext(v1), sext(v0) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_readfirstlane_b32 s0, v0
+; VI-NEXT:    v_readfirstlane_b32 s1, v1
+; VI-NEXT:    s_ashr_i32 s2, s0, 16
+; VI-NEXT:    s_sext_i32_i16 s0, s0
+; VI-NEXT:    s_ashr_i32 s3, s1, 16
+; VI-NEXT:    s_sext_i32_i16 s1, s1
+; VI-NEXT:    s_ashr_i32 s0, s0, s1
+; VI-NEXT:    s_ashr_i32 s1, s2, s3
+; VI-NEXT:    s_lshl_b32 s1, s1, 16
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; VI-NEXT:    s_or_b32 s0, s0, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -249,10 +258,10 @@
 ; SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; SI-NEXT:    v_ashr_i32_e32 v1, v1, v7
-; SI-NEXT:    v_ashr_i32_e32 v3, v5, v3
-; SI-NEXT:    v_ashr_i32_e32 v0, v0, v6
-; SI-NEXT:    v_ashr_i32_e32 v2, v4, v2
+; SI-NEXT:    v_ashrrev_i32_e32 v1, v7, v1
+; SI-NEXT:    v_ashrrev_i32_e32 v3, v3, v5
+; SI-NEXT:    v_ashrrev_i32_e32 v0, v6, v0
+; SI-NEXT:    v_ashrrev_i32_e32 v2, v2, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -28,8 +28,8 @@
 ; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v5, s1, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s0, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v7, v0, v2
@@ -86,8 +86,8 @@
 ; GCN-NEXT:    v_mul_hi_u32 v2, s12, v0
 ; GCN-NEXT:    v_mul_lo_u32 v3, s13, v0
 ; GCN-NEXT:    v_mul_lo_u32 v0, s12, v0
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s11, v1
 ; GCN-NEXT:    v_mov_b32_e32 v3, s13
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
@@ -202,8 +202,8 @@
 ; GCN-IR-NEXT:    v_mul_lo_u32 v3, s5, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, s4, v0
 ; GCN-IR-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
 ; GCN-IR-NEXT:    s_mov_b32 s10, -1
@@ -669,7 +669,7 @@
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
@@ -701,7 +701,7 @@
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
@@ -740,7 +740,7 @@
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
@@ -772,7 +772,7 @@
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
@@ -810,7 +810,7 @@
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s3, v0
@@ -839,7 +839,7 @@
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s3, v0
@@ -889,8 +889,8 @@
 ; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v5, s1, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s0, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
@@ -950,8 +950,8 @@
 ; GCN-NEXT:    v_mul_hi_u32 v2, s12, v0
 ; GCN-NEXT:    v_mul_lo_u32 v3, s13, v0
 ; GCN-NEXT:    v_mul_lo_u32 v0, s12, v0
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s15, v1
 ; GCN-NEXT:    v_mov_b32_e32 v3, s13
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s14, v0
@@ -1078,13 +1078,15 @@
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[14:15]
 ; GCN-IR-NEXT:  .LBB8_6: ; %udiv-end
-; GCN-IR-NEXT:    v_mul_lo_u32 v1, s8, v1
 ; GCN-IR-NEXT:    v_mul_hi_u32 v2, s8, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v3, s9, v0
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, s8, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v1, s8, v1
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    v_readfirstlane_b32 s10, v2
+; GCN-IR-NEXT:    v_mul_lo_u32 v2, s9, v0
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, s8, v0
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, s10, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
@@ -1092,7 +1094,6 @@
 ; GCN-IR-NEXT:    v_xor_b32_e32 v1, s1, v1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-IR-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
-; GCN-IR-NEXT:    s_mov_b32 s6, -1
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
@@ -1239,8 +1240,8 @@
 ; GCN-IR-NEXT:    v_mul_hi_u32 v2, s6, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v3, s7, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, s6, v0
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s5
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
@@ -1291,8 +1292,8 @@
 ; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
 ; GCN-NEXT:    v_mul_lo_u32 v5, s3, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s2, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v7, v0, v2
@@ -1340,7 +1341,7 @@
 ; GCN-NEXT:    v_mul_lo_u32 v1, s9, v0
 ; GCN-NEXT:    v_mul_hi_u32 v2, s8, v0
 ; GCN-NEXT:    v_mul_lo_u32 v0, s8, v0
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, 0, v1
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
 ; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
@@ -1447,8 +1448,8 @@
 ; GCN-IR-NEXT:    v_mul_lo_u32 v3, s5, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, s4, v0
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
@@ -1970,7 +1971,7 @@
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v0|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
@@ -2030,7 +2031,7 @@
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s4
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; GCN-NEXT:    s_movk_i32 s3, 0x5b7f
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, s3
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
@@ -2059,7 +2060,7 @@
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s4
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; GCN-IR-NEXT:    s_movk_i32 s3, 0x5b7f
-; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s3
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll
--- a/llvm/test/CodeGen/AMDGPU/srl.ll
+++ b/llvm/test/CodeGen/AMDGPU/srl.ll
@@ -138,9 +138,9 @@
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshr_b32_e32 v3, v3, v7
-; SI-NEXT:    v_lshr_b32_e32 v2, v2, v6
-; SI-NEXT:    v_lshr_b32_e32 v1, v1, v5
-; SI-NEXT:    v_lshr_b32_e32 v0, v0, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v2, v6, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v1, v5, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v0, v4, v0
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -47,7 +47,7 @@
 
 ; GCN-LABEL: {{^}}test_sub_v2i32:
 ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
-; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
+; SI: v_subrev_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 
 ; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; GFX9: v_sub_u32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
@@ -61,7 +61,7 @@
 }
 
 ; GCN-LABEL: {{^}}test_sub_v4i32:
-; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
+; SI: v_subrev_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -27,7 +27,7 @@
 ; SI-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; SI-NEXT:    v_mul_lo_u32 v3, v3, v2
 ; SI-NEXT:    v_mul_hi_u32 v3, v2, v3
-; SI-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; SI-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; SI-NEXT:    v_mul_lo_u32 v3, v2, v1
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
@@ -63,7 +63,7 @@
 ; VI-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; VI-NEXT:    v_mul_lo_u32 v3, v3, v2
 ; VI-NEXT:    v_mul_hi_u32 v3, v2, v3
-; VI-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
+; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; VI-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; VI-NEXT:    v_mul_lo_u32 v3, v2, v1
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 1, v2
@@ -93,7 +93,7 @@
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v2
 ; GCN-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GCN-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GCN-NEXT:    v_mul_hi_u32 v4, v0, v2
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    v_mov_b32_e32 v3, s1
@@ -193,7 +193,7 @@
 ; SI-NEXT:    v_mul_lo_u32 v1, s4, v0
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; SI-NEXT:    v_mul_hi_u32 v1, v0, v1
-; SI-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; SI-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; SI-NEXT:    v_mul_lo_u32 v1, v0, s3
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
@@ -271,25 +271,29 @@
 ; GFX1030-LABEL: s_udiv_i32:
 ; GFX1030:       ; %bb.0:
 ; GFX1030-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; GFX1030-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030-NEXT:    v_cvt_f32_u32_e32 v0, s1
-; GFX1030-NEXT:    s_sub_i32 s2, 0, s1
+; GFX1030-NEXT:    s_sub_i32 s3, 0, s1
 ; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX1030-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX1030-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX1030-NEXT:    v_mul_lo_u32 v1, s2, v0
+; GFX1030-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX1030-NEXT:    s_mul_i32 s3, s3, s2
+; GFX1030-NEXT:    s_mul_hi_u32 s3, s2, s3
+; GFX1030-NEXT:    s_add_i32 s2, s2, s3
+; GFX1030-NEXT:    s_mul_hi_u32 s6, s0, s2
+; GFX1030-NEXT:    s_mul_i32 s2, s6, s1
+; GFX1030-NEXT:    s_sub_i32 s0, s0, s2
 ; GFX1030-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
-; GFX1030-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX1030-NEXT:    v_add_nc_u32_e32 v0, v0, v1
-; GFX1030-NEXT:    v_mul_hi_u32 v0, s0, v0
-; GFX1030-NEXT:    v_mul_lo_u32 v1, v0, s1
-; GFX1030-NEXT:    v_add_nc_u32_e32 v2, 1, v0
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v1, s0, v1
-; GFX1030-NEXT:    v_subrev_nc_u32_e32 v3, s1, v1
-; GFX1030-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v1
-; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX1030-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX1030-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1030-NEXT:    s_cmp_ge_u32 s0, s1
+; GFX1030-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX1030-NEXT:    s_add_i32 s7, s6, 1
+; GFX1030-NEXT:    s_sub_i32 s4, s0, s1
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1030-NEXT:    v_cndmask_b32_e32 v0, s6, v0, vcc_lo
+; GFX1030-NEXT:    v_cndmask_b32_e32 v1, s0, v1, vcc_lo
 ; GFX1030-NEXT:    v_add_nc_u32_e32 v2, 1, v0
 ; GFX1030-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v1
 ; GFX1030-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
@@ -368,7 +372,7 @@
 ; SI-NEXT:    v_mul_lo_u32 v8, v5, v3
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
 ; SI-NEXT:    v_subrev_i32_e32 v0, vcc, v6, v0
-; SI-NEXT:    v_subrev_i32_e32 v1, vcc, v8, v1
+; SI-NEXT:    v_sub_i32_e32 v1, vcc, v1, v8
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
 ; SI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
 ; SI-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
@@ -422,13 +426,13 @@
 ; VI-NEXT:    v_mul_lo_u32 v6, v4, v2
 ; VI-NEXT:    v_mul_lo_u32 v8, v5, v3
 ; VI-NEXT:    v_add_u32_e32 v7, vcc, 1, v4
-; VI-NEXT:    v_subrev_u32_e32 v0, vcc, v6, v0
+; VI-NEXT:    v_sub_u32_e32 v0, vcc, v0, v6
 ; VI-NEXT:    v_subrev_u32_e32 v1, vcc, v8, v1
 ; VI-NEXT:    v_add_u32_e32 v9, vcc, 1, v5
 ; VI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
 ; VI-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
 ; VI-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[0:1]
-; VI-NEXT:    v_subrev_u32_e32 v6, vcc, v2, v0
+; VI-NEXT:    v_sub_u32_e32 v6, vcc, v0, v2
 ; VI-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[2:3]
 ; VI-NEXT:    v_subrev_u32_e32 v7, vcc, v3, v1
 ; VI-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
@@ -473,13 +477,13 @@
 ; GCN-NEXT:    v_mul_lo_u32 v8, v6, v2
 ; GCN-NEXT:    v_add_u32_e32 v9, vcc, 1, v6
 ; GCN-NEXT:    v_mul_lo_u32 v10, v7, v3
-; GCN-NEXT:    v_subrev_u32_e32 v0, vcc, v8, v0
+; GCN-NEXT:    v_sub_u32_e32 v0, vcc, v0, v8
 ; GCN-NEXT:    v_add_u32_e32 v11, vcc, 1, v7
 ; GCN-NEXT:    v_subrev_u32_e32 v1, vcc, v10, v1
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v2
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], v1, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[0:1]
-; GCN-NEXT:    v_subrev_u32_e32 v8, vcc, v2, v0
+; GCN-NEXT:    v_sub_u32_e32 v8, vcc, v0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s[2:3]
 ; GCN-NEXT:    v_subrev_u32_e32 v9, vcc, v3, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[0:1]
@@ -654,9 +658,9 @@
 ; SI-NEXT:    v_mul_lo_u32 v16, v10, v2
 ; SI-NEXT:    v_mul_lo_u32 v18, v11, v3
 ; SI-NEXT:    v_subrev_i32_e32 v4, vcc, v12, v4
-; SI-NEXT:    v_subrev_i32_e32 v5, vcc, v14, v5
-; SI-NEXT:    v_subrev_i32_e32 v6, vcc, v16, v6
-; SI-NEXT:    v_subrev_i32_e32 v7, vcc, v18, v7
+; SI-NEXT:    v_sub_i32_e32 v5, vcc, v5, v14
+; SI-NEXT:    v_sub_i32_e32 v6, vcc, v6, v16
+; SI-NEXT:    v_sub_i32_e32 v7, vcc, v7, v18
 ; SI-NEXT:    v_add_i32_e32 v13, vcc, 1, v8
 ; SI-NEXT:    v_add_i32_e32 v15, vcc, 1, v9
 ; SI-NEXT:    v_add_i32_e32 v17, vcc, 1, v10
@@ -670,9 +674,9 @@
 ; SI-NEXT:    v_cndmask_b32_e64 v9, v9, v15, s[2:3]
 ; SI-NEXT:    v_subrev_i32_e32 v13, vcc, v1, v5
 ; SI-NEXT:    v_cndmask_b32_e64 v10, v10, v17, s[4:5]
-; SI-NEXT:    v_subrev_i32_e32 v14, vcc, v2, v6
+; SI-NEXT:    v_sub_i32_e32 v14, vcc, v6, v2
 ; SI-NEXT:    v_cndmask_b32_e64 v11, v11, v19, s[6:7]
-; SI-NEXT:    v_subrev_i32_e32 v15, vcc, v3, v7
+; SI-NEXT:    v_sub_i32_e32 v15, vcc, v7, v3
 ; SI-NEXT:    v_cndmask_b32_e64 v4, v4, v12, s[0:1]
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, 1, v8
 ; SI-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[2:3]
@@ -736,8 +740,8 @@
 ; VI-NEXT:    v_mul_hi_u32 v13, v12, v13
 ; VI-NEXT:    v_mul_hi_u32 v15, v14, v15
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, v9, v8
-; VI-NEXT:    v_add_u32_e32 v9, vcc, v11, v10
-; VI-NEXT:    v_add_u32_e32 v10, vcc, v13, v12
+; VI-NEXT:    v_add_u32_e32 v9, vcc, v10, v11
+; VI-NEXT:    v_add_u32_e32 v10, vcc, v12, v13
 ; VI-NEXT:    v_add_u32_e32 v11, vcc, v15, v14
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mul_hi_u32 v8, v4, v8
@@ -748,10 +752,10 @@
 ; VI-NEXT:    v_mul_lo_u32 v14, v9, v1
 ; VI-NEXT:    v_mul_lo_u32 v16, v10, v2
 ; VI-NEXT:    v_mul_lo_u32 v18, v11, v3
-; VI-NEXT:    v_subrev_u32_e32 v4, vcc, v12, v4
+; VI-NEXT:    v_sub_u32_e32 v4, vcc, v4, v12
 ; VI-NEXT:    v_subrev_u32_e32 v5, vcc, v14, v5
 ; VI-NEXT:    v_subrev_u32_e32 v6, vcc, v16, v6
-; VI-NEXT:    v_subrev_u32_e32 v7, vcc, v18, v7
+; VI-NEXT:    v_sub_u32_e32 v7, vcc, v7, v18
 ; VI-NEXT:    v_add_u32_e32 v13, vcc, 1, v8
 ; VI-NEXT:    v_add_u32_e32 v15, vcc, 1, v9
 ; VI-NEXT:    v_add_u32_e32 v17, vcc, 1, v10
@@ -831,8 +835,8 @@
 ; GCN-NEXT:    v_mul_hi_u32 v15, v14, v15
 ; GCN-NEXT:    v_mul_hi_u32 v17, v16, v17
 ; GCN-NEXT:    v_add_u32_e32 v10, vcc, v11, v10
-; GCN-NEXT:    v_add_u32_e32 v11, vcc, v13, v12
-; GCN-NEXT:    v_add_u32_e32 v12, vcc, v15, v14
+; GCN-NEXT:    v_add_u32_e32 v11, vcc, v12, v13
+; GCN-NEXT:    v_add_u32_e32 v12, vcc, v14, v15
 ; GCN-NEXT:    v_add_u32_e32 v13, vcc, v17, v16
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mul_hi_u32 v10, v4, v10
@@ -843,10 +847,10 @@
 ; GCN-NEXT:    v_mul_lo_u32 v16, v11, v1
 ; GCN-NEXT:    v_mul_lo_u32 v18, v12, v2
 ; GCN-NEXT:    v_mul_lo_u32 v19, v13, v3
-; GCN-NEXT:    v_subrev_u32_e32 v4, vcc, v14, v4
+; GCN-NEXT:    v_sub_u32_e32 v4, vcc, v4, v14
 ; GCN-NEXT:    v_subrev_u32_e32 v5, vcc, v16, v5
 ; GCN-NEXT:    v_subrev_u32_e32 v6, vcc, v18, v6
-; GCN-NEXT:    v_subrev_u32_e32 v7, vcc, v19, v7
+; GCN-NEXT:    v_sub_u32_e32 v7, vcc, v7, v19
 ; GCN-NEXT:    v_add_u32_e32 v15, vcc, 1, v10
 ; GCN-NEXT:    v_add_u32_e32 v17, vcc, 1, v11
 ; GCN-NEXT:    v_add_u32_e32 v14, vcc, 1, v12
@@ -1854,10 +1858,10 @@
 ; SI-NEXT:    v_mul_hi_u32 v1, v2, v1
 ; SI-NEXT:    v_mul_lo_u32 v3, v1, v0
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
-; SI-NEXT:    v_subrev_i32_e32 v2, vcc, v3, v2
+; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
 ; SI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v2, v0
 ; SI-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
-; SI-NEXT:    v_subrev_i32_e32 v3, vcc, v0, v2
+; SI-NEXT:    v_sub_i32_e32 v3, vcc, v2, v0
 ; SI-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
 ; SI-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v0
@@ -1897,14 +1901,14 @@
 ; VI-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; VI-NEXT:    v_mul_lo_u32 v4, v4, v1
 ; VI-NEXT:    v_mul_hi_u32 v4, v1, v4
-; VI-NEXT:    v_add_u32_e32 v1, vcc, v4, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v4
 ; VI-NEXT:    v_mul_hi_u32 v1, v2, v1
 ; VI-NEXT:    v_mul_lo_u32 v3, v1, v0
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
-; VI-NEXT:    v_subrev_u32_e32 v2, vcc, v3, v2
+; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
 ; VI-NEXT:    v_cmp_ge_u32_e64 s[0:1], v2, v0
 ; VI-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
-; VI-NEXT:    v_subrev_u32_e32 v3, vcc, v0, v2
+; VI-NEXT:    v_sub_u32_e32 v3, vcc, v2, v0
 ; VI-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 1, v1
 ; VI-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v0
@@ -1950,16 +1954,16 @@
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_mul_lo_u32 v4, v4, v1
 ; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GCN-NEXT:    v_add_u32_e32 v0, vcc, v4, v1
+; GCN-NEXT:    v_add_u32_e32 v0, vcc, v1, v4
 ; GCN-NEXT:    v_mul_hi_u32 v4, v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mul_lo_u32 v5, v4, v3
 ; GCN-NEXT:    v_add_u32_e32 v6, vcc, 1, v4
-; GCN-NEXT:    v_subrev_u32_e32 v2, vcc, v5, v2
+; GCN-NEXT:    v_sub_u32_e32 v2, vcc, v2, v5
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], v2, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[0:1]
-; GCN-NEXT:    v_subrev_u32_e32 v5, vcc, v3, v2
+; GCN-NEXT:    v_sub_u32_e32 v5, vcc, v2, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
 ; GCN-NEXT:    v_add_u32_e32 v5, vcc, 1, v4
 ; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v3
@@ -2351,7 +2355,7 @@
 ; SI-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; SI-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -2377,7 +2381,7 @@
 ; VI-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; VI-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
 ; VI-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
 ; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -2401,7 +2405,7 @@
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v4|
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; GCN-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GCN-NEXT:    flat_store_byte v[0:1], v2
 ; GCN-NEXT:    s_endpgm
 ;
@@ -2597,7 +2601,7 @@
 ; VI-NEXT:    v_cvt_u32_f32_e32 v7, v3
 ; VI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
 ; VI-NEXT:    v_mul_lo_u32 v4, v7, s6
-; VI-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
+; VI-NEXT:    v_sub_u32_e32 v3, vcc, v3, v6
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, v4, v3
 ; VI-NEXT:    v_mul_hi_u32 v5, v6, v2
 ; VI-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0
@@ -2684,7 +2688,7 @@
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v7, v3
 ; GCN-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0
 ; GCN-NEXT:    v_mul_lo_u32 v4, v7, s6
-; GCN-NEXT:    v_subrev_u32_e32 v3, vcc, v6, v3
+; GCN-NEXT:    v_sub_u32_e32 v3, vcc, v3, v6
 ; GCN-NEXT:    v_add_u32_e32 v8, vcc, v4, v3
 ; GCN-NEXT:    v_mul_hi_u32 v5, v6, v2
 ; GCN-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0
@@ -2765,32 +2769,34 @@
 ; GFX1030-NEXT:    s_addc_u32 s5, 0, 0
 ; GFX1030-NEXT:    v_add_co_u32 v2, s4, 0xa9000000, s4
 ; GFX1030-NEXT:    s_cmpk_lg_u32 s4, 0x0
-; GFX1030-NEXT:    s_addc_u32 s4, s5, 0xa7c5
-; GFX1030-NEXT:    v_mul_hi_u32 v3, 0xfffe7960, v2
-; GFX1030-NEXT:    v_mul_lo_u32 v4, 0xfffe7960, v2
-; GFX1030-NEXT:    s_mul_i32 s5, s4, 0xfffe7960
-; GFX1030-NEXT:    v_sub_nc_u32_e32 v3, v3, v2
-; GFX1030-NEXT:    v_mul_hi_u32 v5, v2, v4
-; GFX1030-NEXT:    v_mul_hi_u32 v8, s4, v4
-; GFX1030-NEXT:    v_mul_lo_u32 v4, s4, v4
-; GFX1030-NEXT:    v_add_nc_u32_e32 v3, s5, v3
-; GFX1030-NEXT:    v_mul_lo_u32 v6, v2, v3
-; GFX1030-NEXT:    v_mul_hi_u32 v7, v2, v3
-; GFX1030-NEXT:    v_mul_hi_u32 v9, s4, v3
-; GFX1030-NEXT:    v_mul_lo_u32 v3, s4, v3
-; GFX1030-NEXT:    v_add_co_u32 v5, vcc_lo, v5, v6
-; GFX1030-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v7, vcc_lo
-; GFX1030-NEXT:    v_add_co_u32 v4, vcc_lo, v5, v4
-; GFX1030-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v6, v8, vcc_lo
-; GFX1030-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v9, vcc_lo
-; GFX1030-NEXT:    v_add_co_u32 v3, vcc_lo, v4, v3
-; GFX1030-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo
-; GFX1030-NEXT:    v_add_co_u32 v5, vcc_lo, v2, v3
-; GFX1030-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, s4, v4, vcc_lo
-; GFX1030-NEXT:    v_mul_hi_u32 v8, v0, v5
-; GFX1030-NEXT:    v_mad_u64_u32 v[4:5], null, v1, v5, 0
-; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, v0, v6, 0
-; GFX1030-NEXT:    v_mad_u64_u32 v[6:7], null, v1, v6, 0
+; GFX1030-NEXT:    s_addc_u32 s5, s5, 0xa7c5
+; GFX1030-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX1030-NEXT:    s_mul_i32 s6, s5, 0xfffe7960
+; GFX1030-NEXT:    s_mul_hi_u32 s7, s4, 0xfffe7960
+; GFX1030-NEXT:    s_mul_i32 s8, s4, 0xfffe7960
+; GFX1030-NEXT:    s_sub_i32 s7, s7, s4
+; GFX1030-NEXT:    s_mul_hi_u32 s9, s4, s8
+; GFX1030-NEXT:    s_add_i32 s7, s7, s6
+; GFX1030-NEXT:    s_mul_hi_u32 s10, s5, s8
+; GFX1030-NEXT:    s_mul_i32 s6, s5, s8
+; GFX1030-NEXT:    s_mul_hi_u32 s8, s4, s7
+; GFX1030-NEXT:    s_mul_i32 s4, s4, s7
+; GFX1030-NEXT:    s_mul_hi_u32 s11, s5, s7
+; GFX1030-NEXT:    s_add_u32 s4, s9, s4
+; GFX1030-NEXT:    s_addc_u32 s8, 0, s8
+; GFX1030-NEXT:    s_add_u32 s4, s4, s6
+; GFX1030-NEXT:    s_mul_i32 s7, s5, s7
+; GFX1030-NEXT:    s_addc_u32 s4, s8, s10
+; GFX1030-NEXT:    s_addc_u32 s6, s11, 0
+; GFX1030-NEXT:    s_add_u32 s4, s4, s7
+; GFX1030-NEXT:    s_addc_u32 s6, 0, s6
+; GFX1030-NEXT:    v_add_co_u32 v4, s4, v2, s4
+; GFX1030-NEXT:    s_cmpk_lg_u32 s4, 0x0
+; GFX1030-NEXT:    s_addc_u32 s4, s5, s6
+; GFX1030-NEXT:    v_mul_hi_u32 v8, v0, v4
+; GFX1030-NEXT:    v_mad_u64_u32 v[2:3], null, v0, s4, 0
+; GFX1030-NEXT:    v_mad_u64_u32 v[4:5], null, v1, v4, 0
+; GFX1030-NEXT:    v_mad_u64_u32 v[6:7], null, v1, s4, 0
 ; GFX1030-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v2
 ; GFX1030-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX1030-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -26,8 +26,8 @@
 ; GCN-NEXT:    v_mul_hi_u32 v3, s4, v0
 ; GCN-NEXT:    v_mul_lo_u32 v5, s5, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s4, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v7, v0, v2
@@ -86,9 +86,9 @@
 ; GCN-NEXT:    v_mul_hi_u32 v3, s8, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s9, v0
 ; GCN-NEXT:    v_mov_b32_e32 v5, s9
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GCN-NEXT:    v_mul_lo_u32 v3, s8, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s3, v2
 ; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s2, v3
 ; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
@@ -704,8 +704,8 @@
 ; GCN-NEXT:    v_mul_hi_u32 v4, s0, v1
 ; GCN-NEXT:    v_mul_lo_u32 v5, s1, v1
 ; GCN-NEXT:    v_mul_lo_u32 v6, s0, v1
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GCN-NEXT:    v_mul_lo_u32 v4, v1, v3
 ; GCN-NEXT:    v_mul_hi_u32 v5, v1, v6
 ; GCN-NEXT:    v_mul_hi_u32 v7, v1, v3
@@ -725,9 +725,9 @@
 ; GCN-NEXT:    v_mul_lo_u32 v3, s0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v4, s0, v1
 ; GCN-NEXT:    v_mul_lo_u32 v5, s1, v1
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GCN-NEXT:    v_mul_lo_u32 v4, s0, v1
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GCN-NEXT:    v_mul_lo_u32 v7, v1, v3
 ; GCN-NEXT:    v_mul_hi_u32 v8, v1, v4
 ; GCN-NEXT:    v_mul_hi_u32 v9, v1, v3
@@ -908,8 +908,8 @@
 ; GCN-NEXT:    v_mul_hi_u32 v3, s4, v0
 ; GCN-NEXT:    v_mul_lo_u32 v5, s5, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s4, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v7, v0, v2
@@ -958,7 +958,7 @@
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v1, s3, v0
 ; GCN-NEXT:    v_mul_hi_u32 v2, s2, v0
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; GCN-NEXT:    v_mul_lo_u32 v2, s2, v0
 ; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, 24, v2
@@ -1356,8 +1356,8 @@
 ; GCN-NEXT:    v_mul_hi_u32 v2, v0, s4
 ; GCN-NEXT:    v_mul_lo_u32 v4, v1, s4
 ; GCN-NEXT:    v_mul_lo_u32 v3, v0, s4
-; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_mul_hi_u32 v5, v0, v3
 ; GCN-NEXT:    v_mul_lo_u32 v4, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
@@ -1378,8 +1378,8 @@
 ; GCN-NEXT:    v_mul_lo_u32 v3, v1, s4
 ; GCN-NEXT:    v_mul_lo_u32 v4, v0, s4
 ; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_mul_lo_u32 v3, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v5, v0, v4
 ; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
@@ -1531,8 +1531,8 @@
 ; GCN-NEXT:    v_mul_hi_u32 v4, v2, s4
 ; GCN-NEXT:    v_mul_lo_u32 v6, v3, s4
 ; GCN-NEXT:    v_mul_lo_u32 v5, v2, s4
-; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v2, v4
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GCN-NEXT:    v_sub_i32_e32 v4, vcc, v4, v2
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; GCN-NEXT:    v_mul_hi_u32 v7, v2, v5
 ; GCN-NEXT:    v_mul_lo_u32 v6, v2, v4
 ; GCN-NEXT:    v_mul_hi_u32 v8, v2, v4
@@ -1552,7 +1552,7 @@
 ; GCN-NEXT:    v_mul_hi_u32 v4, v2, s4
 ; GCN-NEXT:    v_mul_lo_u32 v5, v3, s4
 ; GCN-NEXT:    v_mul_lo_u32 v6, v2, s4
-; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v2, v4
+; GCN-NEXT:    v_sub_i32_e32 v4, vcc, v4, v2
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GCN-NEXT:    v_mul_lo_u32 v5, v2, v4
 ; GCN-NEXT:    v_mul_hi_u32 v7, v2, v6
diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll
--- a/llvm/test/CodeGen/AMDGPU/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll
@@ -52,7 +52,7 @@
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
 ; GFX6-NEXT:    s_load_dword s3, s[0:1], 0x1d
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s2
@@ -86,7 +86,7 @@
 ; GFX8-NEXT:    v_mul_lo_u32 v1, s2, v0
 ; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v2, s7, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
@@ -176,7 +176,7 @@
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
@@ -204,41 +204,45 @@
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s7
 ; GFX8-NEXT:    s_sub_i32 s2, 0, s6
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s7
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX8-NEXT:    v_mul_lo_u32 v2, s2, v0
-; GFX8-NEXT:    s_sub_i32 s2, 0, s7
-; GFX8-NEXT:    v_mul_lo_u32 v3, s2, v1
-; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX8-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_mul_lo_u32 v1, s2, v0
+; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX8-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX8-NEXT:    v_mul_lo_u32 v0, v0, s6
-; GFX8-NEXT:    v_mul_lo_u32 v1, v1, s7
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
-; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s5, v1
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s6, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s7, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s6, v0
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s7, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX8-NEXT:    s_mul_i32 s2, s2, s6
+; GFX8-NEXT:    s_sub_i32 s2, s4, s2
+; GFX8-NEXT:    s_sub_i32 s3, s2, s6
+; GFX8-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX8-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX8-NEXT:    s_sub_i32 s3, s2, s6
+; GFX8-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX8-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX8-NEXT:    s_sub_i32 s3, 0, s7
+; GFX8-NEXT:    v_mul_lo_u32 v0, s3, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT:    v_mul_hi_u32 v1, s5, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX8-NEXT:    s_mul_i32 s2, s2, s7
+; GFX8-NEXT:    s_sub_i32 s2, s5, s2
+; GFX8-NEXT:    s_sub_i32 s3, s2, s7
+; GFX8-NEXT:    s_cmp_ge_u32 s2, s7
+; GFX8-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX8-NEXT:    s_sub_i32 s3, s2, s7
+; GFX8-NEXT:    s_cmp_ge_u32 s2, s7
+; GFX8-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_endpgm
   %result0 = udiv <2 x i32> %x, %y
@@ -341,7 +345,7 @@
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v3
@@ -374,7 +378,7 @@
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s10
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s7, v3
 ; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v2
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
@@ -399,75 +403,83 @@
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; GFX8-NEXT:    s_sub_i32 s2, 0, s8
-; GFX8-NEXT:    s_sub_i32 s3, 0, s9
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s9
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, s10
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, s11
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GFX8-NEXT:    v_mul_lo_u32 v2, s2, v0
-; GFX8-NEXT:    s_sub_i32 s2, 0, s10
-; GFX8-NEXT:    v_mul_lo_u32 v4, s3, v1
-; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX8-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT:    v_mul_lo_u32 v1, s2, v0
+; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v4, v1
-; GFX8-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX8-NEXT:    v_mul_lo_u32 v0, v0, s8
-; GFX8-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v3
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX8-NEXT:    v_mul_lo_u32 v1, v1, s9
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s8, v0
-; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s5, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s8, v0
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v3, s2, v2
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s9, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GFX8-NEXT:    v_mul_hi_u32 v3, v2, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v4, v5
-; GFX8-NEXT:    s_sub_i32 s2, 0, s11
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v4
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s9, v1
-; GFX8-NEXT:    v_mul_hi_u32 v2, s6, v2
-; GFX8-NEXT:    v_mul_lo_u32 v5, s2, v3
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v2, v2, s10
-; GFX8-NEXT:    v_mul_hi_u32 v4, v3, v5
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s6, v2
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT:    v_mul_hi_u32 v3, s7, v3
-; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s10, v2
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
-; GFX8-NEXT:    v_mul_lo_u32 v3, v3, s11
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s10, v2
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s7, v3
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s11, v3
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s11, v3
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s10
+; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX8-NEXT:    s_mul_i32 s2, s2, s8
+; GFX8-NEXT:    s_sub_i32 s2, s4, s2
+; GFX8-NEXT:    s_sub_i32 s3, s2, s8
+; GFX8-NEXT:    s_cmp_ge_u32 s2, s8
+; GFX8-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX8-NEXT:    s_sub_i32 s3, s2, s8
+; GFX8-NEXT:    s_cmp_ge_u32 s2, s8
+; GFX8-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX8-NEXT:    s_sub_i32 s3, 0, s9
+; GFX8-NEXT:    v_mul_lo_u32 v0, s3, v1
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX8-NEXT:    v_mul_hi_u32 v0, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s11
+; GFX8-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX8-NEXT:    s_mul_i32 s3, s3, s9
+; GFX8-NEXT:    s_sub_i32 s3, s5, s3
+; GFX8-NEXT:    s_sub_i32 s4, s3, s9
+; GFX8-NEXT:    s_cmp_ge_u32 s3, s9
+; GFX8-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX8-NEXT:    s_sub_i32 s4, s3, s9
+; GFX8-NEXT:    s_cmp_ge_u32 s3, s9
+; GFX8-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX8-NEXT:    s_sub_i32 s4, 0, s10
+; GFX8-NEXT:    v_mul_lo_u32 v0, s4, v1
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX8-NEXT:    v_mul_hi_u32 v0, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v2
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX8-NEXT:    s_mul_i32 s4, s4, s10
+; GFX8-NEXT:    s_sub_i32 s4, s6, s4
+; GFX8-NEXT:    s_sub_i32 s5, s4, s10
+; GFX8-NEXT:    s_cmp_ge_u32 s4, s10
+; GFX8-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX8-NEXT:    s_sub_i32 s5, s4, s10
+; GFX8-NEXT:    s_cmp_ge_u32 s4, s10
+; GFX8-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX8-NEXT:    s_sub_i32 s5, 0, s11
+; GFX8-NEXT:    v_mul_lo_u32 v0, s5, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mul_hi_u32 v0, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT:    v_mul_hi_u32 v3, s7, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX8-NEXT:    s_mul_i32 s2, s2, s11
+; GFX8-NEXT:    s_sub_i32 s2, s7, s2
+; GFX8-NEXT:    s_sub_i32 s3, s2, s11
+; GFX8-NEXT:    s_cmp_ge_u32 s2, s11
+; GFX8-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX8-NEXT:    s_sub_i32 s3, s2, s11
+; GFX8-NEXT:    s_cmp_ge_u32 s2, s11
+; GFX8-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
   %result0 = udiv <4 x i32> %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -28,8 +28,8 @@
 ; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v5, s1, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s0, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v7, v0, v2
@@ -86,8 +86,8 @@
 ; GCN-NEXT:    v_mul_hi_u32 v2, s12, v0
 ; GCN-NEXT:    v_mul_lo_u32 v3, s13, v0
 ; GCN-NEXT:    v_mul_lo_u32 v0, s12, v0
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s11, v1
 ; GCN-NEXT:    v_mov_b32_e32 v3, s13
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
@@ -202,8 +202,8 @@
 ; GCN-IR-NEXT:    v_mul_lo_u32 v3, s5, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, s4, v0
 ; GCN-IR-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
 ; GCN-IR-NEXT:    s_mov_b32 s10, -1
@@ -733,8 +733,8 @@
 ; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v5, s1, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s0, v0
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v7, v0, v2
@@ -782,7 +782,7 @@
 ; GCN-NEXT:    v_mul_lo_u32 v1, s7, v0
 ; GCN-NEXT:    v_mul_hi_u32 v2, s6, v0
 ; GCN-NEXT:    v_mul_lo_u32 v0, s6, v0
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, 0, v1
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
 ; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
@@ -884,8 +884,8 @@
 ; GCN-IR-NEXT:    v_mul_lo_u32 v3, s3, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, s2, v0
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
@@ -919,8 +919,8 @@
 ; GCN-NEXT:    v_mul_hi_u32 v2, v0, s4
 ; GCN-NEXT:    v_mul_lo_u32 v4, v1, s4
 ; GCN-NEXT:    v_mul_lo_u32 v3, v0, s4
-; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_mul_hi_u32 v5, v0, v3
 ; GCN-NEXT:    v_mul_lo_u32 v4, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
@@ -941,7 +941,7 @@
 ; GCN-NEXT:    v_mul_lo_u32 v3, v1, s4
 ; GCN-NEXT:    v_mul_lo_u32 v4, v0, s4
 ; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, v2, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GCN-NEXT:    v_mul_lo_u32 v3, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v5, v0, v4
@@ -1071,7 +1071,7 @@
 ; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
-; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GCN-IR-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, s3
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -135,7 +135,7 @@
 }
 
 ; FUNC-LABEL: {{^}}v_usubo_i16:
-; SI: v_subrev_i32_e32
+; SI: v_sub_i32_e32
 ; SI: v_and_b32
 ; SI: v_cmp_ne_u32_e32
 
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -234,7 +234,7 @@
   ; SI-NEXT: bb.1.Flow:
   ; SI-NEXT:   successors: %bb.2(0x40000000), %bb.10(0x40000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %29:vgpr_32, %bb.0, %4, %bb.9
+  ; SI-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %43:vgpr_32, %bb.0, %4, %bb.9
   ; SI-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %45:vgpr_32, %bb.9
   ; SI-NEXT:   [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %47:vgpr_32, %bb.9
   ; SI-NEXT:   [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %49:vgpr_32, %bb.9
@@ -357,7 +357,7 @@
   ; SI-NEXT: bb.1.Flow:
   ; SI-NEXT:   successors: %bb.2(0x40000000), %bb.10(0x40000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %30:vgpr_32, %bb.0, %4, %bb.9
+  ; SI-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %44:vgpr_32, %bb.0, %4, %bb.9
   ; SI-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %46:vgpr_32, %bb.9
   ; SI-NEXT:   [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %48:vgpr_32, %bb.9
   ; SI-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
@@ -510,7 +510,7 @@
   ; SI-NEXT: bb.6.sw.bb18:
   ; SI-NEXT:   successors: %bb.5(0x80000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %39:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4
+  ; SI-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %37:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4
   ; SI-NEXT:   [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
   ; SI-NEXT:   GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `i8 addrspace(1)* null`, addrspace 1)
   ; SI-NEXT:   S_BRANCH %bb.5
@@ -596,27 +596,17 @@
   ; SI-NEXT: bb.2:
   ; SI-NEXT:   successors: %bb.3(0x80000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec
-  ; SI-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub0, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub1, implicit $exec
   ; SI-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1
-  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE3]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec
-  ; SI-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec
-  ; SI-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec
+  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE3]], [[GLOBAL_LOAD_DWORDX4_2]].sub0_sub1, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub2, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub3, implicit $exec
   ; SI-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_2]], %subreg.sub0, [[V_READFIRSTLANE_B32_3]], %subreg.sub1
-  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE4]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec
+  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE4]], [[GLOBAL_LOAD_DWORDX4_2]].sub2_sub3, implicit $exec
   ; SI-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[V_CMP_EQ_U64_e64_]], killed [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
-  ; SI-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub4, implicit $exec
-  ; SI-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub5, implicit $exec
-  ; SI-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_4]], %subreg.sub0, [[V_READFIRSTLANE_B32_5]], %subreg.sub1
-  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE5]], [[REG_SEQUENCE2]].sub4_sub5, implicit $exec
-  ; SI-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[S_AND_B32_]], killed [[V_CMP_EQ_U64_e64_2]], implicit-def dead $scc
-  ; SI-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub6, implicit $exec
-  ; SI-NEXT:   [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub7, implicit $exec
-  ; SI-NEXT:   [[REG_SEQUENCE6:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_6]], %subreg.sub0, [[V_READFIRSTLANE_B32_7]], %subreg.sub1
-  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE6]], [[REG_SEQUENCE2]].sub6_sub7, implicit $exec
-  ; SI-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[S_AND_B32_1]], killed [[V_CMP_EQ_U64_e64_3]], implicit-def dead $scc
-  ; SI-NEXT:   [[REG_SEQUENCE7:%[0-9]+]]:sgpr_256 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1, killed [[V_READFIRSTLANE_B32_2]], %subreg.sub2, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub3, killed [[V_READFIRSTLANE_B32_4]], %subreg.sub4, killed [[V_READFIRSTLANE_B32_5]], %subreg.sub5, killed [[V_READFIRSTLANE_B32_6]], %subreg.sub6, killed [[V_READFIRSTLANE_B32_7]], %subreg.sub7
-  ; SI-NEXT:   [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_2]], implicit-def $exec, implicit-def dead $scc, implicit $exec
+  ; SI-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1, killed [[V_READFIRSTLANE_B32_2]], %subreg.sub2, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; SI-NEXT:   [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def dead $scc, implicit $exec
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.3:
   ; SI-NEXT:   successors: %bb.4(0x80000000)
@@ -626,22 +616,32 @@
   ; SI-NEXT: bb.4:
   ; SI-NEXT:   successors: %bb.5(0x80000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub0, implicit $exec
-  ; SI-NEXT:   [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub1, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub0, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub1, implicit $exec
+  ; SI-NEXT:   [[REG_SEQUENCE6:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_4]], %subreg.sub0, [[V_READFIRSTLANE_B32_5]], %subreg.sub1
+  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_2:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE6]], [[REG_SEQUENCE2]].sub0_sub1, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub2, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub3, implicit $exec
+  ; SI-NEXT:   [[REG_SEQUENCE7:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_6]], %subreg.sub0, [[V_READFIRSTLANE_B32_7]], %subreg.sub1
+  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE7]], [[REG_SEQUENCE2]].sub2_sub3, implicit $exec
+  ; SI-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[V_CMP_EQ_U64_e64_2]], killed [[V_CMP_EQ_U64_e64_3]], implicit-def dead $scc
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub4, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub5, implicit $exec
   ; SI-NEXT:   [[REG_SEQUENCE8:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_8]], %subreg.sub0, [[V_READFIRSTLANE_B32_9]], %subreg.sub1
-  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_4:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE8]], [[GLOBAL_LOAD_DWORDX4_2]].sub0_sub1, implicit $exec
-  ; SI-NEXT:   [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub2, implicit $exec
-  ; SI-NEXT:   [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[GLOBAL_LOAD_DWORDX4_2]].sub3, implicit $exec
+  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_4:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE8]], [[REG_SEQUENCE2]].sub4_sub5, implicit $exec
+  ; SI-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[S_AND_B32_1]], killed [[V_CMP_EQ_U64_e64_4]], implicit-def dead $scc
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub6, implicit $exec
+  ; SI-NEXT:   [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[REG_SEQUENCE2]].sub7, implicit $exec
   ; SI-NEXT:   [[REG_SEQUENCE9:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[V_READFIRSTLANE_B32_10]], %subreg.sub0, [[V_READFIRSTLANE_B32_11]], %subreg.sub1
-  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_5:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE9]], [[GLOBAL_LOAD_DWORDX4_2]].sub2_sub3, implicit $exec
-  ; SI-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[V_CMP_EQ_U64_e64_4]], killed [[V_CMP_EQ_U64_e64_5]], implicit-def dead $scc
-  ; SI-NEXT:   [[REG_SEQUENCE10:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_8]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_9]], %subreg.sub1, killed [[V_READFIRSTLANE_B32_10]], %subreg.sub2, killed [[V_READFIRSTLANE_B32_11]], %subreg.sub3
+  ; SI-NEXT:   [[V_CMP_EQ_U64_e64_5:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 killed [[REG_SEQUENCE9]], [[REG_SEQUENCE2]].sub6_sub7, implicit $exec
+  ; SI-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 killed [[S_AND_B32_2]], killed [[V_CMP_EQ_U64_e64_5]], implicit-def dead $scc
+  ; SI-NEXT:   [[REG_SEQUENCE10:%[0-9]+]]:sgpr_256 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_4]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_5]], %subreg.sub1, killed [[V_READFIRSTLANE_B32_6]], %subreg.sub2, killed [[V_READFIRSTLANE_B32_7]], %subreg.sub3, killed [[V_READFIRSTLANE_B32_8]], %subreg.sub4, killed [[V_READFIRSTLANE_B32_9]], %subreg.sub5, killed [[V_READFIRSTLANE_B32_10]], %subreg.sub6, killed [[V_READFIRSTLANE_B32_11]], %subreg.sub7
   ; SI-NEXT:   [[S_AND_SAVEEXEC_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_3]], implicit-def $exec, implicit-def dead $scc, implicit $exec
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.5:
   ; SI-NEXT:   successors: %bb.4(0x40000000), %bb.6(0x40000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[IMAGE_SAMPLE_V1_V2_gfx10_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V2_gfx10 undef %27:vreg_64, [[REG_SEQUENCE7]], killed [[REG_SEQUENCE10]], 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from custom "ImageResource")
+  ; SI-NEXT:   [[IMAGE_SAMPLE_V1_V2_gfx10_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V2_gfx10 undef %27:vreg_64, killed [[REG_SEQUENCE10]], [[REG_SEQUENCE5]], 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from custom "ImageResource")
   ; SI-NEXT:   $exec_lo = S_XOR_B32_term $exec_lo, killed [[S_AND_SAVEEXEC_B32_1]], implicit-def dead $scc
   ; SI-NEXT:   SI_WATERFALL_LOOP %bb.4, implicit $exec
   ; SI-NEXT: {{  $}}