diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -85,7 +85,8 @@
 void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
 extern char &AMDGPUAnnotateKernelFeaturesID;
 
-FunctionPass *createAMDGPUAtomicOptimizerPass();
+enum class ScanOptions : bool { DPP, Iterative };
+FunctionPass *createAMDGPUAtomicOptimizerPass(bool UseDpp);
 void initializeAMDGPUAtomicOptimizerPass(PassRegistry &);
 extern char &AMDGPUAtomicOptimizerID;
 
@@ -234,11 +235,13 @@
 };
 
 struct AMDGPUAtomicOptimizerPass : PassInfoMixin<AMDGPUAtomicOptimizerPass> {
-  AMDGPUAtomicOptimizerPass(TargetMachine &TM) : TM(TM) {}
+  AMDGPUAtomicOptimizerPass(TargetMachine &TM, ScanOptions ScanImpl)
+      : TM(TM), ScanImpl(ScanImpl) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 
 private:
   TargetMachine &TM;
+  ScanOptions ScanImpl;
 };
 
 Pass *createAMDGPUStructurizeCFGPass();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -41,8 +41,9 @@
 class AMDGPUAtomicOptimizer : public FunctionPass {
 public:
   static char ID;
-
-  AMDGPUAtomicOptimizer() : FunctionPass(ID) {}
+  ScanOptions ScanImpl;
+  AMDGPUAtomicOptimizer(ScanOptions ScanImpl)
+      : FunctionPass(ID), ScanImpl(ScanImpl) {}
 
   bool runOnFunction(Function &F) override;
 
@@ -62,6 +63,7 @@
   DominatorTree *DT;
   const GCNSubtarget *ST;
   bool IsPixelShader;
+  ScanOptions ScanImpl;
 
   Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
                         Value *const Identity) const;
@@ -69,6 +71,11 @@
                    Value *const Identity) const;
   Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const;
 
+  std::pair<Value *, Value *>
+  buildScanIteratively(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
+                       Value *const Identity, Value *V, Instruction &I,
+                       BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const;
+
   void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
                       bool ValDivergent) const;
 
@@ -77,8 +84,9 @@
 
   AMDGPUAtomicOptimizerImpl(const UniformityInfo *UA, const DataLayout *DL,
                             DominatorTree *DT, const GCNSubtarget *ST,
-                            bool IsPixelShader)
-      : UA(UA), DL(DL), DT(DT), ST(ST), IsPixelShader(IsPixelShader) {}
+                            bool IsPixelShader, ScanOptions ScanImpl)
+      : UA(UA), DL(DL), DT(DT), ST(ST), IsPixelShader(IsPixelShader),
+        ScanImpl(ScanImpl) {}
 
   bool run(Function &F);
 
@@ -111,7 +119,8 @@
 
   bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
 
-  return AMDGPUAtomicOptimizerImpl(UA, DL, DT, ST, IsPixelShader).run(F);
+  return AMDGPUAtomicOptimizerImpl(UA, DL, DT, ST, IsPixelShader, ScanImpl)
+      .run(F);
 }
 
 PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F,
@@ -125,7 +134,8 @@
 
   bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
 
-  return AMDGPUAtomicOptimizerImpl(UA, DL, DT, ST, IsPixelShader).run(F)
+  return AMDGPUAtomicOptimizerImpl(UA, DL, DT, ST, IsPixelShader, ScanImpl)
+                 .run(F)
              ? PreservedAnalyses::none()
              : PreservedAnalyses::all();
 }
@@ -470,6 +480,78 @@
   return V;
 }
 
+// Use the builder to create an exclusive scan and compute the final reduced
+// value using an iterative approach. This provides an alternative
+// implementation to DPP which uses WMM for scan computations. This API iterate
+// over active lanes to read, compute and update the value using
+// readlane and writelane intrinsics.
+std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
+    IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V,
+    Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const {
+
+  Type *const Ty = I.getType();
+  unsigned WaveFrontSize = ST->isWave32() ? 32 : 64;
+  Type *const WaveTy = B.getIntNTy(WaveFrontSize);
+  BasicBlock *const EntryBB = I.getParent();
+  const bool NeedResult = !I.use_empty();
+
+  Value *Ballot =
+      B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
+
+  // Branch to ComputeLoop Block unconditionally from the I's block
+  B.CreateBr(ComputeLoop);
+
+  // Start inserting instructions for ComputeLoop block
+  B.SetInsertPoint(ComputeLoop);
+  // Phi nodes for Accumulator, Scan results destination, and Active Lanes
+  PHINode *const Accumulator = B.CreatePHI(Ty, 2, "accum");
+  Accumulator->addIncoming(Identity, EntryBB);
+  PHINode *OldValuePhi = nullptr;
+  if (NeedResult) {
+    OldValuePhi = B.CreatePHI(Ty, 2, "old_value_phi");
+    OldValuePhi->addIncoming(UndefValue::get(Ty), EntryBB);
+  }
+  PHINode *const ActiveBits = B.CreatePHI(WaveTy, 2, "active_bits");
+  ActiveBits->addIncoming(Ballot, EntryBB);
+
+  // Use llvm.cttz instrinsic to find the lowest remaining active lane.
+  Value *FF1 =
+      B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()});
+  Value *LaneIdxInt = B.CreateTrunc(FF1, Ty);
+
+  // Get the value required for atomic operation
+  Value *LaneValue =
+      B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {V, LaneIdxInt});
+
+  // Perform writelane if intermediate scan results are required later in the
+  // kernel computations
+  Value *OldValue = nullptr;
+  if (NeedResult) {
+    OldValue = B.CreateIntrinsic(Intrinsic::amdgcn_writelane, {},
+                                 {Accumulator, LaneIdxInt, OldValuePhi});
+    OldValuePhi->addIncoming(OldValue, ComputeLoop);
+  }
+
+  // Accumulate the results
+  Value *NewAccumulator = buildNonAtomicBinOp(B, Op, Accumulator, LaneValue);
+  Accumulator->addIncoming(NewAccumulator, ComputeLoop);
+
+  // Set bit to zero of current active lane so that for next iteration llvm.cttz
+  // return the next active lane
+  Value *Mask = B.CreateShl(B.getIntN(WaveFrontSize, 1), FF1);
+  Value *InverseMask = B.CreateXor(Mask, B.getIntN(WaveFrontSize, -1));
+  Value *NewActiveBits = B.CreateAnd(ActiveBits, InverseMask);
+  ActiveBits->addIncoming(NewActiveBits, ComputeLoop);
+
+  // Branch out of the loop when all lanes are processed.
+  Value *IsEnd = B.CreateICmpEQ(NewActiveBits, B.getIntN(WaveFrontSize, 0));
+  B.CreateCondBr(IsEnd, ComputeEnd, ComputeLoop);
+
+  B.SetInsertPoint(ComputeEnd);
+
+  return std::make_pair(OldValue, NewAccumulator);
+}
+
 static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,
                                          unsigned BitWidth) {
   switch (Op) {
@@ -568,36 +650,48 @@
 
   const bool NeedResult = !I.use_empty();
 
+  Function *F = I.getFunction();
+  LLVMContext &C = I.getParent()->getContext();
+  BasicBlock *ComputeLoop = nullptr;
+  BasicBlock *ComputeEnd = nullptr;
   // If we have a divergent value in each lane, we need to combine the value
   // using DPP.
   if (ValDivergent) {
-    // First we need to set all inactive invocations to the identity value, so
-    // that they can correctly contribute to the final result.
-    NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
-
     const AtomicRMWInst::BinOp ScanOp =
         Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
-    if (!NeedResult && ST->hasPermLaneX16()) {
-      // On GFX10 the permlanex16 instruction helps us build a reduction without
-      // too many readlanes and writelanes, which are generally bad for
-      // performance.
-      NewV = buildReduction(B, ScanOp, NewV, Identity);
+    if (ScanImpl == ScanOptions::DPP) {
+      // First we need to set all inactive invocations to the identity value, so
+      // that they can correctly contribute to the final result.
+      NewV =
+          B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
+      const AtomicRMWInst::BinOp ScanOp =
+          Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
+      if (!NeedResult && ST->hasPermLaneX16()) {
+        // On GFX10 the permlanex16 instruction helps us build a reduction
+        // without too many readlanes and writelanes, which are generally bad
+        // for performance.
+        NewV = buildReduction(B, ScanOp, NewV, Identity);
+      } else {
+        NewV = buildScan(B, ScanOp, NewV, Identity);
+        if (NeedResult)
+          ExclScan = buildShiftRight(B, NewV, Identity);
+        // Read the value from the last lane, which has accumulated the values
+        // of each active lane in the wavefront. This will be our new value
+        // which we will provide to the atomic operation.
+        Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
+        assert(TyBitWidth == 32);
+        NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
+                                 {NewV, LastLaneIdx});
+      }
+      // Finally mark the readlanes in the WWM section.
+      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
     } else {
-      NewV = buildScan(B, ScanOp, NewV, Identity);
-      if (NeedResult)
-        ExclScan = buildShiftRight(B, NewV, Identity);
-
-      // Read the value from the last lane, which has accumulated the values of
-      // each active lane in the wavefront. This will be our new value which we
-      // will provide to the atomic operation.
-      Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
-      assert(TyBitWidth == 32);
-      NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
-                               {NewV, LastLaneIdx});
+      // Alternative implementation for scan
+      ComputeLoop = BasicBlock::Create(C, "ComputeLoop", F);
+      ComputeEnd = BasicBlock::Create(C, "ComputeEnd", F);
+      std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, V, I,
+                                                      ComputeLoop, ComputeEnd);
     }
-
-    // Finally mark the readlanes in the WWM section.
-    NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
   } else {
     switch (Op) {
     default:
@@ -650,6 +744,16 @@
   Instruction *const SingleLaneTerminator =
       SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);
 
+  BasicBlock *Predecessor = nullptr;
+  if (ValDivergent && ScanImpl == ScanOptions::Iterative) {
+    Instruction *Terminator = EntryBB->getTerminator();
+    B.SetInsertPoint(ComputeEnd);
+    Terminator->removeFromParent();
+    B.Insert(Terminator);
+    Predecessor = ComputeEnd;
+  } else {
+    Predecessor = EntryBB;
+  }
   // Move the IR builder into single_lane next.
   B.SetInsertPoint(SingleLaneTerminator);
 
@@ -666,7 +770,7 @@
   if (NeedResult) {
     // Create a PHI node to get our new atomic result into the exit block.
     PHINode *const PHI = B.CreatePHI(Ty, 2);
-    PHI->addIncoming(PoisonValue::get(Ty), EntryBB);
+    PHI->addIncoming(PoisonValue::get(Ty), Predecessor);
     PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
 
     // We need to broadcast the value who was the lowest active lane (the first
@@ -700,8 +804,12 @@
     // from the first lane, to get our lane's index into the atomic result.
     Value *LaneOffset = nullptr;
     if (ValDivergent) {
-      LaneOffset =
-          B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
+      if (ScanImpl == ScanOptions::DPP) {
+        LaneOffset =
+            B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
+      } else {
+        LaneOffset = ExclScan;
+      }
     } else {
       switch (Op) {
       default:
@@ -750,6 +858,7 @@
 INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE,
                     "AMDGPU atomic optimizations", false, false)
 
-FunctionPass *llvm::createAMDGPUAtomicOptimizerPass() {
-  return new AMDGPUAtomicOptimizer();
+FunctionPass *llvm::createAMDGPUAtomicOptimizerPass(bool IsUseDpp) {
+  return new AMDGPUAtomicOptimizer(IsUseDpp ? ScanOptions::DPP
+                                            : ScanOptions::Iterative);
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -269,12 +269,15 @@
     cl::desc("Enable VGPR liverange optimizations for if-else structure"),
     cl::init(true), cl::Hidden);
 
+static cl::opt<bool> EnableAtomicOptimizationsUsingDPP(
+    "amdgpu-atomic-optimizer-use-dpp",
+    cl::desc("Use DPP in the atomic optimizer"), cl::init(true), cl::Hidden);
+
 // Enable atomic optimization
-static cl::opt<bool> EnableAtomicOptimizations(
-  "amdgpu-atomic-optimizations",
-  cl::desc("Enable atomic optimizations"),
-  cl::init(false),
-  cl::Hidden);
+static cl::opt<bool>
+    EnableAtomicOptimizations("amdgpu-atomic-optimizations",
+                              cl::desc("Enable atomic optimizations"),
+                              cl::init(false), cl::Hidden);
 
 // Enable Mode register optimization
 static cl::opt<bool> EnableSIModeRegisterPass(
@@ -651,7 +654,10 @@
           return true;
         }
         if (PassName == "amdgpu-atomic-optimizer") {
-          PM.addPass(AMDGPUAtomicOptimizerPass(*this));
+          PM.addPass(
+              AMDGPUAtomicOptimizerPass(*this, EnableAtomicOptimizationsUsingDPP
+                                                   ? ScanOptions::DPP
+                                                   : ScanOptions::Iterative));
           return true;
         }
         return false;
@@ -1117,7 +1123,7 @@
     addPass(createAMDGPULateCodeGenPreparePass());
 
   if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) {
-    addPass(createAMDGPUAtomicOptimizerPass());
+    addPass(createAMDGPUAtomicOptimizerPass(EnableAtomicOptimizationsUsingDPP));
   }
 
   if (TM->getOptLevel() > CodeGenOpt::None)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-- -amdgpu-atomic-optimizer -verify-machineinstrs %s | FileCheck -check-prefix=IR %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-atomic-optimizer -verify-machineinstrs %s | FileCheck -check-prefix=IR %s
 ; RUN: llc -global-isel -mtriple=amdgcn-- -amdgpu-atomic-optimizations -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 declare i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
-; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
+; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
+; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32 immarg)
@@ -449,313 +449,257 @@
 ;
 ; GFX8-LABEL: add_i32_varying_vdata:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX8-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8-NEXT:    s_mov_b32 s4, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX8-NEXT:    ; implicit-def: $vgpr1
+; GFX8-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX8-NEXT:    s_add_i32 s5, s5, 32
+; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX8-NEXT:    s_mov_b32 m0, s5
+; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX8-NEXT:    s_add_i32 s4, s4, s8
+; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT:    s_cbranch_execz .LBB2_2
-; GFX8-NEXT:  ; %bb.1:
+; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8-NEXT:    s_cbranch_execz .LBB2_4
+; GFX8-NEXT:  ; %bb.3:
 ; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
-; GFX8-NEXT:  .LBB2_2:
+; GFX8-NEXT:  .LBB2_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s2, v1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, s1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s0
-; GFX8-NEXT:    flat_store_dword v[3:4], v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: add_i32_varying_vdata:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX9-NEXT:    ; implicit-def: $vgpr1
+; GFX9-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX9-NEXT:    s_add_i32 s5, s5, 32
+; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX9-NEXT:    s_mov_b32 m0, s5
+; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX9-NEXT:    s_add_i32 s4, s4, s8
+; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT:    s_cbranch_execz .LBB2_2
-; GFX9-NEXT:  ; %bb.1:
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execz .LBB2_4
+; GFX9-NEXT:  ; %bb.3:
 ; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
-; GFX9-NEXT:  .LBB2_2:
+; GFX9-NEXT:  .LBB2_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_add_u32_e32 v0, s2, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10W64-LABEL: add_i32_varying_vdata:
 ; GFX10W64:       ; %bb.0: ; %entry
-; GFX10W64-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10W64-NEXT:    s_not_b64 exec, exec
-; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT:    s_not_b64 exec, exec
-; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W64-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W64-NEXT:    v_mov_b32_e32 v2, v1
-; GFX10W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX10W64-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX10W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX10W64-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX10W64-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX10W64-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX10W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX10W64-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
-; GFX10W64-NEXT:    ; implicit-def: $vgpr4
+; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX10W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX10W64-NEXT:    s_mov_b32 s4, 0
+; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX10W64-NEXT:    ; implicit-def: $vgpr1
+; GFX10W64-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10W64-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX10W64-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX10W64-NEXT:    s_add_i32 s5, s5, 32
+; GFX10W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX10W64-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX10W64-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX10W64-NEXT:    s_add_i32 s4, s4, s8
+; GFX10W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX10W64-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX10W64-NEXT:    ; implicit-def: $vgpr0
 ; GFX10W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX10W64-NEXT:    s_cbranch_execz .LBB2_2
-; GFX10W64-NEXT:  ; %bb.1:
+; GFX10W64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX10W64-NEXT:    s_cbranch_execz .LBB2_4
+; GFX10W64-NEXT:  ; %bb.3:
 ; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W64-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10W64-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT:    buffer_atomic_add v4, off, s[8:11], 0 glc
-; GFX10W64-NEXT:  .LBB2_2:
+; GFX10W64-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
+; GFX10W64-NEXT:  .LBB2_4:
 ; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX10W64-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10W64-NEXT:    v_add_nc_u32_e32 v4, s2, v4
+; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10W64-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10W64-NEXT:    v_add_nc_u32_e32 v1, s2, v1
 ; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX10W64-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10W64-NEXT:    s_endpgm
 ;
 ; GFX10W32-LABEL: add_i32_varying_vdata:
 ; GFX10W32:       ; %bb.0: ; %entry
-; GFX10W32-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W32-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W32-NEXT:    v_mov_b32_e32 v2, v1
-; GFX10W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX10W32-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX10W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX10W32-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
-; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX10W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX10W32-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
-; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
-; GFX10W32-NEXT:    ; implicit-def: $vgpr4
-; GFX10W32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
-; GFX10W32-NEXT:    s_cbranch_execz .LBB2_2
-; GFX10W32-NEXT:  ; %bb.1:
-; GFX10W32-NEXT:    s_mov_b32 s3, s4
+; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX10W32-NEXT:    s_mov_b32 s3, exec_lo
+; GFX10W32-NEXT:    s_mov_b32 s2, 0
+; GFX10W32-NEXT:    ; implicit-def: $vgpr1
+; GFX10W32-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10W32-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX10W32-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX10W32-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX10W32-NEXT:    s_andn2_b32 s3, s3, s6
+; GFX10W32-NEXT:    s_add_i32 s2, s2, s5
+; GFX10W32-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX10W32-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX10W32-NEXT:    ; implicit-def: $vgpr0
+; GFX10W32-NEXT:    s_and_saveexec_b32 s3, vcc_lo
+; GFX10W32-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX10W32-NEXT:    s_cbranch_execz .LBB2_4
+; GFX10W32-NEXT:  ; %bb.3:
 ; GFX10W32-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX10W32-NEXT:    v_mov_b32_e32 v4, s3
+; GFX10W32-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    buffer_atomic_add v4, off, s[4:7], 0 glc
-; GFX10W32-NEXT:  .LBB2_2:
+; GFX10W32-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
+; GFX10W32-NEXT:  .LBB2_4:
 ; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX10W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX10W32-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10W32-NEXT:    v_add_nc_u32_e32 v4, s2, v4
+; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10W32-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10W32-NEXT:    v_add_nc_u32_e32 v1, s2, v1
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX10W32-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10W32-NEXT:    s_endpgm
 ;
 ; GFX11W64-LABEL: add_i32_varying_vdata:
 ; GFX11W64:       ; %bb.0: ; %entry
-; GFX11W64-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11W64-NEXT:    s_not_b64 exec, exec
-; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11W64-NEXT:    s_not_b64 exec, exec
-; GFX11W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W64-NEXT:    v_mov_b32_e32 v3, 0
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT:    v_mov_b32_e32 v2, v1
-; GFX11W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX11W64-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT:    v_mov_b32_e32 v2, s4
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX11W64-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX11W64-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX11W64-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX11W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX11W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX11W64-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX11W64-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX11W64-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX11W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX11W64-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX11W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX11W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
-; GFX11W64-NEXT:    ; implicit-def: $vgpr4
-; GFX11W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX11W64-NEXT:    s_cbranch_execz .LBB2_2
-; GFX11W64-NEXT:  ; %bb.1:
+; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT:    s_mov_b32 s4, 0
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX11W64-NEXT:    ; implicit-def: $vgpr1
+; GFX11W64-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11W64-NEXT:    s_ctz_i32_b32 s5, s3
+; GFX11W64-NEXT:    s_ctz_i32_b32 s6, s2
+; GFX11W64-NEXT:    s_add_i32 s5, s5, 32
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX11W64-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX11W64-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX11W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11W64-NEXT:    s_add_i32 s4, s4, s8
+; GFX11W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX11W64-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT:    ; implicit-def: $vgpr0
+; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX11W64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX11W64-NEXT:    s_cbranch_execz .LBB2_4
+; GFX11W64-NEXT:  ; %bb.3:
 ; GFX11W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W64-NEXT:    v_mov_b32_e32 v4, s4
+; GFX11W64-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT:    buffer_atomic_add_u32 v4, off, s[8:11], 0 glc
-; GFX11W64-NEXT:  .LBB2_2:
+; GFX11W64-NEXT:    buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
+; GFX11W64-NEXT:  .LBB2_4:
 ; GFX11W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX11W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX11W64-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT:    v_add_nc_u32_e32 v4, s2, v4
+; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11W64-NEXT:    v_add_nc_u32_e32 v1, s2, v1
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX11W64-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: add_i32_varying_vdata:
 ; GFX11W32:       ; %bb.0: ; %entry
-; GFX11W32-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX11W32-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W32-NEXT:    v_mov_b32_e32 v3, 0
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W32-NEXT:    v_mov_b32_e32 v2, v1
-; GFX11W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX11W32-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX11W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX11W32-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX11W32-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11W32-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX11W32-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX11W32-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
-; GFX11W32-NEXT:    ; implicit-def: $vgpr4
-; GFX11W32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
-; GFX11W32-NEXT:    s_cbranch_execz .LBB2_2
-; GFX11W32-NEXT:  ; %bb.1:
-; GFX11W32-NEXT:    s_mov_b32 s3, s4
+; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX11W32-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT:    s_mov_b32 s2, 0
+; GFX11W32-NEXT:    ; implicit-def: $vgpr1
+; GFX11W32-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX11W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11W32-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11W32-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX11W32-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX11W32-NEXT:    s_and_not1_b32 s3, s3, s6
+; GFX11W32-NEXT:    s_add_i32 s2, s2, s5
+; GFX11W32-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11W32-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX11W32-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT:    ; implicit-def: $vgpr0
+; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX11W32-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX11W32-NEXT:    s_cbranch_execz .LBB2_4
+; GFX11W32-NEXT:  ; %bb.3:
 ; GFX11W32-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11W32-NEXT:    v_mov_b32_e32 v4, s3
+; GFX11W32-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT:    buffer_atomic_add_u32 v4, off, s[4:7], 0 glc
-; GFX11W32-NEXT:  .LBB2_2:
-; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX11W32-NEXT:    buffer_atomic_add_u32 v0, off, s[4:7], 0 glc
+; GFX11W32-NEXT:  .LBB2_4:
+; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX11W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX11W32-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT:    v_add_nc_u32_e32 v4, s2, v4
+; GFX11W32-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX11W32-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 entry:
@@ -782,328 +726,273 @@
 ;
 ; GFX8-LABEL: struct_add_i32_varying_vdata:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX8-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8-NEXT:    s_mov_b32 s4, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX8-NEXT:    ; implicit-def: $vgpr1
+; GFX8-NEXT:  .LBB3_1: ; %ComputeLoop
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX8-NEXT:    s_add_i32 s5, s5, 32
+; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX8-NEXT:    s_mov_b32 m0, s5
+; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX8-NEXT:    s_add_i32 s4, s4, s8
+; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT:    s_cbranch_execz .LBB3_2
-; GFX8-NEXT:  ; %bb.1:
+; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8-NEXT:    s_cbranch_execz .LBB3_4
+; GFX8-NEXT:  ; %bb.3:
 ; GFX8-NEXT:    s_load_dword s5, s[0:1], 0x44
 ; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
-; GFX8-NEXT:    buffer_atomic_add v0, v3, s[8:11], 0 idxen glc
-; GFX8-NEXT:  .LBB3_2:
+; GFX8-NEXT:    v_mov_b32_e32 v2, s5
+; GFX8-NEXT:    buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
+; GFX8-NEXT:  .LBB3_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s2, v1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, s1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s0
-; GFX8-NEXT:    flat_store_dword v[3:4], v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: struct_add_i32_varying_vdata:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX9-NEXT:    ; implicit-def: $vgpr1
+; GFX9-NEXT:  .LBB3_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX9-NEXT:    s_add_i32 s5, s5, 32
+; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX9-NEXT:    s_mov_b32 m0, s5
+; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX9-NEXT:    s_add_i32 s4, s4, s8
+; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT:    s_cbranch_execz .LBB3_2
-; GFX9-NEXT:  ; %bb.1:
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execz .LBB3_4
+; GFX9-NEXT:  ; %bb.3:
 ; GFX9-NEXT:    s_load_dword s5, s[0:1], 0x44
 ; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v4, s5
-; GFX9-NEXT:    buffer_atomic_add v0, v4, s[8:11], 0 idxen glc
-; GFX9-NEXT:  .LBB3_2:
+; GFX9-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NEXT:    buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
+; GFX9-NEXT:  .LBB3_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_add_u32_e32 v0, s2, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10W64-LABEL: struct_add_i32_varying_vdata:
 ; GFX10W64:       ; %bb.0: ; %entry
-; GFX10W64-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10W64-NEXT:    s_not_b64 exec, exec
-; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT:    s_not_b64 exec, exec
-; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W64-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W64-NEXT:    v_mov_b32_e32 v2, v1
-; GFX10W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX10W64-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX10W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX10W64-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX10W64-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX10W64-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX10W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX10W64-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
-; GFX10W64-NEXT:    ; implicit-def: $vgpr4
+; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX10W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX10W64-NEXT:    s_mov_b32 s4, 0
+; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX10W64-NEXT:    ; implicit-def: $vgpr1
+; GFX10W64-NEXT:  .LBB3_1: ; %ComputeLoop
+; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10W64-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX10W64-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX10W64-NEXT:    s_add_i32 s5, s5, 32
+; GFX10W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX10W64-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX10W64-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX10W64-NEXT:    s_add_i32 s4, s4, s8
+; GFX10W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX10W64-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX10W64-NEXT:    ; implicit-def: $vgpr0
 ; GFX10W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX10W64-NEXT:    s_cbranch_execz .LBB3_2
-; GFX10W64-NEXT:  ; %bb.1:
+; GFX10W64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX10W64-NEXT:    s_cbranch_execz .LBB3_4
+; GFX10W64-NEXT:  ; %bb.3:
 ; GFX10W64-NEXT:    s_clause 0x1
 ; GFX10W64-NEXT:    s_load_dword s5, s[0:1], 0x44
 ; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W64-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10W64-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT:    v_mov_b32_e32 v5, s5
-; GFX10W64-NEXT:    buffer_atomic_add v4, v5, s[8:11], 0 idxen glc
-; GFX10W64-NEXT:  .LBB3_2:
+; GFX10W64-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10W64-NEXT:    buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
+; GFX10W64-NEXT:  .LBB3_4:
 ; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX10W64-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10W64-NEXT:    v_add_nc_u32_e32 v4, s2, v4
+; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10W64-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10W64-NEXT:    v_add_nc_u32_e32 v1, s2, v1
 ; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX10W64-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10W64-NEXT:    s_endpgm
 ;
 ; GFX10W32-LABEL: struct_add_i32_varying_vdata:
 ; GFX10W32:       ; %bb.0: ; %entry
-; GFX10W32-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W32-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W32-NEXT:    v_mov_b32_e32 v2, v1
-; GFX10W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX10W32-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX10W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX10W32-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
-; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX10W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX10W32-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
-; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
-; GFX10W32-NEXT:    ; implicit-def: $vgpr4
-; GFX10W32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
-; GFX10W32-NEXT:    s_cbranch_execz .LBB3_2
-; GFX10W32-NEXT:  ; %bb.1:
-; GFX10W32-NEXT:    s_mov_b32 s3, s4
+; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX10W32-NEXT:    s_mov_b32 s3, exec_lo
+; GFX10W32-NEXT:    s_mov_b32 s2, 0
+; GFX10W32-NEXT:    ; implicit-def: $vgpr1
+; GFX10W32-NEXT:  .LBB3_1: ; %ComputeLoop
+; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10W32-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX10W32-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX10W32-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX10W32-NEXT:    s_andn2_b32 s3, s3, s6
+; GFX10W32-NEXT:    s_add_i32 s2, s2, s5
+; GFX10W32-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX10W32-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX10W32-NEXT:    ; implicit-def: $vgpr0
+; GFX10W32-NEXT:    s_and_saveexec_b32 s3, vcc_lo
+; GFX10W32-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX10W32-NEXT:    s_cbranch_execz .LBB3_4
+; GFX10W32-NEXT:  ; %bb.3:
 ; GFX10W32-NEXT:    s_clause 0x1
 ; GFX10W32-NEXT:    s_load_dword s8, s[0:1], 0x44
 ; GFX10W32-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX10W32-NEXT:    v_mov_b32_e32 v4, s3
+; GFX10W32-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    v_mov_b32_e32 v5, s8
-; GFX10W32-NEXT:    buffer_atomic_add v4, v5, s[4:7], 0 idxen glc
-; GFX10W32-NEXT:  .LBB3_2:
+; GFX10W32-NEXT:    v_mov_b32_e32 v2, s8
+; GFX10W32-NEXT:    buffer_atomic_add v0, v2, s[4:7], 0 idxen glc
+; GFX10W32-NEXT:  .LBB3_4:
 ; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX10W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX10W32-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10W32-NEXT:    v_add_nc_u32_e32 v4, s2, v4
+; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10W32-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10W32-NEXT:    v_add_nc_u32_e32 v1, s2, v1
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX10W32-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10W32-NEXT:    s_endpgm
 ;
 ; GFX11W64-LABEL: struct_add_i32_varying_vdata:
 ; GFX11W64:       ; %bb.0: ; %entry
-; GFX11W64-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11W64-NEXT:    s_not_b64 exec, exec
-; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11W64-NEXT:    s_not_b64 exec, exec
-; GFX11W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W64-NEXT:    v_mov_b32_e32 v3, 0
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT:    v_mov_b32_e32 v2, v1
-; GFX11W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX11W64-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT:    v_mov_b32_e32 v2, s4
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX11W64-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX11W64-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX11W64-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX11W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX11W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX11W64-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX11W64-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX11W64-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX11W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX11W64-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX11W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX11W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
-; GFX11W64-NEXT:    ; implicit-def: $vgpr4
-; GFX11W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX11W64-NEXT:    s_cbranch_execz .LBB3_2
-; GFX11W64-NEXT:  ; %bb.1:
+; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT:    s_mov_b32 s4, 0
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX11W64-NEXT:    ; implicit-def: $vgpr1
+; GFX11W64-NEXT:  .LBB3_1: ; %ComputeLoop
+; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11W64-NEXT:    s_ctz_i32_b32 s5, s3
+; GFX11W64-NEXT:    s_ctz_i32_b32 s6, s2
+; GFX11W64-NEXT:    s_add_i32 s5, s5, 32
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX11W64-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX11W64-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX11W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11W64-NEXT:    s_add_i32 s4, s4, s8
+; GFX11W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX11W64-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT:    ; implicit-def: $vgpr0
+; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX11W64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX11W64-NEXT:    s_cbranch_execz .LBB3_4
+; GFX11W64-NEXT:  ; %bb.3:
 ; GFX11W64-NEXT:    s_clause 0x1
 ; GFX11W64-NEXT:    s_load_b32 s5, s[0:1], 0x44
 ; GFX11W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W64-NEXT:    v_mov_b32_e32 v4, s4
+; GFX11W64-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT:    v_mov_b32_e32 v5, s5
-; GFX11W64-NEXT:    buffer_atomic_add_u32 v4, v5, s[8:11], 0 idxen glc
-; GFX11W64-NEXT:  .LBB3_2:
+; GFX11W64-NEXT:    v_mov_b32_e32 v2, s5
+; GFX11W64-NEXT:    buffer_atomic_add_u32 v0, v2, s[8:11], 0 idxen glc
+; GFX11W64-NEXT:  .LBB3_4:
 ; GFX11W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX11W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX11W64-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT:    v_add_nc_u32_e32 v4, s2, v4
+; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11W64-NEXT:    v_add_nc_u32_e32 v1, s2, v1
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX11W64-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: struct_add_i32_varying_vdata:
 ; GFX11W32:       ; %bb.0: ; %entry
-; GFX11W32-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX11W32-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W32-NEXT:    v_mov_b32_e32 v3, 0
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W32-NEXT:    v_mov_b32_e32 v2, v1
-; GFX11W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX11W32-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX11W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX11W32-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX11W32-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11W32-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX11W32-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX11W32-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
-; GFX11W32-NEXT:    ; implicit-def: $vgpr4
-; GFX11W32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
-; GFX11W32-NEXT:    s_cbranch_execz .LBB3_2
-; GFX11W32-NEXT:  ; %bb.1:
-; GFX11W32-NEXT:    s_mov_b32 s3, s4
+; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX11W32-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT:    s_mov_b32 s2, 0
+; GFX11W32-NEXT:    ; implicit-def: $vgpr1
+; GFX11W32-NEXT:  .LBB3_1: ; %ComputeLoop
+; GFX11W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11W32-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11W32-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX11W32-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX11W32-NEXT:    s_and_not1_b32 s3, s3, s6
+; GFX11W32-NEXT:    s_add_i32 s2, s2, s5
+; GFX11W32-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11W32-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX11W32-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT:    ; implicit-def: $vgpr0
+; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX11W32-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX11W32-NEXT:    s_cbranch_execz .LBB3_4
+; GFX11W32-NEXT:  ; %bb.3:
 ; GFX11W32-NEXT:    s_clause 0x1
 ; GFX11W32-NEXT:    s_load_b32 s8, s[0:1], 0x44
 ; GFX11W32-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11W32-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v5, s8
-; GFX11W32-NEXT:    buffer_atomic_add_u32 v4, v5, s[4:7], 0 idxen glc
-; GFX11W32-NEXT:  .LBB3_2:
-; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX11W32-NEXT:    v_mov_b32_e32 v2, s8
+; GFX11W32-NEXT:    buffer_atomic_add_u32 v0, v2, s[4:7], 0 idxen glc
+; GFX11W32-NEXT:  .LBB3_4:
+; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX11W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX11W32-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT:    v_add_nc_u32_e32 v4, s2, v4
+; GFX11W32-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX11W32-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 entry:
@@ -1629,313 +1518,258 @@
 ;
 ; GFX8-LABEL: sub_i32_varying_vdata:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX8-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8-NEXT:    s_mov_b32 s4, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX8-NEXT:    ; implicit-def: $vgpr1
+; GFX8-NEXT:  .LBB7_1: ; %ComputeLoop
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX8-NEXT:    s_add_i32 s5, s5, 32
+; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX8-NEXT:    s_mov_b32 m0, s5
+; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX8-NEXT:    s_add_i32 s4, s4, s8
+; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT:    s_cbranch_execz .LBB7_2
-; GFX8-NEXT:  ; %bb.1:
+; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8-NEXT:    s_cbranch_execz .LBB7_4
+; GFX8-NEXT:  ; %bb.3:
 ; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
-; GFX8-NEXT:  .LBB7_2:
+; GFX8-NEXT:  .LBB7_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v1
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s2, v1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, s1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s0
-; GFX8-NEXT:    flat_store_dword v[3:4], v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sub_i32_varying_vdata:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX9-NEXT:    ; implicit-def: $vgpr1
+; GFX9-NEXT:  .LBB7_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX9-NEXT:    s_add_i32 s5, s5, 32
+; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX9-NEXT:    s_mov_b32 m0, s5
+; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX9-NEXT:    s_add_i32 s4, s4, s8
+; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT:    s_cbranch_execz .LBB7_2
-; GFX9-NEXT:  ; %bb.1:
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execz .LBB7_4
+; GFX9-NEXT:  ; %bb.3:
 ; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
-; GFX9-NEXT:  .LBB7_2:
+; GFX9-NEXT:  .LBB7_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10W64-LABEL: sub_i32_varying_vdata:
 ; GFX10W64:       ; %bb.0: ; %entry
-; GFX10W64-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10W64-NEXT:    s_not_b64 exec, exec
-; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT:    s_not_b64 exec, exec
-; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W64-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W64-NEXT:    v_mov_b32_e32 v2, v1
-; GFX10W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX10W64-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX10W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX10W64-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX10W64-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX10W64-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX10W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX10W64-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
-; GFX10W64-NEXT:    ; implicit-def: $vgpr4
+; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX10W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX10W64-NEXT:    s_mov_b32 s4, 0
+; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX10W64-NEXT:    ; implicit-def: $vgpr1
+; GFX10W64-NEXT:  .LBB7_1: ; %ComputeLoop
+; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10W64-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX10W64-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX10W64-NEXT:    s_add_i32 s5, s5, 32
+; GFX10W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX10W64-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX10W64-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX10W64-NEXT:    s_add_i32 s4, s4, s8
+; GFX10W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX10W64-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX10W64-NEXT:    ; implicit-def: $vgpr0
 ; GFX10W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX10W64-NEXT:    s_cbranch_execz .LBB7_2
-; GFX10W64-NEXT:  ; %bb.1:
+; GFX10W64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX10W64-NEXT:    s_cbranch_execz .LBB7_4
+; GFX10W64-NEXT:  ; %bb.3:
 ; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W64-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10W64-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT:    buffer_atomic_sub v4, off, s[8:11], 0 glc
-; GFX10W64-NEXT:  .LBB7_2:
+; GFX10W64-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
+; GFX10W64-NEXT:  .LBB7_4:
 ; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX10W64-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10W64-NEXT:    v_sub_nc_u32_e32 v4, s2, v4
+; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10W64-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10W64-NEXT:    v_sub_nc_u32_e32 v1, s2, v1
 ; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX10W64-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10W64-NEXT:    s_endpgm
 ;
 ; GFX10W32-LABEL: sub_i32_varying_vdata:
 ; GFX10W32:       ; %bb.0: ; %entry
-; GFX10W32-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W32-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W32-NEXT:    v_mov_b32_e32 v2, v1
-; GFX10W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX10W32-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX10W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX10W32-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
-; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX10W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX10W32-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
-; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
-; GFX10W32-NEXT:    ; implicit-def: $vgpr4
-; GFX10W32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
-; GFX10W32-NEXT:    s_cbranch_execz .LBB7_2
-; GFX10W32-NEXT:  ; %bb.1:
-; GFX10W32-NEXT:    s_mov_b32 s3, s4
+; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX10W32-NEXT:    s_mov_b32 s3, exec_lo
+; GFX10W32-NEXT:    s_mov_b32 s2, 0
+; GFX10W32-NEXT:    ; implicit-def: $vgpr1
+; GFX10W32-NEXT:  .LBB7_1: ; %ComputeLoop
+; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10W32-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX10W32-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX10W32-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX10W32-NEXT:    s_andn2_b32 s3, s3, s6
+; GFX10W32-NEXT:    s_add_i32 s2, s2, s5
+; GFX10W32-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX10W32-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX10W32-NEXT:    ; implicit-def: $vgpr0
+; GFX10W32-NEXT:    s_and_saveexec_b32 s3, vcc_lo
+; GFX10W32-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX10W32-NEXT:    s_cbranch_execz .LBB7_4
+; GFX10W32-NEXT:  ; %bb.3:
 ; GFX10W32-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX10W32-NEXT:    v_mov_b32_e32 v4, s3
+; GFX10W32-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    buffer_atomic_sub v4, off, s[4:7], 0 glc
-; GFX10W32-NEXT:  .LBB7_2:
+; GFX10W32-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0 glc
+; GFX10W32-NEXT:  .LBB7_4:
 ; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX10W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX10W32-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10W32-NEXT:    v_sub_nc_u32_e32 v4, s2, v4
+; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10W32-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10W32-NEXT:    v_sub_nc_u32_e32 v1, s2, v1
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX10W32-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10W32-NEXT:    s_endpgm
 ;
 ; GFX11W64-LABEL: sub_i32_varying_vdata:
 ; GFX11W64:       ; %bb.0: ; %entry
-; GFX11W64-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11W64-NEXT:    s_not_b64 exec, exec
-; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11W64-NEXT:    s_not_b64 exec, exec
-; GFX11W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W64-NEXT:    v_mov_b32_e32 v3, 0
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT:    v_mov_b32_e32 v2, v1
-; GFX11W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX11W64-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT:    v_mov_b32_e32 v2, s4
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX11W64-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX11W64-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX11W64-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX11W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX11W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX11W64-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX11W64-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX11W64-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX11W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX11W64-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX11W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX11W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
-; GFX11W64-NEXT:    ; implicit-def: $vgpr4
-; GFX11W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX11W64-NEXT:    s_cbranch_execz .LBB7_2
-; GFX11W64-NEXT:  ; %bb.1:
+; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT:    s_mov_b32 s4, 0
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX11W64-NEXT:    ; implicit-def: $vgpr1
+; GFX11W64-NEXT:  .LBB7_1: ; %ComputeLoop
+; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11W64-NEXT:    s_ctz_i32_b32 s5, s3
+; GFX11W64-NEXT:    s_ctz_i32_b32 s6, s2
+; GFX11W64-NEXT:    s_add_i32 s5, s5, 32
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX11W64-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX11W64-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX11W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11W64-NEXT:    s_add_i32 s4, s4, s8
+; GFX11W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX11W64-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT:    ; implicit-def: $vgpr0
+; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX11W64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX11W64-NEXT:    s_cbranch_execz .LBB7_4
+; GFX11W64-NEXT:  ; %bb.3:
 ; GFX11W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W64-NEXT:    v_mov_b32_e32 v4, s4
+; GFX11W64-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT:    buffer_atomic_sub_u32 v4, off, s[8:11], 0 glc
-; GFX11W64-NEXT:  .LBB7_2:
+; GFX11W64-NEXT:    buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
+; GFX11W64-NEXT:  .LBB7_4:
 ; GFX11W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX11W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX11W64-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT:    v_sub_nc_u32_e32 v4, s2, v4
+; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11W64-NEXT:    v_sub_nc_u32_e32 v1, s2, v1
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX11W64-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: sub_i32_varying_vdata:
 ; GFX11W32:       ; %bb.0: ; %entry
-; GFX11W32-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX11W32-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W32-NEXT:    v_mov_b32_e32 v3, 0
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W32-NEXT:    v_mov_b32_e32 v2, v1
-; GFX11W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX11W32-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX11W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX11W32-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX11W32-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11W32-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX11W32-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX11W32-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
-; GFX11W32-NEXT:    ; implicit-def: $vgpr4
-; GFX11W32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
-; GFX11W32-NEXT:    s_cbranch_execz .LBB7_2
-; GFX11W32-NEXT:  ; %bb.1:
-; GFX11W32-NEXT:    s_mov_b32 s3, s4
+; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX11W32-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT:    s_mov_b32 s2, 0
+; GFX11W32-NEXT:    ; implicit-def: $vgpr1
+; GFX11W32-NEXT:  .LBB7_1: ; %ComputeLoop
+; GFX11W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11W32-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11W32-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX11W32-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX11W32-NEXT:    s_and_not1_b32 s3, s3, s6
+; GFX11W32-NEXT:    s_add_i32 s2, s2, s5
+; GFX11W32-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11W32-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX11W32-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT:    ; implicit-def: $vgpr0
+; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX11W32-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX11W32-NEXT:    s_cbranch_execz .LBB7_4
+; GFX11W32-NEXT:  ; %bb.3:
 ; GFX11W32-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11W32-NEXT:    v_mov_b32_e32 v4, s3
+; GFX11W32-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT:    buffer_atomic_sub_u32 v4, off, s[4:7], 0 glc
-; GFX11W32-NEXT:  .LBB7_2:
-; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX11W32-NEXT:    buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc
+; GFX11W32-NEXT:  .LBB7_4:
+; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX11W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX11W32-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT:    v_sub_nc_u32_e32 v4, s2, v4
+; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11W32-NEXT:    v_sub_nc_u32_e32 v1, s2, v1
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX11W32-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s
+; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 
@@ -517,36 +517,32 @@
 ; GFX8-LABEL: add_i32_varying:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX8-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8-NEXT:    s_mov_b32 s6, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX8-NEXT:    ; implicit-def: $vgpr1
+; GFX8-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_ff1_i32_b32 s7, s5
+; GFX8-NEXT:    s_ff1_i32_b32 s8, s4
+; GFX8-NEXT:    s_add_i32 s7, s7, 32
+; GFX8-NEXT:    s_min_u32 s7, s8, s7
+; GFX8-NEXT:    v_readlane_b32 s10, v0, s7
+; GFX8-NEXT:    s_lshl_b64 s[8:9], 1, s7
+; GFX8-NEXT:    s_mov_b32 m0, s7
+; GFX8-NEXT:    v_writelane_b32 v1, s6, m0
+; GFX8-NEXT:    s_add_i32 s6, s6, s10
+; GFX8-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[8:9]
+; GFX8-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX8-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX8-NEXT:    s_cbranch_execz .LBB2_2
-; GFX8-NEXT:  ; %bb.1:
+; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execz .LBB2_4
+; GFX8-NEXT:  ; %bb.3:
 ; GFX8-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s10, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -557,50 +553,45 @@
 ; GFX8-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
-; GFX8-NEXT:  .LBB2_2:
+; GFX8-NEXT:  .LBB2_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v1
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: add_i32_varying:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-NEXT:    s_mov_b32 s6, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX9-NEXT:    ; implicit-def: $vgpr1
+; GFX9-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s7, s5
+; GFX9-NEXT:    s_ff1_i32_b32 s8, s4
+; GFX9-NEXT:    s_add_i32 s7, s7, 32
+; GFX9-NEXT:    s_min_u32 s7, s8, s7
+; GFX9-NEXT:    v_readlane_b32 s10, v0, s7
+; GFX9-NEXT:    s_lshl_b64 s[8:9], 1, s7
+; GFX9-NEXT:    s_mov_b32 m0, s7
+; GFX9-NEXT:    v_writelane_b32 v1, s6, m0
+; GFX9-NEXT:    s_add_i32 s6, s6, s10
+; GFX9-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT:    s_cbranch_execz .LBB2_2
-; GFX9-NEXT:  ; %bb.1:
+; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execz .LBB2_4
+; GFX9-NEXT:  ; %bb.3:
 ; GFX9-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s10, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -611,273 +602,214 @@
 ; GFX9-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:  .LBB2_2:
+; GFX9-NEXT:  .LBB2_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, s4, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: add_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s6, v1, 15
-; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1064-NEXT:    v_readlane_b32 s7, v1, 31
-; GFX1064-NEXT:    v_writelane_b32 v3, s6, 16
-; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1064-NEXT:    v_readlane_b32 s8, v1, 47
-; GFX1064-NEXT:    v_readlane_b32 s9, v1, 63
-; GFX1064-NEXT:    v_writelane_b32 v3, s7, 32
-; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX1064-NEXT:    s_mov_b32 s4, s9
-; GFX1064-NEXT:    v_writelane_b32 v3, s8, 48
-; GFX1064-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_mov_b32 s6, -1
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064-NEXT:    s_mov_b32 s6, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX1064-NEXT:    ; implicit-def: $vgpr1
+; GFX1064-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_ff1_i32_b32 s7, s5
+; GFX1064-NEXT:    s_ff1_i32_b32 s8, s4
+; GFX1064-NEXT:    s_add_i32 s7, s7, 32
+; GFX1064-NEXT:    s_min_u32 s7, s8, s7
+; GFX1064-NEXT:    v_readlane_b32 s10, v0, s7
+; GFX1064-NEXT:    s_lshl_b64 s[8:9], 1, s7
+; GFX1064-NEXT:    v_writelane_b32 v1, s6, s7
+; GFX1064-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[8:9]
+; GFX1064-NEXT:    s_add_i32 s6, s6, s10
+; GFX1064-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1064-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB2_2
-; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1064-NEXT:  ; %bb.3:
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1064-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mov_b32 s4, s2
-; GFX1064-NEXT:    s_mov_b32 s5, s3
+; GFX1064-NEXT:    s_mov_b32 s8, s2
+; GFX1064-NEXT:    s_mov_b32 s9, s3
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
+; GFX1064-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
-; GFX1064-NEXT:  .LBB2_2:
+; GFX1064-NEXT:  .LBB2_4:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s2, v0
-; GFX1064-NEXT:    s_mov_b32 s2, s6
+; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s2, v1
+; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: add_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032-NEXT:    v_readlane_b32 s5, v1, 15
-; GFX1032-NEXT:    v_readlane_b32 s6, v1, 31
-; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1032-NEXT:    v_writelane_b32 v3, s5, 16
-; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT:    s_mov_b32 s4, s6
-; GFX1032-NEXT:    s_mov_b32 s6, -1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
+; GFX1032-NEXT:    s_mov_b32 s4, 0
+; GFX1032-NEXT:    ; implicit-def: $vgpr1
+; GFX1032-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_ff1_i32_b32 s6, s5
+; GFX1032-NEXT:    v_readlane_b32 s7, v0, s6
+; GFX1032-NEXT:    s_lshl_b32 s8, 1, s6
+; GFX1032-NEXT:    v_writelane_b32 v1, s4, s6
+; GFX1032-NEXT:    s_andn2_b32 s5, s5, s8
+; GFX1032-NEXT:    s_add_i32 s4, s4, s7
+; GFX1032-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX1032-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0
-; GFX1032-NEXT:    s_and_saveexec_b32 s8, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB2_2
-; GFX1032-NEXT:  ; %bb.1:
+; GFX1032-NEXT:    s_and_saveexec_b32 s5, vcc_lo
+; GFX1032-NEXT:    s_xor_b32 s5, exec_lo, s5
+; GFX1032-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1032-NEXT:  ; %bb.3:
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1032-NEXT:    s_mov_b32 s10, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_mov_b32 s4, s2
-; GFX1032-NEXT:    s_mov_b32 s5, s3
+; GFX1032-NEXT:    s_mov_b32 s8, s2
+; GFX1032-NEXT:    s_mov_b32 s9, s3
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
+; GFX1032-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
-; GFX1032-NEXT:  .LBB2_2:
+; GFX1032-NEXT:  .LBB2_4:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s2, v0
-; GFX1032-NEXT:    s_mov_b32 s2, s6
+; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s2, v1
+; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: add_i32_varying:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_not_b64 exec, exec
-; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    s_not_b64 exec, exec
-; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_readlane_b32 s6, v1, 15
-; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX1164-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1164-NEXT:    v_readlane_b32 s7, v1, 31
-; GFX1164-NEXT:    v_writelane_b32 v3, s6, 16
-; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1164-NEXT:    v_readlane_b32 s8, v1, 47
-; GFX1164-NEXT:    v_readlane_b32 s9, v1, 63
-; GFX1164-NEXT:    v_writelane_b32 v3, s7, 32
-; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX1164-NEXT:    s_mov_b32 s4, s9
-; GFX1164-NEXT:    v_writelane_b32 v3, s8, 48
-; GFX1164-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT:    s_mov_b32 s6, -1
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-NEXT:    s_mov_b32 s6, 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX1164-NEXT:    ; implicit-def: $vgpr1
+; GFX1164-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_ctz_i32_b32 s7, s5
+; GFX1164-NEXT:    s_ctz_i32_b32 s8, s4
+; GFX1164-NEXT:    s_add_i32 s7, s7, 32
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_min_u32 s7, s8, s7
+; GFX1164-NEXT:    v_readlane_b32 s10, v0, s7
+; GFX1164-NEXT:    s_lshl_b64 s[8:9], 1, s7
+; GFX1164-NEXT:    v_writelane_b32 v1, s6, s7
+; GFX1164-NEXT:    s_and_not1_b64 s[4:5], s[4:5], s[8:9]
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT:    s_add_i32 s6, s6, s10
+; GFX1164-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1164-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1164-NEXT:    ; implicit-def: $vgpr0
-; GFX1164-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; GFX1164-NEXT:    s_cbranch_execz .LBB2_2
-; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1164-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1164-NEXT:  ; %bb.3:
+; GFX1164-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1164-NEXT:    s_mov_b32 s10, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_mov_b32 s4, s2
-; GFX1164-NEXT:    s_mov_b32 s5, s3
+; GFX1164-NEXT:    s_mov_b32 s8, s2
+; GFX1164-NEXT:    s_mov_b32 s9, s3
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1164-NEXT:    buffer_atomic_add_u32 v0, off, s[4:7], 0 glc
+; GFX1164-NEXT:    buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:    buffer_gl1_inv
-; GFX1164-NEXT:  .LBB2_2:
-; GFX1164-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX1164-NEXT:  .LBB2_4:
+; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_nc_u32_e32 v0, s2, v0
-; GFX1164-NEXT:    s_mov_b32 s2, s6
+; GFX1164-NEXT:    v_add_nc_u32_e32 v0, s2, v1
+; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: add_i32_varying:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1132-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1132-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1132-NEXT:    v_readlane_b32 s5, v1, 15
-; GFX1132-NEXT:    v_readlane_b32 s6, v1, 31
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132-NEXT:    v_writelane_b32 v3, s5, 16
-; GFX1132-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT:    s_mov_b32 s4, s6
-; GFX1132-NEXT:    s_mov_b32 s6, -1
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT:    s_mov_b32 s5, exec_lo
+; GFX1132-NEXT:    s_mov_b32 s4, 0
+; GFX1132-NEXT:    ; implicit-def: $vgpr1
+; GFX1132-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_ctz_i32_b32 s6, s5
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1132-NEXT:    v_readlane_b32 s7, v0, s6
+; GFX1132-NEXT:    s_lshl_b32 s8, 1, s6
+; GFX1132-NEXT:    v_writelane_b32 v1, s4, s6
+; GFX1132-NEXT:    s_and_not1_b32 s5, s5, s8
+; GFX1132-NEXT:    s_add_i32 s4, s4, s7
+; GFX1132-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX1132-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT:    s_mov_b32 s5, exec_lo
 ; GFX1132-NEXT:    ; implicit-def: $vgpr0
-; GFX1132-NEXT:    s_and_saveexec_b32 s8, vcc_lo
-; GFX1132-NEXT:    s_cbranch_execz .LBB2_2
-; GFX1132-NEXT:  ; %bb.1:
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT:    s_xor_b32 s5, exec_lo, s5
+; GFX1132-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1132-NEXT:  ; %bb.3:
 ; GFX1132-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1132-NEXT:    s_mov_b32 s10, -1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_mov_b32 s4, s2
-; GFX1132-NEXT:    s_mov_b32 s5, s3
+; GFX1132-NEXT:    s_mov_b32 s8, s2
+; GFX1132-NEXT:    s_mov_b32 s9, s3
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1132-NEXT:    buffer_atomic_add_u32 v0, off, s[4:7], 0 glc
+; GFX1132-NEXT:    buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:    buffer_gl1_inv
-; GFX1132-NEXT:  .LBB2_2:
-; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; GFX1132-NEXT:  .LBB2_4:
+; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_add_nc_u32_e32 v0, s2, v0
-; GFX1132-NEXT:    s_mov_b32 s2, s6
+; GFX1132-NEXT:    v_add_nc_u32_e32 v0, s2, v1
+; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
@@ -2085,36 +2017,32 @@
 ; GFX8-LABEL: sub_i32_varying:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX8-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8-NEXT:    s_mov_b32 s6, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX8-NEXT:    ; implicit-def: $vgpr1
+; GFX8-NEXT:  .LBB8_1: ; %ComputeLoop
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_ff1_i32_b32 s7, s5
+; GFX8-NEXT:    s_ff1_i32_b32 s8, s4
+; GFX8-NEXT:    s_add_i32 s7, s7, 32
+; GFX8-NEXT:    s_min_u32 s7, s8, s7
+; GFX8-NEXT:    v_readlane_b32 s10, v0, s7
+; GFX8-NEXT:    s_lshl_b64 s[8:9], 1, s7
+; GFX8-NEXT:    s_mov_b32 m0, s7
+; GFX8-NEXT:    v_writelane_b32 v1, s6, m0
+; GFX8-NEXT:    s_add_i32 s6, s6, s10
+; GFX8-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[8:9]
+; GFX8-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX8-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX8-NEXT:    s_cbranch_execz .LBB8_2
-; GFX8-NEXT:  ; %bb.1:
+; GFX8-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execz .LBB8_4
+; GFX8-NEXT:  ; %bb.3:
 ; GFX8-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s10, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2125,50 +2053,45 @@
 ; GFX8-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
-; GFX8-NEXT:  .LBB8_2:
+; GFX8-NEXT:  .LBB8_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v1
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sub_i32_varying:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9-NEXT:    s_mov_b32 s6, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX9-NEXT:    ; implicit-def: $vgpr1
+; GFX9-NEXT:  .LBB8_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s7, s5
+; GFX9-NEXT:    s_ff1_i32_b32 s8, s4
+; GFX9-NEXT:    s_add_i32 s7, s7, 32
+; GFX9-NEXT:    s_min_u32 s7, s8, s7
+; GFX9-NEXT:    v_readlane_b32 s10, v0, s7
+; GFX9-NEXT:    s_lshl_b64 s[8:9], 1, s7
+; GFX9-NEXT:    s_mov_b32 m0, s7
+; GFX9-NEXT:    v_writelane_b32 v1, s6, m0
+; GFX9-NEXT:    s_add_i32 s6, s6, s10
+; GFX9-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT:    s_cbranch_execz .LBB8_2
-; GFX9-NEXT:  ; %bb.1:
+; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execz .LBB8_4
+; GFX9-NEXT:  ; %bb.3:
 ; GFX9-NEXT:    s_mov_b32 s11, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s10, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2179,273 +2102,214 @@
 ; GFX9-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
-; GFX9-NEXT:  .LBB8_2:
+; GFX9-NEXT:  .LBB8_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
+; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: sub_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s6, v1, 15
-; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1064-NEXT:    v_readlane_b32 s7, v1, 31
-; GFX1064-NEXT:    v_writelane_b32 v3, s6, 16
-; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1064-NEXT:    v_readlane_b32 s8, v1, 47
-; GFX1064-NEXT:    v_readlane_b32 s9, v1, 63
-; GFX1064-NEXT:    v_writelane_b32 v3, s7, 32
-; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX1064-NEXT:    s_mov_b32 s4, s9
-; GFX1064-NEXT:    v_writelane_b32 v3, s8, 48
-; GFX1064-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_mov_b32 s6, -1
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064-NEXT:    s_mov_b32 s6, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX1064-NEXT:    ; implicit-def: $vgpr1
+; GFX1064-NEXT:  .LBB8_1: ; %ComputeLoop
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_ff1_i32_b32 s7, s5
+; GFX1064-NEXT:    s_ff1_i32_b32 s8, s4
+; GFX1064-NEXT:    s_add_i32 s7, s7, 32
+; GFX1064-NEXT:    s_min_u32 s7, s8, s7
+; GFX1064-NEXT:    v_readlane_b32 s10, v0, s7
+; GFX1064-NEXT:    s_lshl_b64 s[8:9], 1, s7
+; GFX1064-NEXT:    v_writelane_b32 v1, s6, s7
+; GFX1064-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[8:9]
+; GFX1064-NEXT:    s_add_i32 s6, s6, s10
+; GFX1064-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1064-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB8_2
-; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1064-NEXT:  ; %bb.3:
+; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1064-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mov_b32 s4, s2
-; GFX1064-NEXT:    s_mov_b32 s5, s3
+; GFX1064-NEXT:    s_mov_b32 s8, s2
+; GFX1064-NEXT:    s_mov_b32 s9, s3
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0 glc
+; GFX1064-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:    buffer_gl1_inv
-; GFX1064-NEXT:  .LBB8_2:
+; GFX1064-NEXT:  .LBB8_4:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
-; GFX1064-NEXT:    s_mov_b32 s2, s6
+; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v1
+; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: sub_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032-NEXT:    v_readlane_b32 s5, v1, 15
-; GFX1032-NEXT:    v_readlane_b32 s6, v1, 31
-; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1032-NEXT:    v_writelane_b32 v3, s5, 16
-; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT:    s_mov_b32 s4, s6
-; GFX1032-NEXT:    s_mov_b32 s6, -1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
+; GFX1032-NEXT:    s_mov_b32 s4, 0
+; GFX1032-NEXT:    ; implicit-def: $vgpr1
+; GFX1032-NEXT:  .LBB8_1: ; %ComputeLoop
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_ff1_i32_b32 s6, s5
+; GFX1032-NEXT:    v_readlane_b32 s7, v0, s6
+; GFX1032-NEXT:    s_lshl_b32 s8, 1, s6
+; GFX1032-NEXT:    v_writelane_b32 v1, s4, s6
+; GFX1032-NEXT:    s_andn2_b32 s5, s5, s8
+; GFX1032-NEXT:    s_add_i32 s4, s4, s7
+; GFX1032-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX1032-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0
-; GFX1032-NEXT:    s_and_saveexec_b32 s8, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB8_2
-; GFX1032-NEXT:  ; %bb.1:
+; GFX1032-NEXT:    s_and_saveexec_b32 s5, vcc_lo
+; GFX1032-NEXT:    s_xor_b32 s5, exec_lo, s5
+; GFX1032-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1032-NEXT:  ; %bb.3:
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1032-NEXT:    s_mov_b32 s10, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_mov_b32 s4, s2
-; GFX1032-NEXT:    s_mov_b32 s5, s3
+; GFX1032-NEXT:    s_mov_b32 s8, s2
+; GFX1032-NEXT:    s_mov_b32 s9, s3
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0 glc
+; GFX1032-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:    buffer_gl1_inv
-; GFX1032-NEXT:  .LBB8_2:
+; GFX1032-NEXT:  .LBB8_4:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
-; GFX1032-NEXT:    s_mov_b32 s2, s6
+; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v1
+; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: sub_i32_varying:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_not_b64 exec, exec
-; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    s_not_b64 exec, exec
-; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_readlane_b32 s6, v1, 15
-; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
 ; GFX1164-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1164-NEXT:    v_readlane_b32 s7, v1, 31
-; GFX1164-NEXT:    v_writelane_b32 v3, s6, 16
-; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1164-NEXT:    v_readlane_b32 s8, v1, 47
-; GFX1164-NEXT:    v_readlane_b32 s9, v1, 63
-; GFX1164-NEXT:    v_writelane_b32 v3, s7, 32
-; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX1164-NEXT:    s_mov_b32 s4, s9
-; GFX1164-NEXT:    v_writelane_b32 v3, s8, 48
-; GFX1164-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT:    s_mov_b32 s6, -1
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-NEXT:    s_mov_b32 s6, 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX1164-NEXT:    ; implicit-def: $vgpr1
+; GFX1164-NEXT:  .LBB8_1: ; %ComputeLoop
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_ctz_i32_b32 s7, s5
+; GFX1164-NEXT:    s_ctz_i32_b32 s8, s4
+; GFX1164-NEXT:    s_add_i32 s7, s7, 32
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_min_u32 s7, s8, s7
+; GFX1164-NEXT:    v_readlane_b32 s10, v0, s7
+; GFX1164-NEXT:    s_lshl_b64 s[8:9], 1, s7
+; GFX1164-NEXT:    v_writelane_b32 v1, s6, s7
+; GFX1164-NEXT:    s_and_not1_b64 s[4:5], s[4:5], s[8:9]
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT:    s_add_i32 s6, s6, s10
+; GFX1164-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1164-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1164-NEXT:    ; implicit-def: $vgpr0
-; GFX1164-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; GFX1164-NEXT:    s_cbranch_execz .LBB8_2
-; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1164-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1164-NEXT:  ; %bb.3:
+; GFX1164-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1164-NEXT:    s_mov_b32 s10, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_mov_b32 s4, s2
-; GFX1164-NEXT:    s_mov_b32 s5, s3
+; GFX1164-NEXT:    s_mov_b32 s8, s2
+; GFX1164-NEXT:    s_mov_b32 s9, s3
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1164-NEXT:    buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc
+; GFX1164-NEXT:    buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:    buffer_gl1_inv
-; GFX1164-NEXT:  .LBB8_2:
-; GFX1164-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX1164-NEXT:  .LBB8_4:
+; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
-; GFX1164-NEXT:    s_mov_b32 s2, s6
+; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s2, v1
+; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: sub_i32_varying:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
 ; GFX1132-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1132-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1132-NEXT:    v_readlane_b32 s5, v1, 15
-; GFX1132-NEXT:    v_readlane_b32 s6, v1, 31
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_or_saveexec_b32 s4, -1
-; GFX1132-NEXT:    v_writelane_b32 v3, s5, 16
-; GFX1132-NEXT:    s_mov_b32 exec_lo, s4
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT:    s_mov_b32 s4, s6
-; GFX1132-NEXT:    s_mov_b32 s6, -1
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT:    s_mov_b32 s5, exec_lo
+; GFX1132-NEXT:    s_mov_b32 s4, 0
+; GFX1132-NEXT:    ; implicit-def: $vgpr1
+; GFX1132-NEXT:  .LBB8_1: ; %ComputeLoop
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_ctz_i32_b32 s6, s5
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1132-NEXT:    v_readlane_b32 s7, v0, s6
+; GFX1132-NEXT:    s_lshl_b32 s8, 1, s6
+; GFX1132-NEXT:    v_writelane_b32 v1, s4, s6
+; GFX1132-NEXT:    s_and_not1_b32 s5, s5, s8
+; GFX1132-NEXT:    s_add_i32 s4, s4, s7
+; GFX1132-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX1132-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT:    s_mov_b32 s5, exec_lo
 ; GFX1132-NEXT:    ; implicit-def: $vgpr0
-; GFX1132-NEXT:    s_and_saveexec_b32 s8, vcc_lo
-; GFX1132-NEXT:    s_cbranch_execz .LBB8_2
-; GFX1132-NEXT:  ; %bb.1:
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT:    s_xor_b32 s5, exec_lo, s5
+; GFX1132-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1132-NEXT:  ; %bb.3:
 ; GFX1132-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1132-NEXT:    s_mov_b32 s10, -1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_mov_b32 s4, s2
-; GFX1132-NEXT:    s_mov_b32 s5, s3
+; GFX1132-NEXT:    s_mov_b32 s8, s2
+; GFX1132-NEXT:    s_mov_b32 s9, s3
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1132-NEXT:    buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc
+; GFX1132-NEXT:    buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:    buffer_gl1_inv
-; GFX1132-NEXT:  .LBB8_2:
-; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; GFX1132-NEXT:  .LBB8_4:
+; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
-; GFX1132-NEXT:    s_mov_b32 s2, s6
+; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s2, v1
+; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1132-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -1,11 +1,11 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update
-; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 
@@ -491,269 +491,226 @@
 ;
 ; GFX8-LABEL: add_i32_varying:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX8-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8-NEXT:    s_mov_b32 s4, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX8-NEXT:    ; implicit-def: $vgpr1
+; GFX8-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX8-NEXT:    s_add_i32 s5, s5, 32
+; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX8-NEXT:    s_mov_b32 m0, s5
+; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX8-NEXT:    s_add_i32 s4, s4, s8
+; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT:    s_cbranch_execz .LBB2_2
-; GFX8-NEXT:  ; %bb.1:
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8-NEXT:    s_cbranch_execz .LBB2_4
+; GFX8-NEXT:  ; %bb.3:
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    ds_add_rtn_u32 v0, v3, v0
+; GFX8-NEXT:    ds_add_rtn_u32 v0, v0, v2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:  .LBB2_2:
+; GFX8-NEXT:  .LBB2_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v1
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: add_i32_varying:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX9-NEXT:    ; implicit-def: $vgpr1
+; GFX9-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX9-NEXT:    s_add_i32 s5, s5, 32
+; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX9-NEXT:    s_mov_b32 m0, s5
+; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX9-NEXT:    s_add_i32 s4, s4, s8
+; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT:    s_cbranch_execz .LBB2_2
-; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execz .LBB2_4
+; GFX9-NEXT:  ; %bb.3:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    ds_add_rtn_u32 v0, v3, v0
+; GFX9-NEXT:    ds_add_rtn_u32 v0, v0, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:  .LBB2_2:
+; GFX9-NEXT:  .LBB2_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX9-NEXT:    v_add_u32_e32 v0, s4, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: add_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
-; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_mov_b32 s2, -1
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064-NEXT:    s_mov_b32 s4, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX1064-NEXT:    ; implicit-def: $vgpr1
+; GFX1064-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX1064-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX1064-NEXT:    s_add_i32 s5, s5, 32
+; GFX1064-NEXT:    s_min_u32 s5, s6, s5
+; GFX1064-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX1064-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX1064-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX1064-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064-NEXT:    s_add_i32 s4, s4, s8
+; GFX1064-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB2_2
-; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s7
-; GFX1064-NEXT:    s_mov_b32 s3, s7
+; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1064-NEXT:  ; %bb.3:
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    ds_add_rtn_u32 v0, v4, v0
+; GFX1064-NEXT:    ds_add_rtn_u32 v0, v0, v2
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
-; GFX1064-NEXT:  .LBB2_2:
+; GFX1064-NEXT:  .LBB2_4:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s3, v0
+; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s2, v1
+; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: add_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT:    s_mov_b32 s2, -1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:    ; implicit-def: $vgpr1
+; GFX1032-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX1032-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX1032-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX1032-NEXT:    s_andn2_b32 s3, s3, s6
+; GFX1032-NEXT:    s_add_i32 s2, s2, s5
+; GFX1032-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1032-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB2_2
-; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1032-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1032-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1032-NEXT:  ; %bb.3:
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    ds_add_rtn_u32 v0, v4, v0
+; GFX1032-NEXT:    ds_add_rtn_u32 v0, v0, v2
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
-; GFX1032-NEXT:  .LBB2_2:
+; GFX1032-NEXT:  .LBB2_4:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s3, v0
+; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s2, v1
+; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: add_i32_varying:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_not_b64 exec, exec
-; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    s_not_b64 exec, exec
-; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
-; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT:    s_mov_b32 s2, -1
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164-NEXT:    s_mov_b32 s4, 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX1164-NEXT:    ; implicit-def: $vgpr1
+; GFX1164-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_ctz_i32_b32 s5, s3
+; GFX1164-NEXT:    s_ctz_i32_b32 s6, s2
+; GFX1164-NEXT:    s_add_i32 s5, s5, 32
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_min_u32 s5, s6, s5
+; GFX1164-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX1164-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX1164-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT:    s_add_i32 s4, s4, s8
+; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    ; implicit-def: $vgpr0
-; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX1164-NEXT:    s_cbranch_execz .LBB2_2
-; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    v_mov_b32_e32 v0, s7
-; GFX1164-NEXT:    s_mov_b32 s3, s7
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1164-NEXT:  ; %bb.3:
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1164-NEXT:    ds_add_rtn_u32 v0, v4, v0
+; GFX1164-NEXT:    ds_add_rtn_u32 v0, v0, v2
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_gl0_inv
-; GFX1164-NEXT:  .LBB2_2:
-; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT:  .LBB2_4:
+; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_nc_u32_e32 v0, s3, v0
+; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_add_nc_u32_e32 v0, s2, v1
+; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -761,53 +718,43 @@
 ;
 ; GFX1132-LABEL: add_i32_varying:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT:    s_mov_b32 s2, -1
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:    ; implicit-def: $vgpr1
+; GFX1132-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1132-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1132-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX1132-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX1132-NEXT:    s_and_not1_b32 s3, s3, s6
+; GFX1132-NEXT:    s_add_i32 s2, s2, s5
+; GFX1132-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1132-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    ; implicit-def: $vgpr0
-; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
-; GFX1132-NEXT:    s_cbranch_execz .LBB2_2
-; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1132-NEXT:    s_cbranch_execz .LBB2_4
+; GFX1132-NEXT:  ; %bb.3:
+; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1132-NEXT:    ds_add_rtn_u32 v0, v4, v0
+; GFX1132-NEXT:    ds_add_rtn_u32 v0, v0, v2
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_gl0_inv
-; GFX1132-NEXT:  .LBB2_2:
+; GFX1132-NEXT:  .LBB2_4:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_add_nc_u32_e32 v0, s3, v0
+; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_add_nc_u32_e32 v0, s2, v1
+; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -831,215 +778,199 @@
 ;
 ; GFX8-LABEL: add_i32_varying_nouse:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8-NEXT:    v_readlane_b32 s2, v1, 63
-; GFX8-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8-NEXT:    s_mov_b32 s0, s2
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT:    s_cbranch_execz .LBB3_2
-; GFX8-NEXT:  ; %bb.1:
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX8-NEXT:    s_mov_b64 s[0:1], exec
+; GFX8-NEXT:    s_mov_b32 s2, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX8-NEXT:  .LBB3_1: ; %ComputeLoop
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_ff1_i32_b32 s3, s1
+; GFX8-NEXT:    s_ff1_i32_b32 s4, s0
+; GFX8-NEXT:    s_add_i32 s3, s3, 32
+; GFX8-NEXT:    s_min_u32 s3, s4, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX8-NEXT:    s_lshl_b64 s[4:5], 1, s3
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX8-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX8-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8-NEXT:    s_cbranch_execz .LBB3_4
+; GFX8-NEXT:  ; %bb.3:
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    ds_add_u32 v2, v0
+; GFX8-NEXT:    ds_add_u32 v0, v1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:  .LBB3_2:
+; GFX8-NEXT:  .LBB3_4:
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: add_i32_varying_nouse:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-NEXT:    v_readlane_b32 s2, v1, 63
-; GFX9-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-NEXT:    s_mov_b32 s0, s2
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT:    s_cbranch_execz .LBB3_2
-; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    s_mov_b32 s2, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX9-NEXT:  .LBB3_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s3, s1
+; GFX9-NEXT:    s_ff1_i32_b32 s4, s0
+; GFX9-NEXT:    s_add_i32 s3, s3, 32
+; GFX9-NEXT:    s_min_u32 s3, s4, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX9-NEXT:    s_lshl_b64 s[4:5], 1, s3
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execz .LBB3_4
+; GFX9-NEXT:  ; %bb.3:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    ds_add_u32 v2, v0
+; GFX9-NEXT:    ds_add_u32 v0, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:  .LBB3_2:
+; GFX9-NEXT:  .LBB3_4:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: add_i32_varying_nouse:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v1, v2
-; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-NEXT:    v_readlane_b32 s2, v1, 0
-; GFX1064-NEXT:    v_readlane_b32 s3, v1, 32
-; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1064-NEXT:    s_mov_b32 s2, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX1064-NEXT:  .LBB3_1: ; %ComputeLoop
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_ff1_i32_b32 s3, s1
+; GFX1064-NEXT:    s_ff1_i32_b32 s4, s0
+; GFX1064-NEXT:    s_add_i32 s3, s3, 32
+; GFX1064-NEXT:    s_min_u32 s3, s4, s3
+; GFX1064-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1064-NEXT:    s_lshl_b64 s[4:5], 1, s3
+; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX1064-NEXT:    s_add_i32 s2, s2, s6
+; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT:    s_cbranch_execz .LBB3_4
+; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1064-NEXT:    s_add_i32 s0, s2, s3
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB3_2
-; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    ds_add_u32 v0, v3
+; GFX1064-NEXT:    ds_add_u32 v0, v1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
-; GFX1064-NEXT:  .LBB3_2:
+; GFX1064-NEXT:  .LBB3_4:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: add_i32_varying_nouse:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v1, v2
-; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1032-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1032-NEXT:    s_mov_b32 s0, 0
+; GFX1032-NEXT:  .LBB3_1: ; %ComputeLoop
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_ff1_i32_b32 s2, s1
+; GFX1032-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1032-NEXT:    s_lshl_b32 s2, 1, s2
+; GFX1032-NEXT:    s_andn2_b32 s1, s1, s2
+; GFX1032-NEXT:    s_add_i32 s0, s0, s3
+; GFX1032-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX1032-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX1032-NEXT:    s_cbranch_execz .LBB3_4
+; GFX1032-NEXT:  ; %bb.3:
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
-; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB3_2
-; GFX1032-NEXT:  ; %bb.1:
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    ds_add_u32 v0, v3
+; GFX1032-NEXT:    ds_add_u32 v0, v1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
-; GFX1032-NEXT:  .LBB3_2:
+; GFX1032-NEXT:  .LBB3_4:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: add_i32_varying_nouse:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_not_b64 exec, exec
-; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    s_not_b64 exec, exec
-; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
-; GFX1164-NEXT:    v_permlane64_b32 v2, v1
-; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
-; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v4
-; GFX1164-NEXT:    s_cbranch_execz .LBB3_2
-; GFX1164-NEXT:  ; %bb.1:
+; GFX1164-NEXT:    s_mov_b32 s2, 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX1164-NEXT:  .LBB3_1: ; %ComputeLoop
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_ctz_i32_b32 s3, s1
+; GFX1164-NEXT:    s_ctz_i32_b32 s4, s0
+; GFX1164-NEXT:    s_add_i32 s3, s3, 32
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_min_u32 s3, s4, s3
+; GFX1164-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1164-NEXT:    s_lshl_b64 s[4:5], 1, s3
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[4:5]
+; GFX1164-NEXT:    s_add_i32 s2, s2, s6
+; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT:    s_cbranch_execz .LBB3_4
+; GFX1164-NEXT:  ; %bb.3:
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1164-NEXT:    ds_add_u32 v0, v3
+; GFX1164-NEXT:    ds_add_u32 v0, v1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_gl0_inv
-; GFX1164-NEXT:  .LBB3_2:
+; GFX1164-NEXT:  .LBB3_4:
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: add_i32_varying_nouse:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_add_nc_u32_e32 v1, v1, v2
-; GFX1132-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1
-; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v4
-; GFX1132-NEXT:    s_cbranch_execz .LBB3_2
-; GFX1132-NEXT:  ; %bb.1:
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1132-NEXT:    s_mov_b32 s0, 0
+; GFX1132-NEXT:  .LBB3_1: ; %ComputeLoop
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1132-NEXT:    s_lshl_b32 s2, 1, s2
+; GFX1132-NEXT:    s_and_not1_b32 s1, s1, s2
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    s_add_i32 s0, s0, s3
+; GFX1132-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1132-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX1132-NEXT:    s_cbranch_execz .LBB3_4
+; GFX1132-NEXT:  ; %bb.3:
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1132-NEXT:    ds_add_u32 v0, v3
+; GFX1132-NEXT:    ds_add_u32 v0, v1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_gl0_inv
-; GFX1132-NEXT:  .LBB3_2:
+; GFX1132-NEXT:  .LBB3_4:
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -2140,269 +2071,226 @@
 ;
 ; GFX8-LABEL: sub_i32_varying:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX8-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8-NEXT:    s_mov_b32 s4, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX8-NEXT:    ; implicit-def: $vgpr1
+; GFX8-NEXT:  .LBB9_1: ; %ComputeLoop
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX8-NEXT:    s_add_i32 s5, s5, 32
+; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX8-NEXT:    s_mov_b32 m0, s5
+; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX8-NEXT:    s_add_i32 s4, s4, s8
+; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT:    s_cbranch_scc1 .LBB9_1
+; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT:    s_cbranch_execz .LBB9_2
-; GFX8-NEXT:  ; %bb.1:
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8-NEXT:    s_cbranch_execz .LBB9_4
+; GFX8-NEXT:  ; %bb.3:
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    ds_sub_rtn_u32 v0, v3, v0
+; GFX8-NEXT:    ds_sub_rtn_u32 v0, v0, v2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:  .LBB9_2:
+; GFX8-NEXT:  .LBB9_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v1
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sub_i32_varying:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX9-NEXT:    ; implicit-def: $vgpr1
+; GFX9-NEXT:  .LBB9_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX9-NEXT:    s_add_i32 s5, s5, 32
+; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX9-NEXT:    s_mov_b32 m0, s5
+; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX9-NEXT:    s_add_i32 s4, s4, s8
+; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT:    s_cbranch_scc1 .LBB9_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT:    s_cbranch_execz .LBB9_2
-; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execz .LBB9_4
+; GFX9-NEXT:  ; %bb.3:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    ds_sub_rtn_u32 v0, v3, v0
+; GFX9-NEXT:    ds_sub_rtn_u32 v0, v0, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:  .LBB9_2:
+; GFX9-NEXT:  .LBB9_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
+; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: sub_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
-; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_mov_b32 s2, -1
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064-NEXT:    s_mov_b32 s4, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX1064-NEXT:    ; implicit-def: $vgpr1
+; GFX1064-NEXT:  .LBB9_1: ; %ComputeLoop
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX1064-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX1064-NEXT:    s_add_i32 s5, s5, 32
+; GFX1064-NEXT:    s_min_u32 s5, s6, s5
+; GFX1064-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX1064-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX1064-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX1064-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064-NEXT:    s_add_i32 s4, s4, s8
+; GFX1064-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064-NEXT:    s_cbranch_scc1 .LBB9_1
+; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB9_2
-; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s7
-; GFX1064-NEXT:    s_mov_b32 s3, s7
+; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1064-NEXT:  ; %bb.3:
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    ds_sub_rtn_u32 v0, v4, v0
+; GFX1064-NEXT:    ds_sub_rtn_u32 v0, v0, v2
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
-; GFX1064-NEXT:  .LBB9_2:
+; GFX1064-NEXT:  .LBB9_4:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
+; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v1
+; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: sub_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT:    s_mov_b32 s2, -1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:    ; implicit-def: $vgpr1
+; GFX1032-NEXT:  .LBB9_1: ; %ComputeLoop
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX1032-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX1032-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX1032-NEXT:    s_andn2_b32 s3, s3, s6
+; GFX1032-NEXT:    s_add_i32 s2, s2, s5
+; GFX1032-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1032-NEXT:    s_cbranch_scc1 .LBB9_1
+; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB9_2
-; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1032-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1032-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1032-NEXT:  ; %bb.3:
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    ds_sub_rtn_u32 v0, v4, v0
+; GFX1032-NEXT:    ds_sub_rtn_u32 v0, v0, v2
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
-; GFX1032-NEXT:  .LBB9_2:
+; GFX1032-NEXT:  .LBB9_4:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
+; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v1
+; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: sub_i32_varying:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_not_b64 exec, exec
-; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    s_not_b64 exec, exec
-; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
-; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT:    s_mov_b32 s2, -1
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164-NEXT:    s_mov_b32 s4, 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX1164-NEXT:    ; implicit-def: $vgpr1
+; GFX1164-NEXT:  .LBB9_1: ; %ComputeLoop
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_ctz_i32_b32 s5, s3
+; GFX1164-NEXT:    s_ctz_i32_b32 s6, s2
+; GFX1164-NEXT:    s_add_i32 s5, s5, 32
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_min_u32 s5, s6, s5
+; GFX1164-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX1164-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX1164-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT:    s_add_i32 s4, s4, s8
+; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164-NEXT:    s_cbranch_scc1 .LBB9_1
+; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    ; implicit-def: $vgpr0
-; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX1164-NEXT:    s_cbranch_execz .LBB9_2
-; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    v_mov_b32_e32 v0, s7
-; GFX1164-NEXT:    s_mov_b32 s3, s7
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1164-NEXT:  ; %bb.3:
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1164-NEXT:    ds_sub_rtn_u32 v0, v4, v0
+; GFX1164-NEXT:    ds_sub_rtn_u32 v0, v0, v2
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_gl0_inv
-; GFX1164-NEXT:  .LBB9_2:
-; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT:  .LBB9_4:
+; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
+; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s2, v1
+; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2410,53 +2298,43 @@
 ;
 ; GFX1132-LABEL: sub_i32_varying:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT:    s_mov_b32 s2, -1
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:    ; implicit-def: $vgpr1
+; GFX1132-NEXT:  .LBB9_1: ; %ComputeLoop
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1132-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1132-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX1132-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX1132-NEXT:    s_and_not1_b32 s3, s3, s6
+; GFX1132-NEXT:    s_add_i32 s2, s2, s5
+; GFX1132-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1132-NEXT:    s_cbranch_scc1 .LBB9_1
+; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    ; implicit-def: $vgpr0
-; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
-; GFX1132-NEXT:    s_cbranch_execz .LBB9_2
-; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1132-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1132-NEXT:  ; %bb.3:
+; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1132-NEXT:    ds_sub_rtn_u32 v0, v4, v0
+; GFX1132-NEXT:    ds_sub_rtn_u32 v0, v0, v2
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_gl0_inv
-; GFX1132-NEXT:  .LBB9_2:
+; GFX1132-NEXT:  .LBB9_4:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s3, v0
+; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s2, v1
+; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2480,215 +2358,199 @@
 ;
 ; GFX8-LABEL: sub_i32_varying_nouse:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8-NEXT:    v_readlane_b32 s2, v1, 63
-; GFX8-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX8-NEXT:    s_mov_b32 s0, s2
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT:    s_cbranch_execz .LBB10_2
-; GFX8-NEXT:  ; %bb.1:
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX8-NEXT:    s_mov_b64 s[0:1], exec
+; GFX8-NEXT:    s_mov_b32 s2, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX8-NEXT:  .LBB10_1: ; %ComputeLoop
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_ff1_i32_b32 s3, s1
+; GFX8-NEXT:    s_ff1_i32_b32 s4, s0
+; GFX8-NEXT:    s_add_i32 s3, s3, 32
+; GFX8-NEXT:    s_min_u32 s3, s4, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX8-NEXT:    s_lshl_b64 s[4:5], 1, s3
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX8-NEXT:    s_cbranch_scc1 .LBB10_1
+; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX8-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX8-NEXT:    s_cbranch_execz .LBB10_4
+; GFX8-NEXT:  ; %bb.3:
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    ds_sub_u32 v2, v0
+; GFX8-NEXT:    ds_sub_u32 v0, v1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:  .LBB10_2:
+; GFX8-NEXT:  .LBB10_4:
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sub_i32_varying_nouse:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-NEXT:    v_readlane_b32 s2, v1, 63
-; GFX9-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX9-NEXT:    s_mov_b32 s0, s2
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT:    s_cbranch_execz .LBB10_2
-; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    s_mov_b32 s2, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX9-NEXT:  .LBB10_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s3, s1
+; GFX9-NEXT:    s_ff1_i32_b32 s4, s0
+; GFX9-NEXT:    s_add_i32 s3, s3, 32
+; GFX9-NEXT:    s_min_u32 s3, s4, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX9-NEXT:    s_lshl_b64 s[4:5], 1, s3
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT:    s_cbranch_scc1 .LBB10_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execz .LBB10_4
+; GFX9-NEXT:  ; %bb.3:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    ds_sub_u32 v2, v0
+; GFX9-NEXT:    ds_sub_u32 v0, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:  .LBB10_2:
+; GFX9-NEXT:  .LBB10_4:
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: sub_i32_varying_nouse:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1064-NEXT:    v_add_nc_u32_e32 v1, v1, v2
-; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1064-NEXT:    v_readlane_b32 s2, v1, 0
-; GFX1064-NEXT:    v_readlane_b32 s3, v1, 32
-; GFX1064-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1064-NEXT:    s_mov_b32 s2, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX1064-NEXT:  .LBB10_1: ; %ComputeLoop
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_ff1_i32_b32 s3, s1
+; GFX1064-NEXT:    s_ff1_i32_b32 s4, s0
+; GFX1064-NEXT:    s_add_i32 s3, s3, 32
+; GFX1064-NEXT:    s_min_u32 s3, s4, s3
+; GFX1064-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1064-NEXT:    s_lshl_b64 s[4:5], 1, s3
+; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX1064-NEXT:    s_add_i32 s2, s2, s6
+; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064-NEXT:    s_cbranch_scc1 .LBB10_1
+; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1064-NEXT:    s_cbranch_execz .LBB10_4
+; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1064-NEXT:    s_add_i32 s0, s2, s3
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB10_2
-; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    v_mov_b32_e32 v3, s0
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    ds_sub_u32 v0, v3
+; GFX1064-NEXT:    ds_sub_u32 v0, v1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
-; GFX1064-NEXT:  .LBB10_2:
+; GFX1064-NEXT:  .LBB10_4:
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: sub_i32_varying_nouse:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1032-NEXT:    v_add_nc_u32_e32 v1, v1, v2
-; GFX1032-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1032-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1032-NEXT:    s_mov_b32 s0, 0
+; GFX1032-NEXT:  .LBB10_1: ; %ComputeLoop
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_ff1_i32_b32 s2, s1
+; GFX1032-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1032-NEXT:    s_lshl_b32 s2, 1, s2
+; GFX1032-NEXT:    s_andn2_b32 s1, s1, s2
+; GFX1032-NEXT:    s_add_i32 s0, s0, s3
+; GFX1032-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032-NEXT:    s_cbranch_scc1 .LBB10_1
+; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX1032-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX1032-NEXT:    s_cbranch_execz .LBB10_4
+; GFX1032-NEXT:  ; %bb.3:
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v3, v1
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
-; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB10_2
-; GFX1032-NEXT:  ; %bb.1:
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    ds_sub_u32 v0, v3
+; GFX1032-NEXT:    ds_sub_u32 v0, v1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
-; GFX1032-NEXT:  .LBB10_2:
+; GFX1032-NEXT:  .LBB10_4:
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: sub_i32_varying_nouse:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_not_b64 exec, exec
-; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    s_not_b64 exec, exec
-; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
-; GFX1164-NEXT:    v_permlane64_b32 v2, v1
-; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT:    s_or_saveexec_b64 s[0:1], -1
-; GFX1164-NEXT:    v_add_nc_u32_e32 v1, v1, v2
-; GFX1164-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1164-NEXT:    v_mov_b32_e32 v3, v1
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v4
-; GFX1164-NEXT:    s_cbranch_execz .LBB10_2
-; GFX1164-NEXT:  ; %bb.1:
+; GFX1164-NEXT:    s_mov_b32 s2, 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX1164-NEXT:  .LBB10_1: ; %ComputeLoop
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_ctz_i32_b32 s3, s1
+; GFX1164-NEXT:    s_ctz_i32_b32 s4, s0
+; GFX1164-NEXT:    s_add_i32 s3, s3, 32
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_min_u32 s3, s4, s3
+; GFX1164-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX1164-NEXT:    s_lshl_b64 s[4:5], 1, s3
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[4:5]
+; GFX1164-NEXT:    s_add_i32 s2, s2, s6
+; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_cbranch_scc1 .LBB10_1
+; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1164-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX1164-NEXT:    s_cbranch_execz .LBB10_4
+; GFX1164-NEXT:  ; %bb.3:
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1164-NEXT:    ds_sub_u32 v0, v3
+; GFX1164-NEXT:    ds_sub_u32 v0, v1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_gl0_inv
-; GFX1164-NEXT:  .LBB10_2:
+; GFX1164-NEXT:  .LBB10_4:
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: sub_i32_varying_nouse:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_add_nc_u32_e32 v1, v1, v2
-; GFX1132-NEXT:    s_mov_b32 exec_lo, s0
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1
-; GFX1132-NEXT:    s_mov_b32 s0, exec_lo
-; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v4
-; GFX1132-NEXT:    s_cbranch_execz .LBB10_2
-; GFX1132-NEXT:  ; %bb.1:
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1132-NEXT:    s_mov_b32 s0, 0
+; GFX1132-NEXT:  .LBB10_1: ; %ComputeLoop
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1132-NEXT:    s_lshl_b32 s2, 1, s2
+; GFX1132-NEXT:    s_and_not1_b32 s1, s1, s2
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    s_add_i32 s0, s0, s3
+; GFX1132-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132-NEXT:    s_cbranch_scc1 .LBB10_1
+; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v1
+; GFX1132-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX1132-NEXT:    s_cbranch_execz .LBB10_4
+; GFX1132-NEXT:  ; %bb.3:
+; GFX1132-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1132-NEXT:    ds_sub_u32 v0, v3
+; GFX1132-NEXT:    ds_sub_u32 v0, v1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_gl0_inv
-; GFX1132-NEXT:  .LBB10_2:
+; GFX1132-NEXT:  .LBB10_4:
 ; GFX1132-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
@@ -3344,273 +3206,226 @@
 ;
 ; GFX8-LABEL: and_i32_varying:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    v_mov_b32_e32 v1, -1
-; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    v_mov_b32_e32 v2, -1
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX8-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX8-NEXT:    s_mov_b32 s4, -1
+; GFX8-NEXT:    ; implicit-def: $vgpr1
+; GFX8-NEXT:  .LBB14_1: ; %ComputeLoop
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX8-NEXT:    s_add_i32 s5, s5, 32
+; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX8-NEXT:    s_mov_b32 m0, s5
+; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX8-NEXT:    s_and_b32 s4, s4, s8
+; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT:    s_cbranch_scc1 .LBB14_1
+; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT:    s_cbranch_execz .LBB14_2
-; GFX8-NEXT:  ; %bb.1:
+; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8-NEXT:    s_cbranch_execz .LBB14_4
+; GFX8-NEXT:  ; %bb.3:
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s4
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    ds_and_rtn_b32 v0, v0, v3
+; GFX8-NEXT:    ds_and_rtn_b32 v0, v0, v2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:  .LBB14_2:
+; GFX8-NEXT:  .LBB14_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, s4, v1
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: and_i32_varying:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT:    v_mov_b32_e32 v1, -1
-; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    v_mov_b32_e32 v2, -1
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX9-NEXT:    s_mov_b32 s4, -1
+; GFX9-NEXT:    ; implicit-def: $vgpr1
+; GFX9-NEXT:  .LBB14_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX9-NEXT:    s_add_i32 s5, s5, 32
+; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX9-NEXT:    s_mov_b32 m0, s5
+; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX9-NEXT:    s_and_b32 s4, s4, s8
+; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT:    s_cbranch_scc1 .LBB14_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT:    s_cbranch_execz .LBB14_2
-; GFX9-NEXT:  ; %bb.1:
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execz .LBB14_4
+; GFX9-NEXT:  ; %bb.3:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    ds_and_rtn_b32 v0, v0, v3
+; GFX9-NEXT:    ds_and_rtn_b32 v0, v0, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:  .LBB14_2:
+; GFX9-NEXT:  .LBB14_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, s4, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: and_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_mov_b32_e32 v3, -1
-; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1064-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1064-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
-; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_mov_b32 s2, -1
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064-NEXT:    s_mov_b32 s4, -1
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX1064-NEXT:    ; implicit-def: $vgpr1
+; GFX1064-NEXT:  .LBB14_1: ; %ComputeLoop
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX1064-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX1064-NEXT:    s_add_i32 s5, s5, 32
+; GFX1064-NEXT:    s_min_u32 s5, s6, s5
+; GFX1064-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX1064-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX1064-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX1064-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064-NEXT:    s_and_b32 s4, s4, s8
+; GFX1064-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064-NEXT:    s_cbranch_scc1 .LBB14_1
+; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB14_2
-; GFX1064-NEXT:  ; %bb.1:
+; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execz .LBB14_4
+; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
-; GFX1064-NEXT:    s_mov_b32 s3, s7
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    ds_and_rtn_b32 v0, v0, v4
+; GFX1064-NEXT:    ds_and_rtn_b32 v0, v0, v2
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
-; GFX1064-NEXT:  .LBB14_2:
+; GFX1064-NEXT:  .LBB14_4:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT:    v_and_b32_e32 v0, s2, v1
+; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: and_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    v_mov_b32_e32 v3, -1
-; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1032-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
+; GFX1032-NEXT:    ; implicit-def: $vgpr1
+; GFX1032-NEXT:  .LBB14_1: ; %ComputeLoop
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX1032-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX1032-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX1032-NEXT:    s_andn2_b32 s3, s3, s6
+; GFX1032-NEXT:    s_and_b32 s2, s2, s5
+; GFX1032-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1032-NEXT:    s_cbranch_scc1 .LBB14_1
+; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB14_2
-; GFX1032-NEXT:  ; %bb.1:
+; GFX1032-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1032-NEXT:    s_cbranch_execz .LBB14_4
+; GFX1032-NEXT:  ; %bb.3:
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    ds_and_rtn_b32 v0, v0, v4
+; GFX1032-NEXT:    ds_and_rtn_b32 v0, v0, v2
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
-; GFX1032-NEXT:  .LBB14_2:
+; GFX1032-NEXT:  .LBB14_4:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1032-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT:    v_and_b32_e32 v0, s2, v1
+; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: and_i32_varying:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_not_b64 exec, exec
-; GFX1164-NEXT:    v_mov_b32_e32 v1, -1
-; GFX1164-NEXT:    s_not_b64 exec, exec
-; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT:    v_mov_b32_e32 v3, -1
-; GFX1164-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1164-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
-; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT:    s_mov_b32 s2, -1
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164-NEXT:    s_mov_b32 s4, -1
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX1164-NEXT:    ; implicit-def: $vgpr1
+; GFX1164-NEXT:  .LBB14_1: ; %ComputeLoop
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_ctz_i32_b32 s5, s3
+; GFX1164-NEXT:    s_ctz_i32_b32 s6, s2
+; GFX1164-NEXT:    s_add_i32 s5, s5, 32
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_min_u32 s5, s6, s5
+; GFX1164-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX1164-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX1164-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT:    s_and_b32 s4, s4, s8
+; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164-NEXT:    s_cbranch_scc1 .LBB14_1
+; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    ; implicit-def: $vgpr0
-; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX1164-NEXT:    s_cbranch_execz .LBB14_2
-; GFX1164-NEXT:  ; %bb.1:
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execz .LBB14_4
+; GFX1164-NEXT:  ; %bb.3:
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
-; GFX1164-NEXT:    s_mov_b32 s3, s7
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1164-NEXT:    ds_and_rtn_b32 v0, v0, v4
+; GFX1164-NEXT:    ds_and_rtn_b32 v0, v0, v2
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_gl0_inv
-; GFX1164-NEXT:  .LBB14_2:
-; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT:  .LBB14_4:
+; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_and_b32_e32 v0, s2, v1
+; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3618,53 +3433,43 @@
 ;
 ; GFX1132-LABEL: and_i32_varying:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT:    v_mov_b32_e32 v1, -1
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132-NEXT:    v_mov_b32_e32 v3, -1
-; GFX1132-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
+; GFX1132-NEXT:    ; implicit-def: $vgpr1
+; GFX1132-NEXT:  .LBB14_1: ; %ComputeLoop
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1132-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1132-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX1132-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX1132-NEXT:    s_and_not1_b32 s3, s3, s6
+; GFX1132-NEXT:    s_and_b32 s2, s2, s5
+; GFX1132-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1132-NEXT:    s_cbranch_scc1 .LBB14_1
+; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    ; implicit-def: $vgpr0
-; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
-; GFX1132-NEXT:    s_cbranch_execz .LBB14_2
-; GFX1132-NEXT:  ; %bb.1:
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1132-NEXT:    s_cbranch_execz .LBB14_4
+; GFX1132-NEXT:  ; %bb.3:
 ; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1132-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1132-NEXT:    ds_and_rtn_b32 v0, v0, v4
+; GFX1132-NEXT:    ds_and_rtn_b32 v0, v0, v2
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_gl0_inv
-; GFX1132-NEXT:  .LBB14_2:
+; GFX1132-NEXT:  .LBB14_4:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_and_b32_e32 v0, s3, v0
+; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_and_b32_e32 v0, s2, v1
+; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3694,269 +3499,226 @@
 ;
 ; GFX8-LABEL: or_i32_varying:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX8-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8-NEXT:    s_mov_b32 s4, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX8-NEXT:    ; implicit-def: $vgpr1
+; GFX8-NEXT:  .LBB15_1: ; %ComputeLoop
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX8-NEXT:    s_add_i32 s5, s5, 32
+; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX8-NEXT:    s_mov_b32 m0, s5
+; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX8-NEXT:    s_or_b32 s4, s4, s8
+; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT:    s_cbranch_scc1 .LBB15_1
+; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT:    s_cbranch_execz .LBB15_2
-; GFX8-NEXT:  ; %bb.1:
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8-NEXT:    s_cbranch_execz .LBB15_4
+; GFX8-NEXT:  ; %bb.3:
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    ds_or_rtn_b32 v0, v3, v0
+; GFX8-NEXT:    ds_or_rtn_b32 v0, v0, v2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:  .LBB15_2:
+; GFX8-NEXT:  .LBB15_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    v_or_b32_e32 v0, s4, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, s4, v1
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: or_i32_varying:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX9-NEXT:    ; implicit-def: $vgpr1
+; GFX9-NEXT:  .LBB15_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX9-NEXT:    s_add_i32 s5, s5, 32
+; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX9-NEXT:    s_mov_b32 m0, s5
+; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX9-NEXT:    s_or_b32 s4, s4, s8
+; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT:    s_cbranch_scc1 .LBB15_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT:    s_cbranch_execz .LBB15_2
-; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execz .LBB15_4
+; GFX9-NEXT:  ; %bb.3:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    ds_or_rtn_b32 v0, v3, v0
+; GFX9-NEXT:    ds_or_rtn_b32 v0, v0, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:  .LBB15_2:
+; GFX9-NEXT:  .LBB15_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_or_b32_e32 v0, s4, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, s4, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: or_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1064-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1064-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
-; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_mov_b32 s2, -1
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064-NEXT:    s_mov_b32 s4, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX1064-NEXT:    ; implicit-def: $vgpr1
+; GFX1064-NEXT:  .LBB15_1: ; %ComputeLoop
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX1064-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX1064-NEXT:    s_add_i32 s5, s5, 32
+; GFX1064-NEXT:    s_min_u32 s5, s6, s5
+; GFX1064-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX1064-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX1064-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX1064-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064-NEXT:    s_or_b32 s4, s4, s8
+; GFX1064-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064-NEXT:    s_cbranch_scc1 .LBB15_1
+; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB15_2
-; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s7
-; GFX1064-NEXT:    s_mov_b32 s3, s7
+; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execz .LBB15_4
+; GFX1064-NEXT:  ; %bb.3:
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    ds_or_rtn_b32 v0, v4, v0
+; GFX1064-NEXT:    ds_or_rtn_b32 v0, v0, v2
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
-; GFX1064-NEXT:  .LBB15_2:
+; GFX1064-NEXT:  .LBB15_4:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064-NEXT:    v_or_b32_e32 v0, s3, v0
+; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT:    v_or_b32_e32 v0, s2, v1
+; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: or_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1032-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT:    s_mov_b32 s2, -1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:    ; implicit-def: $vgpr1
+; GFX1032-NEXT:  .LBB15_1: ; %ComputeLoop
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX1032-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX1032-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX1032-NEXT:    s_andn2_b32 s3, s3, s6
+; GFX1032-NEXT:    s_or_b32 s2, s2, s5
+; GFX1032-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1032-NEXT:    s_cbranch_scc1 .LBB15_1
+; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB15_2
-; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1032-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1032-NEXT:    s_cbranch_execz .LBB15_4
+; GFX1032-NEXT:  ; %bb.3:
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    ds_or_rtn_b32 v0, v4, v0
+; GFX1032-NEXT:    ds_or_rtn_b32 v0, v0, v2
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
-; GFX1032-NEXT:  .LBB15_2:
+; GFX1032-NEXT:  .LBB15_4:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1032-NEXT:    v_or_b32_e32 v0, s3, v0
+; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT:    v_or_b32_e32 v0, s2, v1
+; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: or_i32_varying:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_not_b64 exec, exec
-; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    s_not_b64 exec, exec
-; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1164-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1164-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
-; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT:    s_mov_b32 s2, -1
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164-NEXT:    s_mov_b32 s4, 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX1164-NEXT:    ; implicit-def: $vgpr1
+; GFX1164-NEXT:  .LBB15_1: ; %ComputeLoop
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_ctz_i32_b32 s5, s3
+; GFX1164-NEXT:    s_ctz_i32_b32 s6, s2
+; GFX1164-NEXT:    s_add_i32 s5, s5, 32
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_min_u32 s5, s6, s5
+; GFX1164-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX1164-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX1164-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT:    s_or_b32 s4, s4, s8
+; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164-NEXT:    s_cbranch_scc1 .LBB15_1
+; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    ; implicit-def: $vgpr0
-; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX1164-NEXT:    s_cbranch_execz .LBB15_2
-; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    v_mov_b32_e32 v0, s7
-; GFX1164-NEXT:    s_mov_b32 s3, s7
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execz .LBB15_4
+; GFX1164-NEXT:  ; %bb.3:
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1164-NEXT:    ds_or_rtn_b32 v0, v4, v0
+; GFX1164-NEXT:    ds_or_rtn_b32 v0, v0, v2
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_gl0_inv
-; GFX1164-NEXT:  .LBB15_2:
-; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT:  .LBB15_4:
+; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_or_b32_e32 v0, s3, v0
+; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_or_b32_e32 v0, s2, v1
+; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -3964,53 +3726,43 @@
 ;
 ; GFX1132-LABEL: or_i32_varying:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1132-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT:    s_mov_b32 s2, -1
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:    ; implicit-def: $vgpr1
+; GFX1132-NEXT:  .LBB15_1: ; %ComputeLoop
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1132-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1132-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX1132-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX1132-NEXT:    s_and_not1_b32 s3, s3, s6
+; GFX1132-NEXT:    s_or_b32 s2, s2, s5
+; GFX1132-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1132-NEXT:    s_cbranch_scc1 .LBB15_1
+; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    ; implicit-def: $vgpr0
-; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
-; GFX1132-NEXT:    s_cbranch_execz .LBB15_2
-; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1132-NEXT:    s_cbranch_execz .LBB15_4
+; GFX1132-NEXT:  ; %bb.3:
+; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1132-NEXT:    ds_or_rtn_b32 v0, v4, v0
+; GFX1132-NEXT:    ds_or_rtn_b32 v0, v0, v2
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_gl0_inv
-; GFX1132-NEXT:  .LBB15_2:
+; GFX1132-NEXT:  .LBB15_4:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_or_b32_e32 v0, s3, v0
+; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_or_b32_e32 v0, s2, v1
+; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4040,269 +3792,226 @@
 ;
 ; GFX8-LABEL: xor_i32_varying:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX8-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8-NEXT:    s_mov_b32 s4, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX8-NEXT:    ; implicit-def: $vgpr1
+; GFX8-NEXT:  .LBB16_1: ; %ComputeLoop
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX8-NEXT:    s_add_i32 s5, s5, 32
+; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX8-NEXT:    s_mov_b32 m0, s5
+; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX8-NEXT:    s_xor_b32 s4, s4, s8
+; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT:    s_cbranch_scc1 .LBB16_1
+; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT:    s_cbranch_execz .LBB16_2
-; GFX8-NEXT:  ; %bb.1:
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8-NEXT:    s_cbranch_execz .LBB16_4
+; GFX8-NEXT:  ; %bb.3:
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    ds_xor_rtn_b32 v0, v3, v0
+; GFX8-NEXT:    ds_xor_rtn_b32 v0, v0, v2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:  .LBB16_2:
+; GFX8-NEXT:  .LBB16_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    v_xor_b32_e32 v0, s4, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, s4, v1
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: xor_i32_varying:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX9-NEXT:    ; implicit-def: $vgpr1
+; GFX9-NEXT:  .LBB16_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX9-NEXT:    s_add_i32 s5, s5, 32
+; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX9-NEXT:    s_mov_b32 m0, s5
+; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX9-NEXT:    s_xor_b32 s4, s4, s8
+; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT:    s_cbranch_scc1 .LBB16_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT:    s_cbranch_execz .LBB16_2
-; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execz .LBB16_4
+; GFX9-NEXT:  ; %bb.3:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    ds_xor_rtn_b32 v0, v3, v0
+; GFX9-NEXT:    ds_xor_rtn_b32 v0, v0, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:  .LBB16_2:
+; GFX9-NEXT:  .LBB16_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: xor_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1064-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1064-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
-; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_mov_b32 s2, -1
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064-NEXT:    s_mov_b32 s4, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX1064-NEXT:    ; implicit-def: $vgpr1
+; GFX1064-NEXT:  .LBB16_1: ; %ComputeLoop
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX1064-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX1064-NEXT:    s_add_i32 s5, s5, 32
+; GFX1064-NEXT:    s_min_u32 s5, s6, s5
+; GFX1064-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX1064-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX1064-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX1064-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064-NEXT:    s_xor_b32 s4, s4, s8
+; GFX1064-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064-NEXT:    s_cbranch_scc1 .LBB16_1
+; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB16_2
-; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s7
-; GFX1064-NEXT:    s_mov_b32 s3, s7
+; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execz .LBB16_4
+; GFX1064-NEXT:  ; %bb.3:
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    ds_xor_rtn_b32 v0, v4, v0
+; GFX1064-NEXT:    ds_xor_rtn_b32 v0, v0, v2
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
-; GFX1064-NEXT:  .LBB16_2:
+; GFX1064-NEXT:  .LBB16_4:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064-NEXT:    v_xor_b32_e32 v0, s3, v0
+; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT:    v_xor_b32_e32 v0, s2, v1
+; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: xor_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1032-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT:    s_mov_b32 s2, -1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:    ; implicit-def: $vgpr1
+; GFX1032-NEXT:  .LBB16_1: ; %ComputeLoop
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX1032-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX1032-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX1032-NEXT:    s_andn2_b32 s3, s3, s6
+; GFX1032-NEXT:    s_xor_b32 s2, s2, s5
+; GFX1032-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1032-NEXT:    s_cbranch_scc1 .LBB16_1
+; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB16_2
-; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1032-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1032-NEXT:    s_cbranch_execz .LBB16_4
+; GFX1032-NEXT:  ; %bb.3:
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    ds_xor_rtn_b32 v0, v4, v0
+; GFX1032-NEXT:    ds_xor_rtn_b32 v0, v0, v2
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
-; GFX1032-NEXT:  .LBB16_2:
+; GFX1032-NEXT:  .LBB16_4:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1032-NEXT:    v_xor_b32_e32 v0, s3, v0
+; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT:    v_xor_b32_e32 v0, s2, v1
+; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: xor_i32_varying:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_not_b64 exec, exec
-; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    s_not_b64 exec, exec
-; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1164-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1164-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
-; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT:    s_mov_b32 s2, -1
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164-NEXT:    s_mov_b32 s4, 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX1164-NEXT:    ; implicit-def: $vgpr1
+; GFX1164-NEXT:  .LBB16_1: ; %ComputeLoop
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_ctz_i32_b32 s5, s3
+; GFX1164-NEXT:    s_ctz_i32_b32 s6, s2
+; GFX1164-NEXT:    s_add_i32 s5, s5, 32
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_min_u32 s5, s6, s5
+; GFX1164-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX1164-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX1164-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT:    s_xor_b32 s4, s4, s8
+; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164-NEXT:    s_cbranch_scc1 .LBB16_1
+; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    ; implicit-def: $vgpr0
-; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX1164-NEXT:    s_cbranch_execz .LBB16_2
-; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    v_mov_b32_e32 v0, s7
-; GFX1164-NEXT:    s_mov_b32 s3, s7
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execz .LBB16_4
+; GFX1164-NEXT:  ; %bb.3:
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1164-NEXT:    ds_xor_rtn_b32 v0, v4, v0
+; GFX1164-NEXT:    ds_xor_rtn_b32 v0, v0, v2
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_gl0_inv
-; GFX1164-NEXT:  .LBB16_2:
-; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT:  .LBB16_4:
+; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_xor_b32_e32 v0, s3, v0
+; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_xor_b32_e32 v0, s2, v1
+; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4310,53 +4019,43 @@
 ;
 ; GFX1132-LABEL: xor_i32_varying:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1132-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT:    s_mov_b32 s2, -1
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:    ; implicit-def: $vgpr1
+; GFX1132-NEXT:  .LBB16_1: ; %ComputeLoop
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1132-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1132-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX1132-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX1132-NEXT:    s_and_not1_b32 s3, s3, s6
+; GFX1132-NEXT:    s_xor_b32 s2, s2, s5
+; GFX1132-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1132-NEXT:    s_cbranch_scc1 .LBB16_1
+; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    ; implicit-def: $vgpr0
-; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
-; GFX1132-NEXT:    s_cbranch_execz .LBB16_2
-; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1132-NEXT:    s_cbranch_execz .LBB16_4
+; GFX1132-NEXT:  ; %bb.3:
+; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1132-NEXT:    ds_xor_rtn_b32 v0, v4, v0
+; GFX1132-NEXT:    ds_xor_rtn_b32 v0, v0, v2
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_gl0_inv
-; GFX1132-NEXT:  .LBB16_2:
+; GFX1132-NEXT:  .LBB16_4:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_xor_b32_e32 v0, s3, v0
+; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_xor_b32_e32 v0, s2, v1
+; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4386,273 +4085,226 @@
 ;
 ; GFX8-LABEL: max_i32_varying:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX8-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX8-NEXT:    s_brev_b32 s4, 1
+; GFX8-NEXT:    ; implicit-def: $vgpr1
+; GFX8-NEXT:  .LBB17_1: ; %ComputeLoop
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX8-NEXT:    s_add_i32 s5, s5, 32
+; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX8-NEXT:    s_mov_b32 m0, s5
+; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX8-NEXT:    s_max_i32 s4, s4, s8
+; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT:    s_cbranch_scc1 .LBB17_1
+; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT:    s_cbranch_execz .LBB17_2
-; GFX8-NEXT:  ; %bb.1:
+; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8-NEXT:    s_cbranch_execz .LBB17_4
+; GFX8-NEXT:  ; %bb.3:
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s4
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    ds_max_rtn_i32 v0, v0, v3
+; GFX8-NEXT:    ds_max_rtn_i32 v0, v0, v2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:  .LBB17_2:
+; GFX8-NEXT:  .LBB17_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    v_max_i32_e32 v0, s4, v0
+; GFX8-NEXT:    v_max_i32_e32 v0, s4, v1
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: max_i32_varying:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX9-NEXT:    s_brev_b32 s4, 1
+; GFX9-NEXT:    ; implicit-def: $vgpr1
+; GFX9-NEXT:  .LBB17_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX9-NEXT:    s_add_i32 s5, s5, 32
+; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX9-NEXT:    s_mov_b32 m0, s5
+; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX9-NEXT:    s_max_i32 s4, s4, s8
+; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT:    s_cbranch_scc1 .LBB17_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT:    s_cbranch_execz .LBB17_2
-; GFX9-NEXT:  ; %bb.1:
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execz .LBB17_4
+; GFX9-NEXT:  ; %bb.3:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    ds_max_rtn_i32 v0, v0, v3
+; GFX9-NEXT:    ds_max_rtn_i32 v0, v0, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:  .LBB17_2:
+; GFX9-NEXT:  .LBB17_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_max_i32_e32 v0, s4, v0
+; GFX9-NEXT:    v_max_i32_e32 v0, s4, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: max_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1064-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1064-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
-; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_mov_b32 s2, -1
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064-NEXT:    s_brev_b32 s4, 1
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX1064-NEXT:    ; implicit-def: $vgpr1
+; GFX1064-NEXT:  .LBB17_1: ; %ComputeLoop
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX1064-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX1064-NEXT:    s_add_i32 s5, s5, 32
+; GFX1064-NEXT:    s_min_u32 s5, s6, s5
+; GFX1064-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX1064-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX1064-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX1064-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064-NEXT:    s_max_i32 s4, s4, s8
+; GFX1064-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064-NEXT:    s_cbranch_scc1 .LBB17_1
+; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB17_2
-; GFX1064-NEXT:  ; %bb.1:
+; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execz .LBB17_4
+; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
-; GFX1064-NEXT:    s_mov_b32 s3, s7
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    ds_max_rtn_i32 v0, v0, v4
+; GFX1064-NEXT:    ds_max_rtn_i32 v0, v0, v2
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
-; GFX1064-NEXT:  .LBB17_2:
+; GFX1064-NEXT:  .LBB17_4:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064-NEXT:    v_max_i32_e32 v0, s3, v0
+; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT:    v_max_i32_e32 v0, s2, v1
+; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: max_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1032-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT:    s_mov_b32 s2, -1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032-NEXT:    s_brev_b32 s2, 1
+; GFX1032-NEXT:    ; implicit-def: $vgpr1
+; GFX1032-NEXT:  .LBB17_1: ; %ComputeLoop
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX1032-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX1032-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX1032-NEXT:    s_andn2_b32 s3, s3, s6
+; GFX1032-NEXT:    s_max_i32 s2, s2, s5
+; GFX1032-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1032-NEXT:    s_cbranch_scc1 .LBB17_1
+; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB17_2
-; GFX1032-NEXT:  ; %bb.1:
+; GFX1032-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1032-NEXT:    s_cbranch_execz .LBB17_4
+; GFX1032-NEXT:  ; %bb.3:
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    ds_max_rtn_i32 v0, v0, v4
+; GFX1032-NEXT:    ds_max_rtn_i32 v0, v0, v2
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
-; GFX1032-NEXT:  .LBB17_2:
+; GFX1032-NEXT:  .LBB17_4:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1032-NEXT:    v_max_i32_e32 v0, s3, v0
+; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT:    v_max_i32_e32 v0, s2, v1
+; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: max_i32_varying:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_not_b64 exec, exec
-; GFX1164-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1164-NEXT:    s_not_b64 exec, exec
-; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1164-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1164-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
-; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT:    s_mov_b32 s2, -1
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164-NEXT:    s_brev_b32 s4, 1
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX1164-NEXT:    ; implicit-def: $vgpr1
+; GFX1164-NEXT:  .LBB17_1: ; %ComputeLoop
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_ctz_i32_b32 s5, s3
+; GFX1164-NEXT:    s_ctz_i32_b32 s6, s2
+; GFX1164-NEXT:    s_add_i32 s5, s5, 32
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_min_u32 s5, s6, s5
+; GFX1164-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX1164-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX1164-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT:    s_max_i32 s4, s4, s8
+; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164-NEXT:    s_cbranch_scc1 .LBB17_1
+; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    ; implicit-def: $vgpr0
-; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX1164-NEXT:    s_cbranch_execz .LBB17_2
-; GFX1164-NEXT:  ; %bb.1:
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execz .LBB17_4
+; GFX1164-NEXT:  ; %bb.3:
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
-; GFX1164-NEXT:    s_mov_b32 s3, s7
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1164-NEXT:    ds_max_rtn_i32 v0, v0, v4
+; GFX1164-NEXT:    ds_max_rtn_i32 v0, v0, v2
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_gl0_inv
-; GFX1164-NEXT:  .LBB17_2:
-; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT:  .LBB17_4:
+; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_i32_e32 v0, s3, v0
+; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_max_i32_e32 v0, s2, v1
+; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4660,53 +4312,43 @@
 ;
 ; GFX1132-LABEL: max_i32_varying:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132-NEXT:    v_bfrev_b32_e32 v3, 1
-; GFX1132-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132-NEXT:    v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-NEXT:    s_brev_b32 s2, 1
+; GFX1132-NEXT:    ; implicit-def: $vgpr1
+; GFX1132-NEXT:  .LBB17_1: ; %ComputeLoop
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_ctz_i32_b32 s4, s3
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT:    s_mov_b32 s2, -1
+; GFX1132-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1132-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX1132-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX1132-NEXT:    s_and_not1_b32 s3, s3, s6
+; GFX1132-NEXT:    s_max_i32 s2, s2, s5
+; GFX1132-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1132-NEXT:    s_cbranch_scc1 .LBB17_1
+; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    ; implicit-def: $vgpr0
-; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
-; GFX1132-NEXT:    s_cbranch_execz .LBB17_2
-; GFX1132-NEXT:  ; %bb.1:
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1132-NEXT:    s_cbranch_execz .LBB17_4
+; GFX1132-NEXT:  ; %bb.3:
 ; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1132-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1132-NEXT:    ds_max_rtn_i32 v0, v0, v4
+; GFX1132-NEXT:    ds_max_rtn_i32 v0, v0, v2
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_gl0_inv
-; GFX1132-NEXT:  .LBB17_2:
+; GFX1132-NEXT:  .LBB17_4:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_i32_e32 v0, s3, v0
+; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_max_i32_e32 v0, s2, v1
+; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -4987,273 +4629,226 @@
 ;
 ; GFX8-LABEL: min_i32_varying:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    v_bfrev_b32_e32 v1, -2
-; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    v_bfrev_b32_e32 v2, -2
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX8-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX8-NEXT:    s_brev_b32 s4, -2
+; GFX8-NEXT:    ; implicit-def: $vgpr1
+; GFX8-NEXT:  .LBB19_1: ; %ComputeLoop
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX8-NEXT:    s_add_i32 s5, s5, 32
+; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX8-NEXT:    s_mov_b32 m0, s5
+; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX8-NEXT:    s_min_i32 s4, s4, s8
+; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT:    s_cbranch_scc1 .LBB19_1
+; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT:    s_cbranch_execz .LBB19_2
-; GFX8-NEXT:  ; %bb.1:
+; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8-NEXT:    s_cbranch_execz .LBB19_4
+; GFX8-NEXT:  ; %bb.3:
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s4
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    ds_min_rtn_i32 v0, v0, v3
+; GFX8-NEXT:    ds_min_rtn_i32 v0, v0, v2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:  .LBB19_2:
+; GFX8-NEXT:  .LBB19_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    v_min_i32_e32 v0, s4, v0
+; GFX8-NEXT:    v_min_i32_e32 v0, s4, v1
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: min_i32_varying:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT:    v_bfrev_b32_e32 v1, -2
-; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    v_bfrev_b32_e32 v2, -2
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX9-NEXT:    s_brev_b32 s4, -2
+; GFX9-NEXT:    ; implicit-def: $vgpr1
+; GFX9-NEXT:  .LBB19_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX9-NEXT:    s_add_i32 s5, s5, 32
+; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX9-NEXT:    s_mov_b32 m0, s5
+; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX9-NEXT:    s_min_i32 s4, s4, s8
+; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT:    s_cbranch_scc1 .LBB19_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT:    s_cbranch_execz .LBB19_2
-; GFX9-NEXT:  ; %bb.1:
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execz .LBB19_4
+; GFX9-NEXT:  ; %bb.3:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    ds_min_rtn_i32 v0, v0, v3
+; GFX9-NEXT:    ds_min_rtn_i32 v0, v0, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:  .LBB19_2:
+; GFX9-NEXT:  .LBB19_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_min_i32_e32 v0, s4, v0
+; GFX9-NEXT:    v_min_i32_e32 v0, s4, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: min_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    v_bfrev_b32_e32 v1, -2
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_bfrev_b32_e32 v3, -2
-; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1064-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1064-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
-; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_mov_b32 s2, -1
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064-NEXT:    s_brev_b32 s4, -2
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX1064-NEXT:    ; implicit-def: $vgpr1
+; GFX1064-NEXT:  .LBB19_1: ; %ComputeLoop
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX1064-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX1064-NEXT:    s_add_i32 s5, s5, 32
+; GFX1064-NEXT:    s_min_u32 s5, s6, s5
+; GFX1064-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX1064-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX1064-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX1064-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064-NEXT:    s_min_i32 s4, s4, s8
+; GFX1064-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064-NEXT:    s_cbranch_scc1 .LBB19_1
+; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB19_2
-; GFX1064-NEXT:  ; %bb.1:
+; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execz .LBB19_4
+; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
-; GFX1064-NEXT:    s_mov_b32 s3, s7
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    ds_min_rtn_i32 v0, v0, v4
+; GFX1064-NEXT:    ds_min_rtn_i32 v0, v0, v2
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
-; GFX1064-NEXT:  .LBB19_2:
+; GFX1064-NEXT:  .LBB19_4:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064-NEXT:    v_min_i32_e32 v0, s3, v0
+; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT:    v_min_i32_e32 v0, s2, v1
+; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: min_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    v_bfrev_b32_e32 v1, -2
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    v_bfrev_b32_e32 v3, -2
-; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1032-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT:    s_mov_b32 s2, -1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032-NEXT:    s_brev_b32 s2, -2
+; GFX1032-NEXT:    ; implicit-def: $vgpr1
+; GFX1032-NEXT:  .LBB19_1: ; %ComputeLoop
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX1032-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX1032-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX1032-NEXT:    s_andn2_b32 s3, s3, s6
+; GFX1032-NEXT:    s_min_i32 s2, s2, s5
+; GFX1032-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1032-NEXT:    s_cbranch_scc1 .LBB19_1
+; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB19_2
-; GFX1032-NEXT:  ; %bb.1:
+; GFX1032-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1032-NEXT:    s_cbranch_execz .LBB19_4
+; GFX1032-NEXT:  ; %bb.3:
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    ds_min_rtn_i32 v0, v0, v4
+; GFX1032-NEXT:    ds_min_rtn_i32 v0, v0, v2
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
-; GFX1032-NEXT:  .LBB19_2:
+; GFX1032-NEXT:  .LBB19_4:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1032-NEXT:    v_min_i32_e32 v0, s3, v0
+; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT:    v_min_i32_e32 v0, s2, v1
+; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: min_i32_varying:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_not_b64 exec, exec
-; GFX1164-NEXT:    v_bfrev_b32_e32 v1, -2
-; GFX1164-NEXT:    s_not_b64 exec, exec
-; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT:    v_bfrev_b32_e32 v3, -2
-; GFX1164-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1164-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
-; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT:    s_mov_b32 s2, -1
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164-NEXT:    s_brev_b32 s4, -2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX1164-NEXT:    ; implicit-def: $vgpr1
+; GFX1164-NEXT:  .LBB19_1: ; %ComputeLoop
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_ctz_i32_b32 s5, s3
+; GFX1164-NEXT:    s_ctz_i32_b32 s6, s2
+; GFX1164-NEXT:    s_add_i32 s5, s5, 32
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_min_u32 s5, s6, s5
+; GFX1164-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX1164-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX1164-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT:    s_min_i32 s4, s4, s8
+; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164-NEXT:    s_cbranch_scc1 .LBB19_1
+; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    ; implicit-def: $vgpr0
-; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX1164-NEXT:    s_cbranch_execz .LBB19_2
-; GFX1164-NEXT:  ; %bb.1:
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execz .LBB19_4
+; GFX1164-NEXT:  ; %bb.3:
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
-; GFX1164-NEXT:    s_mov_b32 s3, s7
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1164-NEXT:    ds_min_rtn_i32 v0, v0, v4
+; GFX1164-NEXT:    ds_min_rtn_i32 v0, v0, v2
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_gl0_inv
-; GFX1164-NEXT:  .LBB19_2:
-; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT:  .LBB19_4:
+; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_min_i32_e32 v0, s3, v0
+; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_min_i32_e32 v0, s2, v1
+; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5261,53 +4856,43 @@
 ;
 ; GFX1132-LABEL: min_i32_varying:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT:    v_bfrev_b32_e32 v1, -2
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132-NEXT:    v_bfrev_b32_e32 v3, -2
-; GFX1132-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132-NEXT:    v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-NEXT:    s_brev_b32 s2, -2
+; GFX1132-NEXT:    ; implicit-def: $vgpr1
+; GFX1132-NEXT:  .LBB19_1: ; %ComputeLoop
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_ctz_i32_b32 s4, s3
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT:    s_mov_b32 s2, -1
+; GFX1132-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1132-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX1132-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX1132-NEXT:    s_and_not1_b32 s3, s3, s6
+; GFX1132-NEXT:    s_min_i32 s2, s2, s5
+; GFX1132-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1132-NEXT:    s_cbranch_scc1 .LBB19_1
+; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    ; implicit-def: $vgpr0
-; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
-; GFX1132-NEXT:    s_cbranch_execz .LBB19_2
-; GFX1132-NEXT:  ; %bb.1:
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1132-NEXT:    s_cbranch_execz .LBB19_4
+; GFX1132-NEXT:  ; %bb.3:
 ; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1132-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1132-NEXT:    ds_min_rtn_i32 v0, v0, v4
+; GFX1132-NEXT:    ds_min_rtn_i32 v0, v0, v2
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_gl0_inv
-; GFX1132-NEXT:  .LBB19_2:
+; GFX1132-NEXT:  .LBB19_4:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_min_i32_e32 v0, s3, v0
+; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_min_i32_e32 v0, s2, v1
+; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5588,269 +5173,226 @@
 ;
 ; GFX8-LABEL: umax_i32_varying:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX8-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8-NEXT:    s_mov_b32 s4, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX8-NEXT:    ; implicit-def: $vgpr1
+; GFX8-NEXT:  .LBB21_1: ; %ComputeLoop
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX8-NEXT:    s_add_i32 s5, s5, 32
+; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX8-NEXT:    s_mov_b32 m0, s5
+; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX8-NEXT:    s_max_u32 s4, s4, s8
+; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT:    s_cbranch_scc1 .LBB21_1
+; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT:    s_cbranch_execz .LBB21_2
-; GFX8-NEXT:  ; %bb.1:
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8-NEXT:    s_cbranch_execz .LBB21_4
+; GFX8-NEXT:  ; %bb.3:
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    ds_max_rtn_u32 v0, v3, v0
+; GFX8-NEXT:    ds_max_rtn_u32 v0, v0, v2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:  .LBB21_2:
+; GFX8-NEXT:  .LBB21_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    v_max_u32_e32 v0, s4, v0
+; GFX8-NEXT:    v_max_u32_e32 v0, s4, v1
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: umax_i32_varying:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX9-NEXT:    ; implicit-def: $vgpr1
+; GFX9-NEXT:  .LBB21_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX9-NEXT:    s_add_i32 s5, s5, 32
+; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX9-NEXT:    s_mov_b32 m0, s5
+; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX9-NEXT:    s_max_u32 s4, s4, s8
+; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT:    s_cbranch_scc1 .LBB21_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT:    s_cbranch_execz .LBB21_2
-; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execz .LBB21_4
+; GFX9-NEXT:  ; %bb.3:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    ds_max_rtn_u32 v0, v3, v0
+; GFX9-NEXT:    ds_max_rtn_u32 v0, v0, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:  .LBB21_2:
+; GFX9-NEXT:  .LBB21_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_max_u32_e32 v0, s4, v0
+; GFX9-NEXT:    v_max_u32_e32 v0, s4, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: umax_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1064-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1064-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
-; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_mov_b32 s2, -1
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064-NEXT:    s_mov_b32 s4, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX1064-NEXT:    ; implicit-def: $vgpr1
+; GFX1064-NEXT:  .LBB21_1: ; %ComputeLoop
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX1064-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX1064-NEXT:    s_add_i32 s5, s5, 32
+; GFX1064-NEXT:    s_min_u32 s5, s6, s5
+; GFX1064-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX1064-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX1064-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX1064-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064-NEXT:    s_max_u32 s4, s4, s8
+; GFX1064-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064-NEXT:    s_cbranch_scc1 .LBB21_1
+; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB21_2
-; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    v_mov_b32_e32 v0, s7
-; GFX1064-NEXT:    s_mov_b32 s3, s7
+; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execz .LBB21_4
+; GFX1064-NEXT:  ; %bb.3:
+; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    ds_max_rtn_u32 v0, v4, v0
+; GFX1064-NEXT:    ds_max_rtn_u32 v0, v0, v2
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
-; GFX1064-NEXT:  .LBB21_2:
+; GFX1064-NEXT:  .LBB21_4:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064-NEXT:    v_max_u32_e32 v0, s3, v0
+; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT:    v_max_u32_e32 v0, s2, v1
+; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: umax_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1032-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT:    s_mov_b32 s2, -1
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032-NEXT:    s_mov_b32 s2, 0
+; GFX1032-NEXT:    ; implicit-def: $vgpr1
+; GFX1032-NEXT:  .LBB21_1: ; %ComputeLoop
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX1032-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX1032-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX1032-NEXT:    s_andn2_b32 s3, s3, s6
+; GFX1032-NEXT:    s_max_u32 s2, s2, s5
+; GFX1032-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1032-NEXT:    s_cbranch_scc1 .LBB21_1
+; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB21_2
-; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1032-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1032-NEXT:    s_cbranch_execz .LBB21_4
+; GFX1032-NEXT:  ; %bb.3:
+; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    ds_max_rtn_u32 v0, v4, v0
+; GFX1032-NEXT:    ds_max_rtn_u32 v0, v0, v2
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
-; GFX1032-NEXT:  .LBB21_2:
+; GFX1032-NEXT:  .LBB21_4:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1032-NEXT:    v_max_u32_e32 v0, s3, v0
+; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT:    v_max_u32_e32 v0, s2, v1
+; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: umax_i32_varying:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_not_b64 exec, exec
-; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1164-NEXT:    s_not_b64 exec, exec
-; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1164-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1164-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
-; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT:    s_mov_b32 s2, -1
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164-NEXT:    s_mov_b32 s4, 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX1164-NEXT:    ; implicit-def: $vgpr1
+; GFX1164-NEXT:  .LBB21_1: ; %ComputeLoop
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_ctz_i32_b32 s5, s3
+; GFX1164-NEXT:    s_ctz_i32_b32 s6, s2
+; GFX1164-NEXT:    s_add_i32 s5, s5, 32
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_min_u32 s5, s6, s5
+; GFX1164-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX1164-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX1164-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT:    s_max_u32 s4, s4, s8
+; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164-NEXT:    s_cbranch_scc1 .LBB21_1
+; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    ; implicit-def: $vgpr0
-; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX1164-NEXT:    s_cbranch_execz .LBB21_2
-; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    v_mov_b32_e32 v0, s7
-; GFX1164-NEXT:    s_mov_b32 s3, s7
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execz .LBB21_4
+; GFX1164-NEXT:  ; %bb.3:
+; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1164-NEXT:    ds_max_rtn_u32 v0, v4, v0
+; GFX1164-NEXT:    ds_max_rtn_u32 v0, v0, v2
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_gl0_inv
-; GFX1164-NEXT:  .LBB21_2:
-; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT:  .LBB21_4:
+; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_max_u32_e32 v0, s3, v0
+; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_max_u32_e32 v0, s2, v1
+; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -5858,53 +5400,43 @@
 ;
 ; GFX1132-LABEL: umax_i32_varying:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
-; GFX1132-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1132-NEXT:    s_mov_b32 s2, -1
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132-NEXT:    s_mov_b32 s2, 0
+; GFX1132-NEXT:    ; implicit-def: $vgpr1
+; GFX1132-NEXT:  .LBB21_1: ; %ComputeLoop
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1132-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1132-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX1132-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX1132-NEXT:    s_and_not1_b32 s3, s3, s6
+; GFX1132-NEXT:    s_max_u32 s2, s2, s5
+; GFX1132-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1132-NEXT:    s_cbranch_scc1 .LBB21_1
+; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    ; implicit-def: $vgpr0
-; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
-; GFX1132-NEXT:    s_cbranch_execz .LBB21_2
-; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1132-NEXT:    s_cbranch_execz .LBB21_4
+; GFX1132-NEXT:  ; %bb.3:
+; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1132-NEXT:    ds_max_rtn_u32 v0, v4, v0
+; GFX1132-NEXT:    ds_max_rtn_u32 v0, v0, v2
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_gl0_inv
-; GFX1132-NEXT:  .LBB21_2:
+; GFX1132-NEXT:  .LBB21_4:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_max_u32_e32 v0, s3, v0
+; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_max_u32_e32 v0, s2, v1
+; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6180,273 +5712,226 @@
 ;
 ; GFX8-LABEL: umin_i32_varying:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    v_mov_b32_e32 v1, -1
-; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    v_mov_b32_e32 v2, -1
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX8-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX8-NEXT:    s_mov_b32 s4, -1
+; GFX8-NEXT:    ; implicit-def: $vgpr1
+; GFX8-NEXT:  .LBB23_1: ; %ComputeLoop
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX8-NEXT:    s_add_i32 s5, s5, 32
+; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX8-NEXT:    s_mov_b32 m0, s5
+; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX8-NEXT:    s_min_u32 s4, s4, s8
+; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT:    s_cbranch_scc1 .LBB23_1
+; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT:    s_cbranch_execz .LBB23_2
-; GFX8-NEXT:  ; %bb.1:
+; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8-NEXT:    s_cbranch_execz .LBB23_4
+; GFX8-NEXT:  ; %bb.3:
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s4
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    ds_min_rtn_u32 v0, v0, v3
+; GFX8-NEXT:    ds_min_rtn_u32 v0, v0, v2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:  .LBB23_2:
+; GFX8-NEXT:  .LBB23_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    v_min_u32_e32 v0, s4, v0
+; GFX8-NEXT:    v_min_u32_e32 v0, s4, v1
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: umin_i32_varying:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT:    v_mov_b32_e32 v1, -1
-; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    v_mov_b32_e32 v2, -1
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-NEXT:    v_readlane_b32 s4, v2, 63
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX9-NEXT:    s_mov_b32 s4, -1
+; GFX9-NEXT:    ; implicit-def: $vgpr1
+; GFX9-NEXT:  .LBB23_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX9-NEXT:    s_add_i32 s5, s5, 32
+; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX9-NEXT:    s_mov_b32 m0, s5
+; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX9-NEXT:    s_min_u32 s4, s4, s8
+; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT:    s_cbranch_scc1 .LBB23_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT:    s_cbranch_execz .LBB23_2
-; GFX9-NEXT:  ; %bb.1:
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execz .LBB23_4
+; GFX9-NEXT:  ; %bb.3:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    ds_min_rtn_u32 v0, v0, v3
+; GFX9-NEXT:    ds_min_rtn_u32 v0, v0, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:  .LBB23_2:
+; GFX9-NEXT:  .LBB23_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_min_u32_e32 v0, s4, v0
+; GFX9-NEXT:    v_min_u32_e32 v0, s4, v1
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: umin_i32_varying:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    v_mov_b32_e32 v1, -1
-; GFX1064-NEXT:    s_not_b64 exec, exec
-; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_mov_b32_e32 v3, -1
-; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1064-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1064-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1064-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX1064-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1064-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1064-NEXT:    v_readlane_b32 s7, v1, 63
-; GFX1064-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX1064-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1064-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_mov_b32 s2, -1
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064-NEXT:    s_mov_b32 s4, -1
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX1064-NEXT:    ; implicit-def: $vgpr1
+; GFX1064-NEXT:  .LBB23_1: ; %ComputeLoop
+; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX1064-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX1064-NEXT:    s_add_i32 s5, s5, 32
+; GFX1064-NEXT:    s_min_u32 s5, s6, s5
+; GFX1064-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX1064-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX1064-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX1064-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX1064-NEXT:    s_min_u32 s4, s4, s8
+; GFX1064-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064-NEXT:    s_cbranch_scc1 .LBB23_1
+; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB23_2
-; GFX1064-NEXT:  ; %bb.1:
+; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064-NEXT:    s_cbranch_execz .LBB23_4
+; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1064-NEXT:    v_mov_b32_e32 v4, s7
-; GFX1064-NEXT:    s_mov_b32 s3, s7
+; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1064-NEXT:    ds_min_rtn_u32 v0, v0, v4
+; GFX1064-NEXT:    ds_min_rtn_u32 v0, v0, v2
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_gl0_inv
-; GFX1064-NEXT:  .LBB23_2:
+; GFX1064-NEXT:  .LBB23_4:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1064-NEXT:    v_min_u32_e32 v0, s3, v0
+; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT:    v_min_u32_e32 v0, s2, v1
+; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: umin_i32_varying:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    v_mov_b32_e32 v1, -1
-; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    v_mov_b32_e32 v3, -1
-; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1032-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1032-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX1032-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1032-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
+; GFX1032-NEXT:    ; implicit-def: $vgpr1
+; GFX1032-NEXT:  .LBB23_1: ; %ComputeLoop
+; GFX1032-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX1032-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX1032-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX1032-NEXT:    s_andn2_b32 s3, s3, s6
+; GFX1032-NEXT:    s_min_u32 s2, s2, s5
+; GFX1032-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1032-NEXT:    s_cbranch_scc1 .LBB23_1
+; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s3, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB23_2
-; GFX1032-NEXT:  ; %bb.1:
+; GFX1032-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1032-NEXT:    s_cbranch_execz .LBB23_4
+; GFX1032-NEXT:  ; %bb.3:
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1032-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1032-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1032-NEXT:    ds_min_rtn_u32 v0, v0, v4
+; GFX1032-NEXT:    ds_min_rtn_u32 v0, v0, v2
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_gl0_inv
-; GFX1032-NEXT:  .LBB23_2:
+; GFX1032-NEXT:  .LBB23_4:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1032-NEXT:    v_min_u32_e32 v0, s3, v0
+; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT:    v_min_u32_e32 v0, s2, v1
+; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: umin_i32_varying:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1164-NEXT:    s_not_b64 exec, exec
-; GFX1164-NEXT:    v_mov_b32_e32 v1, -1
-; GFX1164-NEXT:    s_not_b64 exec, exec
-; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT:    v_mov_b32_e32 v3, -1
-; GFX1164-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
-; GFX1164-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1164-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX1164-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX1164-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX1164-NEXT:    v_readlane_b32 s7, v1, 63
-; GFX1164-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX1164-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX1164-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1164-NEXT:    s_mov_b32 s2, -1
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164-NEXT:    s_mov_b32 s4, -1
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX1164-NEXT:    ; implicit-def: $vgpr1
+; GFX1164-NEXT:  .LBB23_1: ; %ComputeLoop
+; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT:    s_ctz_i32_b32 s5, s3
+; GFX1164-NEXT:    s_ctz_i32_b32 s6, s2
+; GFX1164-NEXT:    s_add_i32 s5, s5, 32
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_min_u32 s5, s6, s5
+; GFX1164-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX1164-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX1164-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT:    s_min_u32 s4, s4, s8
+; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164-NEXT:    s_cbranch_scc1 .LBB23_1
+; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    ; implicit-def: $vgpr0
-; GFX1164-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX1164-NEXT:    s_cbranch_execz .LBB23_2
-; GFX1164-NEXT:  ; %bb.1:
+; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164-NEXT:    s_cbranch_execz .LBB23_4
+; GFX1164-NEXT:  ; %bb.3:
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1164-NEXT:    v_mov_b32_e32 v4, s7
-; GFX1164-NEXT:    s_mov_b32 s3, s7
+; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1164-NEXT:    ds_min_rtn_u32 v0, v0, v4
+; GFX1164-NEXT:    ds_min_rtn_u32 v0, v0, v2
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_gl0_inv
-; GFX1164-NEXT:  .LBB23_2:
-; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT:  .LBB23_4:
+; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_min_u32_e32 v0, s3, v0
+; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT:    v_min_u32_e32 v0, s2, v1
+; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -6454,53 +5939,43 @@
 ;
 ; GFX1132-LABEL: umin_i32_varying:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT:    v_mov_b32_e32 v1, -1
-; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1132-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132-NEXT:    v_mov_b32_e32 v3, -1
-; GFX1132-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
-; GFX1132-NEXT:    v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
-; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT:    v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX1132-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX1132-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX1132-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
-; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
+; GFX1132-NEXT:    ; implicit-def: $vgpr1
+; GFX1132-NEXT:  .LBB23_1: ; %ComputeLoop
+; GFX1132-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1132-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1132-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX1132-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX1132-NEXT:    s_and_not1_b32 s3, s3, s6
+; GFX1132-NEXT:    s_min_u32 s2, s2, s5
+; GFX1132-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1132-NEXT:    s_cbranch_scc1 .LBB23_1
+; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132-NEXT:    ; implicit-def: $vgpr0
-; GFX1132-NEXT:    s_and_saveexec_b32 s3, vcc_lo
-; GFX1132-NEXT:    s_cbranch_execz .LBB23_2
-; GFX1132-NEXT:  ; %bb.1:
+; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1132-NEXT:    s_cbranch_execz .LBB23_4
+; GFX1132-NEXT:  ; %bb.3:
 ; GFX1132-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1132-NEXT:    v_mov_b32_e32 v4, s4
+; GFX1132-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1132-NEXT:    ds_min_rtn_u32 v0, v0, v4
+; GFX1132-NEXT:    ds_min_rtn_u32 v0, v0, v2
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_gl0_inv
-; GFX1132-NEXT:  .LBB23_2:
+; GFX1132-NEXT:  .LBB23_4:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_min_u32_e32 v0, s3, v0
+; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT:    v_min_u32_e32 v0, s2, v1
+; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
-; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
+; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
+; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32)
@@ -448,313 +448,257 @@
 ;
 ; GFX8-LABEL: add_i32_varying_vdata:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX8-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8-NEXT:    s_mov_b32 s4, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX8-NEXT:    ; implicit-def: $vgpr1
+; GFX8-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX8-NEXT:    s_add_i32 s5, s5, 32
+; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX8-NEXT:    s_mov_b32 m0, s5
+; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX8-NEXT:    s_add_i32 s4, s4, s8
+; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT:    s_cbranch_execz .LBB2_2
-; GFX8-NEXT:  ; %bb.1:
+; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8-NEXT:    s_cbranch_execz .LBB2_4
+; GFX8-NEXT:  ; %bb.3:
 ; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
-; GFX8-NEXT:  .LBB2_2:
+; GFX8-NEXT:  .LBB2_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s2, v1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, s1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s0
-; GFX8-NEXT:    flat_store_dword v[3:4], v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: add_i32_varying_vdata:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX9-NEXT:    ; implicit-def: $vgpr1
+; GFX9-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX9-NEXT:    s_add_i32 s5, s5, 32
+; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX9-NEXT:    s_mov_b32 m0, s5
+; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX9-NEXT:    s_add_i32 s4, s4, s8
+; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT:    s_cbranch_execz .LBB2_2
-; GFX9-NEXT:  ; %bb.1:
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execz .LBB2_4
+; GFX9-NEXT:  ; %bb.3:
 ; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
-; GFX9-NEXT:  .LBB2_2:
+; GFX9-NEXT:  .LBB2_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_add_u32_e32 v0, s2, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10W64-LABEL: add_i32_varying_vdata:
 ; GFX10W64:       ; %bb.0: ; %entry
-; GFX10W64-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10W64-NEXT:    s_not_b64 exec, exec
-; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT:    s_not_b64 exec, exec
-; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W64-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W64-NEXT:    v_mov_b32_e32 v2, v1
-; GFX10W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX10W64-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX10W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX10W64-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX10W64-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX10W64-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX10W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX10W64-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
-; GFX10W64-NEXT:    ; implicit-def: $vgpr4
+; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX10W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX10W64-NEXT:    s_mov_b32 s4, 0
+; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX10W64-NEXT:    ; implicit-def: $vgpr1
+; GFX10W64-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10W64-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX10W64-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX10W64-NEXT:    s_add_i32 s5, s5, 32
+; GFX10W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX10W64-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX10W64-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX10W64-NEXT:    s_add_i32 s4, s4, s8
+; GFX10W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX10W64-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX10W64-NEXT:    ; implicit-def: $vgpr0
 ; GFX10W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX10W64-NEXT:    s_cbranch_execz .LBB2_2
-; GFX10W64-NEXT:  ; %bb.1:
+; GFX10W64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX10W64-NEXT:    s_cbranch_execz .LBB2_4
+; GFX10W64-NEXT:  ; %bb.3:
 ; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W64-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10W64-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT:    buffer_atomic_add v4, off, s[8:11], 0 glc
-; GFX10W64-NEXT:  .LBB2_2:
+; GFX10W64-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
+; GFX10W64-NEXT:  .LBB2_4:
 ; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX10W64-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10W64-NEXT:    v_add_nc_u32_e32 v4, s2, v4
+; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10W64-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10W64-NEXT:    v_add_nc_u32_e32 v1, s2, v1
 ; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX10W64-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10W64-NEXT:    s_endpgm
 ;
 ; GFX10W32-LABEL: add_i32_varying_vdata:
 ; GFX10W32:       ; %bb.0: ; %entry
-; GFX10W32-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W32-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W32-NEXT:    v_mov_b32_e32 v2, v1
-; GFX10W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX10W32-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX10W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX10W32-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
-; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX10W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX10W32-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
-; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
-; GFX10W32-NEXT:    ; implicit-def: $vgpr4
-; GFX10W32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
-; GFX10W32-NEXT:    s_cbranch_execz .LBB2_2
-; GFX10W32-NEXT:  ; %bb.1:
-; GFX10W32-NEXT:    s_mov_b32 s3, s4
+; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX10W32-NEXT:    s_mov_b32 s3, exec_lo
+; GFX10W32-NEXT:    s_mov_b32 s2, 0
+; GFX10W32-NEXT:    ; implicit-def: $vgpr1
+; GFX10W32-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10W32-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX10W32-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX10W32-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX10W32-NEXT:    s_andn2_b32 s3, s3, s6
+; GFX10W32-NEXT:    s_add_i32 s2, s2, s5
+; GFX10W32-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX10W32-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX10W32-NEXT:    ; implicit-def: $vgpr0
+; GFX10W32-NEXT:    s_and_saveexec_b32 s3, vcc_lo
+; GFX10W32-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX10W32-NEXT:    s_cbranch_execz .LBB2_4
+; GFX10W32-NEXT:  ; %bb.3:
 ; GFX10W32-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX10W32-NEXT:    v_mov_b32_e32 v4, s3
+; GFX10W32-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    buffer_atomic_add v4, off, s[4:7], 0 glc
-; GFX10W32-NEXT:  .LBB2_2:
+; GFX10W32-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
+; GFX10W32-NEXT:  .LBB2_4:
 ; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX10W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX10W32-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10W32-NEXT:    v_add_nc_u32_e32 v4, s2, v4
+; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10W32-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10W32-NEXT:    v_add_nc_u32_e32 v1, s2, v1
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX10W32-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10W32-NEXT:    s_endpgm
 ;
 ; GFX11W64-LABEL: add_i32_varying_vdata:
 ; GFX11W64:       ; %bb.0: ; %entry
-; GFX11W64-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11W64-NEXT:    s_not_b64 exec, exec
-; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11W64-NEXT:    s_not_b64 exec, exec
-; GFX11W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W64-NEXT:    v_mov_b32_e32 v3, 0
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT:    v_mov_b32_e32 v2, v1
-; GFX11W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX11W64-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT:    v_mov_b32_e32 v2, s4
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX11W64-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX11W64-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX11W64-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX11W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX11W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX11W64-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX11W64-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX11W64-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX11W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX11W64-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX11W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX11W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
-; GFX11W64-NEXT:    ; implicit-def: $vgpr4
-; GFX11W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX11W64-NEXT:    s_cbranch_execz .LBB2_2
-; GFX11W64-NEXT:  ; %bb.1:
+; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT:    s_mov_b32 s4, 0
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX11W64-NEXT:    ; implicit-def: $vgpr1
+; GFX11W64-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11W64-NEXT:    s_ctz_i32_b32 s5, s3
+; GFX11W64-NEXT:    s_ctz_i32_b32 s6, s2
+; GFX11W64-NEXT:    s_add_i32 s5, s5, 32
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX11W64-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX11W64-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX11W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11W64-NEXT:    s_add_i32 s4, s4, s8
+; GFX11W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX11W64-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT:    ; implicit-def: $vgpr0
+; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX11W64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX11W64-NEXT:    s_cbranch_execz .LBB2_4
+; GFX11W64-NEXT:  ; %bb.3:
 ; GFX11W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W64-NEXT:    v_mov_b32_e32 v4, s4
+; GFX11W64-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT:    buffer_atomic_add_u32 v4, off, s[8:11], 0 glc
-; GFX11W64-NEXT:  .LBB2_2:
+; GFX11W64-NEXT:    buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
+; GFX11W64-NEXT:  .LBB2_4:
 ; GFX11W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX11W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX11W64-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT:    v_add_nc_u32_e32 v4, s2, v4
+; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11W64-NEXT:    v_add_nc_u32_e32 v1, s2, v1
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX11W64-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: add_i32_varying_vdata:
 ; GFX11W32:       ; %bb.0: ; %entry
-; GFX11W32-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX11W32-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W32-NEXT:    v_mov_b32_e32 v3, 0
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W32-NEXT:    v_mov_b32_e32 v2, v1
-; GFX11W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX11W32-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX11W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX11W32-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX11W32-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11W32-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX11W32-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX11W32-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
-; GFX11W32-NEXT:    ; implicit-def: $vgpr4
-; GFX11W32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
-; GFX11W32-NEXT:    s_cbranch_execz .LBB2_2
-; GFX11W32-NEXT:  ; %bb.1:
-; GFX11W32-NEXT:    s_mov_b32 s3, s4
+; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX11W32-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT:    s_mov_b32 s2, 0
+; GFX11W32-NEXT:    ; implicit-def: $vgpr1
+; GFX11W32-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX11W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11W32-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11W32-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX11W32-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX11W32-NEXT:    s_and_not1_b32 s3, s3, s6
+; GFX11W32-NEXT:    s_add_i32 s2, s2, s5
+; GFX11W32-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11W32-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX11W32-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT:    ; implicit-def: $vgpr0
+; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX11W32-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX11W32-NEXT:    s_cbranch_execz .LBB2_4
+; GFX11W32-NEXT:  ; %bb.3:
 ; GFX11W32-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11W32-NEXT:    v_mov_b32_e32 v4, s3
+; GFX11W32-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT:    buffer_atomic_add_u32 v4, off, s[4:7], 0 glc
-; GFX11W32-NEXT:  .LBB2_2:
-; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX11W32-NEXT:    buffer_atomic_add_u32 v0, off, s[4:7], 0 glc
+; GFX11W32-NEXT:  .LBB2_4:
+; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX11W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX11W32-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT:    v_add_nc_u32_e32 v4, s2, v4
+; GFX11W32-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX11W32-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 entry:
@@ -1280,313 +1224,258 @@
 ;
 ; GFX8-LABEL: sub_i32_varying_vdata:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
-; GFX8-NEXT:    v_mov_b32_e32 v2, v0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8-NEXT:    v_readlane_b32 s4, v2, 63
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX8-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8-NEXT:    s_mov_b32 s4, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX8-NEXT:    ; implicit-def: $vgpr1
+; GFX8-NEXT:  .LBB6_1: ; %ComputeLoop
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX8-NEXT:    s_add_i32 s5, s5, 32
+; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX8-NEXT:    s_mov_b32 m0, s5
+; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX8-NEXT:    s_add_i32 s4, s4, s8
+; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT:    s_cbranch_execz .LBB6_2
-; GFX8-NEXT:  ; %bb.1:
+; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8-NEXT:    s_cbranch_execz .LBB6_4
+; GFX8-NEXT:  ; %bb.3:
 ; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
-; GFX8-NEXT:  .LBB6_2:
+; GFX8-NEXT:  .LBB6_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v1
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s2, v1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, s1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s0
-; GFX8-NEXT:    flat_store_dword v[3:4], v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sub_i32_varying_vdata:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX9-NEXT:    ; implicit-def: $vgpr1
+; GFX9-NEXT:  .LBB6_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX9-NEXT:    s_add_i32 s5, s5, 32
+; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX9-NEXT:    s_mov_b32 m0, s5
+; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX9-NEXT:    s_add_i32 s4, s4, s8
+; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT:    s_cbranch_execz .LBB6_2
-; GFX9-NEXT:  ; %bb.1:
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execz .LBB6_4
+; GFX9-NEXT:  ; %bb.3:
 ; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
-; GFX9-NEXT:  .LBB6_2:
+; GFX9-NEXT:  .LBB6_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10W64-LABEL: sub_i32_varying_vdata:
 ; GFX10W64:       ; %bb.0: ; %entry
-; GFX10W64-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10W64-NEXT:    s_not_b64 exec, exec
-; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT:    s_not_b64 exec, exec
-; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W64-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W64-NEXT:    v_mov_b32_e32 v2, v1
-; GFX10W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX10W64-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX10W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX10W64-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX10W64-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX10W64-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX10W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX10W64-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
-; GFX10W64-NEXT:    ; implicit-def: $vgpr4
+; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX10W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX10W64-NEXT:    s_mov_b32 s4, 0
+; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX10W64-NEXT:    ; implicit-def: $vgpr1
+; GFX10W64-NEXT:  .LBB6_1: ; %ComputeLoop
+; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10W64-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX10W64-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX10W64-NEXT:    s_add_i32 s5, s5, 32
+; GFX10W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX10W64-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX10W64-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX10W64-NEXT:    s_add_i32 s4, s4, s8
+; GFX10W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX10W64-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX10W64-NEXT:    ; implicit-def: $vgpr0
 ; GFX10W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX10W64-NEXT:    s_cbranch_execz .LBB6_2
-; GFX10W64-NEXT:  ; %bb.1:
+; GFX10W64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX10W64-NEXT:    s_cbranch_execz .LBB6_4
+; GFX10W64-NEXT:  ; %bb.3:
 ; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W64-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10W64-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT:    buffer_atomic_sub v4, off, s[8:11], 0 glc
-; GFX10W64-NEXT:  .LBB6_2:
+; GFX10W64-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
+; GFX10W64-NEXT:  .LBB6_4:
 ; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX10W64-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10W64-NEXT:    v_sub_nc_u32_e32 v4, s2, v4
+; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10W64-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10W64-NEXT:    v_sub_nc_u32_e32 v1, s2, v1
 ; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX10W64-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10W64-NEXT:    s_endpgm
 ;
 ; GFX10W32-LABEL: sub_i32_varying_vdata:
 ; GFX10W32:       ; %bb.0: ; %entry
-; GFX10W32-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W32-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W32-NEXT:    v_mov_b32_e32 v2, v1
-; GFX10W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX10W32-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX10W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX10W32-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
-; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX10W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX10W32-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
-; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
-; GFX10W32-NEXT:    ; implicit-def: $vgpr4
-; GFX10W32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
-; GFX10W32-NEXT:    s_cbranch_execz .LBB6_2
-; GFX10W32-NEXT:  ; %bb.1:
-; GFX10W32-NEXT:    s_mov_b32 s3, s4
+; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX10W32-NEXT:    s_mov_b32 s3, exec_lo
+; GFX10W32-NEXT:    s_mov_b32 s2, 0
+; GFX10W32-NEXT:    ; implicit-def: $vgpr1
+; GFX10W32-NEXT:  .LBB6_1: ; %ComputeLoop
+; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10W32-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX10W32-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX10W32-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX10W32-NEXT:    s_andn2_b32 s3, s3, s6
+; GFX10W32-NEXT:    s_add_i32 s2, s2, s5
+; GFX10W32-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX10W32-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX10W32-NEXT:    ; implicit-def: $vgpr0
+; GFX10W32-NEXT:    s_and_saveexec_b32 s3, vcc_lo
+; GFX10W32-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX10W32-NEXT:    s_cbranch_execz .LBB6_4
+; GFX10W32-NEXT:  ; %bb.3:
 ; GFX10W32-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX10W32-NEXT:    v_mov_b32_e32 v4, s3
+; GFX10W32-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    buffer_atomic_sub v4, off, s[4:7], 0 glc
-; GFX10W32-NEXT:  .LBB6_2:
+; GFX10W32-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0 glc
+; GFX10W32-NEXT:  .LBB6_4:
 ; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX10W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX10W32-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10W32-NEXT:    v_sub_nc_u32_e32 v4, s2, v4
+; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10W32-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10W32-NEXT:    v_sub_nc_u32_e32 v1, s2, v1
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX10W32-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10W32-NEXT:    s_endpgm
 ;
 ; GFX11W64-LABEL: sub_i32_varying_vdata:
 ; GFX11W64:       ; %bb.0: ; %entry
-; GFX11W64-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11W64-NEXT:    s_not_b64 exec, exec
-; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11W64-NEXT:    s_not_b64 exec, exec
-; GFX11W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W64-NEXT:    v_mov_b32_e32 v3, 0
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT:    v_mov_b32_e32 v2, v1
-; GFX11W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX11W64-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT:    v_mov_b32_e32 v2, s4
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX11W64-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX11W64-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX11W64-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX11W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX11W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX11W64-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX11W64-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX11W64-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX11W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX11W64-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX11W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX11W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
-; GFX11W64-NEXT:    ; implicit-def: $vgpr4
-; GFX11W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX11W64-NEXT:    s_cbranch_execz .LBB6_2
-; GFX11W64-NEXT:  ; %bb.1:
+; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT:    s_mov_b32 s4, 0
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX11W64-NEXT:    ; implicit-def: $vgpr1
+; GFX11W64-NEXT:  .LBB6_1: ; %ComputeLoop
+; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11W64-NEXT:    s_ctz_i32_b32 s5, s3
+; GFX11W64-NEXT:    s_ctz_i32_b32 s6, s2
+; GFX11W64-NEXT:    s_add_i32 s5, s5, 32
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX11W64-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX11W64-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX11W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11W64-NEXT:    s_add_i32 s4, s4, s8
+; GFX11W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX11W64-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT:    ; implicit-def: $vgpr0
+; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX11W64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX11W64-NEXT:    s_cbranch_execz .LBB6_4
+; GFX11W64-NEXT:  ; %bb.3:
 ; GFX11W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W64-NEXT:    v_mov_b32_e32 v4, s4
+; GFX11W64-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT:    buffer_atomic_sub_u32 v4, off, s[8:11], 0 glc
-; GFX11W64-NEXT:  .LBB6_2:
+; GFX11W64-NEXT:    buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
+; GFX11W64-NEXT:  .LBB6_4:
 ; GFX11W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX11W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX11W64-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT:    v_sub_nc_u32_e32 v4, s2, v4
+; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11W64-NEXT:    v_sub_nc_u32_e32 v1, s2, v1
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX11W64-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: sub_i32_varying_vdata:
 ; GFX11W32:       ; %bb.0: ; %entry
-; GFX11W32-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX11W32-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W32-NEXT:    v_mov_b32_e32 v3, 0
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W32-NEXT:    v_mov_b32_e32 v2, v1
-; GFX11W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX11W32-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX11W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX11W32-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX11W32-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11W32-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX11W32-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX11W32-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
-; GFX11W32-NEXT:    ; implicit-def: $vgpr4
-; GFX11W32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
-; GFX11W32-NEXT:    s_cbranch_execz .LBB6_2
-; GFX11W32-NEXT:  ; %bb.1:
-; GFX11W32-NEXT:    s_mov_b32 s3, s4
+; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX11W32-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT:    s_mov_b32 s2, 0
+; GFX11W32-NEXT:    ; implicit-def: $vgpr1
+; GFX11W32-NEXT:  .LBB6_1: ; %ComputeLoop
+; GFX11W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11W32-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11W32-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX11W32-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX11W32-NEXT:    s_and_not1_b32 s3, s3, s6
+; GFX11W32-NEXT:    s_add_i32 s2, s2, s5
+; GFX11W32-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11W32-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX11W32-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT:    ; implicit-def: $vgpr0
+; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX11W32-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX11W32-NEXT:    s_cbranch_execz .LBB6_4
+; GFX11W32-NEXT:  ; %bb.3:
 ; GFX11W32-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11W32-NEXT:    v_mov_b32_e32 v4, s3
+; GFX11W32-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT:    buffer_atomic_sub_u32 v4, off, s[4:7], 0 glc
-; GFX11W32-NEXT:  .LBB6_2:
-; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX11W32-NEXT:    buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc
+; GFX11W32-NEXT:  .LBB6_4:
+; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX11W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX11W32-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT:    v_sub_nc_u32_e32 v4, s2, v4
+; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11W32-NEXT:    v_sub_nc_u32_e32 v1, s2, v1
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX11W32-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
-; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
+; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
+; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -amdgpu-atomic-optimizer-use-dpp=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32)
@@ -463,312 +463,263 @@
 ;
 ; GFX8-LABEL: add_i32_varying_vdata:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX8-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8-NEXT:    s_mov_b32 s4, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX8-NEXT:    ; implicit-def: $vgpr1
+; GFX8-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX8-NEXT:    s_add_i32 s5, s5, 32
+; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX8-NEXT:    s_mov_b32 m0, s5
+; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX8-NEXT:    s_add_i32 s4, s4, s8
+; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT:    s_cbranch_execz .LBB2_2
-; GFX8-NEXT:  ; %bb.1:
+; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8-NEXT:    s_cbranch_execz .LBB2_4
+; GFX8-NEXT:  ; %bb.3:
 ; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    buffer_atomic_add v0, v3, s[8:11], 0 idxen glc
-; GFX8-NEXT:  .LBB2_2:
+; GFX8-NEXT:    buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
+; GFX8-NEXT:  .LBB2_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v2
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s2, v1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, s1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s0
-; GFX8-NEXT:    flat_store_dword v[3:4], v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: add_i32_varying_vdata:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX9-NEXT:    ; implicit-def: $vgpr1
+; GFX9-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX9-NEXT:    s_add_i32 s5, s5, 32
+; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX9-NEXT:    s_mov_b32 m0, s5
+; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX9-NEXT:    s_add_i32 s4, s4, s8
+; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT:    s_cbranch_execz .LBB2_2
-; GFX9-NEXT:  ; %bb.1:
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execz .LBB2_4
+; GFX9-NEXT:  ; %bb.3:
 ; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    buffer_atomic_add v0, v3, s[8:11], 0 idxen glc
-; GFX9-NEXT:  .LBB2_2:
+; GFX9-NEXT:    buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
+; GFX9-NEXT:  .LBB2_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_add_u32_e32 v0, s2, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10W64-LABEL: add_i32_varying_vdata:
 ; GFX10W64:       ; %bb.0: ; %entry
-; GFX10W64-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10W64-NEXT:    s_not_b64 exec, exec
-; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT:    s_not_b64 exec, exec
-; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W64-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W64-NEXT:    v_mov_b32_e32 v2, v1
-; GFX10W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX10W64-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX10W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX10W64-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX10W64-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX10W64-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX10W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX10W64-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
-; GFX10W64-NEXT:    ; implicit-def: $vgpr4
+; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX10W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX10W64-NEXT:    s_mov_b32 s4, 0
+; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX10W64-NEXT:    ; implicit-def: $vgpr1
+; GFX10W64-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10W64-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX10W64-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX10W64-NEXT:    s_add_i32 s5, s5, 32
+; GFX10W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX10W64-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX10W64-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX10W64-NEXT:    s_add_i32 s4, s4, s8
+; GFX10W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX10W64-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX10W64-NEXT:    ; implicit-def: $vgpr0
 ; GFX10W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX10W64-NEXT:    s_cbranch_execz .LBB2_2
-; GFX10W64-NEXT:  ; %bb.1:
+; GFX10W64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX10W64-NEXT:    s_cbranch_execz .LBB2_4
+; GFX10W64-NEXT:  ; %bb.3:
 ; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W64-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10W64-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10W64-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT:    buffer_atomic_add v4, v0, s[8:11], 0 idxen glc
-; GFX10W64-NEXT:  .LBB2_2:
+; GFX10W64-NEXT:    buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
+; GFX10W64-NEXT:  .LBB2_4:
 ; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX10W64-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10W64-NEXT:    v_add_nc_u32_e32 v4, s2, v4
+; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10W64-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10W64-NEXT:    v_add_nc_u32_e32 v1, s2, v1
 ; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX10W64-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10W64-NEXT:    s_endpgm
 ;
 ; GFX10W32-LABEL: add_i32_varying_vdata:
 ; GFX10W32:       ; %bb.0: ; %entry
-; GFX10W32-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W32-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W32-NEXT:    v_mov_b32_e32 v2, v1
-; GFX10W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX10W32-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX10W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX10W32-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
-; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX10W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX10W32-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
-; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
-; GFX10W32-NEXT:    ; implicit-def: $vgpr4
-; GFX10W32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
-; GFX10W32-NEXT:    s_cbranch_execz .LBB2_2
-; GFX10W32-NEXT:  ; %bb.1:
-; GFX10W32-NEXT:    s_mov_b32 s3, s4
+; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX10W32-NEXT:    s_mov_b32 s3, exec_lo
+; GFX10W32-NEXT:    s_mov_b32 s2, 0
+; GFX10W32-NEXT:    ; implicit-def: $vgpr1
+; GFX10W32-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10W32-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX10W32-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX10W32-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX10W32-NEXT:    s_andn2_b32 s3, s3, s6
+; GFX10W32-NEXT:    s_add_i32 s2, s2, s5
+; GFX10W32-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX10W32-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX10W32-NEXT:    ; implicit-def: $vgpr0
+; GFX10W32-NEXT:    s_and_saveexec_b32 s3, vcc_lo
+; GFX10W32-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX10W32-NEXT:    s_cbranch_execz .LBB2_4
+; GFX10W32-NEXT:  ; %bb.3:
 ; GFX10W32-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX10W32-NEXT:    v_mov_b32_e32 v4, s3
+; GFX10W32-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10W32-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    buffer_atomic_add v4, v0, s[4:7], 0 idxen glc
-; GFX10W32-NEXT:  .LBB2_2:
+; GFX10W32-NEXT:    buffer_atomic_add v0, v2, s[4:7], 0 idxen glc
+; GFX10W32-NEXT:  .LBB2_4:
 ; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX10W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX10W32-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10W32-NEXT:    v_add_nc_u32_e32 v4, s2, v4
+; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10W32-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10W32-NEXT:    v_add_nc_u32_e32 v1, s2, v1
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX10W32-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10W32-NEXT:    s_endpgm
 ;
 ; GFX11W64-LABEL: add_i32_varying_vdata:
 ; GFX11W64:       ; %bb.0: ; %entry
-; GFX11W64-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11W64-NEXT:    s_not_b64 exec, exec
-; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11W64-NEXT:    s_not_b64 exec, exec
-; GFX11W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W64-NEXT:    v_mov_b32_e32 v3, 0
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT:    v_mov_b32_e32 v2, v1
-; GFX11W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX11W64-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT:    v_mov_b32_e32 v2, s4
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX11W64-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX11W64-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX11W64-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX11W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX11W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX11W64-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX11W64-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX11W64-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX11W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX11W64-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX11W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX11W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
-; GFX11W64-NEXT:    ; implicit-def: $vgpr4
-; GFX11W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX11W64-NEXT:    s_cbranch_execz .LBB2_2
-; GFX11W64-NEXT:  ; %bb.1:
+; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT:    s_mov_b32 s4, 0
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX11W64-NEXT:    ; implicit-def: $vgpr1
+; GFX11W64-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11W64-NEXT:    s_ctz_i32_b32 s5, s3
+; GFX11W64-NEXT:    s_ctz_i32_b32 s6, s2
+; GFX11W64-NEXT:    s_add_i32 s5, s5, 32
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX11W64-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX11W64-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX11W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11W64-NEXT:    s_add_i32 s4, s4, s8
+; GFX11W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX11W64-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT:    ; implicit-def: $vgpr0
+; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX11W64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX11W64-NEXT:    s_cbranch_execz .LBB2_4
+; GFX11W64-NEXT:  ; %bb.3:
 ; GFX11W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W64-NEXT:    v_mov_b32_e32 v4, s4
+; GFX11W64-NEXT:    v_mov_b32_e32 v0, s4
+; GFX11W64-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT:    buffer_atomic_add_u32 v4, v0, s[8:11], 0 idxen glc
-; GFX11W64-NEXT:  .LBB2_2:
+; GFX11W64-NEXT:    buffer_atomic_add_u32 v0, v2, s[8:11], 0 idxen glc
+; GFX11W64-NEXT:  .LBB2_4:
 ; GFX11W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX11W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX11W64-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT:    v_add_nc_u32_e32 v4, s2, v4
+; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11W64-NEXT:    v_add_nc_u32_e32 v1, s2, v1
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX11W64-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: add_i32_varying_vdata:
 ; GFX11W32:       ; %bb.0: ; %entry
-; GFX11W32-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX11W32-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W32-NEXT:    v_mov_b32_e32 v3, 0
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W32-NEXT:    v_mov_b32_e32 v2, v1
-; GFX11W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX11W32-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX11W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX11W32-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX11W32-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11W32-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX11W32-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX11W32-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
-; GFX11W32-NEXT:    ; implicit-def: $vgpr4
-; GFX11W32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
-; GFX11W32-NEXT:    s_cbranch_execz .LBB2_2
-; GFX11W32-NEXT:  ; %bb.1:
-; GFX11W32-NEXT:    s_mov_b32 s3, s4
+; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX11W32-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT:    s_mov_b32 s2, 0
+; GFX11W32-NEXT:    ; implicit-def: $vgpr1
+; GFX11W32-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX11W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11W32-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11W32-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX11W32-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX11W32-NEXT:    s_and_not1_b32 s3, s3, s6
+; GFX11W32-NEXT:    s_add_i32 s2, s2, s5
+; GFX11W32-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11W32-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX11W32-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT:    ; implicit-def: $vgpr0
+; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX11W32-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX11W32-NEXT:    s_cbranch_execz .LBB2_4
+; GFX11W32-NEXT:  ; %bb.3:
 ; GFX11W32-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11W32-NEXT:    v_mov_b32_e32 v4, s3
+; GFX11W32-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11W32-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT:    buffer_atomic_add_u32 v4, v0, s[4:7], 0 idxen glc
-; GFX11W32-NEXT:  .LBB2_2:
-; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX11W32-NEXT:    buffer_atomic_add_u32 v0, v2, s[4:7], 0 idxen glc
+; GFX11W32-NEXT:  .LBB2_4:
+; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX11W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX11W32-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT:    v_add_nc_u32_e32 v4, s2, v4
+; GFX11W32-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX11W32-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 entry:
@@ -1411,312 +1362,264 @@
 ;
 ; GFX8-LABEL: sub_i32_varying_vdata:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    s_not_b64 exec, exec
-; GFX8-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX8-NEXT:    s_nop 1
-; GFX8-NEXT:    v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX8-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX8-NEXT:    s_nop 0
-; GFX8-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX8-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX8-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8-NEXT:    s_mov_b32 s4, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX8-NEXT:    ; implicit-def: $vgpr1
+; GFX8-NEXT:  .LBB7_1: ; %ComputeLoop
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX8-NEXT:    s_add_i32 s5, s5, 32
+; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX8-NEXT:    s_mov_b32 m0, s5
+; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX8-NEXT:    s_add_i32 s4, s4, s8
+; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX8-NEXT:    s_cbranch_execz .LBB7_2
-; GFX8-NEXT:  ; %bb.1:
+; GFX8-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8-NEXT:    s_cbranch_execz .LBB7_4
+; GFX8-NEXT:  ; %bb.3:
 ; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc
-; GFX8-NEXT:  .LBB7_2:
+; GFX8-NEXT:    buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc
+; GFX8-NEXT:  .LBB7_4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, v2
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s2, v1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v4, s1
-; GFX8-NEXT:    v_mov_b32_e32 v3, s0
-; GFX8-NEXT:    flat_store_dword v[3:4], v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sub_i32_varying_vdata:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    s_not_b64 exec, exec
-; GFX9-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
-; GFX9-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX9-NEXT:    ; implicit-def: $vgpr1
+; GFX9-NEXT:  .LBB7_1: ; %ComputeLoop
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX9-NEXT:    s_add_i32 s5, s5, 32
+; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX9-NEXT:    s_mov_b32 m0, s5
+; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX9-NEXT:    s_add_i32 s4, s4, s8
+; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX9-NEXT:    s_cbranch_execz .LBB7_2
-; GFX9-NEXT:  ; %bb.1:
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execz .LBB7_4
+; GFX9-NEXT:  ; %bb.3:
 ; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc
-; GFX9-NEXT:  .LBB7_2:
+; GFX9-NEXT:    buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc
+; GFX9-NEXT:  .LBB7_4:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10W64-LABEL: sub_i32_varying_vdata:
 ; GFX10W64:       ; %bb.0: ; %entry
-; GFX10W64-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10W64-NEXT:    s_not_b64 exec, exec
-; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT:    s_not_b64 exec, exec
-; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W64-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W64-NEXT:    v_mov_b32_e32 v2, v1
-; GFX10W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX10W64-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX10W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX10W64-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX10W64-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX10W64-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX10W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX10W64-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
-; GFX10W64-NEXT:    ; implicit-def: $vgpr4
+; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX10W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX10W64-NEXT:    s_mov_b32 s4, 0
+; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX10W64-NEXT:    ; implicit-def: $vgpr1
+; GFX10W64-NEXT:  .LBB7_1: ; %ComputeLoop
+; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10W64-NEXT:    s_ff1_i32_b32 s5, s3
+; GFX10W64-NEXT:    s_ff1_i32_b32 s6, s2
+; GFX10W64-NEXT:    s_add_i32 s5, s5, 32
+; GFX10W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX10W64-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX10W64-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX10W64-NEXT:    s_add_i32 s4, s4, s8
+; GFX10W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX10W64-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX10W64-NEXT:    ; implicit-def: $vgpr0
 ; GFX10W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX10W64-NEXT:    s_cbranch_execz .LBB7_2
-; GFX10W64-NEXT:  ; %bb.1:
+; GFX10W64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX10W64-NEXT:    s_cbranch_execz .LBB7_4
+; GFX10W64-NEXT:  ; %bb.3:
 ; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
-; GFX10W64-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10W64-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10W64-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT:    buffer_atomic_sub v4, v0, s[8:11], 0 idxen glc
-; GFX10W64-NEXT:  .LBB7_2:
+; GFX10W64-NEXT:    buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc
+; GFX10W64-NEXT:  .LBB7_4:
 ; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX10W64-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10W64-NEXT:    v_sub_nc_u32_e32 v4, s2, v4
+; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10W64-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10W64-NEXT:    v_sub_nc_u32_e32 v1, s2, v1
 ; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX10W64-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10W64-NEXT:    s_endpgm
 ;
 ; GFX10W32-LABEL: sub_i32_varying_vdata:
 ; GFX10W32:       ; %bb.0: ; %entry
-; GFX10W32-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W32-NEXT:    v_mov_b32_e32 v3, 0
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX10W32-NEXT:    v_mov_b32_e32 v2, v1
-; GFX10W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX10W32-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX10W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX10W32-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
-; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX10W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX10W32-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
-; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
-; GFX10W32-NEXT:    ; implicit-def: $vgpr4
-; GFX10W32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
-; GFX10W32-NEXT:    s_cbranch_execz .LBB7_2
-; GFX10W32-NEXT:  ; %bb.1:
-; GFX10W32-NEXT:    s_mov_b32 s3, s4
+; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX10W32-NEXT:    s_mov_b32 s3, exec_lo
+; GFX10W32-NEXT:    s_mov_b32 s2, 0
+; GFX10W32-NEXT:    ; implicit-def: $vgpr1
+; GFX10W32-NEXT:  .LBB7_1: ; %ComputeLoop
+; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10W32-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX10W32-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX10W32-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX10W32-NEXT:    s_andn2_b32 s3, s3, s6
+; GFX10W32-NEXT:    s_add_i32 s2, s2, s5
+; GFX10W32-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX10W32-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX10W32-NEXT:    ; implicit-def: $vgpr0
+; GFX10W32-NEXT:    s_and_saveexec_b32 s3, vcc_lo
+; GFX10W32-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX10W32-NEXT:    s_cbranch_execz .LBB7_4
+; GFX10W32-NEXT:  ; %bb.3:
 ; GFX10W32-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX10W32-NEXT:    v_mov_b32_e32 v4, s3
+; GFX10W32-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10W32-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    buffer_atomic_sub v4, v0, s[4:7], 0 idxen glc
-; GFX10W32-NEXT:  .LBB7_2:
+; GFX10W32-NEXT:    buffer_atomic_sub v0, v2, s[4:7], 0 idxen glc
+; GFX10W32-NEXT:  .LBB7_4:
 ; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX10W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX10W32-NEXT:    v_mov_b32_e32 v4, v3
-; GFX10W32-NEXT:    v_sub_nc_u32_e32 v4, s2, v4
+; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX10W32-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10W32-NEXT:    v_sub_nc_u32_e32 v1, s2, v1
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    global_store_dword v0, v4, s[0:1]
+; GFX10W32-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10W32-NEXT:    s_endpgm
 ;
 ; GFX11W64-LABEL: sub_i32_varying_vdata:
 ; GFX11W64:       ; %bb.0: ; %entry
-; GFX11W64-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11W64-NEXT:    s_not_b64 exec, exec
-; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11W64-NEXT:    s_not_b64 exec, exec
-; GFX11W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W64-NEXT:    v_mov_b32_e32 v3, 0
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT:    v_mov_b32_e32 v2, v1
-; GFX11W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX11W64-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W64-NEXT:    v_mov_b32_e32 v2, s4
-; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX11W64-NEXT:    v_readlane_b32 s4, v1, 15
-; GFX11W64-NEXT:    v_readlane_b32 s5, v1, 31
-; GFX11W64-NEXT:    v_writelane_b32 v3, s4, 16
-; GFX11W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX11W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX11W64-NEXT:    v_readlane_b32 s4, v1, 63
-; GFX11W64-NEXT:    v_readlane_b32 s6, v1, 47
-; GFX11W64-NEXT:    v_writelane_b32 v3, s5, 32
-; GFX11W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v0
-; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GFX11W64-NEXT:    v_writelane_b32 v3, s6, 48
-; GFX11W64-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX11W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
-; GFX11W64-NEXT:    ; implicit-def: $vgpr4
-; GFX11W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX11W64-NEXT:    s_cbranch_execz .LBB7_2
-; GFX11W64-NEXT:  ; %bb.1:
+; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT:    s_mov_b32 s4, 0
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v1
+; GFX11W64-NEXT:    ; implicit-def: $vgpr1
+; GFX11W64-NEXT:  .LBB7_1: ; %ComputeLoop
+; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11W64-NEXT:    s_ctz_i32_b32 s5, s3
+; GFX11W64-NEXT:    s_ctz_i32_b32 s6, s2
+; GFX11W64-NEXT:    s_add_i32 s5, s5, 32
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX11W64-NEXT:    v_readlane_b32 s8, v0, s5
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
+; GFX11W64-NEXT:    v_writelane_b32 v1, s4, s5
+; GFX11W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11W64-NEXT:    s_add_i32 s4, s4, s8
+; GFX11W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX11W64-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX11W64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX11W64-NEXT:    ; implicit-def: $vgpr0
+; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX11W64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX11W64-NEXT:    s_cbranch_execz .LBB7_4
+; GFX11W64-NEXT:  ; %bb.3:
 ; GFX11W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
-; GFX11W64-NEXT:    v_mov_b32_e32 v4, s4
+; GFX11W64-NEXT:    v_mov_b32_e32 v0, s4
+; GFX11W64-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT:    buffer_atomic_sub_u32 v4, v0, s[8:11], 0 idxen glc
-; GFX11W64-NEXT:  .LBB7_2:
+; GFX11W64-NEXT:    buffer_atomic_sub_u32 v0, v2, s[8:11], 0 idxen glc
+; GFX11W64-NEXT:  .LBB7_4:
 ; GFX11W64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX11W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX11W64-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT:    v_sub_nc_u32_e32 v4, s2, v4
+; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11W64-NEXT:    v_sub_nc_u32_e32 v1, s2, v1
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX11W64-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
 ;
 ; GFX11W32-LABEL: sub_i32_varying_vdata:
 ; GFX11W32:       ; %bb.0: ; %entry
-; GFX11W32-NEXT:    v_mov_b32_e32 v1, v0
-; GFX11W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11W32-NEXT:    s_not_b32 exec_lo, exec_lo
-; GFX11W32-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W32-NEXT:    v_mov_b32_e32 v3, 0
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W32-NEXT:    v_mov_b32_e32 v2, v1
-; GFX11W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
-; GFX11W32-NEXT:    v_readlane_b32 s4, v1, 31
-; GFX11W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
-; GFX11W32-NEXT:    v_readlane_b32 s3, v1, 15
-; GFX11W32-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
-; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11W32-NEXT:    s_or_saveexec_b32 s2, -1
-; GFX11W32-NEXT:    v_writelane_b32 v3, s3, 16
-; GFX11W32-NEXT:    s_mov_b32 exec_lo, s2
-; GFX11W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
-; GFX11W32-NEXT:    ; implicit-def: $vgpr4
-; GFX11W32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
-; GFX11W32-NEXT:    s_cbranch_execz .LBB7_2
-; GFX11W32-NEXT:  ; %bb.1:
-; GFX11W32-NEXT:    s_mov_b32 s3, s4
+; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX11W32-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT:    s_mov_b32 s2, 0
+; GFX11W32-NEXT:    ; implicit-def: $vgpr1
+; GFX11W32-NEXT:  .LBB7_1: ; %ComputeLoop
+; GFX11W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11W32-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11W32-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s4
+; GFX11W32-NEXT:    v_writelane_b32 v1, s2, s4
+; GFX11W32-NEXT:    s_and_not1_b32 s3, s3, s6
+; GFX11W32-NEXT:    s_add_i32 s2, s2, s5
+; GFX11W32-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX11W32-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX11W32-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11W32-NEXT:    ; implicit-def: $vgpr0
+; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX11W32-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX11W32-NEXT:    s_cbranch_execz .LBB7_4
+; GFX11W32-NEXT:  ; %bb.3:
 ; GFX11W32-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11W32-NEXT:    v_mov_b32_e32 v4, s3
+; GFX11W32-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11W32-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT:    buffer_atomic_sub_u32 v4, v0, s[4:7], 0 idxen glc
-; GFX11W32-NEXT:  .LBB7_2:
-; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX11W32-NEXT:    buffer_atomic_sub_u32 v0, v2, s[4:7], 0 idxen glc
+; GFX11W32-NEXT:  .LBB7_4:
+; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX11W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX11W32-NEXT:    v_mov_b32_e32 v4, v3
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT:    v_sub_nc_u32_e32 v4, s2, v4
+; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11W32-NEXT:    v_sub_nc_u32_e32 v1, s2, v1
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT:    global_store_b32 v0, v4, s[0:1]
+; GFX11W32-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
--- a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
@@ -380,4 +380,4 @@
 attributes #1 = { nounwind writeonly }
 attributes #2 = { nounwind }
 
-!0 = !{}
+!0 = !{}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -1376,4 +1376,4 @@
 
 define void @empty() {
   ret void
-}
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll
--- a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll
@@ -603,4 +603,4 @@
 }
 
 declare void @llvm.amdgcn.s.barrier()
-declare void @llvm.amdgcn.wave.barrier()
+declare void @llvm.amdgcn.wave.barrier()
\ No newline at end of file