diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -186,6 +186,10 @@
       return true;
     }
 
+    bool pairedHasLowerOffset() {
+      return Offset1 < Offset0;
+    }
+
     void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII,
                const GCNSubtarget &STM);
     void setPaired(MachineBasicBlock::iterator MI, const SIInstrInfo &TII);
@@ -214,7 +218,7 @@
   AliasAnalysis *AA = nullptr;
   bool OptimizeAgain;
 
-  static bool offsetsCanBeCombined(CombineInfo &CI);
+  static bool offsetsCanBeCombined(CombineInfo &CI, bool Modify = false);
   static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI);
   static unsigned getNewOpcode(const CombineInfo &CI);
   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
@@ -224,14 +228,19 @@
 
   unsigned read2Opcode(unsigned EltSize) const;
   unsigned read2ST64Opcode(unsigned EltSize) const;
-  MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
+  MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI,
+                                             std::list<CombineInfo> &MergeList);
 
   unsigned write2Opcode(unsigned EltSize) const;
   unsigned write2ST64Opcode(unsigned EltSize) const;
-  MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
-  MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
-  MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
-  MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
+  MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI,
+                                              std::list<CombineInfo> &MergeList);
+  MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI,
+                                                      std::list<CombineInfo> &MergeList);
+  MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI,
+                                                  std::list<CombineInfo> &MergeList);
+  MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI,
+                                                   std::list<CombineInfo> &MergeList);
 
   void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
                            int32_t NewOffset) const;
@@ -249,6 +258,10 @@
                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
   bool collectMergeableInsts(MachineBasicBlock &MBB,
                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
+  bool isInOrder(MachineBasicBlock::const_iterator First,
+                 MachineBasicBlock::const_iterator Second) const;
+  void order(CombineInfo &CI) const;
+  bool isValid(CombineInfo &CI) const;
 
 public:
   static char ID;
@@ -257,8 +270,9 @@
     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
   }
 
-  void removeCombinedInst(std::list<CombineInfo> &MergeList,
-                                         const MachineInstr &MI);
+  void replaceInstsInList(std::list<CombineInfo> &MergeList,
+                                             const CombineInfo &CI,
+                                             MachineBasicBlock::iterator NewMI);
   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
                                      bool &OptimizeListAgain);
   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
@@ -464,7 +478,14 @@
 void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI,
                                                   const SIInstrInfo &TII) {
   Paired = MI;
-  assert(InstClass == getInstClass(Paired->getOpcode(), TII));
+
+  // We can't pair these if we can't determine the instruction
+  // class.
+  if (InstClass == UNKNOWN || getInstClass(MI->getOpcode(), TII) != InstClass) {
+    InstClass = UNKNOWN;
+    return;
+  }
+
   int OffsetIdx =
       AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::offset);
   Offset1 = Paired->getOperand(OffsetIdx).getImm();
@@ -586,7 +607,7 @@
   return MMO;
 }
 
-bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
+bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, bool Modify) {
   // XXX - Would the same offset be OK? Is there any reason this would happen or
   // be useful?
   if (CI.Offset0 == CI.Offset1)
@@ -613,16 +634,20 @@
   // the stride 64 versions.
   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
-    CI.Offset0 = EltOffset0 / 64;
-    CI.Offset1 = EltOffset1 / 64;
-    CI.UseST64 = true;
+    if (Modify) {
+      CI.Offset0 = EltOffset0 / 64;
+      CI.Offset1 = EltOffset1 / 64;
+      CI.UseST64 = true;
+    }
     return true;
   }
 
   // Check if the new offsets fit in the reduced 8-bit range.
   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
-    CI.Offset0 = EltOffset0;
-    CI.Offset1 = EltOffset1;
+    if (Modify) {
+      CI.Offset0 = EltOffset0;
+      CI.Offset1 = EltOffset1;
+    }
     return true;
   }
 
@@ -631,15 +656,19 @@
   CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
 
   if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
-    CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
-    CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
-    CI.UseST64 = true;
+    if (Modify) {
+      CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
+      CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
+      CI.UseST64 = true;
+    }
     return true;
   }
 
   if (isUInt<8>(OffsetDiff)) {
-    CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
-    CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
+    if (Modify) {
+      CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
+      CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
+    }
     return true;
   }
 
@@ -664,8 +693,7 @@
 }
 
 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
-  MachineBasicBlock *MBB = CI.I->getParent();
-  MachineBasicBlock::iterator E = MBB->end();
+  MachineBasicBlock::iterator E = std::next(CI.Paired);
   MachineBasicBlock::iterator MBBI = CI.I;
 
   const unsigned Opc = CI.I->getOpcode();
@@ -731,18 +759,18 @@
                               CI.InstsToMove))
       continue;
 
-    bool Match = CI.hasSameBaseAddress(*MBBI);
-
-    if (Match) {
-      CI.setPaired(MBBI, *TII);
-
-      // Check both offsets fit in the reduced range.
-      // We also need to go through the list of instructions that we plan to
+    if (&*MBBI == &*CI.Paired) {
+      // We need to go through the list of instructions that we plan to
       // move and make sure they are all safe to move down past the merged
       // instruction.
-      if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI))
-        if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
-          return true;
+      if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA)) {
+        // Call offsetsCanBeCombined with modify = true so that the offsets are
+        // correct for the new instruction.  This should return true, because
+        // this function should only be called on CombineInfo objects that
+        // have already been confirmed to be mergeable.
+        offsetsCanBeCombined(CI, true);
+        return true;
+      }
     }
 
     // We've found a load/store that we couldn't merge for some reason.
@@ -772,7 +800,8 @@
 }
 
 MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
+SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI,
+                                     std::list<CombineInfo> &MergeList) {
   MachineBasicBlock *MBB = CI.I->getParent();
 
   // Be careful, since the addresses could be subregisters themselves in weird
@@ -847,6 +876,8 @@
 
   moveInstsAfter(Copy1, CI.InstsToMove);
 
+  replaceInstsInList(MergeList, CI, Read2);
+
   CI.I->eraseFromParent();
   CI.Paired->eraseFromParent();
 
@@ -871,7 +902,8 @@
 }
 
 MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
+SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI,
+                                      std::list<CombineInfo> &MergeList) {
   MachineBasicBlock *MBB = CI.I->getParent();
 
   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
@@ -930,6 +962,8 @@
 
   moveInstsAfter(Write2, CI.InstsToMove);
 
+  replaceInstsInList(MergeList, CI, Write2);
+
   CI.I->eraseFromParent();
   CI.Paired->eraseFromParent();
 
@@ -938,7 +972,8 @@
 }
 
 MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
+SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI,
+                                              std::list<CombineInfo> &MergeList) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
   const unsigned Opcode = getNewOpcode(CI);
@@ -982,13 +1017,16 @@
 
   moveInstsAfter(Copy1, CI.InstsToMove);
 
+  replaceInstsInList(MergeList, CI, New);
+
   CI.I->eraseFromParent();
   CI.Paired->eraseFromParent();
   return New;
 }
 
 MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
+SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI,
+                                          std::list<CombineInfo> &MergeList) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
 
@@ -1043,6 +1081,8 @@
 
   moveInstsAfter(Copy1, CI.InstsToMove);
 
+  replaceInstsInList(MergeList, CI, New);
+
   CI.I->eraseFromParent();
   CI.Paired->eraseFromParent();
   return New;
@@ -1167,7 +1207,8 @@
 }
 
 MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
+SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI,
+                                           std::list<CombineInfo> &MergeList) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
 
@@ -1219,6 +1260,8 @@
 
   moveInstsAfter(MIB, CI.InstsToMove);
 
+  replaceInstsInList(MergeList, CI, New);
+
   CI.I->eraseFromParent();
   CI.Paired->eraseFromParent();
   return New;
@@ -1547,6 +1590,31 @@
   MergeableInsts.emplace_back(1, CI);
 }
 
+bool SILoadStoreOptimizer::isInOrder(MachineBasicBlock::const_iterator First,
+                                     MachineBasicBlock::const_iterator Second) const {
+  // FIXME: Is there a better way to do this.
+  const MachineBasicBlock *MBB = First->getParent();
+  for (auto I = First, E = MBB->end(); I != E; ++I) {
+    if (I == Second)
+      return true;
+  }
+  return false;
+
+}
+
+void SILoadStoreOptimizer::order(CombineInfo &CI) const {
+  if (!isInOrder(CI.I, CI.Paired)) {
+    auto OldMI = CI.I;
+    CI.setMI(CI.Paired, *TII, *STM);
+    CI.setPaired(OldMI, *TII);
+  }
+}
+
+bool SILoadStoreOptimizer::isValid(CombineInfo &CI) const {
+  return CI.InstClass != UNKNOWN && widthsFit(*STM, CI) &&
+         offsetsCanBeCombined(CI);
+}
+
 bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB,
                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
   bool Modified = false;
@@ -1579,6 +1647,62 @@
 
     addInstToMergeableList(CI, MergeableInsts);
   }
+
+  // At this point we have lists of Mergeable instructions.  In each list
+  // there is a CombineInfo object which has it's I member set, but nothing
+  // set of Paired yet.
+  //
+  //
+  // Part 2: Sort lists by offset and then for each CombineInfo object in the
+  // list try to find an instruction that can be merged with I.  If an instruction
+  // is found, it is stored in the Paired field.  If no instructions are found, then
+  // the CombineInfo object is deleted from the list.
+  for (std::list<CombineInfo> &MergeList : MergeableInsts) {
+
+    if (MergeList.size() <= 1) {
+      // This means we have found only one instruction with a given address
+      // that can be merged, and we need at least 2 instructions to do a merge,
+      // so this list can be discarded.
+      // The code below assumes lists have at least 2 items.
+      MergeList.clear();
+      continue;
+    }
+
+    // Sort the lists by offsets, this way mergeable instructions will be
+    // adjacent to each other in the list, which will make it easier to find
+    // matches.
+    MergeList.sort(
+        [] (const CombineInfo &A, CombineInfo &B) {
+          return A.Offset0 < B.Offset0;
+        });
+
+    for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
+         Next = std::next(I)) {
+      CombineInfo &CI = *I;
+
+      // Check that the next instruction in the list can be paired with this
+      // one.  Since the list is sorted by offset, if this instruction can't
+      // be paired with the next one in the list, then it can't be paired at
+      // all.
+      CI.setPaired(Next->I, *TII);
+      if (!widthsFit(*STM, CI) || !offsetsCanBeCombined(CI)) {
+        I = MergeList.erase(I);
+        continue;
+      }
+
+      // Offsets can be combined.  If the Paired instruction comes first in the
+      // order() will swap I and Paired, because the optimizeBlock() function
+      // only looks forward for matches.
+      order(CI);
+
+      I = Next;
+    }
+
+    // The last instruction in each list can be deleted, since it will have
+    // an empty paired value.
+    MergeList.pop_back();
+  }
+
   return Modified;
 }
 
@@ -1590,7 +1714,7 @@
   bool Modified = false;
 
   for (std::list<CombineInfo> &MergeList : MergeableInsts) {
-    if (MergeList.size() < 2)
+    if (MergeList.empty())
       continue;
 
     bool OptimizeListAgain = false;
@@ -1613,14 +1737,26 @@
   return Modified;
 }
 
+// FIXME: This doesn't have to be a loop.
 void
-SILoadStoreOptimizer::removeCombinedInst(std::list<CombineInfo> &MergeList,
-                                         const MachineInstr &MI) {
+SILoadStoreOptimizer::replaceInstsInList(std::list<CombineInfo> &MergeList,
+                                             const CombineInfo &CI,
+                                             MachineBasicBlock::iterator NewMI) {
+  for (auto I = MergeList.begin(), E = MergeList.end(); I != E; ++I) {
+    // Don't need to update CI, because this will be deleted.
+    if (&*I == &CI)
+      continue;
 
-  for (auto CI = MergeList.begin(), E = MergeList.end(); CI != E; ++CI) {
-    if (&*CI->I == &MI) {
-      MergeList.erase(CI);
-      return;
+    if (&*I->I == &*CI.I || &*I->I == &*CI.Paired) {
+      I->setMI(NewMI, *TII, *STM);
+      order(*I);
+    }
+
+    if (&*I->Paired == &*CI.I || &*I->Paired == &*CI.Paired) {
+      I->setPaired(NewMI, *TII);
+      // Reset paired to re-calculate the
+      // FIXME: Do we need to reset Paired here?
+      order(*I);
     }
   }
 }
@@ -1630,35 +1766,40 @@
                                           std::list<CombineInfo> &MergeList,
                                           bool &OptimizeListAgain) {
   bool Modified = false;
-  for (auto I = MergeList.begin(); I != MergeList.end(); ++I) {
+  for (auto I = MergeList.begin(); I != MergeList.end();) {
     CombineInfo &CI = *I;
 
+    if (!isValid(CI)) {
+      ++I;
+      continue;
+    }
+
     switch (CI.InstClass) {
     default:
       break;
     case DS_READ:
       if (findMatchingInst(CI)) {
         Modified = true;
-        removeCombinedInst(MergeList, *CI.Paired);
-        MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI);
-        CI.setMI(NewMI, *TII, *STM);
+        mergeRead2Pair(CI, MergeList);
+        I = MergeList.erase(I);
+        continue;
       }
       break;
     case DS_WRITE:
       if (findMatchingInst(CI)) {
         Modified = true;
-        removeCombinedInst(MergeList, *CI.Paired);
-        MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI);
-        CI.setMI(NewMI, *TII, *STM);
+        mergeWrite2Pair(CI, MergeList);
+        I = MergeList.erase(I);
+        continue;
       }
       break;
     case S_BUFFER_LOAD_IMM:
       if (findMatchingInst(CI)) {
         Modified = true;
-        removeCombinedInst(MergeList, *CI.Paired);
-        MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI);
-        CI.setMI(NewMI, *TII, *STM);
+        mergeSBufferLoadImmPair(CI, MergeList);
         OptimizeListAgain |= (CI.Width0 + CI.Width1) < 16;
+        I = MergeList.erase(I);
+        continue;
       }
       break;
     case BUFFER_LOAD_OFFEN:
@@ -1667,10 +1808,10 @@
     case BUFFER_LOAD_OFFSET_exact:
       if (findMatchingInst(CI)) {
         Modified = true;
-        removeCombinedInst(MergeList, *CI.Paired);
-        MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI);
-        CI.setMI(NewMI, *TII, *STM);
+        mergeBufferLoadPair(CI, MergeList);
         OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
+        I = MergeList.erase(I);
+        continue;
       }
       break;
     case BUFFER_STORE_OFFEN:
@@ -1679,17 +1820,14 @@
     case BUFFER_STORE_OFFSET_exact:
       if (findMatchingInst(CI)) {
         Modified = true;
-        removeCombinedInst(MergeList, *CI.Paired);
-        MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI);
-        CI.setMI(NewMI, *TII, *STM);
+        mergeBufferStorePair(CI, MergeList);
         OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
+        I = MergeList.erase(I);
+        continue;
       }
       break;
     }
-    // Clear the InstsToMove after we have finished searching so we don't have
-    // stale values left over if we search for this CI again in another pass
-    // over the block.
-    CI.InstsToMove.clear();
+    ++I;
   }
 
   return Modified;
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
--- a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
@@ -6,9 +6,9 @@
 ; offset0 is larger than offset1
 
 ; SI-LABEL: {{^}}offset_order:
-; SI-DAG: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset1:4{{$}}
+; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset1:14{{$}}
 ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
-; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:56
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1024
 ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:11 offset1:12
 define amdgpu_kernel void @offset_order(float addrspace(1)* %out) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/merge-load-store.mir b/llvm/test/CodeGen/AMDGPU/merge-load-store.mir
--- a/llvm/test/CodeGen/AMDGPU/merge-load-store.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-load-store.mir
@@ -66,6 +66,7 @@
   attributes #0 = { convergent nounwind }
 
   define amdgpu_kernel void @merge_mmos(i32 addrspace(1)* %ptr_addr1) { ret void }
+  define amdgpu_kernel void @reorder_offsets(i32 addrspace(1)* %reorder_addr1) { ret void }
 
 ...
 ---
@@ -194,3 +195,26 @@
     S_ENDPGM 0
 
 ...
+---
+# CHECK-LABEL: reorder_offsets
+# CHECK-DAG: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 16, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into %ir.reorder_addr1 + 16, align 4, addrspace 1)
+# CHECK-DAG: BUFFER_STORE_DWORDX4_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into %ir.reorder_addr1, align 4, addrspace 1)
+
+name: reorder_offsets
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+
+    %0:sreg_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 4, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 4)
+    BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 8, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 8)
+    BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 12, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 12)
+    BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 16, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 16)
+    BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 20, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1 + 20)
+    BUFFER_STORE_DWORD_OFFSET_exact %1, %0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.reorder_addr1)
+    S_ENDPGM 0
+
+
+...