diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1573,6 +1573,8 @@
   for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
     SUnit *SUa = MemOpRecords[Idx].SU;
     SUnit *SUb = MemOpRecords[Idx+1].SU;
+    if (SUa->NodeNum > SUb->NodeNum)
+      std::swap(SUa, SUb);
     if (TII->shouldClusterMemOps(*MemOpRecords[Idx].BaseOp,
                                  *MemOpRecords[Idx + 1].BaseOp,
                                  ClusterLength) &&
diff --git a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll
--- a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll
@@ -3,7 +3,7 @@
 
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: stp_i64_scale:%bb.0
-; CHECK:Cluster ld/st SU(4) - SU(3)
+; CHECK:Cluster ld/st SU(3) - SU(4)
 ; CHECK:Cluster ld/st SU(2) - SU(5)
 ; CHECK:SU(4):   STRXui %1:gpr64, %0:gpr64common, 1
 ; CHECK:SU(3):   STRXui %1:gpr64, %0:gpr64common, 2
@@ -24,7 +24,7 @@
 
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: stp_i32_scale:%bb.0
-; CHECK:Cluster ld/st SU(4) - SU(3)
+; CHECK:Cluster ld/st SU(3) - SU(4)
 ; CHECK:Cluster ld/st SU(2) - SU(5)
 ; CHECK:SU(4):   STRWui %1:gpr32, %0:gpr64common, 1
 ; CHECK:SU(3):   STRWui %1:gpr32, %0:gpr64common, 2
@@ -45,12 +45,12 @@
 
 ; CHECK:********** MI Scheduling **********
 ; CHECK-LABEL:stp_i64_unscale:%bb.0 entry
-; CHECK:Cluster ld/st SU(5) - SU(2)
-; CHECK:Cluster ld/st SU(4) - SU(3)
-; CHECK:SU(5):   STURXi %1:gpr64, %0:gpr64common, -32
+; CHECK:Cluster ld/st SU(2) - SU(5)
+; CHECK:Cluster ld/st SU(3) - SU(4)
 ; CHECK:SU(2):   STURXi %1:gpr64, %0:gpr64common, -24
-; CHECK:SU(4):   STURXi %1:gpr64, %0:gpr64common, -16
 ; CHECK:SU(3):   STURXi %1:gpr64, %0:gpr64common, -8
+; CHECK:SU(4):   STURXi %1:gpr64, %0:gpr64common, -16
+; CHECK:SU(5):   STURXi %1:gpr64, %0:gpr64common, -32
 define void @stp_i64_unscale(i64* nocapture %P, i64 %v) #0 {
 entry:
   %arrayidx = getelementptr inbounds i64, i64* %P, i64 -3
@@ -66,12 +66,12 @@
 
 ; CHECK:********** MI Scheduling **********
 ; CHECK-LABEL:stp_i32_unscale:%bb.0 entry
-; CHECK:Cluster ld/st SU(5) - SU(2)
-; CHECK:Cluster ld/st SU(4) - SU(3)
-; CHECK:SU(5):   STURWi %1:gpr32, %0:gpr64common, -16
+; CHECK:Cluster ld/st SU(2) - SU(5)
+; CHECK:Cluster ld/st SU(3) - SU(4)
 ; CHECK:SU(2):   STURWi %1:gpr32, %0:gpr64common, -12
-; CHECK:SU(4):   STURWi %1:gpr32, %0:gpr64common, -8
 ; CHECK:SU(3):   STURWi %1:gpr32, %0:gpr64common, -4
+; CHECK:SU(4):   STURWi %1:gpr32, %0:gpr64common, -8
+; CHECK:SU(5):   STURWi %1:gpr32, %0:gpr64common, -16
 define void @stp_i32_unscale(i32* nocapture %P, i32 %v) #0 {
 entry:
   %arrayidx = getelementptr inbounds i32, i32* %P, i32 -3
diff --git a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
--- a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
@@ -36,7 +36,7 @@
 ; Test ldur clustering.
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: ldur_int:%bb.0
-; CHECK: Cluster ld/st SU(2) - SU(1)
+; CHECK: Cluster ld/st SU(1) - SU(2)
 ; CHECK: SU(1):   %{{[0-9]+}}:gpr32 = LDURWi
 ; CHECK: SU(2):   %{{[0-9]+}}:gpr32 = LDURWi
 define i32 @ldur_int(i32* %a) nounwind {
diff --git a/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll b/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll
--- a/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll
@@ -259,9 +259,9 @@
 define void @memset_16_stack() {
 ; CHECK-LABEL: memset_16_stack:
 ; CHECK:       mov x8, #-6148914691236517206
-; CHECK-NEXT:  str x8, [sp, #-32]!
 ; CHECK-NEXT:  mov x0, sp
 ; CHECK-NEXT:  stp x8, x30, [sp, #8]
+; CHECK-NEXT:  str x8, [sp]
 ; CHECK-NEXT:  bl something
   %buf = alloca [16 x i8], align 1
   %cast = bitcast [16 x i8]* %buf to i8*
diff --git a/llvm/test/CodeGen/AArch64/expand-select.ll b/llvm/test/CodeGen/AArch64/expand-select.ll
--- a/llvm/test/CodeGen/AArch64/expand-select.ll
+++ b/llvm/test/CodeGen/AArch64/expand-select.ll
@@ -6,17 +6,17 @@
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    fmov s0, wzr
-; CHECK-NEXT:    ldp x10, x9, [sp, #8]
 ; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    ldr x8, [sp]
+; CHECK-NEXT:    ldp x8, x9, [sp, #8]
+; CHECK-NEXT:    ldr x10, [sp]
 ; CHECK-NEXT:    cmeq v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    fmov w11, s0
 ; CHECK-NEXT:    tst w11, #0x1
 ; CHECK-NEXT:    csel x11, x2, x6, ne
 ; CHECK-NEXT:    csel x12, x3, x7, ne
-; CHECK-NEXT:    csel x8, x4, x8, ne
-; CHECK-NEXT:    csel x10, x5, x10, ne
-; CHECK-NEXT:    stp x8, x10, [x9, #16]
+; CHECK-NEXT:    csel x10, x4, x10, ne
+; CHECK-NEXT:    csel x8, x5, x8, ne
+; CHECK-NEXT:    stp x10, x8, [x9, #16]
 ; CHECK-NEXT:    stp x11, x12, [x9]
 ; CHECK-NEXT:    ret
   %cond = and i32 %In1, 1
diff --git a/llvm/test/CodeGen/AArch64/global-merge-group-by-use.ll b/llvm/test/CodeGen/AArch64/global-merge-group-by-use.ll
--- a/llvm/test/CodeGen/AArch64/global-merge-group-by-use.ll
+++ b/llvm/test/CodeGen/AArch64/global-merge-group-by-use.ll
@@ -65,8 +65,8 @@
 define void @f4(i32 %a1, i32 %a2, i32 %a3) #0 {
 ; CHECK-NEXT: adrp x8, [[SET3]]@PAGE
 ; CHECK-NEXT: add x8, x8, [[SET3]]@PAGEOFF
-; CHECK-NEXT: stp w2, w0, [x8]
-; CHECK-NEXT: str w1, [x8, #8]
+; CHECK-NEXT: stp w0, w1, [x8, #4]
+; CHECK-NEXT: str w2, [x8]
 ; CHECK-NEXT: ret
   store i32 %a1, i32* @m4, align 4
   store i32 %a2, i32* @n4, align 4
diff --git a/llvm/test/CodeGen/AArch64/machine-scheduler.mir b/llvm/test/CodeGen/AArch64/machine-scheduler.mir
--- a/llvm/test/CodeGen/AArch64/machine-scheduler.mir
+++ b/llvm/test/CodeGen/AArch64/machine-scheduler.mir
@@ -18,8 +18,8 @@
 ---
 # CHECK-LABEL: name: load_imp-def
 # CHECK: bb.0.entry:
-# CHECK: LDRWui $x0, 0
 # CHECK: LDRWui $x0, 1
+# CHECK: LDRWui $x0, 0
 # CHECK: STRWui $w1, $x0, 2
 name: load_imp-def
 tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll b/llvm/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll
--- a/llvm/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}cast_constant_i64_to_build_vector_v4i16:
-; GCN: global_store_dwordx2
-; GCN: global_store_dword v
 ; GCN: global_store_short
+; GCN: global_store_dword v
+; GCN: global_store_dwordx2
 define amdgpu_kernel void @cast_constant_i64_to_build_vector_v4i16(i8 addrspace(1)* nocapture %data) {
 entry:
   store i8 72, i8 addrspace(1)* %data, align 1
diff --git a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
--- a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
@@ -133,10 +133,10 @@
 ; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8
 ; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12
 
+; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16
 ; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s34 offset:20
 ; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s34 offset:24
 ; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s34 offset:28
-; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16
 
 ; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16
 ; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20
@@ -331,10 +331,10 @@
 ; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8
 ; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12
 
+; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16
 ; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s34 offset:20
 ; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s34 offset:24
 ; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s34 offset:28
-; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16
 
 ; GCN: s_waitcnt vmcnt(0)
 ; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -765,16 +765,17 @@
 
 ; GCN-LABEL: {{^}}tail_call_byval_align16:
 ; GCN-NOT: s32
-; GCN-NOT: buffer_store_dword v33
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GCN-NOT: buffer_store_dword v33
+; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:16
 
 ; GCN: s_getpc_b64
 
 ; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:16
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}}
+; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GCN-NOT: s32
 ; GCN: s_setpc_b64
 define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -199,15 +199,14 @@
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-NEXT:    v_mov_b32_e32 v3, s5
-; GCN-NEXT:    global_load_dword v4, v[2:3], off
-; GCN-NEXT:    global_load_ushort v2, v[2:3], off offset:4
+; GCN-NEXT:    global_load_ushort v4, v[2:3], off offset:4
+; GCN-NEXT:    global_load_dword v2, v[2:3], off
 ; GCN-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    buffer_store_short v4, off, s[0:3], s9 offset:4
-; GCN-NEXT:    buffer_store_short_d16_hi v4, off, s[0:3], s9 offset:6
-; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    buffer_store_short v2, off, s[0:3], s9 offset:8
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v2, off, s[0:3], s9 offset:4
+; GCN-NEXT:    buffer_store_short_d16_hi v2, off, s[0:3], s9 offset:6
+; GCN-NEXT:    buffer_store_short v4, off, s[0:3], s9 offset:8
 ; GCN-NEXT:    buffer_load_ushort v2, off, s[0:3], s9 offset:4
 ; GCN-NEXT:    buffer_load_ushort v4, off, s[0:3], s9 offset:6
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -236,19 +236,21 @@
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, 1, v0
 ; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_ubyte v0, v[0:1]
-; VI-NEXT:    flat_load_ubyte v1, v[6:7]
-; VI-NEXT:    flat_load_ubyte v4, v[4:5]
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
+; VI-NEXT:    flat_load_ubyte v3, v[4:5]
+; VI-NEXT:    flat_load_ubyte v4, v[6:7]
+; VI-NEXT:    flat_load_ubyte v0, v[0:1]
+; VI-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
 ; VI-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    v_or_b32_e32 v1, v1, v3
+; VI-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_e32 v2, v2, v4
-; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; VI-NEXT:    v_or_b32_e32 v0, v2, v0
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v1
 ; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
-; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v3
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
@@ -414,35 +416,34 @@
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 3, v0
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-NEXT:    flat_load_ubyte v9, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v10, v[4:5]
-; VI-NEXT:    flat_load_ubyte v11, v[2:3]
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 5, v0
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, 4, v0
 ; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v8, vcc, 6, v0
-; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_ubyte v0, v[0:1]
-; VI-NEXT:    flat_load_ubyte v1, v[8:9]
-; VI-NEXT:    flat_load_ubyte v7, v[6:7]
-; VI-NEXT:    flat_load_ubyte v4, v[4:5]
+; VI-NEXT:    flat_load_ubyte v8, v[0:1]
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 6, v0
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
+; VI-NEXT:    flat_load_ubyte v3, v[4:5]
+; VI-NEXT:    flat_load_ubyte v4, v[6:7]
+; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v10
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v9
 ; VI-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; VI-NEXT:    v_lshlrev_b32_e32 v3, 8, v11
-; VI-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; VI-NEXT:    v_or_b32_e32 v0, v3, v0
-; VI-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; VI-NEXT:    v_cvt_f32_ubyte0_e32 v6, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v10
+; VI-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; VI-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-NEXT:    v_or_b32_e32 v4, v3, v4
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v6, v0
+; VI-NEXT:    v_or_b32_e32 v0, v1, v8
 ; VI-NEXT:    v_or_b32_sdwa v1, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
-; VI-NEXT:    v_or_b32_e32 v4, v4, v7
 ; VI-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; VI-NEXT:    v_or_b32_e32 v4, v4, v5
 ; VI-NEXT:    v_cvt_f32_ubyte1_e32 v5, v4
@@ -699,23 +700,24 @@
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 3, v0
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; VI-NEXT:    v_add_u32_e32 v6, vcc, 2, v0
-; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_ubyte v0, v[0:1]
-; VI-NEXT:    flat_load_ubyte v1, v[6:7]
-; VI-NEXT:    flat_load_ubyte v4, v[4:5]
+; VI-NEXT:    flat_load_ubyte v6, v[0:1]
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v2, v[2:3]
+; VI-NEXT:    flat_load_ubyte v3, v[4:5]
+; VI-NEXT:    flat_load_ubyte v0, v[0:1]
+; VI-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
 ; VI-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v3, 8, v4
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
+; VI-NEXT:    v_or_b32_e32 v1, v1, v6
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT:    v_or_b32_e32 v4, v2, v0
-; VI-NEXT:    v_or_b32_sdwa v0, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT:    v_or_b32_e32 v0, v0, v4
+; VI-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
 ; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v4
+; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v1
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -473,8 +473,8 @@
 ; GFX9-NOT: m0
 
 ; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar@abs32@lo{{$}}
-; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1
 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:2 offset1:3
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1
 define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
   %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
   %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -231,10 +231,10 @@
 
 ; GCN-LABEL: {{^}}undefined_stack_store_reg:
 ; GCN: s_and_saveexec_b64
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s34 offset:
 ; GCN: buffer_store_dword v0, off, s[0:3], s34 offset:
 ; GCN: buffer_store_dword v0, off, s[0:3], s34 offset:
 ; GCN: buffer_store_dword v0, off, s[0:3], s34 offset:
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s34 offset:
 define void @undefined_stack_store_reg(float %arg, i32 %arg1) #0 {
 bb:
   %tmp = alloca <4 x float>, align 16, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll b/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll
--- a/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll
@@ -4,8 +4,8 @@
 ; an unused stack slot, causing ScratchSize to be non-zero.
 
 ; GCN-LABEL: store_v3i32:
-; GCN:        ds_read_b64
 ; GCN:        ds_read_b32
+; GCN:        ds_read_b64
 ; GCN:        ds_write_b32
 ; GCN:        ds_write_b64
 ; GCN: ScratchSize: 0
@@ -17,8 +17,8 @@
 }
 
 ; GCN-LABEL: store_v5i32:
-; GCN:        ds_read2_b64
 ; GCN:        ds_read_b32
+; GCN:        ds_read2_b64
 ; GCN:        ds_write_b32
 ; GCN:        ds_write2_b64
 ; GCN: ScratchSize: 0
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1642,10 +1642,10 @@
 ; SI-NEXT:    v_mov_b32_e32 v13, s21
 ; SI-NEXT:    v_mov_b32_e32 v14, s22
 ; SI-NEXT:    v_mov_b32_e32 v15, s23
-; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96
 ; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64
+; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96
 ; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64
 ; SI-NEXT:    v_or_b32_e32 v16, s4, v16
 ; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0x40200000
@@ -1688,10 +1688,10 @@
 ; VI-NEXT:    v_mov_b32_e32 v13, s21
 ; VI-NEXT:    v_mov_b32_e32 v14, s22
 ; VI-NEXT:    v_mov_b32_e32 v15, s23
-; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64
+; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96
 ; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64
 ; VI-NEXT:    v_or_b32_e32 v16, s4, v16
 ; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x40200000
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -855,8 +855,8 @@
 ; multiple.
 ; FUNC-LABEL: {{^}}packed_struct_argument_alignment:
 ; HSA-GFX9: kernarg_segment_byte_size = 28
-; HSA-GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13
 ; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17
+; HSA-GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13
 ; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
 ; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
--- a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
@@ -75,8 +75,8 @@
 ; multiple.
 ; FUNC-LABEL: {{^}}packed_struct_argument_alignment:
 ; HSA-VI: kernarg_segment_byte_size = 28
-; HSA-VI: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13
 ; HSA-VI: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17
+; HSA-VI: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13
 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll
--- a/llvm/test/CodeGen/AMDGPU/max.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll
@@ -137,22 +137,22 @@
 ; VI-NEXT:    v_mov_b32_e32 v7, s5
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
 ; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; VI-NEXT:    flat_load_dword v8, v[0:1]
-; VI-NEXT:    flat_load_ushort v9, v[4:5]
+; VI-NEXT:    flat_load_ushort v8, v[4:5]
+; VI-NEXT:    flat_load_dword v9, v[0:1]
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 4, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; VI-NEXT:    flat_load_dword v2, v[2:3]
 ; VI-NEXT:    flat_load_ushort v0, v[0:1]
+; VI-NEXT:    flat_load_dword v1, v[2:3]
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 4, v6
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; VI-NEXT:    v_max_i16_e32 v1, v8, v2
-; VI-NEXT:    v_max_i16_sdwa v2, v8, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT:    v_or_b32_e32 v1, v1, v2
+; VI-NEXT:    v_max_i16_e32 v0, v8, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_max_i16_e32 v0, v9, v0
-; VI-NEXT:    flat_store_dword v[6:7], v1
+; VI-NEXT:    v_max_i16_e32 v2, v9, v1
+; VI-NEXT:    v_max_i16_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; VI-NEXT:    flat_store_short v[4:5], v0
+; VI-NEXT:    flat_store_dword v[6:7], v1
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_imax_sge_v3i16:
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -221,26 +221,26 @@
 ; GFX8:    flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
 ;
 ; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
-; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
-; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
 ; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048
 ; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072
 ; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
+; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
+; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
 ; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048
 ; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072
 ; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
 ;
 ; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
-; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048
-; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
 ; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
-; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048
+; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
 ; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
+; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048
 ; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
+; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
 entry:
    %call = tail call i64 @_Z13get_global_idj(i32 0)
    %conv = and i64 %call, 255
@@ -299,10 +299,10 @@
 ; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 ; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 ;
-; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
-; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ;
 ; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
@@ -352,8 +352,8 @@
 ;
 ; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048
-; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
 ; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
+; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
 entry:
   %call = tail call i64 @_Z13get_global_idj(i32 0)
   %conv = and i64 %call, 255
@@ -454,13 +454,13 @@
 ; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 ; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 ;
-; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
-; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
 ;
 ; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
@@ -519,8 +519,8 @@
 ; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 ; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 ;
-; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
 ;
 ; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
 ; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
--- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
@@ -509,8 +509,8 @@
 ; GCN-NEXT:    v_mov_b32_e32 v10, 16
 ; GCN-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GCN-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
+; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GCN-NEXT:    s_endpgm
   %shift = shl <2 x i128> %lhs, %rhs
   store <2 x i128> %shift, <2 x i128> addrspace(1)* null
@@ -579,8 +579,8 @@
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v10, 16
 ; GCN-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GCN-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
+; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GCN-NEXT:    s_endpgm
   %shift = lshr <2 x i128> %lhs, %rhs
   store <2 x i128> %shift, <2 x i128> addrspace(1)* null
@@ -653,8 +653,8 @@
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v10, 16
 ; GCN-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GCN-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
+; GCN-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GCN-NEXT:    s_endpgm
   %shift = ashr <2 x i128> %lhs, %rhs
   store <2 x i128> %shift, <2 x i128> addrspace(1)* null
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
--- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
@@ -10,21 +10,20 @@
 ; GCN-NEXT:  BB0_1: ; %bb0
 ; GCN-NEXT:    ; =>This Loop Header: Depth=1
 ; GCN-NEXT:    ; Child Loop BB0_2 Depth 2
-; GCN-NEXT:    v_add_co_u32_e64 v4, vcc_lo, v0, 8
+; GCN-NEXT:    v_add_co_u32_e64 v2, vcc_lo, v0, 8
 ; GCN-NEXT:    s_mov_b32 s5, exec_lo
-; GCN-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
-; GCN-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; GCN-NEXT:    flat_load_dwordx2 v[4:5], v[4:5]
+; GCN-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GCN-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
+; GCN-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
 ; GCN-NEXT:  BB0_2: ; Parent Loop BB0_1 Depth=1
 ; GCN-NEXT:    ; => This Inner Loop Header: Depth=2
-; GCN-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; GCN-NEXT:    v_readfirstlane_b32 s8, v2
-; GCN-NEXT:    v_readfirstlane_b32 s9, v3
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_readfirstlane_b32 s10, v4
-; GCN-NEXT:    v_readfirstlane_b32 s11, v5
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[2:3]
-; GCN-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[4:5]
+; GCN-NEXT:    v_readfirstlane_b32 s8, v4
+; GCN-NEXT:    v_readfirstlane_b32 s9, v5
+; GCN-NEXT:    v_readfirstlane_b32 s10, v2
+; GCN-NEXT:    v_readfirstlane_b32 s11, v3
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[4:5]
+; GCN-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
 ; GCN-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GCN-NEXT:    s_and_saveexec_b32 s4, s4
 ; GCN-NEXT:    s_nop 0