diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -1573,6 +1573,8 @@ for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) { SUnit *SUa = MemOpRecords[Idx].SU; SUnit *SUb = MemOpRecords[Idx+1].SU; + if (SUa->NodeNum > SUb->NodeNum) + std::swap(SUa, SUb); if (TII->shouldClusterMemOps(*MemOpRecords[Idx].BaseOp, *MemOpRecords[Idx + 1].BaseOp, ClusterLength) && diff --git a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll --- a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll @@ -3,7 +3,7 @@ ; CHECK: ********** MI Scheduling ********** ; CHECK-LABEL: stp_i64_scale:%bb.0 -; CHECK:Cluster ld/st SU(4) - SU(3) +; CHECK:Cluster ld/st SU(3) - SU(4) ; CHECK:Cluster ld/st SU(2) - SU(5) ; CHECK:SU(4): STRXui %1:gpr64, %0:gpr64common, 1 ; CHECK:SU(3): STRXui %1:gpr64, %0:gpr64common, 2 @@ -24,7 +24,7 @@ ; CHECK: ********** MI Scheduling ********** ; CHECK-LABEL: stp_i32_scale:%bb.0 -; CHECK:Cluster ld/st SU(4) - SU(3) +; CHECK:Cluster ld/st SU(3) - SU(4) ; CHECK:Cluster ld/st SU(2) - SU(5) ; CHECK:SU(4): STRWui %1:gpr32, %0:gpr64common, 1 ; CHECK:SU(3): STRWui %1:gpr32, %0:gpr64common, 2 @@ -45,12 +45,12 @@ ; CHECK:********** MI Scheduling ********** ; CHECK-LABEL:stp_i64_unscale:%bb.0 entry -; CHECK:Cluster ld/st SU(5) - SU(2) -; CHECK:Cluster ld/st SU(4) - SU(3) -; CHECK:SU(5): STURXi %1:gpr64, %0:gpr64common, -32 +; CHECK:Cluster ld/st SU(2) - SU(5) +; CHECK:Cluster ld/st SU(3) - SU(4) ; CHECK:SU(2): STURXi %1:gpr64, %0:gpr64common, -24 -; CHECK:SU(4): STURXi %1:gpr64, %0:gpr64common, -16 ; CHECK:SU(3): STURXi %1:gpr64, %0:gpr64common, -8 +; CHECK:SU(4): STURXi %1:gpr64, %0:gpr64common, -16 +; CHECK:SU(5): STURXi %1:gpr64, %0:gpr64common, -32 define void @stp_i64_unscale(i64* nocapture %P, i64 %v) #0 { entry: %arrayidx = getelementptr inbounds i64, i64* %P, i64 -3 @@ -66,12 +66,12 @@ ; CHECK:********** MI Scheduling ********** ; CHECK-LABEL:stp_i32_unscale:%bb.0 entry -; CHECK:Cluster ld/st SU(5) - SU(2) -; CHECK:Cluster ld/st SU(4) - SU(3) -; CHECK:SU(5): STURWi %1:gpr32, %0:gpr64common, -16 +; CHECK:Cluster ld/st SU(2) - SU(5) +; CHECK:Cluster ld/st SU(3) - SU(4) ; CHECK:SU(2): STURWi %1:gpr32, %0:gpr64common, -12 -; CHECK:SU(4): STURWi %1:gpr32, %0:gpr64common, -8 ; CHECK:SU(3): STURWi %1:gpr32, %0:gpr64common, -4 +; CHECK:SU(4): STURWi %1:gpr32, %0:gpr64common, -8 +; CHECK:SU(5): STURWi %1:gpr32, %0:gpr64common, -16 define void @stp_i32_unscale(i32* nocapture %P, i32 %v) #0 { entry: %arrayidx = getelementptr inbounds i32, i32* %P, i32 -3 diff --git a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll --- a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll +++ b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll @@ -36,7 +36,7 @@ ; Test ldur clustering. ; CHECK: ********** MI Scheduling ********** ; CHECK-LABEL: ldur_int:%bb.0 -; CHECK: Cluster ld/st SU(2) - SU(1) +; CHECK: Cluster ld/st SU(1) - SU(2) ; CHECK: SU(1): %{{[0-9]+}}:gpr32 = LDURWi ; CHECK: SU(2): %{{[0-9]+}}:gpr32 = LDURWi define i32 @ldur_int(i32* %a) nounwind { diff --git a/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll b/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll --- a/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll +++ b/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll @@ -259,9 +259,9 @@ define void @memset_16_stack() { ; CHECK-LABEL: memset_16_stack: ; CHECK: mov x8, #-6148914691236517206 -; CHECK-NEXT: str x8, [sp, #-32]! ; CHECK-NEXT: mov x0, sp ; CHECK-NEXT: stp x8, x30, [sp, #8] +; CHECK-NEXT: str x8, [sp] ; CHECK-NEXT: bl something %buf = alloca [16 x i8], align 1 %cast = bitcast [16 x i8]* %buf to i8* diff --git a/llvm/test/CodeGen/AArch64/expand-select.ll b/llvm/test/CodeGen/AArch64/expand-select.ll --- a/llvm/test/CodeGen/AArch64/expand-select.ll +++ b/llvm/test/CodeGen/AArch64/expand-select.ll @@ -6,17 +6,17 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: and w8, w0, #0x1 ; CHECK-NEXT: fmov s0, wzr -; CHECK-NEXT: ldp x10, x9, [sp, #8] ; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: ldr x8, [sp] +; CHECK-NEXT: ldp x8, x9, [sp, #8] +; CHECK-NEXT: ldr x10, [sp] ; CHECK-NEXT: cmeq v0.4s, v1.4s, v0.4s ; CHECK-NEXT: fmov w11, s0 ; CHECK-NEXT: tst w11, #0x1 ; CHECK-NEXT: csel x11, x2, x6, ne ; CHECK-NEXT: csel x12, x3, x7, ne -; CHECK-NEXT: csel x8, x4, x8, ne -; CHECK-NEXT: csel x10, x5, x10, ne -; CHECK-NEXT: stp x8, x10, [x9, #16] +; CHECK-NEXT: csel x10, x4, x10, ne +; CHECK-NEXT: csel x8, x5, x8, ne +; CHECK-NEXT: stp x10, x8, [x9, #16] ; CHECK-NEXT: stp x11, x12, [x9] ; CHECK-NEXT: ret %cond = and i32 %In1, 1 diff --git a/llvm/test/CodeGen/AArch64/global-merge-group-by-use.ll b/llvm/test/CodeGen/AArch64/global-merge-group-by-use.ll --- a/llvm/test/CodeGen/AArch64/global-merge-group-by-use.ll +++ b/llvm/test/CodeGen/AArch64/global-merge-group-by-use.ll @@ -65,8 +65,8 @@ define void @f4(i32 %a1, i32 %a2, i32 %a3) #0 { ; CHECK-NEXT: adrp x8, [[SET3]]@PAGE ; CHECK-NEXT: add x8, x8, [[SET3]]@PAGEOFF -; CHECK-NEXT: stp w2, w0, [x8] -; CHECK-NEXT: str w1, [x8, #8] +; CHECK-NEXT: stp w0, w1, [x8, #4] +; CHECK-NEXT: str w2, [x8] ; CHECK-NEXT: ret store i32 %a1, i32* @m4, align 4 store i32 %a2, i32* @n4, align 4 diff --git a/llvm/test/CodeGen/AArch64/machine-scheduler.mir b/llvm/test/CodeGen/AArch64/machine-scheduler.mir --- a/llvm/test/CodeGen/AArch64/machine-scheduler.mir +++ b/llvm/test/CodeGen/AArch64/machine-scheduler.mir @@ -18,8 +18,8 @@ --- # CHECK-LABEL: name: load_imp-def # CHECK: bb.0.entry: -# CHECK: LDRWui $x0, 0 # CHECK: LDRWui $x0, 1 +# CHECK: LDRWui $x0, 0 # CHECK: STRWui $w1, $x0, 2 name: load_imp-def tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll b/llvm/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll --- a/llvm/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll +++ b/llvm/test/CodeGen/AMDGPU/bitcast-constant-to-vector.ll @@ -1,9 +1,9 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}cast_constant_i64_to_build_vector_v4i16: -; GCN: global_store_dwordx2 -; GCN: global_store_dword v ; GCN: global_store_short +; GCN: global_store_dword v +; GCN: global_store_dwordx2 define amdgpu_kernel void @cast_constant_i64_to_build_vector_v4i16(i8 addrspace(1)* nocapture %data) { entry: store i8 72, i8 addrspace(1)* %data, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -133,10 +133,10 @@ ; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8 ; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12 +; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16 ; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s34 offset:20 ; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s34 offset:24 ; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s34 offset:28 -; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16 ; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16 ; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20 @@ -331,10 +331,10 @@ ; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8 ; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12 +; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16 ; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s34 offset:20 ; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s34 offset:24 ; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s34 offset:28 -; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16 ; GCN: s_waitcnt vmcnt(0) ; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -765,16 +765,17 @@ ; GCN-LABEL: {{^}}tail_call_byval_align16: ; GCN-NOT: s32 -; GCN-NOT: buffer_store_dword v33 -; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NOT: buffer_store_dword v33 +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:20 +; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:16 ; GCN: s_getpc_b64 ; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:16 -; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}} +; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GCN-NOT: s32 ; GCN: s_setpc_b64 define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -199,15 +199,14 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NEXT: global_load_dword v4, v[2:3], off -; GCN-NEXT: global_load_ushort v2, v[2:3], off offset:4 +; GCN-NEXT: global_load_ushort v4, v[2:3], off offset:4 +; GCN-NEXT: global_load_dword v2, v[2:3], off ; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: buffer_store_short v4, off, s[0:3], s9 offset:4 -; GCN-NEXT: buffer_store_short_d16_hi v4, off, s[0:3], s9 offset:6 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: buffer_store_short v2, off, s[0:3], s9 offset:8 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_short v2, off, s[0:3], s9 offset:4 +; GCN-NEXT: buffer_store_short_d16_hi v2, off, s[0:3], s9 offset:6 +; GCN-NEXT: buffer_store_short v4, off, s[0:3], s9 offset:8 ; GCN-NEXT: buffer_load_ushort v2, off, s[0:3], s9 offset:4 ; GCN-NEXT: buffer_load_ushort v4, off, s[0:3], s9 offset:6 ; GCN-NEXT: s_waitcnt vmcnt(1) diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -236,19 +236,21 @@ ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, 1, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: flat_load_ubyte v1, v[6:7] -; VI-NEXT: flat_load_ubyte v4, v[4:5] ; VI-NEXT: flat_load_ubyte v2, v[2:3] +; VI-NEXT: flat_load_ubyte v3, v[4:5] +; VI-NEXT: flat_load_ubyte v4, v[6:7] +; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v3 +; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_e32 v2, v2, v4 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v1 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 -; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v3 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 @@ -414,35 +416,34 @@ ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ubyte v9, v[2:3] ; VI-NEXT: flat_load_ubyte v10, v[4:5] -; VI-NEXT: flat_load_ubyte v11, v[2:3] ; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 5, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 6, v0 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: flat_load_ubyte v1, v[8:9] -; VI-NEXT: flat_load_ubyte v7, v[6:7] -; VI-NEXT: flat_load_ubyte v4, v[4:5] +; VI-NEXT: flat_load_ubyte v8, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, 6, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v2, v[2:3] +; VI-NEXT: flat_load_ubyte v3, v[4:5] +; VI-NEXT: flat_load_ubyte v4, v[6:7] +; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 ; VI-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v11 -; VI-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; VI-NEXT: v_or_b32_e32 v0, v3, v0 -; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v1 +; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10 +; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-NEXT: v_or_b32_e32 v4, v3, v4 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v0 +; VI-NEXT: v_or_b32_e32 v0, v1, v8 ; VI-NEXT: v_or_b32_sdwa v1, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v4, v4, v7 ; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; VI-NEXT: v_or_b32_e32 v4, v4, v5 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v5, v4 @@ -699,23 +700,24 @@ ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 2, v0 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: flat_load_ubyte v1, v[6:7] -; VI-NEXT: flat_load_ubyte v4, v[4:5] +; VI-NEXT: flat_load_ubyte v6, v[0:1] +; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v2, v[2:3] +; VI-NEXT: flat_load_ubyte v3, v[4:5] +; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 +; VI-NEXT: v_or_b32_e32 v1, v1, v6 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_e32 v4, v2, v0 -; VI-NEXT: v_or_b32_sdwa v0, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v4 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 +; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v1 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -473,8 +473,8 @@ ; GFX9-NOT: m0 ; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar@abs32@lo{{$}} -; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1 ; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:2 offset1:3 +; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1 define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) { %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4 %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4 diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll --- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -231,10 +231,10 @@ ; GCN-LABEL: {{^}}undefined_stack_store_reg: ; GCN: s_and_saveexec_b64 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s34 offset: ; GCN: buffer_store_dword v0, off, s[0:3], s34 offset: ; GCN: buffer_store_dword v0, off, s[0:3], s34 offset: ; GCN: buffer_store_dword v0, off, s[0:3], s34 offset: +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s34 offset: define void @undefined_stack_store_reg(float %arg, i32 %arg1) #0 { bb: %tmp = alloca <4 x float>, align 16, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll b/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll @@ -4,8 +4,8 @@ ; an unused stack slot, causing ScratchSize to be non-zero. ; GCN-LABEL: store_v3i32: -; GCN: ds_read_b64 ; GCN: ds_read_b32 +; GCN: ds_read_b64 ; GCN: ds_write_b32 ; GCN: ds_write_b64 ; GCN: ScratchSize: 0 @@ -17,8 +17,8 @@ } ; GCN-LABEL: store_v5i32: -; GCN: ds_read2_b64 ; GCN: ds_read_b32 +; GCN: ds_read2_b64 ; GCN: ds_write_b32 ; GCN: ds_write2_b64 ; GCN: ScratchSize: 0 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1642,10 +1642,10 @@ ; SI-NEXT: v_mov_b32_e32 v13, s21 ; SI-NEXT: v_mov_b32_e32 v14, s22 ; SI-NEXT: v_mov_b32_e32 v15, s23 -; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64 +; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64 ; SI-NEXT: v_or_b32_e32 v16, s4, v16 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, 0x40200000 @@ -1688,10 +1688,10 @@ ; VI-NEXT: v_mov_b32_e32 v13, s21 ; VI-NEXT: v_mov_b32_e32 v14, s22 ; VI-NEXT: v_mov_b32_e32 v15, s23 -; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64 +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64 ; VI-NEXT: v_or_b32_e32 v16, s4, v16 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0x40200000 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -855,8 +855,8 @@ ; multiple. ; FUNC-LABEL: {{^}}packed_struct_argument_alignment: ; HSA-GFX9: kernarg_segment_byte_size = 28 -; HSA-GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13 ; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17 +; HSA-GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13 ; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 ; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { diff --git a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll --- a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll @@ -75,8 +75,8 @@ ; multiple. ; FUNC-LABEL: {{^}}packed_struct_argument_alignment: ; HSA-VI: kernarg_segment_byte_size = 28 -; HSA-VI: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13 ; HSA-VI: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17 +; HSA-VI: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll --- a/llvm/test/CodeGen/AMDGPU/max.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -137,22 +137,22 @@ ; VI-NEXT: v_mov_b32_e32 v7, s5 ; VI-NEXT: v_add_u32_e32 v6, vcc, s4, v6 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: flat_load_dword v8, v[0:1] -; VI-NEXT: flat_load_ushort v9, v[4:5] +; VI-NEXT: flat_load_ushort v8, v[4:5] +; VI-NEXT: flat_load_dword v9, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v2, v[2:3] ; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[2:3] ; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v6 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_max_i16_e32 v1, v8, v2 -; VI-NEXT: v_max_i16_sdwa v2, v8, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_max_i16_e32 v0, v8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v0, v9, v0 -; VI-NEXT: flat_store_dword v[6:7], v1 +; VI-NEXT: v_max_i16_e32 v2, v9, v1 +; VI-NEXT: v_max_i16_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: flat_store_short v[4:5], v0 +; VI-NEXT: flat_store_dword v[6:7], v1 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sge_v3i16: diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -221,26 +221,26 @@ ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 -; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 -; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 +; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 entry: %call = tail call i64 @_Z13get_global_idj(i32 0) %conv = and i64 %call, 255 @@ -299,10 +299,10 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 @@ -352,8 +352,8 @@ ; ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 entry: %call = tail call i64 @_Z13get_global_idj(i32 0) %conv = and i64 %call, 255 @@ -454,13 +454,13 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} @@ -519,8 +519,8 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off ; ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -509,8 +509,8 @@ ; GCN-NEXT: v_mov_b32_e32 v10, 16 ; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-NEXT: s_endpgm %shift = shl <2 x i128> %lhs, %rhs store <2 x i128> %shift, <2 x i128> addrspace(1)* null @@ -579,8 +579,8 @@ ; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v10, 16 ; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-NEXT: s_endpgm %shift = lshr <2 x i128> %lhs, %rhs store <2 x i128> %shift, <2 x i128> addrspace(1)* null @@ -653,8 +653,8 @@ ; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v10, 16 ; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-NEXT: s_endpgm %shift = ashr <2 x i128> %lhs, %rhs store <2 x i128> %shift, <2 x i128> addrspace(1)* null diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll @@ -10,21 +10,20 @@ ; GCN-NEXT: BB0_1: ; %bb0 ; GCN-NEXT: ; =>This Loop Header: Depth=1 ; GCN-NEXT: ; Child Loop BB0_2 Depth 2 -; GCN-NEXT: v_add_co_u32_e64 v4, vcc_lo, v0, 8 +; GCN-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 8 ; GCN-NEXT: s_mov_b32 s5, exec_lo -; GCN-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; GCN-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GCN-NEXT: flat_load_dwordx2 v[4:5], v[4:5] +; GCN-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo +; GCN-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; GCN-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GCN-NEXT: BB0_2: ; Parent Loop BB0_1 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 -; GCN-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; GCN-NEXT: v_readfirstlane_b32 s8, v2 -; GCN-NEXT: v_readfirstlane_b32 s9, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s10, v4 -; GCN-NEXT: v_readfirstlane_b32 s11, v5 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[2:3] -; GCN-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[4:5] +; GCN-NEXT: v_readfirstlane_b32 s8, v4 +; GCN-NEXT: v_readfirstlane_b32 s9, v5 +; GCN-NEXT: v_readfirstlane_b32 s10, v2 +; GCN-NEXT: v_readfirstlane_b32 s11, v3 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[4:5] +; GCN-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GCN-NEXT: s_and_b32 s4, vcc_lo, s4 ; GCN-NEXT: s_and_saveexec_b32 s4, s4 ; GCN-NEXT: s_nop 0