diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -871,10 +871,6 @@ /// (3) Bottom-up allocation is no longer guaranteed to optimally color. virtual bool reverseLocalAssignment() const { return false; } - /// Add the allocation priority to global and split ranges as well as the - /// local ranges when registers are added to the queue. - virtual bool addAllocPriorityToGlobalRanges() const { return false; } - /// Allow the target to override the cost of using a callee-saved register for /// the first time. Default value of 0 means we will use a callee-saved /// register if it is available. diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -760,7 +760,6 @@ // Giant live ranges fall back to the global assignment heuristic, which // prevents excessive spilling in pathological cases. bool ReverseLocal = TRI->reverseLocalAssignment(); - bool AddPriorityToGlobal = TRI->addAllocPriorityToGlobalRanges(); const TargetRegisterClass &RC = *MRI->getRegClass(Reg); bool ForceGlobal = !ReverseLocal && (Size / SlotIndex::InstrDist) > (2 * RC.getNumRegs()); @@ -785,8 +784,7 @@ // interference. Mark a bit to prioritize global above local ranges. Prio = (1u << 29) + Size; - if (AddPriorityToGlobal) - Prio |= RC.AllocationPriority << 24; + Prio |= RC.AllocationPriority << 24; } // Mark a higher bit to prioritize global and local above RS_Split. Prio |= (1u << 31); diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h @@ -147,8 +147,6 @@ unsigned FIOperandNum, RegScavenger *RS = nullptr) const override; - bool addAllocPriorityToGlobalRanges() const override { return true; } - // Support for virtual base registers. bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override; Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll @@ -7,18 +7,16 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.ptr, <64 x i32> addrspace(1)* %ptr, i32 %val, i32 %idx) #0 { ; GCN-LABEL: v_insert_v64i32_varidx: ; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[20:23], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x10 ; GCN-NEXT: s_add_u32 s0, s0, s7 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v16, 0x100 -; GCN-NEXT: v_mov_b32_e32 v64, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx16 s[36:51], s[10:11], 0x0 -; GCN-NEXT: s_load_dwordx16 s[52:67], s[10:11], 0x40 -; GCN-NEXT: s_load_dwordx16 s[12:27], s[10:11], 0x80 -; GCN-NEXT: s_and_b32 s4, s7, 63 -; GCN-NEXT: s_lshl_b32 s4, s4, 2 +; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0x0 +; GCN-NEXT: s_load_dwordx16 s[52:67], s[22:23], 0x40 +; GCN-NEXT: s_load_dwordx16 s[4:19], s[22:23], 0x80 +; GCN-NEXT: v_mov_b32_e32 v64, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NEXT: v_mov_b32_e32 v1, s37 @@ -36,7 +34,7 @@ ; GCN-NEXT: v_mov_b32_e32 v13, s49 ; GCN-NEXT: v_mov_b32_e32 v14, s50 ; GCN-NEXT: v_mov_b32_e32 v15, s51 -; GCN-NEXT: s_load_dwordx16 s[36:51], s[10:11], 0xc0 +; GCN-NEXT: s_load_dwordx16 s[36:51], s[22:23], 0xc0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:256 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:260 ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:264 @@ -85,37 +83,37 @@ ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:376 ; GCN-NEXT: v_mov_b32_e32 v0, s67 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:380 -; GCN-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:384 -; GCN-NEXT: v_mov_b32_e32 v0, s13 +; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:388 -; GCN-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:392 -; GCN-NEXT: v_mov_b32_e32 v0, s15 +; GCN-NEXT: v_mov_b32_e32 v0, s7 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:396 -; GCN-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:400 -; GCN-NEXT: v_mov_b32_e32 v0, s17 +; GCN-NEXT: v_mov_b32_e32 v0, s9 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:404 -; GCN-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:408 -; GCN-NEXT: v_mov_b32_e32 v0, s19 +; GCN-NEXT: v_mov_b32_e32 v0, s11 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:412 -; GCN-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:416 -; GCN-NEXT: v_mov_b32_e32 v0, s21 +; GCN-NEXT: v_mov_b32_e32 v0, s13 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:420 -; GCN-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:424 -; GCN-NEXT: v_mov_b32_e32 v0, s23 +; GCN-NEXT: v_mov_b32_e32 v0, s15 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:428 -; GCN-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NEXT: v_mov_b32_e32 v0, s16 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:432 -; GCN-NEXT: v_mov_b32_e32 v0, s25 +; GCN-NEXT: v_mov_b32_e32 v0, s17 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:436 -; GCN-NEXT: v_mov_b32_e32 v0, s26 +; GCN-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:440 -; GCN-NEXT: v_mov_b32_e32 v0, s27 +; GCN-NEXT: v_mov_b32_e32 v0, s19 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:444 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s36 @@ -145,13 +143,15 @@ ; GCN-NEXT: v_mov_b32_e32 v0, s48 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:496 ; GCN-NEXT: v_mov_b32_e32 v0, s49 +; GCN-NEXT: s_and_b32 s4, s25, 63 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:500 ; GCN-NEXT: v_mov_b32_e32 v0, s50 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:504 ; GCN-NEXT: v_mov_b32_e32 v0, s51 +; GCN-NEXT: s_lshl_b32 s4, s4, 2 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:508 ; GCN-NEXT: v_add_u32_e32 v0, s4, v16 -; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s24 ; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:256 ; GCN-NEXT: s_nop 0 @@ -219,37 +219,37 @@ ; GCN-NEXT: buffer_load_dword v62, off, s[0:3], 0 offset:504 ; GCN-NEXT: buffer_load_dword v63, off, s[0:3], 0 offset:508 ; GCN-NEXT: s_waitcnt vmcnt(60) -; GCN-NEXT: global_store_dwordx4 v64, v[0:3], s[8:9] +; GCN-NEXT: global_store_dwordx4 v64, v[0:3], s[20:21] ; GCN-NEXT: s_waitcnt vmcnt(57) -; GCN-NEXT: global_store_dwordx4 v64, v[4:7], s[8:9] offset:16 +; GCN-NEXT: global_store_dwordx4 v64, v[4:7], s[20:21] offset:16 ; GCN-NEXT: s_waitcnt vmcnt(54) -; GCN-NEXT: global_store_dwordx4 v64, v[8:11], s[8:9] offset:32 +; GCN-NEXT: global_store_dwordx4 v64, v[8:11], s[20:21] offset:32 ; GCN-NEXT: s_waitcnt vmcnt(51) -; GCN-NEXT: global_store_dwordx4 v64, v[12:15], s[8:9] offset:48 +; GCN-NEXT: global_store_dwordx4 v64, v[12:15], s[20:21] offset:48 ; GCN-NEXT: s_waitcnt vmcnt(48) -; GCN-NEXT: global_store_dwordx4 v64, v[16:19], s[8:9] offset:64 +; GCN-NEXT: global_store_dwordx4 v64, v[16:19], s[20:21] offset:64 ; GCN-NEXT: s_waitcnt vmcnt(45) -; GCN-NEXT: global_store_dwordx4 v64, v[20:23], s[8:9] offset:80 +; GCN-NEXT: global_store_dwordx4 v64, v[20:23], s[20:21] offset:80 ; GCN-NEXT: s_waitcnt vmcnt(42) -; GCN-NEXT: global_store_dwordx4 v64, v[24:27], s[8:9] offset:96 +; GCN-NEXT: global_store_dwordx4 v64, v[24:27], s[20:21] offset:96 ; GCN-NEXT: s_waitcnt vmcnt(39) -; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[8:9] offset:112 +; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[20:21] offset:112 ; GCN-NEXT: s_waitcnt vmcnt(36) -; GCN-NEXT: global_store_dwordx4 v64, v[32:35], s[8:9] offset:128 +; GCN-NEXT: global_store_dwordx4 v64, v[32:35], s[20:21] offset:128 ; GCN-NEXT: s_waitcnt vmcnt(33) -; GCN-NEXT: global_store_dwordx4 v64, v[36:39], s[8:9] offset:144 +; GCN-NEXT: global_store_dwordx4 v64, v[36:39], s[20:21] offset:144 ; GCN-NEXT: s_waitcnt vmcnt(30) -; GCN-NEXT: global_store_dwordx4 v64, v[40:43], s[8:9] offset:160 +; GCN-NEXT: global_store_dwordx4 v64, v[40:43], s[20:21] offset:160 ; GCN-NEXT: s_waitcnt vmcnt(27) -; GCN-NEXT: global_store_dwordx4 v64, v[44:47], s[8:9] offset:176 +; GCN-NEXT: global_store_dwordx4 v64, v[44:47], s[20:21] offset:176 ; GCN-NEXT: s_waitcnt vmcnt(24) -; GCN-NEXT: global_store_dwordx4 v64, v[48:51], s[8:9] offset:192 +; GCN-NEXT: global_store_dwordx4 v64, v[48:51], s[20:21] offset:192 ; GCN-NEXT: s_waitcnt vmcnt(21) -; GCN-NEXT: global_store_dwordx4 v64, v[52:55], s[8:9] offset:208 +; GCN-NEXT: global_store_dwordx4 v64, v[52:55], s[20:21] offset:208 ; GCN-NEXT: s_waitcnt vmcnt(18) -; GCN-NEXT: global_store_dwordx4 v64, v[56:59], s[8:9] offset:224 +; GCN-NEXT: global_store_dwordx4 v64, v[56:59], s[20:21] offset:224 ; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: global_store_dwordx4 v64, v[60:63], s[8:9] offset:240 +; GCN-NEXT: global_store_dwordx4 v64, v[60:63], s[20:21] offset:240 ; GCN-NEXT: s_endpgm %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr %insert = insertelement <64 x i32> %vec, i32 %val, i32 %idx diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -946,9 +946,9 @@ ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: buffer_load_dwordx3 v[1:3], v[1:2], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX7-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7-NEXT: s_cbranch_execz BB13_2 ; GFX7-NEXT: ; %bb.1: ; %bb ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x1d @@ -956,10 +956,10 @@ ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_cmp_lg_u32 s0, 0 -; GFX7-NEXT: s_cselect_b32 s2, 1, 0 +; GFX7-NEXT: s_cselect_b32 s6, 1, 0 ; GFX7-NEXT: BB13_2: ; %exit -; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7-NEXT: s_and_b32 s0, 1, s2 +; GFX7-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7-NEXT: s_and_b32 s0, 1, s6 ; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] @@ -971,18 +971,18 @@ ; ; GFX8-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x4c +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0 -; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_mov_b32 s6, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GFX8-NEXT: flat_load_dwordx3 v[1:3], v[1:2] ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_cbranch_execz BB13_2 ; GFX8-NEXT: ; %bb.1: ; %bb ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x74 @@ -990,12 +990,12 @@ ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_cmp_lg_u32 s0, 0 -; GFX8-NEXT: s_cselect_b32 s2, 1, 0 +; GFX8-NEXT: s_cselect_b32 s6, 1, 0 ; GFX8-NEXT: BB13_2: ; %exit -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX8-NEXT: s_add_u32 s0, s4, 8 -; GFX8-NEXT: s_addc_u32 s1, s5, 0 -; GFX8-NEXT: s_and_b32 s2, 1, s2 +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_add_u32 s0, s2, 8 +; GFX8-NEXT: s_addc_u32 s1, s3, 0 +; GFX8-NEXT: s_and_b32 s2, 1, s6 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_nop 2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -147,6 +147,10 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_vgpr_descr: ; GFX1030: ; %bb.0: +; GFX1030-NEXT: v_mov_b32_e32 v5, v0 +; GFX1030-NEXT: v_mov_b32_e32 v9, v1 +; GFX1030-NEXT: v_mov_b32_e32 v13, v2 +; GFX1030-NEXT: v_mov_b32_e32 v18, v3 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo ; GFX1030-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v14 @@ -154,7 +158,7 @@ ; GFX1030-NEXT: v_readfirstlane_b32 s6, v16 ; GFX1030-NEXT: v_readfirstlane_b32 s7, v17 ; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[14:15] -; GFX1030-NEXT: image_bvh_intersect_ray v[18:21], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[4:7] +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], [v5, v9, v13, v18, v4, v6, v7, v8, v10, v11, v12], s[4:7] ; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[16:17] ; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 @@ -163,10 +167,6 @@ ; GFX1030-NEXT: ; %bb.2: ; GFX1030-NEXT: s_mov_b32 exec_lo, s1 ; GFX1030-NEXT: s_waitcnt vmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, v18 -; GFX1030-NEXT: v_mov_b32_e32 v1, v19 -; GFX1030-NEXT: v_mov_b32_e32 v2, v20 -; GFX1030-NEXT: v_mov_b32_e32 v3, v21 ; GFX1030-NEXT: ; return to shader part epilog ; ; GFX1013-LABEL: image_bvh_intersect_ray_vgpr_descr: @@ -207,23 +207,27 @@ ; GFX1030-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_mov_b32 s0, 0xffff -; GFX1030-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GFX1030-NEXT: v_and_b32_e32 v14, s0, v8 -; GFX1030-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX1030-NEXT: v_and_b32_e32 v15, s0, v9 +; GFX1030-NEXT: v_mov_b32_e32 v5, v0 +; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX1030-NEXT: v_mov_b32_e32 v14, v1 +; GFX1030-NEXT: v_and_b32_e32 v1, s0, v8 +; GFX1030-NEXT: v_mov_b32_e32 v15, v2 +; GFX1030-NEXT: v_mov_b32_e32 v16, v3 +; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1030-NEXT: v_and_b32_e32 v3, s0, v9 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo -; GFX1030-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX1030-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX1030-NEXT: v_lshl_or_b32 v15, v15, 16, v8 -; GFX1030-NEXT: v_and_or_b32 v9, v6, s0, v5 -; GFX1030-NEXT: v_and_or_b32 v14, v7, s0, v14 +; GFX1030-NEXT: v_and_or_b32 v6, v6, s0, v0 +; GFX1030-NEXT: v_and_or_b32 v7, v7, s0, v1 +; GFX1030-NEXT: v_lshl_or_b32 v8, v3, 16, v2 ; GFX1030-NEXT: BB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v10 ; GFX1030-NEXT: v_readfirstlane_b32 s5, v11 ; GFX1030-NEXT: v_readfirstlane_b32 s6, v12 ; GFX1030-NEXT: v_readfirstlane_b32 s7, v13 ; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11] -; GFX1030-NEXT: image_bvh_intersect_ray v[5:8], [v0, v1, v2, v3, v4, v9, v14, v15], s[4:7] a16 +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], [v5, v14, v15, v16, v4, v6, v7, v8], s[4:7] a16 ; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13] ; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 @@ -232,10 +236,6 @@ ; GFX1030-NEXT: ; %bb.2: ; GFX1030-NEXT: s_mov_b32 exec_lo, s1 ; GFX1030-NEXT: s_waitcnt vmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, v5 -; GFX1030-NEXT: v_mov_b32_e32 v1, v6 -; GFX1030-NEXT: v_mov_b32_e32 v2, v7 -; GFX1030-NEXT: v_mov_b32_e32 v3, v8 ; GFX1030-NEXT: ; return to shader part epilog ; ; GFX1013-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: @@ -279,6 +279,10 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh64_intersect_ray_vgpr_descr: ; GFX1030: ; %bb.0: +; GFX1030-NEXT: v_mov_b32_e32 v6, v0 +; GFX1030-NEXT: v_mov_b32_e32 v10, v1 +; GFX1030-NEXT: v_mov_b32_e32 v14, v2 +; GFX1030-NEXT: v_mov_b32_e32 v19, v3 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo ; GFX1030-NEXT: BB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v15 @@ -286,7 +290,7 @@ ; GFX1030-NEXT: v_readfirstlane_b32 s6, v17 ; GFX1030-NEXT: v_readfirstlane_b32 s7, v18 ; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[15:16] -; GFX1030-NEXT: image_bvh64_intersect_ray v[19:22], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[4:7] +; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], [v6, v10, v14, v19, v4, v5, v7, v8, v9, v11, v12, v13], s[4:7] ; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[17:18] ; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 @@ -295,10 +299,6 @@ ; GFX1030-NEXT: ; %bb.2: ; GFX1030-NEXT: s_mov_b32 exec_lo, s1 ; GFX1030-NEXT: s_waitcnt vmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, v19 -; GFX1030-NEXT: v_mov_b32_e32 v1, v20 -; GFX1030-NEXT: v_mov_b32_e32 v2, v21 -; GFX1030-NEXT: v_mov_b32_e32 v3, v22 ; GFX1030-NEXT: ; return to shader part epilog ; ; GFX1013-LABEL: image_bvh64_intersect_ray_vgpr_descr: @@ -339,23 +339,27 @@ ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_mov_b32 s0, 0xffff -; GFX1030-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; GFX1030-NEXT: v_and_b32_e32 v15, s0, v9 -; GFX1030-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX1030-NEXT: v_and_b32_e32 v16, s0, v10 +; GFX1030-NEXT: v_mov_b32_e32 v6, v0 +; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; GFX1030-NEXT: v_mov_b32_e32 v15, v1 +; GFX1030-NEXT: v_and_b32_e32 v1, s0, v9 +; GFX1030-NEXT: v_mov_b32_e32 v16, v2 +; GFX1030-NEXT: v_mov_b32_e32 v17, v3 +; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1030-NEXT: v_and_b32_e32 v3, s0, v10 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo -; GFX1030-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX1030-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX1030-NEXT: v_lshl_or_b32 v16, v16, 16, v9 -; GFX1030-NEXT: v_and_or_b32 v10, v7, s0, v6 -; GFX1030-NEXT: v_and_or_b32 v15, v8, s0, v15 +; GFX1030-NEXT: v_and_or_b32 v7, v7, s0, v0 +; GFX1030-NEXT: v_and_or_b32 v8, v8, s0, v1 +; GFX1030-NEXT: v_lshl_or_b32 v9, v3, 16, v2 ; GFX1030-NEXT: BB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v11 ; GFX1030-NEXT: v_readfirstlane_b32 s5, v12 ; GFX1030-NEXT: v_readfirstlane_b32 s6, v13 ; GFX1030-NEXT: v_readfirstlane_b32 s7, v14 ; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12] -; GFX1030-NEXT: image_bvh64_intersect_ray v[6:9], [v0, v1, v2, v3, v4, v5, v10, v15, v16], s[4:7] a16 +; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], [v6, v15, v16, v17, v4, v5, v7, v8, v9], s[4:7] a16 ; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14] ; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo ; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 @@ -364,10 +368,6 @@ ; GFX1030-NEXT: ; %bb.2: ; GFX1030-NEXT: s_mov_b32 exec_lo, s1 ; GFX1030-NEXT: s_waitcnt vmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, v6 -; GFX1030-NEXT: v_mov_b32_e32 v1, v7 -; GFX1030-NEXT: v_mov_b32_e32 v2, v8 -; GFX1030-NEXT: v_mov_b32_e32 v3, v9 ; GFX1030-NEXT: ; return to shader part epilog ; ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll @@ -663,15 +663,15 @@ ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; SI-NEXT: s_movk_i32 s2, 0x3c00 -; SI-NEXT: s_bfe_u32 s4, 0, 0x100000 -; SI-NEXT: s_bfe_u32 s3, s2, 0x100000 -; SI-NEXT: s_lshl_b32 s2, s4, 16 -; SI-NEXT: s_or_b32 s2, s3, s2 -; SI-NEXT: s_lshl_b32 s3, s3, 16 -; SI-NEXT: s_or_b32 s3, s4, s3 +; SI-NEXT: s_bfe_u32 s3, 0, 0x100000 +; SI-NEXT: s_bfe_u32 s2, s2, 0x100000 +; SI-NEXT: s_lshl_b32 s4, s3, 16 +; SI-NEXT: s_or_b32 s4, s2, s4 +; SI-NEXT: s_lshl_b32 s2, s2, 16 +; SI-NEXT: s_or_b32 s5, s3, s2 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc +; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; SI-NEXT: s_cbranch_execz BB6_3 ; SI-NEXT: ; %bb.1: ; %.demote0 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec @@ -680,9 +680,9 @@ ; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] ; SI-NEXT: s_and_b64 exec, exec, s[6:7] ; SI-NEXT: BB6_3: ; %.continue0 -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_mov_b64 s[4:5], s[0:1] -; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[4:5] +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: s_mov_b64 s[2:3], s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, v0 ; SI-NEXT: s_nop 1 ; SI-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -691,10 +691,10 @@ ; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; SI-NEXT: s_and_b64 exec, exec, s[0:1] ; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 -; SI-NEXT: s_and_b64 s[4:5], s[0:1], vcc -; SI-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; SI-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; SI-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; SI-NEXT: s_and_b64 s[2:3], s[0:1], vcc +; SI-NEXT: s_xor_b64 s[2:3], s[2:3], -1 +; SI-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] +; SI-NEXT: s_xor_b64 s[2:3], exec, s[6:7] ; SI-NEXT: s_cbranch_execz BB6_6 ; SI-NEXT: ; %bb.4: ; %.demote1 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec @@ -702,9 +702,9 @@ ; SI-NEXT: ; %bb.5: ; %.demote1 ; SI-NEXT: s_mov_b64 exec, 0 ; SI-NEXT: BB6_6: ; %.continue1 -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm ; SI-NEXT: s_endpgm ; SI-NEXT: BB6_7: @@ -893,16 +893,16 @@ ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; SI-NEXT: s_movk_i32 s2, 0x3c00 -; SI-NEXT: s_bfe_u32 s4, 0, 0x100000 -; SI-NEXT: s_bfe_u32 s3, s2, 0x100000 -; SI-NEXT: s_lshl_b32 s2, s4, 16 -; SI-NEXT: s_or_b32 s2, s3, s2 -; SI-NEXT: s_lshl_b32 s3, s3, 16 -; SI-NEXT: s_or_b32 s3, s4, s3 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_bfe_u32 s3, 0, 0x100000 +; SI-NEXT: s_bfe_u32 s2, s2, 0x100000 +; SI-NEXT: s_lshl_b32 s4, s3, 16 +; SI-NEXT: s_or_b32 s6, s2, s4 +; SI-NEXT: s_lshl_b32 s2, s2, 16 +; SI-NEXT: s_or_b32 s7, s3, s2 +; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc +; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; SI-NEXT: s_cbranch_execz BB7_3 ; SI-NEXT: ; %bb.1: ; %.demote0 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec @@ -911,22 +911,22 @@ ; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] ; SI-NEXT: s_and_b64 exec, exec, s[8:9] ; SI-NEXT: BB7_3: ; %.continue0.preheader -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: s_mov_b64 s[2:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_branch BB7_5 ; SI-NEXT: BB7_4: ; %.continue1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; SI-NEXT: s_or_b64 exec, exec, s[6:7] +; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 ; SI-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 -; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; SI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] ; SI-NEXT: s_cbranch_execz BB7_8 ; SI-NEXT: BB7_5: ; %.continue0 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_mov_b64 s[6:7], s[0:1] -; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[6:7] +; SI-NEXT: s_mov_b64 s[4:5], s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5] ; SI-NEXT: v_mov_b32_e32 v3, v2 ; SI-NEXT: s_nop 1 ; SI-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 @@ -934,10 +934,10 @@ ; SI-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; SI-NEXT: s_and_b64 s[6:7], s[0:1], vcc -; SI-NEXT: s_xor_b64 s[6:7], s[6:7], -1 -; SI-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] -; SI-NEXT: s_xor_b64 s[6:7], exec, s[8:9] +; SI-NEXT: s_and_b64 s[4:5], s[0:1], vcc +; SI-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; SI-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; SI-NEXT: s_xor_b64 s[4:5], exec, s[8:9] ; SI-NEXT: s_cbranch_execz BB7_4 ; SI-NEXT: ; %bb.6: ; %.demote1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -949,10 +949,10 @@ ; SI-NEXT: s_and_b64 exec, exec, s[8:9] ; SI-NEXT: s_branch BB7_4 ; SI-NEXT: BB7_8: ; %.return -; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_and_b64 exec, exec, s[0:1] -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm ; SI-NEXT: s_endpgm ; SI-NEXT: BB7_9: @@ -1094,10 +1094,10 @@ ; GFX10-64-NEXT: s_mov_b64 s[0:1], exec ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX10-64-NEXT: s_mov_b32 s2, 0 +; GFX10-64-NEXT: s_mov_b32 s4, 0 ; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX10-64-NEXT: s_cbranch_execz BB7_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote0 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec @@ -1106,8 +1106,8 @@ ; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] ; GFX10-64-NEXT: BB7_3: ; %.continue0.preheader -; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX10-64-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-64-NEXT: s_mov_b64 s[2:3], 0 ; GFX10-64-NEXT: s_branch BB7_5 ; GFX10-64-NEXT: BB7_4: ; %.continue1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -8,45 +8,47 @@ ; CHECK-LABEL: v_sdiv_i64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_or_b32_e32 v5, v1, v3 -; CHECK-NEXT: v_mov_b32_e32 v4, 0 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: v_mov_b32_e32 v5, v1 +; CHECK-NEXT: v_mov_b32_e32 v4, v0 +; CHECK-NEXT: v_or_b32_e32 v1, v5, v3 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz BB0_2 ; CHECK-NEXT: ; %bb.1: -; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc -; CHECK-NEXT: v_xor_b32_e32 v2, v2, v4 -; CHECK-NEXT: v_xor_b32_e32 v3, v3, v4 -; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v2 -; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v3 -; CHECK-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; CHECK-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; CHECK-NEXT: v_sub_i32_e32 v8, vcc, 0, v2 -; CHECK-NEXT: v_subb_u32_e32 v9, vcc, 0, v3, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v7 -; CHECK-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v0 +; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v3, v0, vcc +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v0 +; CHECK-NEXT: v_xor_b32_e32 v2, v2, v0 +; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v1 +; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v2 +; CHECK-NEXT: v_ashrrev_i32_e32 v7, 31, v5 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc +; CHECK-NEXT: v_mac_f32_e32 v3, 0x4f800000, v6 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; CHECK-NEXT: v_sub_i32_e32 v8, vcc, 0, v1 +; CHECK-NEXT: v_subb_u32_e32 v9, vcc, 0, v2, vcc +; CHECK-NEXT: v_xor_b32_e32 v4, v4, v7 +; CHECK-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 +; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v3 ; CHECK-NEXT: v_trunc_f32_e32 v6, v6 -; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 -; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5 +; CHECK-NEXT: v_mac_f32_e32 v3, 0xcf800000, v6 +; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v7 -; CHECK-NEXT: v_mul_lo_u32 v10, v9, v5 +; CHECK-NEXT: v_xor_b32_e32 v5, v5, v7 +; CHECK-NEXT: v_mul_lo_u32 v10, v9, v3 ; CHECK-NEXT: v_mul_lo_u32 v11, v8, v6 -; CHECK-NEXT: v_mul_hi_u32 v13, v8, v5 -; CHECK-NEXT: v_mul_lo_u32 v12, v8, v5 +; CHECK-NEXT: v_mul_hi_u32 v13, v8, v3 +; CHECK-NEXT: v_mul_lo_u32 v12, v8, v3 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v13 ; CHECK-NEXT: v_mul_lo_u32 v11, v6, v12 -; CHECK-NEXT: v_mul_lo_u32 v13, v5, v10 -; CHECK-NEXT: v_mul_hi_u32 v14, v5, v12 +; CHECK-NEXT: v_mul_lo_u32 v13, v3, v10 +; CHECK-NEXT: v_mul_hi_u32 v14, v3, v12 ; CHECK-NEXT: v_mul_hi_u32 v12, v6, v12 ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc @@ -54,7 +56,7 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CHECK-NEXT: v_mul_lo_u32 v14, v6, v10 ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CHECK-NEXT: v_mul_hi_u32 v13, v5, v10 +; CHECK-NEXT: v_mul_hi_u32 v13, v3, v10 ; CHECK-NEXT: v_mul_hi_u32 v10, v6, v10 ; CHECK-NEXT: v_add_i32_e32 v12, vcc, v14, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc @@ -65,18 +67,18 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v11 ; CHECK-NEXT: v_addc_u32_e64 v11, s[4:5], v6, v10, vcc -; CHECK-NEXT: v_mul_lo_u32 v9, v9, v5 +; CHECK-NEXT: v_mul_lo_u32 v9, v9, v3 ; CHECK-NEXT: v_mul_lo_u32 v12, v8, v11 -; CHECK-NEXT: v_mul_lo_u32 v13, v8, v5 -; CHECK-NEXT: v_mul_hi_u32 v8, v8, v5 +; CHECK-NEXT: v_mul_lo_u32 v13, v8, v3 +; CHECK-NEXT: v_mul_hi_u32 v8, v8, v3 ; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10 ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 -; CHECK-NEXT: v_mul_hi_u32 v10, v5, v13 +; CHECK-NEXT: v_mul_hi_u32 v10, v3, v13 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 ; CHECK-NEXT: v_mul_lo_u32 v9, v11, v13 -; CHECK-NEXT: v_mul_lo_u32 v12, v5, v8 +; CHECK-NEXT: v_mul_lo_u32 v12, v3, v8 ; CHECK-NEXT: v_mul_hi_u32 v13, v11, v13 ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] @@ -84,7 +86,7 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] ; CHECK-NEXT: v_mul_lo_u32 v10, v11, v8 ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 -; CHECK-NEXT: v_mul_hi_u32 v12, v5, v8 +; CHECK-NEXT: v_mul_hi_u32 v12, v3, v8 ; CHECK-NEXT: v_mul_hi_u32 v8, v11, v8 ; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] @@ -96,99 +98,97 @@ ; CHECK-NEXT: v_add_i32_e64 v10, s[4:5], v12, v10 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 ; CHECK-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; CHECK-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; CHECK-NEXT: v_mul_lo_u32 v8, v1, v5 -; CHECK-NEXT: v_mul_lo_u32 v9, v0, v6 -; CHECK-NEXT: v_mul_hi_u32 v10, v0, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v1, v5 +; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3 +; CHECK-NEXT: v_mul_lo_u32 v9, v4, v6 +; CHECK-NEXT: v_mul_hi_u32 v10, v4, v3 +; CHECK-NEXT: v_mul_hi_u32 v3, v5, v3 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v10, v1, v6 +; CHECK-NEXT: v_mul_lo_u32 v10, v5, v6 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_mul_hi_u32 v9, v0, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v1, v6 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CHECK-NEXT: v_mul_hi_u32 v9, v4, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v5, v6 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v10, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; CHECK-NEXT: v_mul_lo_u32 v8, v3, v5 -; CHECK-NEXT: v_mul_lo_u32 v9, v2, v6 -; CHECK-NEXT: v_mul_hi_u32 v11, v2, v5 -; CHECK-NEXT: v_mul_lo_u32 v10, v2, v5 +; CHECK-NEXT: v_mul_lo_u32 v8, v2, v3 +; CHECK-NEXT: v_mul_lo_u32 v9, v1, v6 +; CHECK-NEXT: v_mul_hi_u32 v11, v1, v3 +; CHECK-NEXT: v_mul_lo_u32 v10, v1, v3 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 -; CHECK-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v8, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v8 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v4, v10 +; CHECK-NEXT: v_subb_u32_e64 v9, s[4:5], v5, v8, vcc +; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v5, v8 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v2 +; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v5, v2, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v4, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 -; CHECK-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v2 +; CHECK-NEXT: v_add_i32_e32 v9, vcc, 1, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] ; CHECK-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v9 -; CHECK-NEXT: v_addc_u32_e32 v2, vcc, 0, v10, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v5, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v9 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, 0, v10, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v10, v4, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; CHECK-NEXT: v_xor_b32_e32 v2, v7, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2 -; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2 -; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v1, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v3, v7, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3 +; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; CHECK-NEXT: ; implicit-def: $vgpr2 -; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr4 ; CHECK-NEXT: BB0_2: ; %Flow ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] ; CHECK-NEXT: s_xor_b64 exec, exec, s[6:7] ; CHECK-NEXT: s_cbranch_execz BB0_4 ; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v2 -; CHECK-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 -; CHECK-NEXT: v_mov_b32_e32 v5, 0 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_lo_u32 v3, v3, v1 -; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v2 +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 +; CHECK-NEXT: v_mul_lo_u32 v1, v1, v0 ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 -; CHECK-NEXT: v_mul_lo_u32 v3, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v1 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 +; CHECK-NEXT: v_mul_lo_u32 v1, v0, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v4, v1 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v1, v3, vcc +; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: BB0_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] -; CHECK-NEXT: v_mov_b32_e32 v0, v4 -; CHECK-NEXT: v_mov_b32_e32 v1, v5 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv i64 %num, %den ret i64 %result @@ -692,11 +692,13 @@ ; CGP-LABEL: v_sdiv_v2i64: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_mov_b32_e32 v9, v1 -; CGP-NEXT: v_mov_b32_e32 v8, v0 -; CGP-NEXT: v_or_b32_e32 v1, v9, v5 +; CGP-NEXT: v_mov_b32_e32 v11, v1 +; CGP-NEXT: v_mov_b32_e32 v10, v0 +; CGP-NEXT: v_or_b32_e32 v1, v11, v5 ; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: v_mov_b32_e32 v8, v2 +; CGP-NEXT: v_mov_b32_e32 v9, v3 ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] @@ -704,44 +706,44 @@ ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v0 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v5, v0, vcc +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v5, v0, vcc ; CGP-NEXT: v_xor_b32_e32 v1, v1, v0 -; CGP-NEXT: v_xor_b32_e32 v4, v4, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v5, v1 -; CGP-NEXT: v_cvt_f32_u32_e32 v10, v4 -; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v9 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc -; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v10 -; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; CGP-NEXT: v_xor_b32_e32 v2, v2, v0 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 +; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v11 +; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 +; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v5 +; CGP-NEXT: v_addc_u32_e32 v10, vcc, v11, v5, vcc ; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v1 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v8, v8, v11 -; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 -; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v5 -; CGP-NEXT: v_trunc_f32_e32 v10, v10 -; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v10 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v10, v10 -; CGP-NEXT: v_xor_b32_e32 v9, v9, v11 -; CGP-NEXT: v_mul_lo_u32 v14, v13, v5 -; CGP-NEXT: v_mul_lo_u32 v15, v12, v10 -; CGP-NEXT: v_mul_hi_u32 v17, v12, v5 -; CGP-NEXT: v_mul_lo_u32 v16, v12, v5 +; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 +; CGP-NEXT: v_mul_f32_e32 v11, 0x2f800000, v3 +; CGP-NEXT: v_trunc_f32_e32 v11, v11 +; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v11 +; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v11, v11 +; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v4, v4, v5 +; CGP-NEXT: v_mul_lo_u32 v14, v13, v3 +; CGP-NEXT: v_mul_lo_u32 v15, v12, v11 +; CGP-NEXT: v_mul_hi_u32 v17, v12, v3 +; CGP-NEXT: v_mul_lo_u32 v16, v12, v3 +; CGP-NEXT: v_xor_b32_e32 v10, v10, v5 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; CGP-NEXT: v_mul_lo_u32 v15, v10, v16 -; CGP-NEXT: v_mul_lo_u32 v17, v5, v14 -; CGP-NEXT: v_mul_hi_u32 v18, v5, v16 -; CGP-NEXT: v_mul_hi_u32 v16, v10, v16 +; CGP-NEXT: v_mul_lo_u32 v15, v11, v16 +; CGP-NEXT: v_mul_lo_u32 v17, v3, v14 +; CGP-NEXT: v_mul_hi_u32 v18, v3, v16 +; CGP-NEXT: v_mul_hi_u32 v16, v11, v16 ; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v18 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v18, v10, v14 +; CGP-NEXT: v_mul_lo_u32 v18, v11, v14 ; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; CGP-NEXT: v_mul_hi_u32 v17, v5, v14 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v14 +; CGP-NEXT: v_mul_hi_u32 v17, v3, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v11, v14 ; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v16 ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v17 @@ -751,18 +753,18 @@ ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v15 -; CGP-NEXT: v_addc_u32_e64 v15, s[4:5], v10, v14, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v13, v5 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v15 +; CGP-NEXT: v_addc_u32_e64 v15, s[4:5], v11, v14, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v13, v3 ; CGP-NEXT: v_mul_lo_u32 v16, v12, v15 -; CGP-NEXT: v_mul_lo_u32 v17, v12, v5 -; CGP-NEXT: v_mul_hi_u32 v12, v12, v5 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14 +; CGP-NEXT: v_mul_lo_u32 v17, v12, v3 +; CGP-NEXT: v_mul_hi_u32 v12, v12, v3 +; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 -; CGP-NEXT: v_mul_hi_u32 v14, v5, v17 +; CGP-NEXT: v_mul_hi_u32 v14, v3, v17 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 ; CGP-NEXT: v_mul_lo_u32 v13, v15, v17 -; CGP-NEXT: v_mul_lo_u32 v16, v5, v12 +; CGP-NEXT: v_mul_lo_u32 v16, v3, v12 ; CGP-NEXT: v_mul_hi_u32 v17, v15, v17 ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] @@ -770,7 +772,7 @@ ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CGP-NEXT: v_mul_lo_u32 v14, v15, v12 ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v16, v13 -; CGP-NEXT: v_mul_hi_u32 v16, v5, v12 +; CGP-NEXT: v_mul_hi_u32 v16, v3, v12 ; CGP-NEXT: v_mul_hi_u32 v12, v15, v12 ; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] @@ -781,71 +783,71 @@ ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v10, v12, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v9, v5 -; CGP-NEXT: v_mul_lo_u32 v13, v8, v10 -; CGP-NEXT: v_mul_hi_u32 v14, v8, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v9, v5 +; CGP-NEXT: v_addc_u32_e32 v11, vcc, v11, v12, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v13 +; CGP-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; CGP-NEXT: v_mul_lo_u32 v12, v10, v3 +; CGP-NEXT: v_mul_lo_u32 v13, v4, v11 +; CGP-NEXT: v_mul_hi_u32 v14, v4, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v10, v3 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v9, v10 +; CGP-NEXT: v_mul_lo_u32 v14, v10, v11 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v9, v10 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v14, v5 +; CGP-NEXT: v_mul_hi_u32 v13, v4, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v10, v11 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v12 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; CGP-NEXT: v_mul_lo_u32 v12, v4, v5 -; CGP-NEXT: v_mul_lo_u32 v13, v1, v10 -; CGP-NEXT: v_mul_hi_u32 v15, v1, v5 -; CGP-NEXT: v_mul_lo_u32 v14, v1, v5 +; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_mul_lo_u32 v12, v2, v3 +; CGP-NEXT: v_mul_lo_u32 v13, v1, v11 +; CGP-NEXT: v_mul_hi_u32 v15, v1, v3 +; CGP-NEXT: v_mul_lo_u32 v14, v1, v3 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v14 -; CGP-NEXT: v_subb_u32_e64 v13, s[4:5], v9, v12, vcc -; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v9, v12 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v4 -; CGP-NEXT: v_subb_u32_e32 v9, vcc, v9, v4, vcc +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v4, v14 +; CGP-NEXT: v_subb_u32_e64 v13, s[4:5], v10, v12, vcc +; CGP-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v12 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v2 +; CGP-NEXT: v_subb_u32_e32 v10, vcc, v10, v2, vcc ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 -; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v1 -; CGP-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v4, v1 +; CGP-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v10, vcc ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v13, v4 -; CGP-NEXT: v_add_i32_e32 v13, vcc, 1, v5 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v13, v2 +; CGP-NEXT: v_add_i32_e32 v13, vcc, 1, v3 ; CGP-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[4:5] -; CGP-NEXT: v_addc_u32_e32 v14, vcc, 0, v10, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v4 +; CGP-NEXT: v_addc_u32_e32 v14, vcc, 0, v11, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v10, v2 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v1 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v4 +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v10, v2 ; CGP-NEXT: v_cndmask_b32_e32 v1, v15, v1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v13 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v14, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v13 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, 0, v14, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v14, v8, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v14, v4, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; CGP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v11, v0 -; CGP-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v1, v5 -; CGP-NEXT: v_xor_b32_e32 v1, v4, v5 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CGP-NEXT: v_xor_b32_e32 v3, v5, v0 +; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v1, v3 +; CGP-NEXT: v_xor_b32_e32 v1, v2, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; CGP-NEXT: ; implicit-def: $vgpr4 -; CGP-NEXT: ; implicit-def: $vgpr8 +; CGP-NEXT: ; implicit-def: $vgpr10 ; CGP-NEXT: BB2_2: ; %Flow2 ; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] @@ -859,68 +861,68 @@ ; CGP-NEXT: v_mul_lo_u32 v1, v1, v0 ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 ; CGP-NEXT: v_mul_lo_u32 v1, v0, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v8, v1 +; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v10, v1 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v0 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v4 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: BB2_4: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] -; CGP-NEXT: v_or_b32_e32 v5, v3, v7 -; CGP-NEXT: v_mov_b32_e32 v4, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; CGP-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CGP-NEXT: v_or_b32_e32 v3, v9, v7 +; CGP-NEXT: v_mov_b32_e32 v2, 0 +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB2_6 ; CGP-NEXT: ; %bb.5: -; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v4 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v5, v4 -; CGP-NEXT: v_xor_b32_e32 v6, v6, v4 -; CGP-NEXT: v_cvt_f32_u32_e32 v7, v5 -; CGP-NEXT: v_cvt_f32_u32_e32 v8, v6 -; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc -; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v8 -; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v5 -; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v6, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v9 -; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 -; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 -; CGP-NEXT: v_trunc_f32_e32 v8, v8 -; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v9 -; CGP-NEXT: v_mul_lo_u32 v12, v11, v7 -; CGP-NEXT: v_mul_lo_u32 v13, v10, v8 -; CGP-NEXT: v_mul_hi_u32 v15, v10, v7 -; CGP-NEXT: v_mul_lo_u32 v14, v10, v7 +; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v6, v2 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v7, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v3, v3, v2 +; CGP-NEXT: v_xor_b32_e32 v4, v4, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v5, v3 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v4 +; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v9 +; CGP-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 +; CGP-NEXT: v_rcp_iflag_f32_e32 v5, v5 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v7 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v7, vcc +; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v3 +; CGP-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; CGP-NEXT: v_mul_f32_e32 v9, 0x2f800000, v5 +; CGP-NEXT: v_trunc_f32_e32 v9, v9 +; CGP-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9 +; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 +; CGP-NEXT: v_cvt_u32_f32_e32 v9, v9 +; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v6, v6, v7 +; CGP-NEXT: v_mul_lo_u32 v12, v11, v5 +; CGP-NEXT: v_mul_lo_u32 v13, v10, v9 +; CGP-NEXT: v_mul_hi_u32 v15, v10, v5 +; CGP-NEXT: v_mul_lo_u32 v14, v10, v5 +; CGP-NEXT: v_xor_b32_e32 v8, v8, v7 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; CGP-NEXT: v_mul_lo_u32 v13, v8, v14 -; CGP-NEXT: v_mul_lo_u32 v15, v7, v12 -; CGP-NEXT: v_mul_hi_u32 v16, v7, v14 -; CGP-NEXT: v_mul_hi_u32 v14, v8, v14 +; CGP-NEXT: v_mul_lo_u32 v13, v9, v14 +; CGP-NEXT: v_mul_lo_u32 v15, v5, v12 +; CGP-NEXT: v_mul_hi_u32 v16, v5, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v9, v14 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v8, v12 +; CGP-NEXT: v_mul_lo_u32 v16, v9, v12 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CGP-NEXT: v_mul_hi_u32 v15, v7, v12 -; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 +; CGP-NEXT: v_mul_hi_u32 v15, v5, v12 +; CGP-NEXT: v_mul_hi_u32 v12, v9, v12 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 @@ -930,18 +932,18 @@ ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; CGP-NEXT: v_addc_u32_e64 v13, s[4:5], v8, v12, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v11, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13 +; CGP-NEXT: v_addc_u32_e64 v13, s[4:5], v9, v12, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v11, v5 ; CGP-NEXT: v_mul_lo_u32 v14, v10, v13 -; CGP-NEXT: v_mul_lo_u32 v15, v10, v7 -; CGP-NEXT: v_mul_hi_u32 v10, v10, v7 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v12 +; CGP-NEXT: v_mul_lo_u32 v15, v10, v5 +; CGP-NEXT: v_mul_hi_u32 v10, v10, v5 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_mul_hi_u32 v12, v7, v15 +; CGP-NEXT: v_mul_hi_u32 v12, v5, v15 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 ; CGP-NEXT: v_mul_lo_u32 v11, v13, v15 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v10 +; CGP-NEXT: v_mul_lo_u32 v14, v5, v10 ; CGP-NEXT: v_mul_hi_u32 v15, v13, v15 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] @@ -949,7 +951,7 @@ ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; CGP-NEXT: v_mul_lo_u32 v12, v13, v10 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v10 +; CGP-NEXT: v_mul_hi_u32 v14, v5, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v13, v10 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] @@ -960,100 +962,98 @@ ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v3, v7 -; CGP-NEXT: v_mul_lo_u32 v11, v2, v8 -; CGP-NEXT: v_mul_hi_u32 v12, v2, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v8, v5 +; CGP-NEXT: v_mul_lo_u32 v11, v6, v9 +; CGP-NEXT: v_mul_hi_u32 v12, v6, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v8, v5 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v3, v8 +; CGP-NEXT: v_mul_lo_u32 v12, v8, v9 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v11, v2, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7 +; CGP-NEXT: v_mul_hi_u32 v11, v6, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_mul_lo_u32 v10, v6, v7 -; CGP-NEXT: v_mul_lo_u32 v11, v5, v8 -; CGP-NEXT: v_mul_hi_u32 v13, v5, v7 -; CGP-NEXT: v_mul_lo_u32 v12, v5, v7 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 +; CGP-NEXT: v_mul_lo_u32 v10, v4, v5 +; CGP-NEXT: v_mul_lo_u32 v11, v3, v9 +; CGP-NEXT: v_mul_hi_u32 v13, v3, v5 +; CGP-NEXT: v_mul_lo_u32 v12, v3, v5 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_subb_u32_e64 v11, s[4:5], v3, v10, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v10 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v6 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; CGP-NEXT: v_sub_i32_e32 v6, vcc, v6, v12 +; CGP-NEXT: v_subb_u32_e64 v11, s[4:5], v8, v10, vcc +; CGP-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v10 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v4 +; CGP-NEXT: v_subb_u32_e32 v8, vcc, v8, v4, vcc ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 -; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 +; CGP-NEXT: v_sub_i32_e32 v6, vcc, v6, v3 +; CGP-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v6 -; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v7 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v4 +; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v5 ; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[4:5] -; CGP-NEXT: v_addc_u32_e32 v12, vcc, 0, v8, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 +; CGP-NEXT: v_addc_u32_e32 v12, vcc, 0, v9, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v13, v2, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v11 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v12, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v5, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v8, v4 +; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v11 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, 0, v12, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; CGP-NEXT: v_cndmask_b32_e32 v3, v11, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v12, v6, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v9, v4 -; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v5 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v5 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v5 -; CGP-NEXT: v_subb_u32_e32 v5, vcc, v3, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v7, v2 +; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v3, v5 +; CGP-NEXT: v_xor_b32_e32 v3, v4, v5 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc ; CGP-NEXT: ; implicit-def: $vgpr6 -; CGP-NEXT: ; implicit-def: $vgpr2 +; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: BB2_6: ; %Flow ; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] ; CGP-NEXT: s_cbranch_execz BB2_8 ; CGP-NEXT: ; %bb.7: -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v6 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v6 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v3 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6 +; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_mul_lo_u32 v4, v3, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v3 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v2, v6 -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_mul_hi_u32 v2, v8, v2 +; CGP-NEXT: v_mul_lo_u32 v3, v2, v6 +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v8, v3 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; CGP-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; CGP-NEXT: v_mov_b32_e32 v5, 0 +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v6 +; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: BB2_8: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] -; CGP-NEXT: v_mov_b32_e32 v2, v4 -; CGP-NEXT: v_mov_b32_e32 v3, v5 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i64> %num, %den ret <2 x i64> %result @@ -2510,37 +2510,39 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b64 s[4:5], 0x1000 -; CHECK-NEXT: v_lshl_b64 v[4:5], s[4:5], v2 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: v_or_b32_e32 v3, v1, v5 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CHECK-NEXT: v_lshl_b64 v[5:6], s[4:5], v2 +; CHECK-NEXT: v_mov_b32_e32 v4, v1 +; CHECK-NEXT: v_mov_b32_e32 v3, v0 +; CHECK-NEXT: v_or_b32_e32 v1, v4, v6 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz BB7_2 ; CHECK-NEXT: ; %bb.1: -; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v2 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v2, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v3, v2 -; CHECK-NEXT: v_xor_b32_e32 v4, v4, v2 -; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v3 -; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v4 -; CHECK-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc +; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v6 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v5, v0 +; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v6, v0, vcc +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v0 +; CHECK-NEXT: v_xor_b32_e32 v2, v2, v0 +; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v1 +; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v2 +; CHECK-NEXT: v_ashrrev_i32_e32 v7, 31, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v7, vcc ; CHECK-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; CHECK-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; CHECK-NEXT: v_sub_i32_e32 v8, vcc, 0, v3 -; CHECK-NEXT: v_subb_u32_e32 v9, vcc, 0, v4, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v7 +; CHECK-NEXT: v_sub_i32_e32 v8, vcc, 0, v1 +; CHECK-NEXT: v_subb_u32_e32 v9, vcc, 0, v2, vcc +; CHECK-NEXT: v_xor_b32_e32 v3, v3, v7 ; CHECK-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; CHECK-NEXT: v_trunc_f32_e32 v6, v6 ; CHECK-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 ; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5 ; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v7 +; CHECK-NEXT: v_xor_b32_e32 v4, v4, v7 ; CHECK-NEXT: v_mul_lo_u32 v10, v9, v5 ; CHECK-NEXT: v_mul_lo_u32 v11, v8, v6 ; CHECK-NEXT: v_mul_hi_u32 v13, v8, v5 @@ -2601,18 +2603,18 @@ ; CHECK-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; CHECK-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; CHECK-NEXT: v_mul_lo_u32 v8, v1, v5 -; CHECK-NEXT: v_mul_lo_u32 v9, v0, v6 -; CHECK-NEXT: v_mul_hi_u32 v10, v0, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v1, v5 +; CHECK-NEXT: v_mul_lo_u32 v8, v4, v5 +; CHECK-NEXT: v_mul_lo_u32 v9, v3, v6 +; CHECK-NEXT: v_mul_hi_u32 v10, v3, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v10, v1, v6 +; CHECK-NEXT: v_mul_lo_u32 v10, v4, v6 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_mul_hi_u32 v9, v0, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v1, v6 +; CHECK-NEXT: v_mul_hi_u32 v9, v3, v6 +; CHECK-NEXT: v_mul_hi_u32 v6, v4, v6 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v10, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v9 @@ -2622,76 +2624,74 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; CHECK-NEXT: v_mul_lo_u32 v8, v4, v5 -; CHECK-NEXT: v_mul_lo_u32 v9, v3, v6 -; CHECK-NEXT: v_mul_hi_u32 v11, v3, v5 -; CHECK-NEXT: v_mul_lo_u32 v10, v3, v5 +; CHECK-NEXT: v_mul_lo_u32 v8, v2, v5 +; CHECK-NEXT: v_mul_lo_u32 v9, v1, v6 +; CHECK-NEXT: v_mul_hi_u32 v11, v1, v5 +; CHECK-NEXT: v_mul_lo_u32 v10, v1, v5 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 -; CHECK-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v8, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v8 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v4 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v10 +; CHECK-NEXT: v_subb_u32_e64 v9, s[4:5], v4, v8, vcc +; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v4, v8 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v2 +; CHECK-NEXT: v_subb_u32_e32 v4, vcc, v4, v2, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v1 +; CHECK-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v4 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v2 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, 1, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] ; CHECK-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v1, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v9 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v9 ; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v10, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v7, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v3 -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v0, v3 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3 -; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc -; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; CHECK-NEXT: v_xor_b32_e32 v3, v7, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; CHECK-NEXT: v_xor_b32_e32 v0, v1, v3 +; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 +; CHECK-NEXT: ; implicit-def: $vgpr3 ; CHECK-NEXT: BB7_2: ; %Flow ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] ; CHECK-NEXT: s_xor_b64 exec, exec, s[6:7] ; CHECK-NEXT: s_cbranch_execz BB7_4 ; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v4 -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, 0, v4 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_lo_u32 v2, v2, v1 -; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5 +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 +; CHECK-NEXT: v_mul_lo_u32 v1, v1, v0 ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 -; CHECK-NEXT: v_mul_lo_u32 v2, v1, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_mul_hi_u32 v0, v3, v0 +; CHECK-NEXT: v_mul_lo_u32 v1, v0, v5 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc -; CHECK-NEXT: v_mov_b32_e32 v3, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: BB7_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] -; CHECK-NEXT: v_mov_b32_e32 v0, v2 -; CHECK-NEXT: v_mov_b32_e32 v1, v3 ; CHECK-NEXT: s_setpc_b64 s[30:31] %shl.y = shl i64 4096, %y %r = sdiv i64 %x, %shl.y @@ -2995,58 +2995,60 @@ ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: s_mov_b64 s[4:5], 0x1000 -; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v4 -; CGP-NEXT: v_mov_b32_e32 v7, v1 -; CGP-NEXT: v_mov_b32_e32 v5, v0 -; CGP-NEXT: v_or_b32_e32 v1, v7, v11 +; CGP-NEXT: v_mov_b32_e32 v5, v2 +; CGP-NEXT: v_mov_b32_e32 v7, v3 +; CGP-NEXT: v_lshl_b64 v[2:3], s[4:5], v4 +; CGP-NEXT: v_mov_b32_e32 v9, v1 +; CGP-NEXT: v_mov_b32_e32 v8, v0 +; CGP-NEXT: v_or_b32_e32 v1, v9, v3 ; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; CGP-NEXT: v_lshl_b64 v[8:9], s[4:5], v6 +; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v6 ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB8_2 ; CGP-NEXT: ; %bb.1: -; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v11 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v0 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v11, v0, vcc +; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v3, v0, vcc ; CGP-NEXT: v_xor_b32_e32 v1, v1, v0 -; CGP-NEXT: v_xor_b32_e32 v4, v4, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v6, v1 -; CGP-NEXT: v_cvt_f32_u32_e32 v10, v4 -; CGP-NEXT: v_ashrrev_i32_e32 v11, 31, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v11 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v11, vcc -; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v10 -; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; CGP-NEXT: v_xor_b32_e32 v2, v2, v0 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2 +; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v9 +; CGP-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 +; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v6 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v6, vcc ; CGP-NEXT: v_sub_i32_e32 v12, vcc, 0, v1 -; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v5, v11 -; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 -; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v6 -; CGP-NEXT: v_trunc_f32_e32 v10, v10 -; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v10 -; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v10, v10 -; CGP-NEXT: v_xor_b32_e32 v7, v7, v11 -; CGP-NEXT: v_mul_lo_u32 v14, v13, v6 -; CGP-NEXT: v_mul_lo_u32 v15, v12, v10 -; CGP-NEXT: v_mul_hi_u32 v17, v12, v6 -; CGP-NEXT: v_mul_lo_u32 v16, v12, v6 +; CGP-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 +; CGP-NEXT: v_mul_f32_e32 v9, 0x2f800000, v3 +; CGP-NEXT: v_trunc_f32_e32 v9, v9 +; CGP-NEXT: v_mac_f32_e32 v3, 0xcf800000, v9 +; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v9, v9 +; CGP-NEXT: v_subb_u32_e32 v13, vcc, 0, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v4, v4, v6 +; CGP-NEXT: v_mul_lo_u32 v14, v13, v3 +; CGP-NEXT: v_mul_lo_u32 v15, v12, v9 +; CGP-NEXT: v_mul_hi_u32 v17, v12, v3 +; CGP-NEXT: v_mul_lo_u32 v16, v12, v3 +; CGP-NEXT: v_xor_b32_e32 v8, v8, v6 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 -; CGP-NEXT: v_mul_lo_u32 v15, v10, v16 -; CGP-NEXT: v_mul_lo_u32 v17, v6, v14 -; CGP-NEXT: v_mul_hi_u32 v18, v6, v16 -; CGP-NEXT: v_mul_hi_u32 v16, v10, v16 +; CGP-NEXT: v_mul_lo_u32 v15, v9, v16 +; CGP-NEXT: v_mul_lo_u32 v17, v3, v14 +; CGP-NEXT: v_mul_hi_u32 v18, v3, v16 +; CGP-NEXT: v_mul_hi_u32 v16, v9, v16 ; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v18 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v18, v10, v14 +; CGP-NEXT: v_mul_lo_u32 v18, v9, v14 ; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 -; CGP-NEXT: v_mul_hi_u32 v17, v6, v14 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v14 +; CGP-NEXT: v_mul_hi_u32 v17, v3, v14 +; CGP-NEXT: v_mul_hi_u32 v14, v9, v14 ; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v16 ; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v16, vcc, v16, v17 @@ -3056,18 +3058,18 @@ ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v15 -; CGP-NEXT: v_addc_u32_e64 v15, s[4:5], v10, v14, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v13, v6 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v15 +; CGP-NEXT: v_addc_u32_e64 v15, s[4:5], v9, v14, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v13, v3 ; CGP-NEXT: v_mul_lo_u32 v16, v12, v15 -; CGP-NEXT: v_mul_lo_u32 v17, v12, v6 -; CGP-NEXT: v_mul_hi_u32 v12, v12, v6 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14 +; CGP-NEXT: v_mul_lo_u32 v17, v12, v3 +; CGP-NEXT: v_mul_hi_u32 v12, v12, v3 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 -; CGP-NEXT: v_mul_hi_u32 v14, v6, v17 +; CGP-NEXT: v_mul_hi_u32 v14, v3, v17 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 ; CGP-NEXT: v_mul_lo_u32 v13, v15, v17 -; CGP-NEXT: v_mul_lo_u32 v16, v6, v12 +; CGP-NEXT: v_mul_lo_u32 v16, v3, v12 ; CGP-NEXT: v_mul_hi_u32 v17, v15, v17 ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] @@ -3075,7 +3077,7 @@ ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CGP-NEXT: v_mul_lo_u32 v14, v15, v12 ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v16, v13 -; CGP-NEXT: v_mul_hi_u32 v16, v6, v12 +; CGP-NEXT: v_mul_hi_u32 v16, v3, v12 ; CGP-NEXT: v_mul_hi_u32 v12, v15, v12 ; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] @@ -3086,137 +3088,137 @@ ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v14 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, v10, v12, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13 -; CGP-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v7, v6 -; CGP-NEXT: v_mul_lo_u32 v13, v5, v10 -; CGP-NEXT: v_mul_hi_u32 v14, v5, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v7, v6 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v12, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v13 +; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; CGP-NEXT: v_mul_lo_u32 v12, v8, v3 +; CGP-NEXT: v_mul_lo_u32 v13, v4, v9 +; CGP-NEXT: v_mul_hi_u32 v14, v4, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v8, v3 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v14, v7, v10 +; CGP-NEXT: v_mul_lo_u32 v14, v8, v9 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_mul_hi_u32 v13, v5, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v7, v10 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v14, v6 +; CGP-NEXT: v_mul_hi_u32 v13, v4, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; CGP-NEXT: v_mul_lo_u32 v12, v4, v6 -; CGP-NEXT: v_mul_lo_u32 v13, v1, v10 -; CGP-NEXT: v_mul_hi_u32 v15, v1, v6 -; CGP-NEXT: v_mul_lo_u32 v14, v1, v6 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 +; CGP-NEXT: v_mul_lo_u32 v12, v2, v3 +; CGP-NEXT: v_mul_lo_u32 v13, v1, v9 +; CGP-NEXT: v_mul_hi_u32 v15, v1, v3 +; CGP-NEXT: v_mul_lo_u32 v14, v1, v3 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v14 -; CGP-NEXT: v_subb_u32_e64 v13, s[4:5], v7, v12, vcc -; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v7, v12 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v4 -; CGP-NEXT: v_subb_u32_e32 v7, vcc, v7, v4, vcc +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v4, v14 +; CGP-NEXT: v_subb_u32_e64 v13, s[4:5], v8, v12, vcc +; CGP-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v12 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v2 +; CGP-NEXT: v_subb_u32_e32 v8, vcc, v8, v2, vcc ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v1 -; CGP-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v4, v1 +; CGP-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v13, v4 -; CGP-NEXT: v_add_i32_e32 v13, vcc, 1, v6 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v13, v2 +; CGP-NEXT: v_add_i32_e32 v13, vcc, 1, v3 ; CGP-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[4:5] -; CGP-NEXT: v_addc_u32_e32 v14, vcc, 0, v10, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v4 +; CGP-NEXT: v_addc_u32_e32 v14, vcc, 0, v9, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v2 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v8, v2 ; CGP-NEXT: v_cndmask_b32_e32 v1, v15, v1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v13 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v14, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v13 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, 0, v14, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v14, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v13, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v14, v4, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; CGP-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v11, v0 -; CGP-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v1, v5 -; CGP-NEXT: v_xor_b32_e32 v1, v4, v5 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 -; CGP-NEXT: ; implicit-def: $vgpr5 +; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CGP-NEXT: v_xor_b32_e32 v3, v6, v0 +; CGP-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v1, v3 +; CGP-NEXT: v_xor_b32_e32 v1, v2, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: BB8_2: ; %Flow2 ; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] ; CGP-NEXT: s_cbranch_execz BB8_4 ; CGP-NEXT: ; %bb.3: -; CGP-NEXT: v_cvt_f32_u32_e32 v0, v10 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v10 +; CGP-NEXT: v_cvt_f32_u32_e32 v0, v2 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 ; CGP-NEXT: v_mul_lo_u32 v1, v1, v0 ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CGP-NEXT: v_mul_hi_u32 v0, v5, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v0, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v10 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v1, v10 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v0 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v10 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_mul_lo_u32 v1, v0, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v8, v1 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v2 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: BB8_4: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] -; CGP-NEXT: v_or_b32_e32 v5, v3, v9 -; CGP-NEXT: v_mov_b32_e32 v4, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; CGP-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CGP-NEXT: v_or_b32_e32 v3, v7, v11 +; CGP-NEXT: v_mov_b32_e32 v2, 0 +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB8_6 ; CGP-NEXT: ; %bb.5: -; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v9 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v4 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v9, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v5, v4 -; CGP-NEXT: v_xor_b32_e32 v6, v6, v4 -; CGP-NEXT: v_cvt_f32_u32_e32 v7, v5 -; CGP-NEXT: v_cvt_f32_u32_e32 v8, v6 -; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc -; CGP-NEXT: v_mac_f32_e32 v7, 0x4f800000, v8 -; CGP-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v5 -; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v6, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v9 -; CGP-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 -; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 +; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v11 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v2 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v11, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v3, v3, v2 +; CGP-NEXT: v_xor_b32_e32 v4, v4, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3 +; CGP-NEXT: v_cvt_f32_u32_e32 v8, v4 +; CGP-NEXT: v_ashrrev_i32_e32 v9, 31, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc +; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v8 +; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 +; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v3 +; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v5, v9 +; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6 ; CGP-NEXT: v_trunc_f32_e32 v8, v8 -; CGP-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 +; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 ; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v9 -; CGP-NEXT: v_mul_lo_u32 v12, v11, v7 +; CGP-NEXT: v_xor_b32_e32 v7, v7, v9 +; CGP-NEXT: v_mul_lo_u32 v12, v11, v6 ; CGP-NEXT: v_mul_lo_u32 v13, v10, v8 -; CGP-NEXT: v_mul_hi_u32 v15, v10, v7 -; CGP-NEXT: v_mul_lo_u32 v14, v10, v7 +; CGP-NEXT: v_mul_hi_u32 v15, v10, v6 +; CGP-NEXT: v_mul_lo_u32 v14, v10, v6 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; CGP-NEXT: v_mul_lo_u32 v13, v8, v14 -; CGP-NEXT: v_mul_lo_u32 v15, v7, v12 -; CGP-NEXT: v_mul_hi_u32 v16, v7, v14 +; CGP-NEXT: v_mul_lo_u32 v15, v6, v12 +; CGP-NEXT: v_mul_hi_u32 v16, v6, v14 ; CGP-NEXT: v_mul_hi_u32 v14, v8, v14 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc @@ -3224,7 +3226,7 @@ ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v16, v8, v12 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; CGP-NEXT: v_mul_hi_u32 v15, v7, v12 +; CGP-NEXT: v_mul_hi_u32 v15, v6, v12 ; CGP-NEXT: v_mul_hi_u32 v12, v8, v12 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc @@ -3235,18 +3237,18 @@ ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13 ; CGP-NEXT: v_addc_u32_e64 v13, s[4:5], v8, v12, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v11, v7 +; CGP-NEXT: v_mul_lo_u32 v11, v11, v6 ; CGP-NEXT: v_mul_lo_u32 v14, v10, v13 -; CGP-NEXT: v_mul_lo_u32 v15, v10, v7 -; CGP-NEXT: v_mul_hi_u32 v10, v10, v7 +; CGP-NEXT: v_mul_lo_u32 v15, v10, v6 +; CGP-NEXT: v_mul_hi_u32 v10, v10, v6 ; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v12 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 -; CGP-NEXT: v_mul_hi_u32 v12, v7, v15 +; CGP-NEXT: v_mul_hi_u32 v12, v6, v15 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 ; CGP-NEXT: v_mul_lo_u32 v11, v13, v15 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v10 +; CGP-NEXT: v_mul_lo_u32 v14, v6, v10 ; CGP-NEXT: v_mul_hi_u32 v15, v13, v15 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] @@ -3254,7 +3256,7 @@ ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; CGP-NEXT: v_mul_lo_u32 v12, v13, v10 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v14, v11 -; CGP-NEXT: v_mul_hi_u32 v14, v7, v10 +; CGP-NEXT: v_mul_hi_u32 v14, v6, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v13, v10 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] @@ -3266,99 +3268,97 @@ ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 ; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 ; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v3, v7 -; CGP-NEXT: v_mul_lo_u32 v11, v2, v8 -; CGP-NEXT: v_mul_hi_u32 v12, v2, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 +; CGP-NEXT: v_mul_lo_u32 v10, v7, v6 +; CGP-NEXT: v_mul_lo_u32 v11, v5, v8 +; CGP-NEXT: v_mul_hi_u32 v12, v5, v6 +; CGP-NEXT: v_mul_hi_u32 v6, v7, v6 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v3, v8 +; CGP-NEXT: v_mul_lo_u32 v12, v7, v8 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v11, v2, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v12, v7 +; CGP-NEXT: v_mul_hi_u32 v11, v5, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CGP-NEXT: v_mul_lo_u32 v10, v6, v7 -; CGP-NEXT: v_mul_lo_u32 v11, v5, v8 -; CGP-NEXT: v_mul_hi_u32 v13, v5, v7 -; CGP-NEXT: v_mul_lo_u32 v12, v5, v7 +; CGP-NEXT: v_mul_lo_u32 v10, v4, v6 +; CGP-NEXT: v_mul_lo_u32 v11, v3, v8 +; CGP-NEXT: v_mul_hi_u32 v13, v3, v6 +; CGP-NEXT: v_mul_lo_u32 v12, v3, v6 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 -; CGP-NEXT: v_subb_u32_e64 v11, s[4:5], v3, v10, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v10 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v6 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v12 +; CGP-NEXT: v_subb_u32_e64 v11, s[4:5], v7, v10, vcc +; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v7, v10 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v4 +; CGP-NEXT: v_subb_u32_e32 v7, vcc, v7, v4, vcc ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 -; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v3 +; CGP-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v6 -; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v7 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v4 +; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v6 ; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[4:5] ; CGP-NEXT: v_addc_u32_e32 v12, vcc, 0, v8, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v4 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v13, v2, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v11 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4 +; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v11 ; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v12, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v5, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; CGP-NEXT: v_cndmask_b32_e32 v3, v11, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v4, v12, v5, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v9, v4 -; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v5 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v5 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v5 -; CGP-NEXT: v_subb_u32_e32 v5, vcc, v3, v5, vcc -; CGP-NEXT: ; implicit-def: $vgpr8_vgpr9 -; CGP-NEXT: ; implicit-def: $vgpr2 +; CGP-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v9, v2 +; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; CGP-NEXT: v_xor_b32_e32 v2, v3, v5 +; CGP-NEXT: v_xor_b32_e32 v3, v4, v5 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc +; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 +; CGP-NEXT: ; implicit-def: $vgpr5 ; CGP-NEXT: BB8_6: ; %Flow ; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] ; CGP-NEXT: s_cbranch_execz BB8_8 ; CGP-NEXT: ; %bb.7: -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v8 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v8 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v3 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v10 +; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_mul_lo_u32 v4, v3, v8 -; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v3 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v8 -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v2, v8 -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 +; CGP-NEXT: v_mul_lo_u32 v3, v2, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v5, v3 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v10 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v10 +; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v10 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v8 -; CGP-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; CGP-NEXT: v_mov_b32_e32 v5, 0 +; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: BB8_8: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] -; CGP-NEXT: v_mov_b32_e32 v2, v4 -; CGP-NEXT: v_mov_b32_e32 v3, v5 ; CGP-NEXT: s_setpc_b64 s[30:31] %shl.y = shl <2 x i64> , %y %r = sdiv <2 x i64> %x, %shl.y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -1341,31 +1341,31 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64> addrspace(1)* %out1, <2 x i64> %x, <2 x i64> %y) { ; GFX8-LABEL: sdivrem_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_ashr_i32 s2, s13, 31 ; GFX8-NEXT: s_ashr_i32 s6, s9, 31 -; GFX8-NEXT: s_add_u32 s0, s12, s2 -; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: s_and_b32 s1, s1, 1 -; GFX8-NEXT: s_cmp_lg_u32 s1, 0 -; GFX8-NEXT: s_addc_u32 s1, s13, s2 +; GFX8-NEXT: s_ashr_i32 s12, s1, 31 ; GFX8-NEXT: s_add_u32 s8, s8, s6 -; GFX8-NEXT: s_cselect_b32 s3, 1, 0 -; GFX8-NEXT: s_and_b32 s3, s3, 1 -; GFX8-NEXT: s_cmp_lg_u32 s3, 0 -; GFX8-NEXT: s_mov_b32 s7, s6 +; GFX8-NEXT: s_cselect_b32 s7, 1, 0 +; GFX8-NEXT: s_and_b32 s7, s7, 1 +; GFX8-NEXT: s_cmp_lg_u32 s7, 0 ; GFX8-NEXT: s_addc_u32 s9, s9, s6 +; GFX8-NEXT: s_add_u32 s0, s0, s12 +; GFX8-NEXT: s_cselect_b32 s7, 1, 0 +; GFX8-NEXT: s_and_b32 s7, s7, 1 +; GFX8-NEXT: s_cmp_lg_u32 s7, 0 +; GFX8-NEXT: s_mov_b32 s13, s12 +; GFX8-NEXT: s_addc_u32 s1, s1, s12 +; GFX8-NEXT: s_xor_b64 s[14:15], s[0:1], s[12:13] +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s15 +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s14 +; GFX8-NEXT: s_mov_b32 s7, s6 ; GFX8-NEXT: s_xor_b64 s[8:9], s[8:9], s[6:7] -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s8 -; GFX8-NEXT: s_mov_b32 s3, s2 -; GFX8-NEXT: s_xor_b64 s[12:13], s[0:1], s[2:3] ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX8-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_sub_u32 s16, 0, s8 +; GFX8-NEXT: s_sub_u32 s16, 0, s14 ; GFX8-NEXT: s_cselect_b32 s0, 1, 0 ; GFX8-NEXT: s_and_b32 s0, s0, 1 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1376,12 +1376,12 @@ ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: s_cmp_lg_u32 s0, 0 -; GFX8-NEXT: s_subb_u32 s17, 0, s9 +; GFX8-NEXT: s_subb_u32 s17, 0, s15 ; GFX8-NEXT: v_mul_lo_u32 v3, s17, v0 ; GFX8-NEXT: v_mul_lo_u32 v2, s16, v1 ; GFX8-NEXT: v_mul_hi_u32 v5, s16, v0 ; GFX8-NEXT: v_mul_lo_u32 v4, s16, v0 -; GFX8-NEXT: v_mov_b32_e32 v6, s9 +; GFX8-NEXT: v_mov_b32_e32 v6, s15 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_mul_lo_u32 v3, v1, v4 @@ -1438,19 +1438,19 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mul_lo_u32 v2, s13, v0 -; GFX8-NEXT: v_mul_lo_u32 v3, s12, v1 -; GFX8-NEXT: v_mul_hi_u32 v5, s12, v0 -; GFX8-NEXT: v_mul_hi_u32 v0, s13, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s13 +; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1 +; GFX8-NEXT: v_mul_hi_u32 v5, s8, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, s9 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v5, s13, v1 +; GFX8-NEXT: v_mul_lo_u32 v5, s9, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_mul_hi_u32 v3, s12, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, s13, v1 +; GFX8-NEXT: v_mul_hi_u32 v3, s8, v1 +; GFX8-NEXT: v_mul_hi_u32 v1, s9, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3 @@ -1460,33 +1460,33 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0 -; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1 -; GFX8-NEXT: v_mul_hi_u32 v7, s8, v0 -; GFX8-NEXT: v_mul_lo_u32 v5, s8, v0 +; GFX8-NEXT: v_mul_lo_u32 v2, s15, v0 +; GFX8-NEXT: v_mul_lo_u32 v3, s14, v1 +; GFX8-NEXT: v_mul_hi_u32 v7, s14, v0 +; GFX8-NEXT: v_mul_lo_u32 v5, s14, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s12, v5 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s8, v5 ; GFX8-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v2, vcc -; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s13, v2 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 +; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s9, v2 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v3 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v3 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v4 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v4 ; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s8, v3 +; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s14, v3 ; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v2, vcc ; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v0 ; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v8 ; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s8, v7 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s14, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9 ; GFX8-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc @@ -1501,55 +1501,55 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[0:1] -; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[6:7] +; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[12:13] ; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX8-NEXT: s_ashr_i32 s6, s15, 31 ; GFX8-NEXT: s_ashr_i32 s8, s11, 31 +; GFX8-NEXT: s_ashr_i32 s12, s3, 31 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: s_add_u32 s0, s14, s6 +; GFX8-NEXT: s_add_u32 s0, s10, s8 ; GFX8-NEXT: v_xor_b32_e32 v1, s1, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NEXT: s_cselect_b32 s1, 1, 0 ; GFX8-NEXT: s_and_b32 s1, s1, 1 ; GFX8-NEXT: s_cmp_lg_u32 s1, 0 -; GFX8-NEXT: s_addc_u32 s1, s15, s6 -; GFX8-NEXT: s_add_u32 s10, s10, s8 -; GFX8-NEXT: s_cselect_b32 s3, 1, 0 -; GFX8-NEXT: s_and_b32 s3, s3, 1 -; GFX8-NEXT: s_cmp_lg_u32 s3, 0 -; GFX8-NEXT: s_mov_b32 s9, s8 -; GFX8-NEXT: s_addc_u32 s11, s11, s8 -; GFX8-NEXT: s_xor_b64 s[10:11], s[10:11], s[8:9] +; GFX8-NEXT: s_addc_u32 s1, s11, s8 +; GFX8-NEXT: s_add_u32 s2, s2, s12 +; GFX8-NEXT: s_cselect_b32 s7, 1, 0 +; GFX8-NEXT: s_and_b32 s7, s7, 1 +; GFX8-NEXT: s_cmp_lg_u32 s7, 0 +; GFX8-NEXT: s_mov_b32 s13, s12 +; GFX8-NEXT: s_addc_u32 s3, s3, s12 +; GFX8-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc -; GFX8-NEXT: v_cvt_f32_u32_e32 v4, s11 -; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s10 -; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s2, v2 +; GFX8-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GFX8-NEXT: v_cvt_f32_u32_e32 v5, s2 +; GFX8-NEXT: v_xor_b32_e32 v3, s6, v3 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX8-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4 ; GFX8-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v7, v4 -; GFX8-NEXT: v_mov_b32_e32 v6, s2 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s2, v3 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s6, v3 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v2, v6, vcc ; GFX8-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v7 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; GFX8-NEXT: s_mov_b32 s7, s6 +; GFX8-NEXT: s_mov_b32 s9, s8 ; GFX8-NEXT: v_trunc_f32_e32 v3, v3 ; GFX8-NEXT: v_mul_f32_e32 v6, 0xcf800000, v3 -; GFX8-NEXT: s_xor_b64 s[2:3], s[0:1], s[6:7] +; GFX8-NEXT: s_xor_b64 s[6:7], s[0:1], s[8:9] ; GFX8-NEXT: v_add_f32_e32 v2, v6, v2 -; GFX8-NEXT: s_sub_u32 s12, 0, s10 +; GFX8-NEXT: s_sub_u32 s10, 0, s2 ; GFX8-NEXT: s_cselect_b32 s0, 1, 0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX8-NEXT: s_and_b32 s0, s0, 1 ; GFX8-NEXT: s_cmp_lg_u32 s0, 0 -; GFX8-NEXT: s_subb_u32 s13, 0, s11 -; GFX8-NEXT: v_mul_lo_u32 v6, s13, v2 -; GFX8-NEXT: v_mul_lo_u32 v7, s12, v3 -; GFX8-NEXT: v_mul_hi_u32 v9, s12, v2 -; GFX8-NEXT: v_mul_lo_u32 v8, s12, v2 -; GFX8-NEXT: v_mov_b32_e32 v10, s11 +; GFX8-NEXT: s_subb_u32 s11, 0, s3 +; GFX8-NEXT: v_mul_lo_u32 v6, s11, v2 +; GFX8-NEXT: v_mul_lo_u32 v7, s10, v3 +; GFX8-NEXT: v_mul_hi_u32 v9, s10, v2 +; GFX8-NEXT: v_mul_lo_u32 v8, s10, v2 +; GFX8-NEXT: v_mov_b32_e32 v10, s3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v7 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v9 ; GFX8-NEXT: v_mul_lo_u32 v7, v3, v8 @@ -1575,10 +1575,10 @@ ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 ; GFX8-NEXT: v_addc_u32_e64 v7, s[0:1], v3, v6, vcc -; GFX8-NEXT: v_mul_lo_u32 v8, s13, v2 -; GFX8-NEXT: v_mul_lo_u32 v9, s12, v7 -; GFX8-NEXT: v_mul_hi_u32 v12, s12, v2 -; GFX8-NEXT: v_mul_lo_u32 v11, s12, v2 +; GFX8-NEXT: v_mul_lo_u32 v8, s11, v2 +; GFX8-NEXT: v_mul_lo_u32 v9, s10, v7 +; GFX8-NEXT: v_mul_hi_u32 v12, s10, v2 +; GFX8-NEXT: v_mul_lo_u32 v11, s10, v2 ; GFX8-NEXT: v_add_u32_e64 v3, s[0:1], v3, v6 ; GFX8-NEXT: v_add_u32_e64 v8, s[0:1], v8, v9 ; GFX8-NEXT: v_add_u32_e64 v8, s[0:1], v8, v12 @@ -1586,7 +1586,6 @@ ; GFX8-NEXT: v_mul_lo_u32 v12, v2, v8 ; GFX8-NEXT: v_mul_hi_u32 v6, v2, v11 ; GFX8-NEXT: v_mul_hi_u32 v11, v7, v11 -; GFX8-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0 ; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], v9, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v6, s[0:1], v9, v6 @@ -1607,19 +1606,19 @@ ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v7, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-NEXT: v_mul_lo_u32 v6, s3, v2 -; GFX8-NEXT: v_mul_lo_u32 v7, s2, v3 -; GFX8-NEXT: v_mul_hi_u32 v9, s2, v2 -; GFX8-NEXT: v_mul_hi_u32 v2, s3, v2 -; GFX8-NEXT: v_mov_b32_e32 v8, s3 +; GFX8-NEXT: v_mul_lo_u32 v6, s7, v2 +; GFX8-NEXT: v_mul_lo_u32 v7, s6, v3 +; GFX8-NEXT: v_mul_hi_u32 v9, s6, v2 +; GFX8-NEXT: v_mul_hi_u32 v2, s7, v2 +; GFX8-NEXT: v_mov_b32_e32 v8, s7 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v9 ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX8-NEXT: v_mul_lo_u32 v9, s3, v3 +; GFX8-NEXT: v_mul_lo_u32 v9, s7, v3 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 -; GFX8-NEXT: v_mul_hi_u32 v7, s2, v3 -; GFX8-NEXT: v_mul_hi_u32 v3, s3, v3 +; GFX8-NEXT: v_mul_hi_u32 v7, s6, v3 +; GFX8-NEXT: v_mul_hi_u32 v3, s7, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v7 @@ -1629,29 +1628,29 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v7, v6 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v6 -; GFX8-NEXT: v_mul_lo_u32 v6, s11, v2 -; GFX8-NEXT: v_mul_lo_u32 v7, s10, v3 -; GFX8-NEXT: v_mul_hi_u32 v11, s10, v2 -; GFX8-NEXT: v_mul_lo_u32 v9, s10, v2 +; GFX8-NEXT: v_mul_lo_u32 v6, s3, v2 +; GFX8-NEXT: v_mul_lo_u32 v7, s2, v3 +; GFX8-NEXT: v_mul_hi_u32 v11, s2, v2 +; GFX8-NEXT: v_mul_lo_u32 v9, s2, v2 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v7 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v11 -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s2, v9 +; GFX8-NEXT: v_sub_u32_e32 v7, vcc, s6, v9 ; GFX8-NEXT: v_subb_u32_e64 v8, s[0:1], v8, v6, vcc -; GFX8-NEXT: v_sub_u32_e64 v6, s[0:1], s3, v6 -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v8 +; GFX8-NEXT: v_sub_u32_e64 v6, s[0:1], s7, v6 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v8 ; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v7 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v7 ; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v8 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v8 ; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v6, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, s10, v7 +; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, s2, v7 ; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v6, vcc -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v12 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v11 +; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v11 ; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v12 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12 ; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] ; GFX8-NEXT: v_add_u32_e64 v14, s[0:1], 1, v2 ; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v6, v10, vcc @@ -1659,7 +1658,7 @@ ; GFX8-NEXT: v_add_u32_e32 v10, vcc, 1, v14 ; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s10, v11 +; GFX8-NEXT: v_subrev_u32_e64 v13, s[0:1], s2, v11 ; GFX8-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v10, v14, v10, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 @@ -1667,57 +1666,58 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[0:1] +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[0:1] -; GFX8-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9] +; GFX8-NEXT: s_xor_b64 s[0:1], s[8:9], s[12:13] ; GFX8-NEXT: v_xor_b32_e32 v2, s0, v2 ; GFX8-NEXT: v_xor_b32_e32 v3, s1, v3 ; GFX8-NEXT: v_mov_b32_e32 v8, s1 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v2 ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc -; GFX8-NEXT: v_xor_b32_e32 v7, s6, v7 -; GFX8-NEXT: v_xor_b32_e32 v8, s6, v6 -; GFX8-NEXT: v_mov_b32_e32 v9, s6 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s6, v7 +; GFX8-NEXT: v_xor_b32_e32 v7, s8, v7 +; GFX8-NEXT: v_xor_b32_e32 v8, s8, v6 +; GFX8-NEXT: v_mov_b32_e32 v9, s8 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s8, v7 ; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v8, v9, vcc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NEXT: v_mov_b32_e32 v9, s13 +; GFX8-NEXT: v_mov_b32_e32 v9, s5 +; GFX8-NEXT: v_mov_b32_e32 v8, s4 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s14 -; GFX8-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sdivrem_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s13, 31 ; GFX9-NEXT: s_ashr_i32 s6, s9, 31 -; GFX9-NEXT: s_add_u32 s0, s12, s2 -; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: s_and_b32 s1, s1, 1 -; GFX9-NEXT: s_cmp_lg_u32 s1, 0 -; GFX9-NEXT: s_addc_u32 s1, s13, s2 +; GFX9-NEXT: s_ashr_i32 s12, s1, 31 ; GFX9-NEXT: s_add_u32 s8, s8, s6 -; GFX9-NEXT: s_cselect_b32 s3, 1, 0 -; GFX9-NEXT: s_and_b32 s3, s3, 1 -; GFX9-NEXT: s_cmp_lg_u32 s3, 0 -; GFX9-NEXT: s_mov_b32 s7, s6 +; GFX9-NEXT: s_cselect_b32 s7, 1, 0 +; GFX9-NEXT: s_and_b32 s7, s7, 1 +; GFX9-NEXT: s_cmp_lg_u32 s7, 0 ; GFX9-NEXT: s_addc_u32 s9, s9, s6 +; GFX9-NEXT: s_add_u32 s0, s0, s12 +; GFX9-NEXT: s_cselect_b32 s7, 1, 0 +; GFX9-NEXT: s_and_b32 s7, s7, 1 +; GFX9-NEXT: s_cmp_lg_u32 s7, 0 +; GFX9-NEXT: s_mov_b32 s13, s12 +; GFX9-NEXT: s_addc_u32 s1, s1, s12 +; GFX9-NEXT: s_xor_b64 s[14:15], s[0:1], s[12:13] +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s15 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s14 +; GFX9-NEXT: s_mov_b32 s7, s6 ; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[6:7] -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s8 -; GFX9-NEXT: s_mov_b32 s3, s2 -; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[2:3] ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_u32 s16, 0, s8 +; GFX9-NEXT: s_sub_u32 s16, 0, s14 ; GFX9-NEXT: s_cselect_b32 s0, 1, 0 ; GFX9-NEXT: s_and_b32 s0, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1728,7 +1728,7 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_subb_u32 s17, 0, s9 +; GFX9-NEXT: s_subb_u32 s17, 0, s15 ; GFX9-NEXT: v_mul_lo_u32 v3, s17, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, s16, v1 ; GFX9-NEXT: v_mul_hi_u32 v4, s16, v0 @@ -1785,19 +1785,19 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s13, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s13, v0 -; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mul_lo_u32 v2, s9, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 +; GFX9-NEXT: v_mov_b32_e32 v7, s9 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s13, v1 +; GFX9-NEXT: v_mul_lo_u32 v4, s9, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_hi_u32 v3, s12, v1 -; GFX9-NEXT: v_mul_hi_u32 v1, s13, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s8, v1 +; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 @@ -1806,33 +1806,33 @@ ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1 -; GFX9-NEXT: v_mul_lo_u32 v2, s9, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, s8, v0 -; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_mul_lo_u32 v2, s15, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s14, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, s14, v0 +; GFX9-NEXT: v_mul_lo_u32 v6, s14, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 ; GFX9-NEXT: v_add3_u32 v2, v2, v3, v4 -; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s12, v6 +; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s8, v6 ; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v7, v2, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 -; GFX9-NEXT: v_sub_u32_e32 v2, s13, v2 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v4 +; GFX9-NEXT: v_sub_u32_e32 v2, s9, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v3 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v4 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v4 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s8, v3 +; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s14, v3 ; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[0:1], 0, v2, vcc ; GFX9-NEXT: v_add_co_u32_e64 v9, s[0:1], 1, v0 ; GFX9-NEXT: v_addc_co_u32_e64 v10, s[0:1], 0, v1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v8 ; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 -; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s8, v7 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7 +; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s14, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v12, s[0:1], 1, v9 ; GFX9-NEXT: v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc @@ -1848,28 +1848,28 @@ ; GFX9-NEXT: s_ashr_i32 s8, s11, 31 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[0:1] -; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[6:7] -; GFX9-NEXT: s_ashr_i32 s6, s15, 31 -; GFX9-NEXT: s_add_u32 s12, s14, s6 -; GFX9-NEXT: s_cselect_b32 s3, 1, 0 -; GFX9-NEXT: s_and_b32 s3, s3, 1 -; GFX9-NEXT: s_cmp_lg_u32 s3, 0 -; GFX9-NEXT: s_addc_u32 s13, s15, s6 +; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[12:13] +; GFX9-NEXT: s_ashr_i32 s12, s3, 31 ; GFX9-NEXT: s_add_u32 s10, s10, s8 -; GFX9-NEXT: s_cselect_b32 s3, 1, 0 -; GFX9-NEXT: s_and_b32 s3, s3, 1 -; GFX9-NEXT: s_cmp_lg_u32 s3, 0 -; GFX9-NEXT: s_mov_b32 s9, s8 +; GFX9-NEXT: s_cselect_b32 s7, 1, 0 +; GFX9-NEXT: s_and_b32 s7, s7, 1 +; GFX9-NEXT: s_cmp_lg_u32 s7, 0 ; GFX9-NEXT: s_addc_u32 s11, s11, s8 +; GFX9-NEXT: s_add_u32 s2, s2, s12 +; GFX9-NEXT: s_cselect_b32 s7, 1, 0 +; GFX9-NEXT: s_and_b32 s7, s7, 1 +; GFX9-NEXT: s_cmp_lg_u32 s7, 0 +; GFX9-NEXT: s_mov_b32 s13, s12 +; GFX9-NEXT: s_addc_u32 s3, s3, s12 +; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 +; GFX9-NEXT: s_mov_b32 s9, s8 ; GFX9-NEXT: s_xor_b64 s[10:11], s[10:11], s[8:9] -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s11 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s10 -; GFX9-NEXT: s_mov_b32 s7, s6 -; GFX9-NEXT: s_xor_b64 s[12:13], s[12:13], s[6:7] ; GFX9-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4 ; GFX9-NEXT: v_add_f32_e32 v4, v4, v5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GFX9-NEXT: s_sub_u32 s3, 0, s10 +; GFX9-NEXT: s_sub_u32 s7, 0, s2 ; GFX9-NEXT: v_xor_b32_e32 v1, s1, v1 ; GFX9-NEXT: v_mov_b32_e32 v5, s1 ; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 @@ -1882,11 +1882,11 @@ ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX9-NEXT: s_and_b32 s1, s1, 1 ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 -; GFX9-NEXT: s_subb_u32 s14, 0, s11 +; GFX9-NEXT: s_subb_u32 s14, 0, s3 ; GFX9-NEXT: v_mul_lo_u32 v8, s14, v4 -; GFX9-NEXT: v_mul_lo_u32 v9, s3, v6 -; GFX9-NEXT: v_mul_hi_u32 v10, s3, v4 -; GFX9-NEXT: v_mul_lo_u32 v7, s3, v4 +; GFX9-NEXT: v_mul_lo_u32 v9, s7, v6 +; GFX9-NEXT: v_mul_hi_u32 v10, s7, v4 +; GFX9-NEXT: v_mul_lo_u32 v7, s7, v4 ; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc @@ -1895,7 +1895,7 @@ ; GFX9-NEXT: v_mul_lo_u32 v9, v4, v5 ; GFX9-NEXT: v_mul_hi_u32 v10, v4, v7 ; GFX9-NEXT: v_mul_hi_u32 v7, v6, v7 -; GFX9-NEXT: v_xor_b32_e32 v3, s2, v3 +; GFX9-NEXT: v_xor_b32_e32 v3, s6, v3 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 @@ -1915,17 +1915,17 @@ ; GFX9-NEXT: v_add3_u32 v5, v9, v8, v5 ; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], v6, v5, vcc ; GFX9-NEXT: v_mul_lo_u32 v8, s14, v4 -; GFX9-NEXT: v_mul_lo_u32 v9, s3, v7 -; GFX9-NEXT: v_mul_hi_u32 v10, s3, v4 -; GFX9-NEXT: v_mul_lo_u32 v11, s3, v4 +; GFX9-NEXT: v_mul_lo_u32 v9, s7, v7 +; GFX9-NEXT: v_mul_hi_u32 v10, s7, v4 +; GFX9-NEXT: v_mul_lo_u32 v11, s7, v4 ; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX9-NEXT: v_xor_b32_e32 v2, s2, v2 +; GFX9-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX9-NEXT: v_add3_u32 v8, v8, v9, v10 ; GFX9-NEXT: v_mul_lo_u32 v9, v7, v11 ; GFX9-NEXT: v_mul_lo_u32 v10, v4, v8 ; GFX9-NEXT: v_mul_hi_u32 v6, v4, v11 ; GFX9-NEXT: v_mul_hi_u32 v11, v7, v11 -; GFX9-NEXT: v_mov_b32_e32 v12, s2 +; GFX9-NEXT: v_mov_b32_e32 v12, s6 ; GFX9-NEXT: v_add_co_u32_e64 v9, s[0:1], v9, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v9, v6 @@ -1945,20 +1945,20 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v5, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s2, v3 -; GFX9-NEXT: v_mul_lo_u32 v8, s13, v6 -; GFX9-NEXT: v_mul_lo_u32 v9, s12, v7 +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v3 +; GFX9-NEXT: v_mul_lo_u32 v8, s11, v6 +; GFX9-NEXT: v_mul_lo_u32 v9, s10, v7 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v2, v12, vcc -; GFX9-NEXT: v_mul_hi_u32 v2, s12, v6 -; GFX9-NEXT: v_mul_hi_u32 v6, s13, v6 +; GFX9-NEXT: v_mul_hi_u32 v2, s10, v6 +; GFX9-NEXT: v_mul_hi_u32 v6, s11, v6 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v8, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s13, v7 +; GFX9-NEXT: v_mul_lo_u32 v3, s11, v7 ; GFX9-NEXT: v_add_u32_e32 v2, v8, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, s12, v7 -; GFX9-NEXT: v_mul_hi_u32 v7, s13, v7 +; GFX9-NEXT: v_mul_hi_u32 v8, s10, v7 +; GFX9-NEXT: v_mul_hi_u32 v7, s11, v7 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 @@ -1967,30 +1967,30 @@ ; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX9-NEXT: v_add3_u32 v3, v6, v3, v7 -; GFX9-NEXT: v_mul_lo_u32 v6, s11, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, s10, v3 -; GFX9-NEXT: v_mul_hi_u32 v8, s10, v2 -; GFX9-NEXT: v_mul_lo_u32 v10, s10, v2 -; GFX9-NEXT: v_mov_b32_e32 v11, s13 -; GFX9-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-NEXT: v_mul_lo_u32 v6, s3, v2 +; GFX9-NEXT: v_mul_lo_u32 v7, s2, v3 +; GFX9-NEXT: v_mul_hi_u32 v8, s2, v2 +; GFX9-NEXT: v_mul_lo_u32 v10, s2, v2 +; GFX9-NEXT: v_mov_b32_e32 v11, s11 +; GFX9-NEXT: v_mov_b32_e32 v9, s3 ; GFX9-NEXT: v_add3_u32 v6, v6, v7, v8 -; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, s12, v10 +; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, s10, v10 ; GFX9-NEXT: v_subb_co_u32_e64 v8, s[0:1], v11, v6, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v8 -; GFX9-NEXT: v_sub_u32_e32 v6, s13, v6 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v8 +; GFX9-NEXT: v_sub_u32_e32 v6, s11, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v7 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v8 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v8 ; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s10, v7 +; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s2, v7 ; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v6, vcc -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v12 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v11 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v12 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v9, vcc @@ -2000,87 +2000,87 @@ ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v14, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s10, v11 +; GFX9-NEXT: v_subrev_co_u32_e64 v15, s[0:1], s2, v11 ; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[0:1], 0, v6, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v12, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v15, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[0:1] -; GFX9-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[0:1] -; GFX9-NEXT: s_xor_b64 s[0:1], s[6:7], s[8:9] +; GFX9-NEXT: s_xor_b64 s[0:1], s[8:9], s[12:13] ; GFX9-NEXT: v_xor_b32_e32 v2, s0, v2 ; GFX9-NEXT: v_xor_b32_e32 v3, s1, v3 ; GFX9-NEXT: v_mov_b32_e32 v8, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v8, vcc -; GFX9-NEXT: v_xor_b32_e32 v7, s6, v7 -; GFX9-NEXT: v_xor_b32_e32 v8, s6, v6 +; GFX9-NEXT: v_xor_b32_e32 v7, s8, v7 +; GFX9-NEXT: v_xor_b32_e32 v8, s8, v6 ; GFX9-NEXT: v_mov_b32_e32 v13, 0 -; GFX9-NEXT: v_mov_b32_e32 v9, s6 -; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v7 +; GFX9-NEXT: v_mov_b32_e32 v9, s8 +; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s8, v7 ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v8, v9, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v13, v[0:3], s[12:13] -; GFX9-NEXT: global_store_dwordx4 v13, v[4:7], s[14:15] +; GFX9-NEXT: global_store_dwordx4 v13, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx4 v13, v[4:7], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdivrem_v2i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x10 -; GFX10-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ashr_i32 s2, s17, 31 -; GFX10-NEXT: s_ashr_i32 s0, s13, 31 -; GFX10-NEXT: s_add_u32 s8, s16, s2 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: s_cmp_lg_u32 s1, 0 -; GFX10-NEXT: s_addc_u32 s9, s17, s2 -; GFX10-NEXT: s_add_u32 s6, s12, s0 -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_and_b32 s3, s1, 1 -; GFX10-NEXT: s_mov_b32 s1, s0 -; GFX10-NEXT: s_cmp_lg_u32 s3, 0 -; GFX10-NEXT: s_mov_b32 s3, s2 -; GFX10-NEXT: s_addc_u32 s7, s13, s0 -; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[2:3] -; GFX10-NEXT: s_xor_b64 s[6:7], s[6:7], s[0:1] -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX10-NEXT: s_sub_u32 s20, 0, s6 -; GFX10-NEXT: s_cselect_b32 s10, 1, 0 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX10-NEXT: s_and_b32 s10, s10, 1 +; GFX10-NEXT: s_ashr_i32 s12, s9, 31 +; GFX10-NEXT: s_ashr_i32 s6, s1, 31 +; GFX10-NEXT: s_add_u32 s14, s8, s12 +; GFX10-NEXT: s_cselect_b32 s7, 1, 0 +; GFX10-NEXT: s_mov_b32 s13, s12 +; GFX10-NEXT: s_and_b32 s7, s7, 1 +; GFX10-NEXT: s_cmp_lg_u32 s7, 0 +; GFX10-NEXT: s_addc_u32 s15, s9, s12 +; GFX10-NEXT: s_add_u32 s0, s0, s6 +; GFX10-NEXT: s_cselect_b32 s7, 1, 0 +; GFX10-NEXT: s_and_b32 s8, s7, 1 +; GFX10-NEXT: s_mov_b32 s7, s6 +; GFX10-NEXT: s_cmp_lg_u32 s8, 0 +; GFX10-NEXT: s_addc_u32 s1, s1, s6 +; GFX10-NEXT: s_xor_b64 s[14:15], s[14:15], s[12:13] +; GFX10-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GFX10-NEXT: s_sub_u32 s22, 0, s8 +; GFX10-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX10-NEXT: s_and_b32 s0, s0, 1 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 -; GFX10-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10-NEXT: s_subb_u32 s21, 0, s7 -; GFX10-NEXT: s_ashr_i32 s10, s19, 31 +; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_subb_u32 s23, 0, s9 +; GFX10-NEXT: s_ashr_i32 s16, s11, 31 ; GFX10-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX10-NEXT: s_ashr_i32 s12, s15, 31 -; GFX10-NEXT: s_xor_b64 s[16:17], s[2:3], s[0:1] -; GFX10-NEXT: s_add_u32 s0, s18, s10 +; GFX10-NEXT: s_ashr_i32 s18, s3, 31 +; GFX10-NEXT: s_xor_b64 s[20:21], s[12:13], s[6:7] +; GFX10-NEXT: s_add_u32 s0, s10, s16 ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: s_and_b32 s1, s1, 1 -; GFX10-NEXT: s_mov_b32 s13, s12 +; GFX10-NEXT: s_mov_b32 s19, s18 ; GFX10-NEXT: s_cmp_lg_u32 s1, 0 -; GFX10-NEXT: s_mov_b32 s11, s10 -; GFX10-NEXT: s_addc_u32 s1, s19, s10 -; GFX10-NEXT: s_add_u32 s14, s14, s12 -; GFX10-NEXT: s_cselect_b32 s3, 1, 0 -; GFX10-NEXT: s_and_b32 s3, s3, 1 +; GFX10-NEXT: s_mov_b32 s17, s16 +; GFX10-NEXT: s_addc_u32 s1, s11, s16 +; GFX10-NEXT: s_add_u32 s2, s2, s18 +; GFX10-NEXT: s_cselect_b32 s6, 1, 0 +; GFX10-NEXT: s_and_b32 s6, s6, 1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GFX10-NEXT: s_cmp_lg_u32 s3, 0 -; GFX10-NEXT: s_addc_u32 s15, s15, s12 -; GFX10-NEXT: s_xor_b64 s[18:19], s[0:1], s[10:11] -; GFX10-NEXT: s_xor_b64 s[14:15], s[14:15], s[12:13] +; GFX10-NEXT: s_cmp_lg_u32 s6, 0 +; GFX10-NEXT: s_addc_u32 s3, s3, s18 +; GFX10-NEXT: s_xor_b64 s[10:11], s[0:1], s[16:17] +; GFX10-NEXT: s_xor_b64 s[2:3], s[2:3], s[18:19] ; GFX10-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s15 -; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s14 -; GFX10-NEXT: s_sub_u32 s3, 0, s14 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s2 +; GFX10-NEXT: s_sub_u32 s6, 0, s2 ; GFX10-NEXT: s_cselect_b32 s0, 1, 0 ; GFX10-NEXT: v_trunc_f32_e32 v2, v2 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 @@ -2089,15 +2089,15 @@ ; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX10-NEXT: v_mul_f32_e32 v3, 0xcf800000, v2 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX10-NEXT: s_subb_u32 s22, 0, s15 +; GFX10-NEXT: s_subb_u32 s7, 0, s3 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_add_f32_e32 v0, v3, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, s20, v2 +; GFX10-NEXT: v_mul_lo_u32 v3, s22, v2 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 -; GFX10-NEXT: v_mul_lo_u32 v4, s21, v0 -; GFX10-NEXT: v_mul_hi_u32 v5, s20, v0 -; GFX10-NEXT: v_mul_lo_u32 v6, s20, v0 +; GFX10-NEXT: v_mul_lo_u32 v4, s23, v0 +; GFX10-NEXT: v_mul_hi_u32 v5, s22, v0 +; GFX10-NEXT: v_mul_lo_u32 v6, s22, v0 ; GFX10-NEXT: v_mul_f32_e32 v7, 0x2f800000, v1 ; GFX10-NEXT: v_add3_u32 v3, v4, v3, v5 ; GFX10-NEXT: v_trunc_f32_e32 v4, v7 @@ -2112,17 +2112,17 @@ ; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX10-NEXT: v_add_f32_e32 v1, v9, v1 ; GFX10-NEXT: v_add_co_u32 v5, s0, v5, v8 -; GFX10-NEXT: v_mul_lo_u32 v9, s3, v4 +; GFX10-NEXT: v_mul_lo_u32 v9, s6, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v6, s0, v10, v6 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v5, s0, v5, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 -; GFX10-NEXT: v_mul_lo_u32 v12, s22, v1 -; GFX10-NEXT: v_mul_hi_u32 v13, s3, v1 +; GFX10-NEXT: v_mul_lo_u32 v12, s7, v1 +; GFX10-NEXT: v_mul_hi_u32 v13, s6, v1 ; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v11 -; GFX10-NEXT: v_mul_lo_u32 v11, s3, v1 +; GFX10-NEXT: v_mul_lo_u32 v11, s6, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v5, v8, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v5, s0, v6, v5 @@ -2140,15 +2140,15 @@ ; GFX10-NEXT: v_add_co_ci_u32_e64 v12, s0, v2, v3, vcc_lo ; GFX10-NEXT: v_mul_hi_u32 v8, v4, v8 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-NEXT: v_mul_lo_u32 v14, s21, v0 +; GFX10-NEXT: v_mul_lo_u32 v14, s23, v0 ; GFX10-NEXT: v_add_co_u32 v6, s0, v9, v6 -; GFX10-NEXT: v_mul_hi_u32 v15, s20, v0 +; GFX10-NEXT: v_mul_hi_u32 v15, s22, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v7, s0, v7, v11 -; GFX10-NEXT: v_mul_lo_u32 v16, s20, v12 +; GFX10-NEXT: v_mul_lo_u32 v16, s22, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v10 -; GFX10-NEXT: v_mul_lo_u32 v13, s20, v0 +; GFX10-NEXT: v_mul_lo_u32 v13, s22, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0 ; GFX10-NEXT: v_add_co_u32 v5, s0, v7, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 @@ -2167,23 +2167,23 @@ ; GFX10-NEXT: v_add3_u32 v6, v7, v6, v8 ; GFX10-NEXT: v_add_co_u32 v5, s1, v10, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s1 -; GFX10-NEXT: v_mul_lo_u32 v3, s3, v1 +; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 ; GFX10-NEXT: v_add_co_u32 v8, s1, v15, v13 -; GFX10-NEXT: v_mul_lo_u32 v13, s22, v1 +; GFX10-NEXT: v_mul_lo_u32 v13, s7, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v11, s1, v4, v6, s0 ; GFX10-NEXT: v_add_co_u32 v5, s1, v5, v9 -; GFX10-NEXT: v_mul_hi_u32 v15, s3, v1 +; GFX10-NEXT: v_mul_hi_u32 v15, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s1 ; GFX10-NEXT: v_add_co_u32 v8, s1, v8, v16 -; GFX10-NEXT: v_mul_lo_u32 v9, s3, v11 +; GFX10-NEXT: v_mul_lo_u32 v9, s6, v11 ; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, v4, v6 ; GFX10-NEXT: v_add_nc_u32_e32 v5, v7, v5 ; GFX10-NEXT: v_mul_hi_u32 v7, v12, v14 ; GFX10-NEXT: v_mul_lo_u32 v12, v11, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v10, v10, v16 -; GFX10-NEXT: s_load_dwordx4 s[20:23], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_add_co_u32 v5, s1, v8, v5 ; GFX10-NEXT: v_add3_u32 v9, v13, v9, v15 ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1 @@ -2200,22 +2200,22 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo ; GFX10-NEXT: v_add_co_u32 v3, s1, v8, v3 -; GFX10-NEXT: v_mul_lo_u32 v8, s9, v0 +; GFX10-NEXT: v_mul_lo_u32 v8, s15, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s1 ; GFX10-NEXT: v_add_co_u32 v7, s1, v7, v13 -; GFX10-NEXT: v_mul_lo_u32 v14, s8, v2 -; GFX10-NEXT: v_mul_hi_u32 v12, s8, v0 -; GFX10-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX10-NEXT: v_mul_lo_u32 v13, s9, v2 +; GFX10-NEXT: v_mul_lo_u32 v14, s14, v2 +; GFX10-NEXT: v_mul_hi_u32 v12, s14, v0 +; GFX10-NEXT: v_mul_hi_u32 v0, s15, v0 +; GFX10-NEXT: v_mul_lo_u32 v13, s15, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s1 ; GFX10-NEXT: v_add_co_u32 v3, s1, v3, v10 -; GFX10-NEXT: v_mul_hi_u32 v15, s8, v2 +; GFX10-NEXT: v_mul_hi_u32 v15, s14, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s1 ; GFX10-NEXT: v_add_co_u32 v8, s1, v8, v14 ; GFX10-NEXT: v_add_nc_u32_e32 v7, v11, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s1 ; GFX10-NEXT: v_add_co_u32 v0, s1, v13, v0 -; GFX10-NEXT: v_mul_hi_u32 v2, s9, v2 +; GFX10-NEXT: v_mul_hi_u32 v2, s15, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s1 ; GFX10-NEXT: v_add_co_u32 v8, s1, v8, v12 ; GFX10-NEXT: v_add_nc_u32_e32 v5, v5, v10 @@ -2228,41 +2228,41 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1 ; GFX10-NEXT: v_add_co_u32 v3, s1, v3, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s1 -; GFX10-NEXT: v_mul_lo_u32 v6, s7, v0 +; GFX10-NEXT: v_mul_lo_u32 v6, s9, v0 ; GFX10-NEXT: v_add3_u32 v2, v10, v8, v2 ; GFX10-NEXT: v_add3_u32 v5, v5, v7, v9 -; GFX10-NEXT: v_mul_hi_u32 v7, s6, v0 -; GFX10-NEXT: v_mul_lo_u32 v8, s6, v2 +; GFX10-NEXT: v_mul_hi_u32 v7, s8, v0 +; GFX10-NEXT: v_mul_lo_u32 v8, s8, v2 ; GFX10-NEXT: v_mov_b32_e32 v9, 0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v4, vcc_lo, v4, v5, s0 -; GFX10-NEXT: v_mul_lo_u32 v5, s6, v0 +; GFX10-NEXT: v_mul_lo_u32 v5, s8, v0 ; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v4, vcc_lo ; GFX10-NEXT: v_add3_u32 v4, v6, v8, v7 -; GFX10-NEXT: v_mul_lo_u32 v6, s19, v1 -; GFX10-NEXT: v_mul_hi_u32 v7, s19, v1 -; GFX10-NEXT: v_sub_co_u32 v5, vcc_lo, s8, v5 -; GFX10-NEXT: v_mul_lo_u32 v14, s18, v3 -; GFX10-NEXT: v_sub_nc_u32_e32 v8, s9, v4 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v4, s0, s9, v4, vcc_lo -; GFX10-NEXT: v_mul_lo_u32 v15, s19, v3 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v5 -; GFX10-NEXT: v_mul_hi_u32 v1, s18, v1 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s7, v8, vcc_lo -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v4 -; GFX10-NEXT: v_mul_hi_u32 v17, s18, v3 +; GFX10-NEXT: v_mul_lo_u32 v6, s11, v1 +; GFX10-NEXT: v_mul_hi_u32 v7, s11, v1 +; GFX10-NEXT: v_sub_co_u32 v5, vcc_lo, s14, v5 +; GFX10-NEXT: v_mul_lo_u32 v14, s10, v3 +; GFX10-NEXT: v_sub_nc_u32_e32 v8, s15, v4 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v4, s0, s15, v4, vcc_lo +; GFX10-NEXT: v_mul_lo_u32 v15, s11, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v5 +; GFX10-NEXT: v_mul_hi_u32 v1, s10, v1 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s9, v4 +; GFX10-NEXT: v_mul_hi_u32 v17, s10, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0 -; GFX10-NEXT: v_mul_hi_u32 v3, s19, v3 +; GFX10-NEXT: v_mul_hi_u32 v3, s11, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v12, vcc_lo, v5, s6 +; GFX10-NEXT: v_sub_co_u32 v12, vcc_lo, v5, s8 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v13, s0, 0, v8, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v4 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s7, v8, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v4 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v10, v11, v10, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v13 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v13 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v12 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, -1, s0 ; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v14 ; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s0 @@ -2275,7 +2275,7 @@ ; GFX10-NEXT: v_add_co_u32 v17, s0, v0, 1 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v14, v1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v18, s0, 0, v2, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v13 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v13 ; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v15 ; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v16, s0 ; GFX10-NEXT: v_add_co_u32 v7, s0, v7, v1 @@ -2284,52 +2284,52 @@ ; GFX10-NEXT: v_add_co_ci_u32_e64 v15, s0, 0, v18, s0 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v11 ; GFX10-NEXT: v_add3_u32 v3, v6, v1, v3 -; GFX10-NEXT: v_mul_lo_u32 v10, s15, v7 -; GFX10-NEXT: v_mul_lo_u32 v16, s14, v7 +; GFX10-NEXT: v_mul_lo_u32 v10, s3, v7 +; GFX10-NEXT: v_mul_lo_u32 v16, s2, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v18, v15, s0 -; GFX10-NEXT: v_mul_lo_u32 v11, s14, v3 -; GFX10-NEXT: v_mul_hi_u32 v15, s14, v7 +; GFX10-NEXT: v_mul_lo_u32 v11, s2, v3 +; GFX10-NEXT: v_mul_hi_u32 v15, s2, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v17, v14, s0 -; GFX10-NEXT: v_sub_co_u32 v14, s1, v12, s6 +; GFX10-NEXT: v_sub_co_u32 v14, s1, v12, s8 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v8, s1, 0, v8, s1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v6, vcc_lo ; GFX10-NEXT: v_add3_u32 v6, v10, v11, v15 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v13, v8, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v14, s0 -; GFX10-NEXT: v_sub_co_u32 v8, s0, s18, v16 -; GFX10-NEXT: v_xor_b32_e32 v0, s16, v0 -; GFX10-NEXT: v_sub_co_ci_u32_e64 v10, s1, s19, v6, s0 +; GFX10-NEXT: v_sub_co_u32 v8, s0, s10, v16 +; GFX10-NEXT: v_xor_b32_e32 v0, s20, v0 +; GFX10-NEXT: v_sub_co_ci_u32_e64 v10, s1, s11, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v4, s19, v6 -; GFX10-NEXT: v_xor_b32_e32 v1, s17, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s15, v10 -; GFX10-NEXT: v_xor_b32_e32 v2, s2, v2 -; GFX10-NEXT: v_xor_b32_e32 v5, s2, v5 +; GFX10-NEXT: v_sub_nc_u32_e32 v4, s11, v6 +; GFX10-NEXT: v_xor_b32_e32 v1, s21, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v10 +; GFX10-NEXT: v_xor_b32_e32 v2, s12, v2 +; GFX10-NEXT: v_xor_b32_e32 v5, s12, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v4, vcc_lo, s15, v4, s0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v8 +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v4, vcc_lo, s3, v4, s0 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v8 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v12, vcc_lo, v8, s14 +; GFX10-NEXT: v_sub_co_u32 v12, vcc_lo, v8, s2 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v13, s0, 0, v4, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v0, s0, v0, s16 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v4, vcc_lo, s15, v4, vcc_lo -; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s0, s17, v1, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s15, v10 +; GFX10-NEXT: v_sub_co_u32 v0, s0, v0, s20 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v4, vcc_lo, s3, v4, vcc_lo +; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s0, s21, v1, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v10 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v11, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s15, v13 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v12 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, -1, s0 ; GFX10-NEXT: v_add_co_u32 v15, s0, v7, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v16, s0, 0, v3, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s15, v13 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, v13 ; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v14, s0 ; GFX10-NEXT: v_add_co_u32 v14, s0, v15, 1 ; GFX10-NEXT: v_add_co_ci_u32_e64 v17, s0, 0, v16, s0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 -; GFX10-NEXT: v_sub_co_u32 v11, s0, v12, s14 +; GFX10-NEXT: v_sub_co_u32 v11, s0, v12, s2 ; GFX10-NEXT: v_subrev_co_ci_u32_e64 v4, s0, 0, v4, s0 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v14, v15, v14, vcc_lo @@ -2340,20 +2340,20 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v15, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v8, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v10, v4, s0 -; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v5, s2 -; GFX10-NEXT: s_xor_b64 s[0:1], s[10:11], s[12:13] -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s2, v2, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v5, s12 +; GFX10-NEXT: s_xor_b64 s[0:1], s[16:17], s[18:19] +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s12, v2, vcc_lo ; GFX10-NEXT: v_xor_b32_e32 v2, s0, v7 ; GFX10-NEXT: v_xor_b32_e32 v3, s1, v3 -; GFX10-NEXT: v_xor_b32_e32 v6, s10, v6 -; GFX10-NEXT: v_xor_b32_e32 v7, s10, v8 +; GFX10-NEXT: v_xor_b32_e32 v6, s16, v6 +; GFX10-NEXT: v_xor_b32_e32 v7, s16, v8 ; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v2, s0 ; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo -; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v6, s10 -; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s10, v7, vcc_lo +; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v6, s16 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s16, v7, vcc_lo ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_store_dwordx4 v9, v[0:3], s[20:21] -; GFX10-NEXT: global_store_dwordx4 v9, v[4:7], s[22:23] +; GFX10-NEXT: global_store_dwordx4 v9, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dwordx4 v9, v[4:7], s[6:7] ; GFX10-NEXT: s_endpgm %div = sdiv <2 x i64> %x, %y store <2 x i64> %div, <2 x i64> addrspace(1)* %out0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -8,45 +8,47 @@ ; CHECK-LABEL: v_srem_i64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_or_b32_e32 v5, v1, v3 -; CHECK-NEXT: v_mov_b32_e32 v4, 0 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: v_mov_b32_e32 v5, v1 +; CHECK-NEXT: v_mov_b32_e32 v4, v0 +; CHECK-NEXT: v_or_b32_e32 v1, v5, v3 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz BB0_2 ; CHECK-NEXT: ; %bb.1: -; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v3, v4 -; CHECK-NEXT: v_xor_b32_e32 v2, v2, v4 -; CHECK-NEXT: v_cvt_f32_u32_e32 v4, v2 -; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v3 -; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc -; CHECK-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CHECK-NEXT: v_sub_i32_e32 v7, vcc, 0, v2 -; CHECK-NEXT: v_subb_u32_e32 v8, vcc, 0, v3, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 -; CHECK-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v0 +; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v3, v0, vcc +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v0 +; CHECK-NEXT: v_xor_b32_e32 v0, v2, v0 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v1 +; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v6 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v6, vcc +; CHECK-NEXT: v_sub_i32_e32 v7, vcc, 0, v1 +; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v5, v5 -; CHECK-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v5 +; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 -; CHECK-NEXT: v_mul_lo_u32 v9, v8, v4 +; CHECK-NEXT: v_subb_u32_e32 v8, vcc, 0, v0, vcc +; CHECK-NEXT: v_xor_b32_e32 v3, v3, v6 +; CHECK-NEXT: v_mul_lo_u32 v9, v8, v2 ; CHECK-NEXT: v_mul_lo_u32 v10, v7, v5 -; CHECK-NEXT: v_mul_hi_u32 v12, v7, v4 -; CHECK-NEXT: v_mul_lo_u32 v11, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v12, v7, v2 +; CHECK-NEXT: v_mul_lo_u32 v11, v7, v2 +; CHECK-NEXT: v_xor_b32_e32 v4, v4, v6 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CHECK-NEXT: v_mul_lo_u32 v10, v5, v11 -; CHECK-NEXT: v_mul_lo_u32 v12, v4, v9 -; CHECK-NEXT: v_mul_hi_u32 v13, v4, v11 +; CHECK-NEXT: v_mul_lo_u32 v12, v2, v9 +; CHECK-NEXT: v_mul_hi_u32 v13, v2, v11 ; CHECK-NEXT: v_mul_hi_u32 v11, v5, v11 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc @@ -54,7 +56,7 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CHECK-NEXT: v_mul_lo_u32 v13, v5, v9 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CHECK-NEXT: v_mul_hi_u32 v12, v4, v9 +; CHECK-NEXT: v_mul_hi_u32 v12, v2, v9 ; CHECK-NEXT: v_mul_hi_u32 v9, v5, v9 ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc @@ -65,18 +67,18 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v10 ; CHECK-NEXT: v_addc_u32_e64 v10, s[4:5], v5, v9, vcc -; CHECK-NEXT: v_mul_lo_u32 v8, v8, v4 +; CHECK-NEXT: v_mul_lo_u32 v8, v8, v2 ; CHECK-NEXT: v_mul_lo_u32 v11, v7, v10 -; CHECK-NEXT: v_mul_lo_u32 v12, v7, v4 -; CHECK-NEXT: v_mul_hi_u32 v7, v7, v4 +; CHECK-NEXT: v_mul_lo_u32 v12, v7, v2 +; CHECK-NEXT: v_mul_hi_u32 v7, v7, v2 ; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v5, v9 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; CHECK-NEXT: v_mul_hi_u32 v9, v4, v12 +; CHECK-NEXT: v_mul_hi_u32 v9, v2, v12 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 ; CHECK-NEXT: v_mul_lo_u32 v8, v10, v12 -; CHECK-NEXT: v_mul_lo_u32 v11, v4, v7 +; CHECK-NEXT: v_mul_lo_u32 v11, v2, v7 ; CHECK-NEXT: v_mul_hi_u32 v12, v10, v12 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] @@ -84,7 +86,7 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; CHECK-NEXT: v_mul_lo_u32 v9, v10, v7 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 -; CHECK-NEXT: v_mul_hi_u32 v11, v4, v7 +; CHECK-NEXT: v_mul_hi_u32 v11, v2, v7 ; CHECK-NEXT: v_mul_hi_u32 v7, v10, v7 ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] @@ -96,95 +98,93 @@ ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 ; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v1, v4 -; CHECK-NEXT: v_mul_lo_u32 v8, v0, v5 -; CHECK-NEXT: v_mul_hi_u32 v9, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 +; CHECK-NEXT: v_mul_lo_u32 v7, v4, v2 +; CHECK-NEXT: v_mul_lo_u32 v8, v3, v5 +; CHECK-NEXT: v_mul_hi_u32 v9, v3, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v4, v2 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v9, v1, v5 +; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_mul_hi_u32 v8, v0, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v1, v5 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CHECK-NEXT: v_mul_hi_u32 v8, v3, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CHECK-NEXT: v_mul_lo_u32 v7, v3, v4 -; CHECK-NEXT: v_mul_lo_u32 v5, v2, v5 -; CHECK-NEXT: v_mul_lo_u32 v8, v2, v4 -; CHECK-NEXT: v_mul_hi_u32 v4, v2, v4 +; CHECK-NEXT: v_mul_lo_u32 v7, v0, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v1, v5 +; CHECK-NEXT: v_mul_lo_u32 v8, v1, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v1, v4, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v4 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v8 +; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v4, v2, vcc +; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v4, v2 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v3 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v0 +; CHECK-NEXT: v_subb_u32_e32 v2, vcc, v2, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] -; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v0, v2 -; CHECK-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3 +; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v3, v1 +; CHECK-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v2, vcc +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v2 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v3 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v0 +; CHECK-NEXT: v_subb_u32_e32 v0, vcc, v2, v0, vcc +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v7, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[4:5] -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; CHECK-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 -; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v6 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 -; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v1, v6, vcc +; CHECK-NEXT: v_xor_b32_e32 v2, v0, v6 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v1, v6 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v2, v6, vcc ; CHECK-NEXT: ; implicit-def: $vgpr2 -; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr4 ; CHECK-NEXT: BB0_2: ; %Flow ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] ; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz BB0_4 ; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v2 -; CHECK-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 -; CHECK-NEXT: v_mov_b32_e32 v5, 0 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_lo_u32 v3, v3, v1 -; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v2 +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 +; CHECK-NEXT: v_mul_lo_u32 v1, v1, v0 ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, v2 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 +; CHECK-NEXT: v_mul_lo_u32 v0, v0, v2 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: BB0_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: v_mov_b32_e32 v0, v4 -; CHECK-NEXT: v_mov_b32_e32 v1, v5 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %num, %den ret i64 %result @@ -680,11 +680,13 @@ ; CGP-LABEL: v_srem_v2i64: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_mov_b32_e32 v9, v1 -; CGP-NEXT: v_mov_b32_e32 v8, v0 -; CGP-NEXT: v_or_b32_e32 v1, v9, v5 +; CGP-NEXT: v_mov_b32_e32 v11, v1 +; CGP-NEXT: v_mov_b32_e32 v10, v0 +; CGP-NEXT: v_or_b32_e32 v1, v11, v5 ; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CGP-NEXT: v_mov_b32_e32 v8, v2 +; CGP-NEXT: v_mov_b32_e32 v9, v3 ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] @@ -692,44 +694,44 @@ ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v5 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v0 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v5, v0, vcc +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v5, v0, vcc ; CGP-NEXT: v_xor_b32_e32 v1, v1, v0 -; CGP-NEXT: v_xor_b32_e32 v0, v4, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v4, v1 -; CGP-NEXT: v_cvt_f32_u32_e32 v5, v0 -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v9 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v10 -; CGP-NEXT: v_addc_u32_e32 v8, vcc, v9, v10, vcc +; CGP-NEXT: v_xor_b32_e32 v0, v2, v0 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v0 +; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v11 +; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 +; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v4 +; CGP-NEXT: v_addc_u32_e32 v5, vcc, v11, v4, vcc ; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 -; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; CGP-NEXT: v_mul_f32_e32 v9, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v9, v9 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v9 -; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v9, v9 +; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; CGP-NEXT: v_mul_f32_e32 v10, 0x2f800000, v2 +; CGP-NEXT: v_trunc_f32_e32 v10, v10 +; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v10 +; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v10, v10 ; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v5, v10 -; CGP-NEXT: v_mul_lo_u32 v13, v12, v4 -; CGP-NEXT: v_mul_lo_u32 v14, v11, v9 -; CGP-NEXT: v_mul_hi_u32 v16, v11, v4 -; CGP-NEXT: v_mul_lo_u32 v15, v11, v4 -; CGP-NEXT: v_xor_b32_e32 v8, v8, v10 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v4 +; CGP-NEXT: v_mul_lo_u32 v13, v12, v2 +; CGP-NEXT: v_mul_lo_u32 v14, v11, v10 +; CGP-NEXT: v_mul_hi_u32 v16, v11, v2 +; CGP-NEXT: v_mul_lo_u32 v15, v11, v2 +; CGP-NEXT: v_xor_b32_e32 v5, v5, v4 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; CGP-NEXT: v_mul_lo_u32 v14, v9, v15 -; CGP-NEXT: v_mul_lo_u32 v16, v4, v13 -; CGP-NEXT: v_mul_hi_u32 v17, v4, v15 -; CGP-NEXT: v_mul_hi_u32 v15, v9, v15 +; CGP-NEXT: v_mul_lo_u32 v14, v10, v15 +; CGP-NEXT: v_mul_lo_u32 v16, v2, v13 +; CGP-NEXT: v_mul_hi_u32 v17, v2, v15 +; CGP-NEXT: v_mul_hi_u32 v15, v10, v15 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v17, v9, v13 +; CGP-NEXT: v_mul_lo_u32 v17, v10, v13 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; CGP-NEXT: v_mul_hi_u32 v16, v4, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 +; CGP-NEXT: v_mul_hi_u32 v16, v2, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v10, v13 ; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16 @@ -739,18 +741,18 @@ ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 -; CGP-NEXT: v_addc_u32_e64 v14, s[4:5], v9, v13, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v12, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v14 +; CGP-NEXT: v_addc_u32_e64 v14, s[4:5], v10, v13, vcc +; CGP-NEXT: v_mul_lo_u32 v12, v12, v2 ; CGP-NEXT: v_mul_lo_u32 v15, v11, v14 -; CGP-NEXT: v_mul_lo_u32 v16, v11, v4 -; CGP-NEXT: v_mul_hi_u32 v11, v11, v4 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 +; CGP-NEXT: v_mul_lo_u32 v16, v11, v2 +; CGP-NEXT: v_mul_hi_u32 v11, v11, v2 +; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; CGP-NEXT: v_mul_hi_u32 v13, v4, v16 +; CGP-NEXT: v_mul_hi_u32 v13, v2, v16 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 ; CGP-NEXT: v_mul_lo_u32 v12, v14, v16 -; CGP-NEXT: v_mul_lo_u32 v15, v4, v11 +; CGP-NEXT: v_mul_lo_u32 v15, v2, v11 ; CGP-NEXT: v_mul_hi_u32 v16, v14, v16 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] @@ -758,7 +760,7 @@ ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; CGP-NEXT: v_mul_lo_u32 v13, v14, v11 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v15, v12 -; CGP-NEXT: v_mul_hi_u32 v15, v4, v11 +; CGP-NEXT: v_mul_hi_u32 v15, v2, v11 ; CGP-NEXT: v_mul_hi_u32 v11, v14, v11 ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] @@ -769,69 +771,69 @@ ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v12, v5, v9 -; CGP-NEXT: v_mul_hi_u32 v13, v5, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v8, v4 +; CGP-NEXT: v_addc_u32_e32 v10, vcc, v10, v11, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; CGP-NEXT: v_mul_lo_u32 v11, v5, v2 +; CGP-NEXT: v_mul_lo_u32 v12, v3, v10 +; CGP-NEXT: v_mul_hi_u32 v13, v3, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v8, v9 +; CGP-NEXT: v_mul_lo_u32 v13, v5, v10 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_mul_hi_u32 v12, v5, v9 -; CGP-NEXT: v_mul_hi_u32 v9, v8, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_mul_hi_u32 v12, v3, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v5, v10 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v11 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; CGP-NEXT: v_mul_lo_u32 v11, v0, v4 -; CGP-NEXT: v_mul_lo_u32 v9, v1, v9 -; CGP-NEXT: v_mul_lo_u32 v12, v1, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v8, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v8, v4 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v0 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 +; CGP-NEXT: v_mul_lo_u32 v11, v0, v2 +; CGP-NEXT: v_mul_lo_u32 v10, v1, v10 +; CGP-NEXT: v_mul_lo_u32 v12, v1, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v1, v2 +; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v10, v2 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v12 +; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v5, v2, vcc +; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v5, v2 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v0 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v0 -; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v0, vcc -; CGP-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v11, vcc, v5, v1 -; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v4, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v0 +; CGP-NEXT: v_subb_u32_e32 v2, vcc, v2, v0, vcc +; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[4:5] +; CGP-NEXT: v_sub_i32_e32 v11, vcc, v3, v1 +; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v2, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v0 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v0 -; CGP-NEXT: v_subb_u32_e32 v0, vcc, v4, v0, vcc +; CGP-NEXT: v_subb_u32_e32 v0, vcc, v2, v0, vcc ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v11, v1 ; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 ; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc ; CGP-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; CGP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v1, v10 -; CGP-NEXT: v_xor_b32_e32 v4, v0, v10 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v1, v10 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v4, v10, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc +; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 +; CGP-NEXT: v_xor_b32_e32 v2, v0, v4 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v2, v4, vcc ; CGP-NEXT: ; implicit-def: $vgpr4 -; CGP-NEXT: ; implicit-def: $vgpr8 +; CGP-NEXT: ; implicit-def: $vgpr10 ; CGP-NEXT: BB2_2: ; %Flow2 ; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] @@ -845,9 +847,9 @@ ; CGP-NEXT: v_mul_lo_u32 v1, v1, v0 ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 ; CGP-NEXT: v_mul_lo_u32 v0, v0, v4 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v8, v0 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v10, v0 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v4 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc @@ -857,54 +859,54 @@ ; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: BB2_4: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] -; CGP-NEXT: v_or_b32_e32 v5, v3, v7 -; CGP-NEXT: v_mov_b32_e32 v4, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; CGP-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CGP-NEXT: v_or_b32_e32 v3, v9, v7 +; CGP-NEXT: v_mov_b32_e32 v2, 0 +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB2_6 ; CGP-NEXT: ; %bb.5: -; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v4 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v5, v4 -; CGP-NEXT: v_xor_b32_e32 v4, v6, v4 -; CGP-NEXT: v_cvt_f32_u32_e32 v6, v5 -; CGP-NEXT: v_cvt_f32_u32_e32 v7, v4 -; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc -; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 -; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v5 -; CGP-NEXT: v_subb_u32_e32 v10, vcc, 0, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v8 -; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 -; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6 -; CGP-NEXT: v_trunc_f32_e32 v7, v7 -; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v8 -; CGP-NEXT: v_mul_lo_u32 v11, v10, v6 -; CGP-NEXT: v_mul_lo_u32 v12, v9, v7 -; CGP-NEXT: v_mul_hi_u32 v14, v9, v6 -; CGP-NEXT: v_mul_lo_u32 v13, v9, v6 +; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v6, v2 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v7, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v3, v3, v2 +; CGP-NEXT: v_xor_b32_e32 v2, v4, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, v3 +; CGP-NEXT: v_cvt_f32_u32_e32 v5, v2 +; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v9 +; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v6 +; CGP-NEXT: v_addc_u32_e32 v7, vcc, v9, v6, vcc +; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 +; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v4 +; CGP-NEXT: v_trunc_f32_e32 v8, v8 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 +; CGP-NEXT: v_subb_u32_e32 v10, vcc, 0, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v5, v5, v6 +; CGP-NEXT: v_mul_lo_u32 v11, v10, v4 +; CGP-NEXT: v_mul_lo_u32 v12, v9, v8 +; CGP-NEXT: v_mul_hi_u32 v14, v9, v4 +; CGP-NEXT: v_mul_lo_u32 v13, v9, v4 +; CGP-NEXT: v_xor_b32_e32 v7, v7, v6 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; CGP-NEXT: v_mul_lo_u32 v12, v7, v13 -; CGP-NEXT: v_mul_lo_u32 v14, v6, v11 -; CGP-NEXT: v_mul_hi_u32 v15, v6, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v13 +; CGP-NEXT: v_mul_lo_u32 v12, v8, v13 +; CGP-NEXT: v_mul_lo_u32 v14, v4, v11 +; CGP-NEXT: v_mul_hi_u32 v15, v4, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v15, v7, v11 +; CGP-NEXT: v_mul_lo_u32 v15, v8, v11 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; CGP-NEXT: v_mul_hi_u32 v14, v6, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v7, v11 +; CGP-NEXT: v_mul_hi_u32 v14, v4, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v8, v11 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 @@ -914,18 +916,18 @@ ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v12 -; CGP-NEXT: v_addc_u32_e64 v12, s[4:5], v7, v11, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v10, v6 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; CGP-NEXT: v_addc_u32_e64 v12, s[4:5], v8, v11, vcc +; CGP-NEXT: v_mul_lo_u32 v10, v10, v4 ; CGP-NEXT: v_mul_lo_u32 v13, v9, v12 -; CGP-NEXT: v_mul_lo_u32 v14, v9, v6 -; CGP-NEXT: v_mul_hi_u32 v9, v9, v6 -; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v11 +; CGP-NEXT: v_mul_lo_u32 v14, v9, v4 +; CGP-NEXT: v_mul_hi_u32 v9, v9, v4 +; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; CGP-NEXT: v_mul_hi_u32 v11, v6, v14 +; CGP-NEXT: v_mul_hi_u32 v11, v4, v14 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 ; CGP-NEXT: v_mul_lo_u32 v10, v12, v14 -; CGP-NEXT: v_mul_lo_u32 v13, v6, v9 +; CGP-NEXT: v_mul_lo_u32 v13, v4, v9 ; CGP-NEXT: v_mul_hi_u32 v14, v12, v14 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] @@ -933,7 +935,7 @@ ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; CGP-NEXT: v_mul_lo_u32 v11, v12, v9 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10 -; CGP-NEXT: v_mul_hi_u32 v13, v6, v9 +; CGP-NEXT: v_mul_hi_u32 v13, v4, v9 ; CGP-NEXT: v_mul_hi_u32 v9, v12, v9 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] @@ -944,96 +946,94 @@ ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v11 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v3, v6 -; CGP-NEXT: v_mul_lo_u32 v10, v2, v7 -; CGP-NEXT: v_mul_hi_u32 v11, v2, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v3, v6 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v9, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v7, v4 +; CGP-NEXT: v_mul_lo_u32 v10, v5, v8 +; CGP-NEXT: v_mul_hi_u32 v11, v5, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v7, v4 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v3, v7 +; CGP-NEXT: v_mul_lo_u32 v11, v7, v8 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v2, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; CGP-NEXT: v_mul_hi_u32 v10, v5, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v7, v8 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; CGP-NEXT: v_mul_lo_u32 v9, v4, v6 -; CGP-NEXT: v_mul_lo_u32 v7, v5, v7 -; CGP-NEXT: v_mul_lo_u32 v10, v5, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v5, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v3, v6, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v6 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v4 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_mul_lo_u32 v9, v2, v4 +; CGP-NEXT: v_mul_lo_u32 v8, v3, v8 +; CGP-NEXT: v_mul_lo_u32 v10, v3, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v10 +; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v7, v4, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v7, v4 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v4 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc -; CGP-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v9, vcc, v2, v5 -; CGP-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v4 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v2 +; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v2, vcc +; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v5, v3 +; CGP-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v4, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v2 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v4 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v9, v5 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v2 +; CGP-NEXT: v_subb_u32_e32 v2, vcc, v4, v2, vcc +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v9, v3 ; CGP-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] -; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; CGP-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v8 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v8 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v8 -; CGP-NEXT: v_subb_u32_e32 v5, vcc, v3, v8, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v3, v3, v6 +; CGP-NEXT: v_xor_b32_e32 v4, v2, v6 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v3, v6 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v4, v6, vcc ; CGP-NEXT: ; implicit-def: $vgpr6 -; CGP-NEXT: ; implicit-def: $vgpr2 +; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: BB2_6: ; %Flow ; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB2_8 ; CGP-NEXT: ; %bb.7: -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v6 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v6 -; CGP-NEXT: v_mov_b32_e32 v5, 0 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v3 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6 +; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_mul_lo_u32 v3, v3, v6 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_mul_hi_u32 v2, v8, v2 +; CGP-NEXT: v_mul_lo_u32 v2, v2, v6 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v6 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v6 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; CGP-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: BB2_8: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] -; CGP-NEXT: v_mov_b32_e32 v2, v4 -; CGP-NEXT: v_mov_b32_e32 v3, v5 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i64> %num, %den ret <2 x i64> %result @@ -2474,46 +2474,48 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_mov_b64 s[4:5], 0x1000 -; CHECK-NEXT: v_lshl_b64 v[4:5], s[4:5], v2 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: v_or_b32_e32 v3, v1, v5 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CHECK-NEXT: v_lshl_b64 v[5:6], s[4:5], v2 +; CHECK-NEXT: v_mov_b32_e32 v4, v1 +; CHECK-NEXT: v_mov_b32_e32 v3, v0 +; CHECK-NEXT: v_or_b32_e32 v1, v4, v6 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz BB7_2 ; CHECK-NEXT: ; %bb.1: -; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v2 -; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v5, v2, vcc -; CHECK-NEXT: v_xor_b32_e32 v3, v3, v2 -; CHECK-NEXT: v_xor_b32_e32 v2, v4, v2 -; CHECK-NEXT: v_cvt_f32_u32_e32 v4, v3 -; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v2 -; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc -; CHECK-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CHECK-NEXT: v_sub_i32_e32 v7, vcc, 0, v3 -; CHECK-NEXT: v_subb_u32_e32 v8, vcc, 0, v2, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 -; CHECK-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, 31, v6 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v5, v0 +; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v6, v0, vcc +; CHECK-NEXT: v_xor_b32_e32 v1, v1, v0 +; CHECK-NEXT: v_xor_b32_e32 v0, v2, v0 +; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v1 +; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v4 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; CHECK-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v5 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CHECK-NEXT: v_sub_i32_e32 v7, vcc, 0, v1 +; CHECK-NEXT: v_subb_u32_e32 v8, vcc, 0, v0, vcc +; CHECK-NEXT: v_xor_b32_e32 v3, v3, v6 +; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2 ; CHECK-NEXT: v_trunc_f32_e32 v5, v5 -; CHECK-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 +; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v5 +; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 -; CHECK-NEXT: v_mul_lo_u32 v9, v8, v4 +; CHECK-NEXT: v_xor_b32_e32 v4, v4, v6 +; CHECK-NEXT: v_mul_lo_u32 v9, v8, v2 ; CHECK-NEXT: v_mul_lo_u32 v10, v7, v5 -; CHECK-NEXT: v_mul_hi_u32 v12, v7, v4 -; CHECK-NEXT: v_mul_lo_u32 v11, v7, v4 +; CHECK-NEXT: v_mul_hi_u32 v12, v7, v2 +; CHECK-NEXT: v_mul_lo_u32 v11, v7, v2 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CHECK-NEXT: v_mul_lo_u32 v10, v5, v11 -; CHECK-NEXT: v_mul_lo_u32 v12, v4, v9 -; CHECK-NEXT: v_mul_hi_u32 v13, v4, v11 +; CHECK-NEXT: v_mul_lo_u32 v12, v2, v9 +; CHECK-NEXT: v_mul_hi_u32 v13, v2, v11 ; CHECK-NEXT: v_mul_hi_u32 v11, v5, v11 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc @@ -2521,7 +2523,7 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CHECK-NEXT: v_mul_lo_u32 v13, v5, v9 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CHECK-NEXT: v_mul_hi_u32 v12, v4, v9 +; CHECK-NEXT: v_mul_hi_u32 v12, v2, v9 ; CHECK-NEXT: v_mul_hi_u32 v9, v5, v9 ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v13, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc @@ -2532,18 +2534,18 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v10 ; CHECK-NEXT: v_addc_u32_e64 v10, s[4:5], v5, v9, vcc -; CHECK-NEXT: v_mul_lo_u32 v8, v8, v4 +; CHECK-NEXT: v_mul_lo_u32 v8, v8, v2 ; CHECK-NEXT: v_mul_lo_u32 v11, v7, v10 -; CHECK-NEXT: v_mul_lo_u32 v12, v7, v4 -; CHECK-NEXT: v_mul_hi_u32 v7, v7, v4 +; CHECK-NEXT: v_mul_lo_u32 v12, v7, v2 +; CHECK-NEXT: v_mul_hi_u32 v7, v7, v2 ; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v5, v9 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 -; CHECK-NEXT: v_mul_hi_u32 v9, v4, v12 +; CHECK-NEXT: v_mul_hi_u32 v9, v2, v12 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 ; CHECK-NEXT: v_mul_lo_u32 v8, v10, v12 -; CHECK-NEXT: v_mul_lo_u32 v11, v4, v7 +; CHECK-NEXT: v_mul_lo_u32 v11, v2, v7 ; CHECK-NEXT: v_mul_hi_u32 v12, v10, v12 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v8, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] @@ -2551,7 +2553,7 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; CHECK-NEXT: v_mul_lo_u32 v9, v10, v7 ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8 -; CHECK-NEXT: v_mul_hi_u32 v11, v4, v7 +; CHECK-NEXT: v_mul_hi_u32 v11, v2, v7 ; CHECK-NEXT: v_mul_hi_u32 v7, v10, v7 ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v9, v12 ; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] @@ -2563,95 +2565,93 @@ ; CHECK-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v7, v9 ; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v1, v4 -; CHECK-NEXT: v_mul_lo_u32 v8, v0, v5 -; CHECK-NEXT: v_mul_hi_u32 v9, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 +; CHECK-NEXT: v_mul_lo_u32 v7, v4, v2 +; CHECK-NEXT: v_mul_lo_u32 v8, v3, v5 +; CHECK-NEXT: v_mul_hi_u32 v9, v3, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v4, v2 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v9, v1, v5 +; CHECK-NEXT: v_mul_lo_u32 v9, v4, v5 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; CHECK-NEXT: v_mul_hi_u32 v8, v0, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v1, v5 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CHECK-NEXT: v_mul_hi_u32 v8, v3, v5 +; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CHECK-NEXT: v_mul_lo_u32 v7, v2, v4 -; CHECK-NEXT: v_mul_lo_u32 v5, v3, v5 -; CHECK-NEXT: v_mul_lo_u32 v8, v3, v4 -; CHECK-NEXT: v_mul_hi_u32 v4, v3, v4 +; CHECK-NEXT: v_mul_lo_u32 v7, v0, v2 +; CHECK-NEXT: v_mul_lo_u32 v5, v1, v5 +; CHECK-NEXT: v_mul_lo_u32 v8, v1, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 ; CHECK-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v1, v4, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v4 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v2 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v8 +; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v4, v2, vcc +; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v4, v2 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v2 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v0 +; CHECK-NEXT: v_subb_u32_e32 v2, vcc, v2, v0, vcc ; CHECK-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5] -; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v0, v3 -; CHECK-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 +; CHECK-NEXT: v_sub_i32_e32 v7, vcc, v3, v1 +; CHECK-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v2, vcc +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v2 -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v7, v3 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v0 +; CHECK-NEXT: v_subb_u32_e32 v0, vcc, v2, v0, vcc +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v7, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[4:5] -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; CHECK-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v0, v6 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 -; CHECK-NEXT: v_subb_u32_e32 v3, vcc, v1, v6, vcc -; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: v_xor_b32_e32 v2, v0, v6 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v1, v6 +; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v2, v6, vcc +; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 +; CHECK-NEXT: ; implicit-def: $vgpr3 ; CHECK-NEXT: BB7_2: ; %Flow ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] ; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz BB7_4 ; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v4 -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, 0, v4 -; CHECK-NEXT: v_mov_b32_e32 v3, 0 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_lo_u32 v2, v2, v1 -; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5 +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 +; CHECK-NEXT: v_mul_lo_u32 v1, v1, v0 ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, v4 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v4 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_mul_hi_u32 v0, v3, v0 +; CHECK-NEXT: v_mul_lo_u32 v0, v0, v5 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v3, v0 +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v5 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v5 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v4 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: BB7_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: v_mov_b32_e32 v0, v2 -; CHECK-NEXT: v_mov_b32_e32 v1, v3 ; CHECK-NEXT: s_setpc_b64 s[30:31] %shl.y = shl i64 4096, %y %r = srem i64 %x, %shl.y @@ -2951,58 +2951,60 @@ ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: s_mov_b64 s[4:5], 0x1000 -; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v4 -; CGP-NEXT: v_mov_b32_e32 v7, v1 -; CGP-NEXT: v_mov_b32_e32 v5, v0 -; CGP-NEXT: v_or_b32_e32 v1, v7, v11 +; CGP-NEXT: v_mov_b32_e32 v5, v2 +; CGP-NEXT: v_mov_b32_e32 v7, v3 +; CGP-NEXT: v_lshl_b64 v[2:3], s[4:5], v4 +; CGP-NEXT: v_mov_b32_e32 v9, v1 +; CGP-NEXT: v_mov_b32_e32 v8, v0 +; CGP-NEXT: v_or_b32_e32 v1, v9, v3 ; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; CGP-NEXT: v_lshl_b64 v[8:9], s[4:5], v6 +; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v6 ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB8_2 ; CGP-NEXT: ; %bb.1: -; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v11 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v0 -; CGP-NEXT: v_addc_u32_e32 v4, vcc, v11, v0, vcc +; CGP-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v0 +; CGP-NEXT: v_addc_u32_e32 v2, vcc, v3, v0, vcc ; CGP-NEXT: v_xor_b32_e32 v1, v1, v0 -; CGP-NEXT: v_xor_b32_e32 v0, v4, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v4, v1 -; CGP-NEXT: v_cvt_f32_u32_e32 v6, v0 -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v7 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; CGP-NEXT: v_xor_b32_e32 v5, v5, v10 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v10, vcc -; CGP-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 +; CGP-NEXT: v_xor_b32_e32 v0, v2, v0 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v0 +; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v9 +; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 +; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v8, v4 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v9, v4, vcc +; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v1 +; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; CGP-NEXT: v_mul_f32_e32 v8, 0x2f800000, v2 +; CGP-NEXT: v_trunc_f32_e32 v8, v8 +; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v8 +; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8 ; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v0, vcc -; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v7, v7 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_xor_b32_e32 v6, v6, v10 -; CGP-NEXT: v_mul_lo_u32 v13, v12, v4 -; CGP-NEXT: v_mul_lo_u32 v14, v11, v7 -; CGP-NEXT: v_mul_hi_u32 v16, v11, v4 -; CGP-NEXT: v_mul_lo_u32 v15, v11, v4 +; CGP-NEXT: v_xor_b32_e32 v3, v3, v4 +; CGP-NEXT: v_mul_lo_u32 v13, v12, v2 +; CGP-NEXT: v_mul_lo_u32 v14, v9, v8 +; CGP-NEXT: v_mul_hi_u32 v16, v9, v2 +; CGP-NEXT: v_mul_lo_u32 v15, v9, v2 +; CGP-NEXT: v_xor_b32_e32 v6, v6, v4 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; CGP-NEXT: v_mul_lo_u32 v14, v7, v15 -; CGP-NEXT: v_mul_lo_u32 v16, v4, v13 -; CGP-NEXT: v_mul_hi_u32 v17, v4, v15 -; CGP-NEXT: v_mul_hi_u32 v15, v7, v15 +; CGP-NEXT: v_mul_lo_u32 v14, v8, v15 +; CGP-NEXT: v_mul_lo_u32 v16, v2, v13 +; CGP-NEXT: v_mul_hi_u32 v17, v2, v15 +; CGP-NEXT: v_mul_hi_u32 v15, v8, v15 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v17, v7, v13 +; CGP-NEXT: v_mul_lo_u32 v17, v8, v13 ; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; CGP-NEXT: v_mul_hi_u32 v16, v4, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v7, v13 +; CGP-NEXT: v_mul_hi_u32 v16, v2, v13 +; CGP-NEXT: v_mul_hi_u32 v13, v8, v13 ; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v15 ; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16 @@ -3012,27 +3014,27 @@ ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14 -; CGP-NEXT: v_addc_u32_e64 v14, s[4:5], v7, v13, vcc -; CGP-NEXT: v_mul_lo_u32 v12, v12, v4 -; CGP-NEXT: v_mul_lo_u32 v15, v11, v14 -; CGP-NEXT: v_mul_lo_u32 v16, v11, v4 -; CGP-NEXT: v_mul_hi_u32 v11, v11, v4 -; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v13 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v14 +; CGP-NEXT: v_addc_u32_e64 v14, s[4:5], v8, v13, vcc +; CGP-NEXT: v_mul_lo_u32 v12, v12, v2 +; CGP-NEXT: v_mul_lo_u32 v15, v9, v14 +; CGP-NEXT: v_mul_lo_u32 v16, v9, v2 +; CGP-NEXT: v_mul_hi_u32 v9, v9, v2 +; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v13 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 -; CGP-NEXT: v_mul_hi_u32 v13, v4, v16 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 +; CGP-NEXT: v_mul_hi_u32 v13, v2, v16 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v9 ; CGP-NEXT: v_mul_lo_u32 v12, v14, v16 -; CGP-NEXT: v_mul_lo_u32 v15, v4, v11 +; CGP-NEXT: v_mul_lo_u32 v15, v2, v9 ; CGP-NEXT: v_mul_hi_u32 v16, v14, v16 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v15 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_mul_lo_u32 v13, v14, v11 +; CGP-NEXT: v_mul_lo_u32 v13, v14, v9 ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v15, v12 -; CGP-NEXT: v_mul_hi_u32 v15, v4, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v14, v11 +; CGP-NEXT: v_mul_hi_u32 v15, v2, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v14, v9 ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v16 ; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15 @@ -3041,134 +3043,134 @@ ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v11, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v6, v4 -; CGP-NEXT: v_mul_lo_u32 v12, v5, v7 -; CGP-NEXT: v_mul_hi_u32 v13, v5, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v6, v4 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v13 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, v8, v9, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; CGP-NEXT: v_mul_lo_u32 v9, v6, v2 +; CGP-NEXT: v_mul_lo_u32 v12, v3, v8 +; CGP-NEXT: v_mul_hi_u32 v13, v3, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v6, v2 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v6, v7 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_mul_hi_u32 v12, v5, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_mul_lo_u32 v13, v6, v8 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; CGP-NEXT: v_mul_hi_u32 v12, v3, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v6, v8 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v13, v2 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; CGP-NEXT: v_mul_lo_u32 v11, v0, v4 -; CGP-NEXT: v_mul_lo_u32 v7, v1, v7 -; CGP-NEXT: v_mul_lo_u32 v12, v1, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v1, v4 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v12 -; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v6, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v6, v4 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v0 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v9 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v9, vcc, v12, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; CGP-NEXT: v_mul_lo_u32 v9, v0, v2 +; CGP-NEXT: v_mul_lo_u32 v8, v1, v8 +; CGP-NEXT: v_mul_lo_u32 v12, v1, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v1, v2 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v8, v2 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v12 +; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v6, v2, vcc +; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v6, v2 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v0 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v1 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v0 -; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v0, vcc -; CGP-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v11, vcc, v5, v1 -; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v4, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v0 +; CGP-NEXT: v_subb_u32_e32 v2, vcc, v2, v0, vcc +; CGP-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v3, v1 +; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v2, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v0 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v1 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] ; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v0 -; CGP-NEXT: v_subb_u32_e32 v0, vcc, v4, v0, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v11, v1 +; CGP-NEXT: v_subb_u32_e32 v0, vcc, v2, v0, vcc +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v9, v1 ; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; CGP-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; CGP-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v1, v10 -; CGP-NEXT: v_xor_b32_e32 v4, v0, v10 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v1, v10 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v4, v10, vcc -; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 -; CGP-NEXT: ; implicit-def: $vgpr5 +; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc +; CGP-NEXT: v_xor_b32_e32 v1, v1, v4 +; CGP-NEXT: v_xor_b32_e32 v2, v0, v4 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 +; CGP-NEXT: v_subb_u32_e32 v1, vcc, v2, v4, vcc +; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: BB8_2: ; %Flow2 ; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB8_4 ; CGP-NEXT: ; %bb.3: -; CGP-NEXT: v_cvt_f32_u32_e32 v0, v10 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v10 +; CGP-NEXT: v_cvt_f32_u32_e32 v0, v2 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 ; CGP-NEXT: v_mul_lo_u32 v1, v1, v0 ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CGP-NEXT: v_mul_hi_u32 v0, v5, v0 -; CGP-NEXT: v_mul_lo_u32 v0, v0, v10 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v5, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v10 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v10 +; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_mul_lo_u32 v0, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v8, v0 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v10 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v10 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: BB8_4: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] -; CGP-NEXT: v_or_b32_e32 v5, v3, v9 -; CGP-NEXT: v_mov_b32_e32 v4, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; CGP-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CGP-NEXT: v_or_b32_e32 v3, v7, v11 +; CGP-NEXT: v_mov_b32_e32 v2, 0 +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB8_6 ; CGP-NEXT: ; %bb.5: -; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v9 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v4 -; CGP-NEXT: v_addc_u32_e32 v6, vcc, v9, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v5, v5, v4 -; CGP-NEXT: v_xor_b32_e32 v4, v6, v4 -; CGP-NEXT: v_cvt_f32_u32_e32 v6, v5 -; CGP-NEXT: v_cvt_f32_u32_e32 v7, v4 -; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc -; CGP-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 -; CGP-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v5 -; CGP-NEXT: v_subb_u32_e32 v10, vcc, 0, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v8 -; CGP-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 -; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6 +; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v11 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v2 +; CGP-NEXT: v_addc_u32_e32 v4, vcc, v11, v2, vcc +; CGP-NEXT: v_xor_b32_e32 v3, v3, v2 +; CGP-NEXT: v_xor_b32_e32 v2, v4, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v4, v3 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v2 +; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; CGP-NEXT: v_xor_b32_e32 v5, v5, v8 +; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v6 +; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; CGP-NEXT: v_addc_u32_e32 v6, vcc, v7, v8, vcc +; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 +; CGP-NEXT: v_subb_u32_e32 v10, vcc, 0, v2, vcc +; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; CGP-NEXT: v_mul_f32_e32 v7, 0x2f800000, v4 ; CGP-NEXT: v_trunc_f32_e32 v7, v7 -; CGP-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 +; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v8 -; CGP-NEXT: v_mul_lo_u32 v11, v10, v6 +; CGP-NEXT: v_xor_b32_e32 v6, v6, v8 +; CGP-NEXT: v_mul_lo_u32 v11, v10, v4 ; CGP-NEXT: v_mul_lo_u32 v12, v9, v7 -; CGP-NEXT: v_mul_hi_u32 v14, v9, v6 -; CGP-NEXT: v_mul_lo_u32 v13, v9, v6 +; CGP-NEXT: v_mul_hi_u32 v14, v9, v4 +; CGP-NEXT: v_mul_lo_u32 v13, v9, v4 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14 ; CGP-NEXT: v_mul_lo_u32 v12, v7, v13 -; CGP-NEXT: v_mul_lo_u32 v14, v6, v11 -; CGP-NEXT: v_mul_hi_u32 v15, v6, v13 +; CGP-NEXT: v_mul_lo_u32 v14, v4, v11 +; CGP-NEXT: v_mul_hi_u32 v15, v4, v13 ; CGP-NEXT: v_mul_hi_u32 v13, v7, v13 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc @@ -3176,7 +3178,7 @@ ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_mul_lo_u32 v15, v7, v11 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 -; CGP-NEXT: v_mul_hi_u32 v14, v6, v11 +; CGP-NEXT: v_mul_hi_u32 v14, v4, v11 ; CGP-NEXT: v_mul_hi_u32 v11, v7, v11 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc @@ -3187,18 +3189,18 @@ ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 ; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; CGP-NEXT: v_addc_u32_e64 v12, s[4:5], v7, v11, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v10, v6 +; CGP-NEXT: v_mul_lo_u32 v10, v10, v4 ; CGP-NEXT: v_mul_lo_u32 v13, v9, v12 -; CGP-NEXT: v_mul_lo_u32 v14, v9, v6 -; CGP-NEXT: v_mul_hi_u32 v9, v9, v6 +; CGP-NEXT: v_mul_lo_u32 v14, v9, v4 +; CGP-NEXT: v_mul_hi_u32 v9, v9, v4 ; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v11 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 -; CGP-NEXT: v_mul_hi_u32 v11, v6, v14 +; CGP-NEXT: v_mul_hi_u32 v11, v4, v14 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 ; CGP-NEXT: v_mul_lo_u32 v10, v12, v14 -; CGP-NEXT: v_mul_lo_u32 v13, v6, v9 +; CGP-NEXT: v_mul_lo_u32 v13, v4, v9 ; CGP-NEXT: v_mul_hi_u32 v14, v12, v14 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] @@ -3206,7 +3208,7 @@ ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; CGP-NEXT: v_mul_lo_u32 v11, v12, v9 ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v13, v10 -; CGP-NEXT: v_mul_hi_u32 v13, v6, v9 +; CGP-NEXT: v_mul_hi_u32 v13, v4, v9 ; CGP-NEXT: v_mul_hi_u32 v9, v12, v9 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] @@ -3218,95 +3220,93 @@ ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v11 ; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v11 ; CGP-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; CGP-NEXT: v_mul_lo_u32 v9, v3, v6 -; CGP-NEXT: v_mul_lo_u32 v10, v2, v7 -; CGP-NEXT: v_mul_hi_u32 v11, v2, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v3, v6 +; CGP-NEXT: v_mul_lo_u32 v9, v6, v4 +; CGP-NEXT: v_mul_lo_u32 v10, v5, v7 +; CGP-NEXT: v_mul_hi_u32 v11, v5, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v6, v4 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v3, v7 +; CGP-NEXT: v_mul_lo_u32 v11, v6, v7 ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; CGP-NEXT: v_mul_hi_u32 v10, v2, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6 +; CGP-NEXT: v_mul_hi_u32 v10, v5, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v6, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; CGP-NEXT: v_mul_lo_u32 v9, v4, v6 -; CGP-NEXT: v_mul_lo_u32 v7, v5, v7 -; CGP-NEXT: v_mul_lo_u32 v10, v5, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v5, v6 +; CGP-NEXT: v_mul_lo_u32 v9, v2, v4 +; CGP-NEXT: v_mul_lo_u32 v7, v3, v7 +; CGP-NEXT: v_mul_lo_u32 v10, v3, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 ; CGP-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v3, v6, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v6 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v10 +; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v6, v4, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v6, v4 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v2 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 ; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v4 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v2 +; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v2, vcc ; CGP-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v9, vcc, v2, v5 -; CGP-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v4 +; CGP-NEXT: v_sub_i32_e32 v9, vcc, v5, v3 +; CGP-NEXT: v_subbrev_u32_e64 v10, s[4:5], 0, v4, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v2 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v4 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v9, v5 +; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v2 +; CGP-NEXT: v_subb_u32_e32 v2, vcc, v4, v2, vcc +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v9, v3 ; CGP-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[4:5] -; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; CGP-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v8 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v8 +; CGP-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc ; CGP-NEXT: v_xor_b32_e32 v3, v3, v8 -; CGP-NEXT: v_subb_u32_e32 v5, vcc, v3, v8, vcc -; CGP-NEXT: ; implicit-def: $vgpr8_vgpr9 -; CGP-NEXT: ; implicit-def: $vgpr2 +; CGP-NEXT: v_xor_b32_e32 v4, v2, v8 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v3, v8 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, v4, v8, vcc +; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 +; CGP-NEXT: ; implicit-def: $vgpr5 ; CGP-NEXT: BB8_6: ; %Flow ; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB8_8 ; CGP-NEXT: ; %bb.7: -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v8 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v8 -; CGP-NEXT: v_mov_b32_e32 v5, 0 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v3 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v10 +; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_mul_lo_u32 v3, v3, v8 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v8 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v8 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 +; CGP-NEXT: v_mul_lo_u32 v2, v2, v10 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v10 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v10 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v8 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v8 -; CGP-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc +; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: BB8_8: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] -; CGP-NEXT: v_mov_b32_e32 v2, v4 -; CGP-NEXT: v_mov_b32_e32 v3, v5 ; CGP-NEXT: s_setpc_b64 s[30:31] %shl.y = shl <2 x i64> , %y %r = srem <2 x i64> %x, %shl.y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -8,39 +8,41 @@ ; CHECK-LABEL: v_udiv_i64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_or_b32_e32 v5, v1, v3 -; CHECK-NEXT: v_mov_b32_e32 v4, 0 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: v_mov_b32_e32 v4, v0 +; CHECK-NEXT: v_mov_b32_e32 v5, v1 +; CHECK-NEXT: v_or_b32_e32 v1, v5, v3 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz BB0_2 ; CHECK-NEXT: ; %bb.1: -; CHECK-NEXT: v_cvt_f32_u32_e32 v4, v2 -; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v3 +; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v2 +; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v3 ; CHECK-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 ; CHECK-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc -; CHECK-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CHECK-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CHECK-NEXT: v_trunc_f32_e32 v5, v5 -; CHECK-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v5 -; CHECK-NEXT: v_mul_lo_u32 v9, v6, v4 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v4 -; CHECK-NEXT: v_mul_hi_u32 v11, v6, v4 +; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; CHECK-NEXT: v_trunc_f32_e32 v1, v1 +; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 +; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 +; CHECK-NEXT: v_mul_lo_u32 v8, v6, v1 +; CHECK-NEXT: v_mul_lo_u32 v9, v6, v0 +; CHECK-NEXT: v_mul_lo_u32 v10, v7, v0 +; CHECK-NEXT: v_mul_hi_u32 v11, v6, v0 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CHECK-NEXT: v_mul_lo_u32 v10, v5, v9 -; CHECK-NEXT: v_mul_hi_u32 v12, v4, v9 -; CHECK-NEXT: v_mul_hi_u32 v9, v5, v9 +; CHECK-NEXT: v_mul_lo_u32 v10, v1, v9 +; CHECK-NEXT: v_mul_hi_u32 v12, v0, v9 +; CHECK-NEXT: v_mul_hi_u32 v9, v1, v9 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; CHECK-NEXT: v_mul_lo_u32 v11, v4, v8 -; CHECK-NEXT: v_mul_lo_u32 v13, v5, v8 -; CHECK-NEXT: v_mul_hi_u32 v14, v4, v8 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v8 +; CHECK-NEXT: v_mul_lo_u32 v11, v0, v8 +; CHECK-NEXT: v_mul_lo_u32 v13, v1, v8 +; CHECK-NEXT: v_mul_hi_u32 v14, v0, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v1, v8 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v13, v9 @@ -55,21 +57,21 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CHECK-NEXT: v_addc_u32_e64 v9, s[4:5], v5, v8, vcc -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v5, v8 -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v4 -; CHECK-NEXT: v_mul_lo_u32 v7, v7, v4 -; CHECK-NEXT: v_mul_hi_u32 v10, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_addc_u32_e64 v9, s[4:5], v1, v8, vcc +; CHECK-NEXT: v_add_i32_e64 v1, s[4:5], v1, v8 +; CHECK-NEXT: v_mul_lo_u32 v8, v6, v0 +; CHECK-NEXT: v_mul_lo_u32 v7, v7, v0 +; CHECK-NEXT: v_mul_hi_u32 v10, v6, v0 ; CHECK-NEXT: v_mul_lo_u32 v6, v6, v9 ; CHECK-NEXT: v_mul_lo_u32 v11, v9, v8 -; CHECK-NEXT: v_mul_hi_u32 v12, v4, v8 +; CHECK-NEXT: v_mul_hi_u32 v12, v0, v8 ; CHECK-NEXT: v_mul_hi_u32 v8, v9, v8 ; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 ; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10 -; CHECK-NEXT: v_mul_lo_u32 v7, v4, v6 +; CHECK-NEXT: v_mul_lo_u32 v7, v0, v6 ; CHECK-NEXT: v_mul_lo_u32 v10, v9, v6 -; CHECK-NEXT: v_mul_hi_u32 v13, v4, v6 +; CHECK-NEXT: v_mul_hi_u32 v13, v0, v6 ; CHECK-NEXT: v_mul_hi_u32 v6, v9, v6 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] @@ -85,95 +87,93 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 ; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v6, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v1, v4 -; CHECK-NEXT: v_mul_hi_u32 v7, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 -; CHECK-NEXT: v_mul_lo_u32 v8, v0, v5 -; CHECK-NEXT: v_mul_lo_u32 v9, v1, v5 -; CHECK-NEXT: v_mul_hi_u32 v10, v0, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v1, v5 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CHECK-NEXT: v_mul_lo_u32 v6, v5, v0 +; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0 +; CHECK-NEXT: v_mul_lo_u32 v8, v4, v1 +; CHECK-NEXT: v_mul_lo_u32 v9, v5, v1 +; CHECK-NEXT: v_mul_hi_u32 v10, v4, v1 +; CHECK-NEXT: v_mul_hi_u32 v1, v5, v1 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v9, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_mul_lo_u32 v7, v2, v4 -; CHECK-NEXT: v_mul_lo_u32 v8, v3, v4 -; CHECK-NEXT: v_mul_hi_u32 v9, v2, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_mul_lo_u32 v6, v2, v5 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, 1, v4 -; CHECK-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc +; CHECK-NEXT: v_mul_lo_u32 v7, v2, v0 +; CHECK-NEXT: v_mul_lo_u32 v8, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v9, v2, v0 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v6 +; CHECK-NEXT: v_mul_lo_u32 v6, v2, v1 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, 1, v0 +; CHECK-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, 1, v10 ; CHECK-NEXT: v_addc_u32_e32 v12, vcc, 0, v11, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; CHECK-NEXT: v_subb_u32_e64 v7, s[4:5], v1, v6, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v6 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2 +; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v4, v7 +; CHECK-NEXT: v_subb_u32_e64 v7, s[4:5], v5, v6, vcc +; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v5, v6 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 +; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v4, v2 +; CHECK-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v10, v8, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v10, v8, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v3, v11, v12, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; CHECK-NEXT: ; implicit-def: $vgpr2 -; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr4 ; CHECK-NEXT: BB0_2: ; %Flow ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] ; CHECK-NEXT: s_xor_b64 exec, exec, s[6:7] ; CHECK-NEXT: s_cbranch_execz BB0_4 ; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v2 -; CHECK-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_lo_u32 v3, v3, v1 -; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v2 +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 +; CHECK-NEXT: v_mul_lo_u32 v1, v1, v0 ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 -; CHECK-NEXT: v_mul_lo_u32 v3, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v1 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 +; CHECK-NEXT: v_mul_lo_u32 v1, v0, v2 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v4, v1 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CHECK-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v1, v3, vcc -; CHECK-NEXT: v_mov_b32_e32 v5, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: BB0_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] -; CHECK-NEXT: v_mov_b32_e32 v0, v4 -; CHECK-NEXT: v_mov_b32_e32 v1, v5 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = udiv i64 %num, %den ret i64 %result @@ -628,9 +628,11 @@ ; CGP-LABEL: v_udiv_v2i64: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_mov_b32_e32 v8, v0 -; CGP-NEXT: v_mov_b32_e32 v9, v1 -; CGP-NEXT: v_or_b32_e32 v1, v9, v5 +; CGP-NEXT: v_mov_b32_e32 v10, v0 +; CGP-NEXT: v_mov_b32_e32 v11, v1 +; CGP-NEXT: v_mov_b32_e32 v8, v2 +; CGP-NEXT: v_mov_b32_e32 v9, v3 +; CGP-NEXT: v_or_b32_e32 v1, v11, v5 ; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -640,8 +642,8 @@ ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v1, v5 -; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc +; CGP-NEXT: v_sub_i32_e32 v2, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, 0, v5, vcc ; CGP-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -650,10 +652,10 @@ ; CGP-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; CGP-NEXT: v_cvt_u32_f32_e32 v1, v1 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CGP-NEXT: v_mul_lo_u32 v12, v10, v1 -; CGP-NEXT: v_mul_lo_u32 v13, v10, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v11, v0 -; CGP-NEXT: v_mul_hi_u32 v15, v10, v0 +; CGP-NEXT: v_mul_lo_u32 v12, v2, v1 +; CGP-NEXT: v_mul_lo_u32 v13, v2, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v3, v0 +; CGP-NEXT: v_mul_hi_u32 v15, v2, v0 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 ; CGP-NEXT: v_mul_lo_u32 v14, v1, v13 ; CGP-NEXT: v_mul_hi_u32 v16, v0, v13 @@ -680,93 +682,93 @@ ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 ; CGP-NEXT: v_addc_u32_e64 v13, s[4:5], v1, v12, vcc ; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12 -; CGP-NEXT: v_mul_lo_u32 v12, v10, v0 -; CGP-NEXT: v_mul_lo_u32 v11, v11, v0 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v0 -; CGP-NEXT: v_mul_lo_u32 v10, v10, v13 +; CGP-NEXT: v_mul_lo_u32 v12, v2, v0 +; CGP-NEXT: v_mul_lo_u32 v3, v3, v0 +; CGP-NEXT: v_mul_hi_u32 v14, v2, v0 +; CGP-NEXT: v_mul_lo_u32 v2, v2, v13 ; CGP-NEXT: v_mul_lo_u32 v15, v13, v12 ; CGP-NEXT: v_mul_hi_u32 v16, v0, v12 ; CGP-NEXT: v_mul_hi_u32 v12, v13, v12 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14 -; CGP-NEXT: v_mul_lo_u32 v11, v0, v10 -; CGP-NEXT: v_mul_lo_u32 v14, v13, v10 -; CGP-NEXT: v_mul_hi_u32 v17, v0, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v13, v10 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v15, v11 +; CGP-NEXT: v_add_i32_e64 v2, s[4:5], v3, v2 +; CGP-NEXT: v_add_i32_e64 v2, s[4:5], v2, v14 +; CGP-NEXT: v_mul_lo_u32 v3, v0, v2 +; CGP-NEXT: v_mul_lo_u32 v14, v13, v2 +; CGP-NEXT: v_mul_hi_u32 v17, v0, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v13, v2 +; CGP-NEXT: v_add_i32_e64 v3, s[4:5], v15, v3 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v16 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v3, s[4:5], v3, v16 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v11 +; CGP-NEXT: v_add_i32_e64 v3, s[4:5], v13, v3 ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v15 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 +; CGP-NEXT: v_add_i32_e64 v3, s[4:5], v12, v3 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v10, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; CGP-NEXT: v_add_i32_e64 v2, s[4:5], v2, v12 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v9, v0 -; CGP-NEXT: v_mul_hi_u32 v11, v8, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 -; CGP-NEXT: v_mul_lo_u32 v12, v8, v1 -; CGP-NEXT: v_mul_lo_u32 v13, v9, v1 -; CGP-NEXT: v_mul_hi_u32 v14, v8, v1 -; CGP-NEXT: v_mul_hi_u32 v1, v9, v1 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_mul_lo_u32 v2, v11, v0 +; CGP-NEXT: v_mul_hi_u32 v3, v10, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 +; CGP-NEXT: v_mul_lo_u32 v12, v10, v1 +; CGP-NEXT: v_mul_lo_u32 v13, v11, v1 +; CGP-NEXT: v_mul_hi_u32 v14, v10, v1 +; CGP-NEXT: v_mul_hi_u32 v1, v11, v1 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v13, v0 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_lo_u32 v11, v4, v0 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v12, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v13, v3 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CGP-NEXT: v_mul_lo_u32 v3, v4, v0 ; CGP-NEXT: v_mul_lo_u32 v12, v5, v0 ; CGP-NEXT: v_mul_hi_u32 v13, v4, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v10 -; CGP-NEXT: v_mul_lo_u32 v10, v4, v1 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CGP-NEXT: v_mul_lo_u32 v2, v4, v1 ; CGP-NEXT: v_add_i32_e32 v14, vcc, 1, v0 ; CGP-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v12, v2 ; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v14 ; CGP-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v11 -; CGP-NEXT: v_subb_u32_e64 v11, s[4:5], v9, v10, vcc -; CGP-NEXT: v_sub_i32_e64 v9, s[4:5], v9, v10 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v4 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v5 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v10, v3 +; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v11, v2, vcc +; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v11, v2 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v4 +; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v5 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] -; CGP-NEXT: v_subb_u32_e32 v9, vcc, v9, v5, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v11, v5 -; CGP-NEXT: v_cndmask_b32_e32 v10, v13, v10, vcc -; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v4 -; CGP-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4 +; CGP-NEXT: v_subb_u32_e32 v2, vcc, v2, v5, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v10, v5 +; CGP-NEXT: v_cndmask_b32_e32 v10, v13, v11, vcc +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v5 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v5 -; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CGP-NEXT: v_cndmask_b32_e32 v4, v14, v12, vcc -; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v16, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v2, v14, v12, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v15, v16, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; CGP-NEXT: ; implicit-def: $vgpr4 -; CGP-NEXT: ; implicit-def: $vgpr8 +; CGP-NEXT: ; implicit-def: $vgpr10 ; CGP-NEXT: BB2_2: ; %Flow2 ; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] @@ -780,53 +782,53 @@ ; CGP-NEXT: v_mul_lo_u32 v1, v1, v0 ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 ; CGP-NEXT: v_mul_lo_u32 v1, v0, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v8, v1 +; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v10, v1 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v0 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v4 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: BB2_4: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] -; CGP-NEXT: v_or_b32_e32 v5, v3, v7 -; CGP-NEXT: v_mov_b32_e32 v4, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; CGP-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CGP-NEXT: v_or_b32_e32 v3, v9, v7 +; CGP-NEXT: v_mov_b32_e32 v2, 0 +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB2_6 ; CGP-NEXT: ; %bb.5: -; CGP-NEXT: v_cvt_f32_u32_e32 v4, v6 -; CGP-NEXT: v_cvt_f32_u32_e32 v5, v7 -; CGP-NEXT: v_sub_i32_e32 v8, vcc, 0, v6 -; CGP-NEXT: v_subb_u32_e32 v9, vcc, 0, v7, vcc -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v5, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v8, v5 -; CGP-NEXT: v_mul_lo_u32 v11, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v12, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v7 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v6 +; CGP-NEXT: v_subb_u32_e32 v5, vcc, 0, v7, vcc +; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 +; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 +; CGP-NEXT: v_trunc_f32_e32 v3, v3 +; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CGP-NEXT: v_mul_lo_u32 v10, v4, v3 +; CGP-NEXT: v_mul_lo_u32 v11, v4, v2 +; CGP-NEXT: v_mul_lo_u32 v12, v5, v2 +; CGP-NEXT: v_mul_hi_u32 v13, v4, v2 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_mul_lo_u32 v12, v5, v11 -; CGP-NEXT: v_mul_hi_u32 v14, v4, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v5, v11 +; CGP-NEXT: v_mul_lo_u32 v12, v3, v11 +; CGP-NEXT: v_mul_hi_u32 v14, v2, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v3, v11 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v4, v10 -; CGP-NEXT: v_mul_lo_u32 v15, v5, v10 -; CGP-NEXT: v_mul_hi_u32 v16, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v5, v10 +; CGP-NEXT: v_mul_lo_u32 v13, v2, v10 +; CGP-NEXT: v_mul_lo_u32 v15, v3, v10 +; CGP-NEXT: v_mul_hi_u32 v16, v2, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v3, v10 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v15, v11 @@ -841,125 +843,123 @@ ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; CGP-NEXT: v_addc_u32_e64 v11, s[4:5], v5, v10, vcc -; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v5, v10 -; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v9, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v8, v8, v11 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; CGP-NEXT: v_addc_u32_e64 v11, s[4:5], v3, v10, vcc +; CGP-NEXT: v_add_i32_e64 v3, s[4:5], v3, v10 +; CGP-NEXT: v_mul_lo_u32 v10, v4, v2 +; CGP-NEXT: v_mul_lo_u32 v5, v5, v2 +; CGP-NEXT: v_mul_hi_u32 v12, v4, v2 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v11 ; CGP-NEXT: v_mul_lo_u32 v13, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v14, v4, v10 +; CGP-NEXT: v_mul_hi_u32 v14, v2, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v11, v10 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v12 -; CGP-NEXT: v_mul_lo_u32 v9, v4, v8 -; CGP-NEXT: v_mul_lo_u32 v12, v11, v8 -; CGP-NEXT: v_mul_hi_u32 v15, v4, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v11, v8 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v13, v9 +; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4 +; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v4, v12 +; CGP-NEXT: v_mul_lo_u32 v5, v2, v4 +; CGP-NEXT: v_mul_lo_u32 v12, v11, v4 +; CGP-NEXT: v_mul_hi_u32 v15, v2, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v11, v4 +; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v13, v5 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v10 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v5, v14 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v15 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 +; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v11, v5 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v13 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 +; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v9, v2, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v2, v5 -; CGP-NEXT: v_mul_lo_u32 v11, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v12, v2, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v3, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v4, v10 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CGP-NEXT: v_mul_lo_u32 v4, v9, v2 +; CGP-NEXT: v_mul_hi_u32 v5, v8, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v9, v2 +; CGP-NEXT: v_mul_lo_u32 v10, v8, v3 +; CGP-NEXT: v_mul_lo_u32 v11, v9, v3 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v9, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v11, v2 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_mul_lo_u32 v9, v6, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v11, v6, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CGP-NEXT: v_mul_lo_u32 v8, v6, v5 -; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v4 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v5, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; CGP-NEXT: v_mul_lo_u32 v5, v6, v2 +; CGP-NEXT: v_mul_lo_u32 v10, v7, v2 +; CGP-NEXT: v_mul_hi_u32 v11, v6, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v6, v3 +; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v2 +; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v12 ; CGP-NEXT: v_addc_u32_e32 v14, vcc, 0, v13, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 -; CGP-NEXT: v_subb_u32_e64 v9, s[4:5], v3, v8, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v8 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v7 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v5 +; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v9, v4, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v9, v4 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v6 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v7 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v7 -; CGP-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7 +; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v7, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7 +; CGP-NEXT: v_cndmask_b32_e32 v8, v11, v9, vcc +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 +; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v6 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v7 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; CGP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v2, v12, v10, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v14, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; CGP-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; CGP-NEXT: v_cndmask_b32_e32 v4, v12, v10, vcc +; CGP-NEXT: v_cndmask_b32_e32 v5, v13, v14, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; CGP-NEXT: ; implicit-def: $vgpr6 -; CGP-NEXT: ; implicit-def: $vgpr2 +; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: BB2_6: ; %Flow ; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] ; CGP-NEXT: s_cbranch_execz BB2_8 ; CGP-NEXT: ; %bb.7: -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v6 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v6 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v3 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6 +; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_mul_lo_u32 v4, v3, v6 -; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v3 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v2, v6 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_mul_hi_u32 v2, v8, v2 +; CGP-NEXT: v_mul_lo_u32 v3, v2, v6 +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v8, v3 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; CGP-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; CGP-NEXT: v_mov_b32_e32 v5, 0 +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v6 +; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v6 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: BB2_8: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] -; CGP-NEXT: v_mov_b32_e32 v2, v4 -; CGP-NEXT: v_mov_b32_e32 v3, v5 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = udiv <2 x i64> %num, %den ret <2 x i64> %result @@ -2291,41 +2291,43 @@ ; CHECK-LABEL: v_udiv_i64_pow2_shl_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v3, v0 +; CHECK-NEXT: v_mov_b32_e32 v4, v1 ; CHECK-NEXT: s_mov_b64 s[4:5], 0x1000 -; CHECK-NEXT: v_lshl_b64 v[4:5], s[4:5], v2 -; CHECK-NEXT: v_or_b32_e32 v3, v1, v5 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CHECK-NEXT: v_lshl_b64 v[5:6], s[4:5], v2 +; CHECK-NEXT: v_or_b32_e32 v1, v4, v6 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz BB7_2 ; CHECK-NEXT: ; %bb.1: -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v4 -; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v5 -; CHECK-NEXT: v_sub_i32_e32 v6, vcc, 0, v4 -; CHECK-NEXT: v_subb_u32_e32 v7, vcc, 0, v5, vcc -; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v3, v3 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 -; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v3 -; CHECK-NEXT: v_mul_lo_u32 v9, v6, v2 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v11, v6, v2 +; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5 +; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v6 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, 0, v5 +; CHECK-NEXT: v_subb_u32_e32 v7, vcc, 0, v6, vcc +; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; CHECK-NEXT: v_trunc_f32_e32 v1, v1 +; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 +; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 +; CHECK-NEXT: v_mul_lo_u32 v8, v2, v1 +; CHECK-NEXT: v_mul_lo_u32 v9, v2, v0 +; CHECK-NEXT: v_mul_lo_u32 v10, v7, v0 +; CHECK-NEXT: v_mul_hi_u32 v11, v2, v0 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CHECK-NEXT: v_mul_lo_u32 v10, v3, v9 -; CHECK-NEXT: v_mul_hi_u32 v12, v2, v9 -; CHECK-NEXT: v_mul_hi_u32 v9, v3, v9 +; CHECK-NEXT: v_mul_lo_u32 v10, v1, v9 +; CHECK-NEXT: v_mul_hi_u32 v12, v0, v9 +; CHECK-NEXT: v_mul_hi_u32 v9, v1, v9 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; CHECK-NEXT: v_mul_lo_u32 v11, v2, v8 -; CHECK-NEXT: v_mul_lo_u32 v13, v3, v8 -; CHECK-NEXT: v_mul_hi_u32 v14, v2, v8 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v8 +; CHECK-NEXT: v_mul_lo_u32 v11, v0, v8 +; CHECK-NEXT: v_mul_lo_u32 v13, v1, v8 +; CHECK-NEXT: v_mul_hi_u32 v14, v0, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v1, v8 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v13, v9 @@ -2340,22 +2342,22 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; CHECK-NEXT: v_addc_u32_e64 v9, s[4:5], v3, v8, vcc -; CHECK-NEXT: v_add_i32_e64 v3, s[4:5], v3, v8 -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v10, v6, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, v6, v9 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_addc_u32_e64 v9, s[4:5], v1, v8, vcc +; CHECK-NEXT: v_add_i32_e64 v1, s[4:5], v1, v8 +; CHECK-NEXT: v_mul_lo_u32 v8, v2, v0 +; CHECK-NEXT: v_mul_lo_u32 v7, v7, v0 +; CHECK-NEXT: v_mul_hi_u32 v10, v2, v0 +; CHECK-NEXT: v_mul_lo_u32 v2, v2, v9 ; CHECK-NEXT: v_mul_lo_u32 v11, v9, v8 -; CHECK-NEXT: v_mul_hi_u32 v12, v2, v8 +; CHECK-NEXT: v_mul_hi_u32 v12, v0, v8 ; CHECK-NEXT: v_mul_hi_u32 v8, v9, v8 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10 -; CHECK-NEXT: v_mul_lo_u32 v7, v2, v6 -; CHECK-NEXT: v_mul_lo_u32 v10, v9, v6 -; CHECK-NEXT: v_mul_hi_u32 v13, v2, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v9, v6 +; CHECK-NEXT: v_add_i32_e64 v2, s[4:5], v7, v2 +; CHECK-NEXT: v_add_i32_e64 v2, s[4:5], v2, v10 +; CHECK-NEXT: v_mul_lo_u32 v7, v0, v2 +; CHECK-NEXT: v_mul_lo_u32 v10, v9, v2 +; CHECK-NEXT: v_mul_hi_u32 v13, v0, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v9, v2 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v10, v8 @@ -2369,96 +2371,94 @@ ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v1, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v0, v3 -; CHECK-NEXT: v_mul_lo_u32 v9, v1, v3 -; CHECK-NEXT: v_mul_hi_u32 v10, v0, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_add_i32_e64 v2, s[4:5], v2, v8 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0 +; CHECK-NEXT: v_mul_hi_u32 v7, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 +; CHECK-NEXT: v_mul_lo_u32 v8, v3, v1 +; CHECK-NEXT: v_mul_lo_u32 v9, v4, v1 +; CHECK-NEXT: v_mul_hi_u32 v10, v3, v1 +; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v9, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v10 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_mul_lo_u32 v7, v4, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v9, v4, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CHECK-NEXT: v_mul_lo_u32 v6, v4, v3 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, 1, v2 -; CHECK-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, v5, v0 +; CHECK-NEXT: v_mul_lo_u32 v8, v6, v0 +; CHECK-NEXT: v_mul_hi_u32 v9, v5, v0 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_mul_lo_u32 v2, v5, v1 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, 1, v0 +; CHECK-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, 1, v10 ; CHECK-NEXT: v_addc_u32_e32 v12, vcc, 0, v11, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; CHECK-NEXT: v_subb_u32_e64 v7, s[4:5], v1, v6, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v6 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v5 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v9 +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 +; CHECK-NEXT: v_subb_u32_e64 v7, s[4:5], v4, v2, vcc +; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v4, v2 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v7, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v10, v8, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: v_subb_u32_e32 v2, vcc, v2, v6, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; CHECK-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc +; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v5 +; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v6 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; CHECK-NEXT: v_cndmask_b32_e32 v2, v10, v8, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v3, v11, v12, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 +; CHECK-NEXT: ; implicit-def: $vgpr3 ; CHECK-NEXT: BB7_2: ; %Flow ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] ; CHECK-NEXT: s_xor_b64 exec, exec, s[6:7] ; CHECK-NEXT: s_cbranch_execz BB7_4 ; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v4 -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, 0, v4 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_lo_u32 v2, v2, v1 -; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5 +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 +; CHECK-NEXT: v_mul_lo_u32 v1, v1, v0 ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 -; CHECK-NEXT: v_mul_lo_u32 v2, v1, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v0, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_mul_hi_u32 v0, v3, v0 +; CHECK-NEXT: v_mul_lo_u32 v1, v0, v5 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v3, v1 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc -; CHECK-NEXT: v_mov_b32_e32 v3, 0 +; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: BB7_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] -; CHECK-NEXT: v_mov_b32_e32 v0, v2 -; CHECK-NEXT: v_mov_b32_e32 v1, v3 ; CHECK-NEXT: s_setpc_b64 s[30:31] %shl.y = shl i64 4096, %y %r = udiv i64 %x, %shl.y @@ -2731,12 +2731,14 @@ ; CGP-LABEL: v_udiv_v2i64_pow2_shl_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_mov_b32_e32 v5, v0 -; CGP-NEXT: v_mov_b32_e32 v7, v1 +; CGP-NEXT: v_mov_b32_e32 v8, v0 +; CGP-NEXT: v_mov_b32_e32 v9, v1 +; CGP-NEXT: v_mov_b32_e32 v5, v2 +; CGP-NEXT: v_mov_b32_e32 v7, v3 ; CGP-NEXT: s_mov_b64 s[4:5], 0x1000 -; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v4 -; CGP-NEXT: v_lshl_b64 v[8:9], s[4:5], v6 -; CGP-NEXT: v_or_b32_e32 v1, v7, v11 +; CGP-NEXT: v_lshl_b64 v[2:3], s[4:5], v4 +; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v6 +; CGP-NEXT: v_or_b32_e32 v1, v9, v3 ; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -2744,10 +2746,10 @@ ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB8_2 ; CGP-NEXT: ; %bb.1: -; CGP-NEXT: v_cvt_f32_u32_e32 v0, v10 -; CGP-NEXT: v_cvt_f32_u32_e32 v1, v11 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v10 -; CGP-NEXT: v_subb_u32_e32 v6, vcc, 0, v11, vcc +; CGP-NEXT: v_cvt_f32_u32_e32 v0, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v6, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2816,13 +2818,13 @@ ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v4, v7, v0 -; CGP-NEXT: v_mul_hi_u32 v6, v5, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v7, v0 -; CGP-NEXT: v_mul_lo_u32 v12, v5, v1 -; CGP-NEXT: v_mul_lo_u32 v13, v7, v1 -; CGP-NEXT: v_mul_hi_u32 v14, v5, v1 -; CGP-NEXT: v_mul_hi_u32 v1, v7, v1 +; CGP-NEXT: v_mul_lo_u32 v4, v9, v0 +; CGP-NEXT: v_mul_hi_u32 v6, v8, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 +; CGP-NEXT: v_mul_lo_u32 v12, v8, v1 +; CGP-NEXT: v_mul_lo_u32 v13, v9, v1 +; CGP-NEXT: v_mul_hi_u32 v14, v8, v1 +; CGP-NEXT: v_mul_hi_u32 v1, v9, v1 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v13, v0 @@ -2836,236 +2838,234 @@ ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_mul_lo_u32 v6, v10, v0 -; CGP-NEXT: v_mul_lo_u32 v12, v11, v0 -; CGP-NEXT: v_mul_hi_u32 v13, v10, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v2, v0 +; CGP-NEXT: v_mul_lo_u32 v12, v3, v0 +; CGP-NEXT: v_mul_hi_u32 v13, v2, v0 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v10, v1 +; CGP-NEXT: v_mul_lo_u32 v4, v2, v1 ; CGP-NEXT: v_add_i32_e32 v14, vcc, 1, v0 ; CGP-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4 ; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v14 ; CGP-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13 -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 -; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v7, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v7, v4 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v10 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v11 +; CGP-NEXT: v_sub_i32_e32 v6, vcc, v8, v6 +; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v9, v4, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v9, v4 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5] -; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v11, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v11 -; CGP-NEXT: v_cndmask_b32_e32 v6, v13, v7, vcc -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v10 +; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v3, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v8, v3 +; CGP-NEXT: v_cndmask_b32_e32 v8, v13, v9, vcc +; CGP-NEXT: v_sub_i32_e32 v6, vcc, v6, v2 ; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v10 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v11 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v11 -; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CGP-NEXT: v_cndmask_b32_e32 v4, v14, v12, vcc -; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v16, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 -; CGP-NEXT: ; implicit-def: $vgpr5 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v3 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; CGP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; CGP-NEXT: v_cndmask_b32_e32 v2, v14, v12, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v15, v16, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: BB8_2: ; %Flow2 ; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] ; CGP-NEXT: s_cbranch_execz BB8_4 ; CGP-NEXT: ; %bb.3: -; CGP-NEXT: v_cvt_f32_u32_e32 v0, v10 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v10 +; CGP-NEXT: v_cvt_f32_u32_e32 v0, v2 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 ; CGP-NEXT: v_mul_lo_u32 v1, v1, v0 ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CGP-NEXT: v_mul_hi_u32 v0, v5, v0 -; CGP-NEXT: v_mul_lo_u32 v1, v0, v10 -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v5, v1 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v10 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v1, v10 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v0 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v10 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_mul_lo_u32 v1, v0, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v8, v1 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v2 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: BB8_4: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] -; CGP-NEXT: v_or_b32_e32 v5, v3, v9 -; CGP-NEXT: v_mov_b32_e32 v4, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; CGP-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CGP-NEXT: v_or_b32_e32 v3, v7, v11 +; CGP-NEXT: v_mov_b32_e32 v2, 0 +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB8_6 ; CGP-NEXT: ; %bb.5: -; CGP-NEXT: v_cvt_f32_u32_e32 v4, v8 -; CGP-NEXT: v_cvt_f32_u32_e32 v5, v9 -; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v8 -; CGP-NEXT: v_subb_u32_e32 v7, vcc, 0, v9, vcc -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v5, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v6, v5 -; CGP-NEXT: v_mul_lo_u32 v11, v6, v4 -; CGP-NEXT: v_mul_lo_u32 v12, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v13, v6, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_mul_lo_u32 v12, v5, v11 -; CGP-NEXT: v_mul_hi_u32 v14, v4, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v5, v11 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v4, v10 -; CGP-NEXT: v_mul_lo_u32 v15, v5, v10 -; CGP-NEXT: v_mul_hi_u32 v16, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v5, v10 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v11 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v10 +; CGP-NEXT: v_subb_u32_e32 v6, vcc, 0, v11, vcc +; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 +; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 +; CGP-NEXT: v_trunc_f32_e32 v3, v3 +; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CGP-NEXT: v_mul_lo_u32 v8, v4, v3 +; CGP-NEXT: v_mul_lo_u32 v9, v4, v2 +; CGP-NEXT: v_mul_lo_u32 v12, v6, v2 +; CGP-NEXT: v_mul_hi_u32 v13, v4, v2 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8 +; CGP-NEXT: v_mul_lo_u32 v12, v3, v9 +; CGP-NEXT: v_mul_hi_u32 v14, v2, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v3, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v13 +; CGP-NEXT: v_mul_lo_u32 v13, v2, v8 +; CGP-NEXT: v_mul_lo_u32 v15, v3, v8 +; CGP-NEXT: v_mul_hi_u32 v16, v2, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v15, v11 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v15, v9 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v16 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v16 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; CGP-NEXT: v_addc_u32_e64 v11, s[4:5], v5, v10, vcc -; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v5, v10 -; CGP-NEXT: v_mul_lo_u32 v10, v6, v4 -; CGP-NEXT: v_mul_lo_u32 v7, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v12, v6, v4 -; CGP-NEXT: v_mul_lo_u32 v6, v6, v11 -; CGP-NEXT: v_mul_lo_u32 v13, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v14, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v10 -; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 -; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v6, v12 -; CGP-NEXT: v_mul_lo_u32 v7, v4, v6 -; CGP-NEXT: v_mul_lo_u32 v12, v11, v6 -; CGP-NEXT: v_mul_hi_u32 v15, v4, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v11, v6 -; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v13, v7 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v10 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v9 +; CGP-NEXT: v_addc_u32_e64 v9, s[4:5], v3, v8, vcc +; CGP-NEXT: v_add_i32_e64 v3, s[4:5], v3, v8 +; CGP-NEXT: v_mul_lo_u32 v8, v4, v2 +; CGP-NEXT: v_mul_lo_u32 v6, v6, v2 +; CGP-NEXT: v_mul_hi_u32 v12, v4, v2 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v9 +; CGP-NEXT: v_mul_lo_u32 v13, v9, v8 +; CGP-NEXT: v_mul_hi_u32 v14, v2, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v9, v8 +; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v6, v4 +; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v4, v12 +; CGP-NEXT: v_mul_lo_u32 v6, v2, v4 +; CGP-NEXT: v_mul_lo_u32 v12, v9, v4 +; CGP-NEXT: v_mul_hi_u32 v15, v2, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v9, v4 +; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v13, v6 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v12, v8 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v14 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v15 +; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v6, v14 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v15 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v13 -; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 -; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v5, v6, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v7, v2, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v2, v5 -; CGP-NEXT: v_mul_lo_u32 v11, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v12, v2, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v3, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 +; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v9, v6 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v13 +; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v8, v6 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 +; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v4, v8 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CGP-NEXT: v_mul_lo_u32 v4, v7, v2 +; CGP-NEXT: v_mul_hi_u32 v6, v5, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v7, v2 +; CGP-NEXT: v_mul_lo_u32 v8, v5, v3 +; CGP-NEXT: v_mul_lo_u32 v9, v7, v3 +; CGP-NEXT: v_mul_hi_u32 v12, v5, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v7, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v9, v2 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_mul_lo_u32 v7, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v11, v8, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CGP-NEXT: v_mul_lo_u32 v6, v8, v5 -; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v4 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v5, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 -; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v12 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CGP-NEXT: v_mul_lo_u32 v6, v10, v2 +; CGP-NEXT: v_mul_lo_u32 v8, v11, v2 +; CGP-NEXT: v_mul_hi_u32 v9, v10, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_mul_lo_u32 v4, v10, v3 +; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v2 +; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v12 ; CGP-NEXT: v_addc_u32_e32 v14, vcc, 0, v13, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v11 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 -; CGP-NEXT: v_subb_u32_e64 v7, s[4:5], v3, v6, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v6 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v8 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v9 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v9 -; CGP-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v8 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 +; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v7, v4, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v7, v4 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v10 +; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v11 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v11, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v11 +; CGP-NEXT: v_cndmask_b32_e32 v6, v9, v7, vcc +; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v10 +; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v10 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v11 ; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v9 -; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v2, v12, v10, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v14, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v11 +; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; CGP-NEXT: v_cndmask_b32_e32 v4, v12, v8, vcc +; CGP-NEXT: v_cndmask_b32_e32 v5, v13, v14, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc -; CGP-NEXT: ; implicit-def: $vgpr8_vgpr9 -; CGP-NEXT: ; implicit-def: $vgpr2 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 +; CGP-NEXT: ; implicit-def: $vgpr5 ; CGP-NEXT: BB8_6: ; %Flow ; CGP-NEXT: s_or_saveexec_b64 s[6:7], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[6:7] ; CGP-NEXT: s_cbranch_execz BB8_8 ; CGP-NEXT: ; %bb.7: -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v8 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v8 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v3 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v10 +; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_mul_lo_u32 v4, v3, v8 -; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v3 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v8 -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v2, v8 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 +; CGP-NEXT: v_mul_lo_u32 v3, v2, v10 +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v5, v3 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v10 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v8 -; CGP-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; CGP-NEXT: v_mov_b32_e32 v5, 0 +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v10 +; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v10 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: BB8_8: ; CGP-NEXT: s_or_b64 exec, exec, s[6:7] -; CGP-NEXT: v_mov_b32_e32 v2, v4 -; CGP-NEXT: v_mov_b32_e32 v3, v5 ; CGP-NEXT: s_setpc_b64 s[30:31] %shl.y = shl <2 x i64> , %y %r = udiv <2 x i64> %x, %shl.y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -8,39 +8,41 @@ ; CHECK-LABEL: v_urem_i64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_or_b32_e32 v5, v1, v3 -; CHECK-NEXT: v_mov_b32_e32 v4, 0 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CHECK-NEXT: v_mov_b32_e32 v4, v0 +; CHECK-NEXT: v_mov_b32_e32 v5, v1 +; CHECK-NEXT: v_or_b32_e32 v1, v5, v3 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz BB0_2 ; CHECK-NEXT: ; %bb.1: -; CHECK-NEXT: v_cvt_f32_u32_e32 v4, v2 -; CHECK-NEXT: v_cvt_f32_u32_e32 v5, v3 +; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v2 +; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v3 ; CHECK-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 ; CHECK-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc -; CHECK-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CHECK-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; CHECK-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CHECK-NEXT: v_trunc_f32_e32 v5, v5 -; CHECK-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v5 -; CHECK-NEXT: v_mul_lo_u32 v9, v6, v4 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v4 -; CHECK-NEXT: v_mul_hi_u32 v11, v6, v4 +; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; CHECK-NEXT: v_trunc_f32_e32 v1, v1 +; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 +; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 +; CHECK-NEXT: v_mul_lo_u32 v8, v6, v1 +; CHECK-NEXT: v_mul_lo_u32 v9, v6, v0 +; CHECK-NEXT: v_mul_lo_u32 v10, v7, v0 +; CHECK-NEXT: v_mul_hi_u32 v11, v6, v0 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CHECK-NEXT: v_mul_lo_u32 v10, v5, v9 -; CHECK-NEXT: v_mul_hi_u32 v12, v4, v9 -; CHECK-NEXT: v_mul_hi_u32 v9, v5, v9 +; CHECK-NEXT: v_mul_lo_u32 v10, v1, v9 +; CHECK-NEXT: v_mul_hi_u32 v12, v0, v9 +; CHECK-NEXT: v_mul_hi_u32 v9, v1, v9 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; CHECK-NEXT: v_mul_lo_u32 v11, v4, v8 -; CHECK-NEXT: v_mul_lo_u32 v13, v5, v8 -; CHECK-NEXT: v_mul_hi_u32 v14, v4, v8 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v8 +; CHECK-NEXT: v_mul_lo_u32 v11, v0, v8 +; CHECK-NEXT: v_mul_lo_u32 v13, v1, v8 +; CHECK-NEXT: v_mul_hi_u32 v14, v0, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v1, v8 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v13, v9 @@ -55,21 +57,21 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CHECK-NEXT: v_addc_u32_e64 v9, s[4:5], v5, v8, vcc -; CHECK-NEXT: v_add_i32_e64 v5, s[4:5], v5, v8 -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v4 -; CHECK-NEXT: v_mul_lo_u32 v7, v7, v4 -; CHECK-NEXT: v_mul_hi_u32 v10, v6, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_addc_u32_e64 v9, s[4:5], v1, v8, vcc +; CHECK-NEXT: v_add_i32_e64 v1, s[4:5], v1, v8 +; CHECK-NEXT: v_mul_lo_u32 v8, v6, v0 +; CHECK-NEXT: v_mul_lo_u32 v7, v7, v0 +; CHECK-NEXT: v_mul_hi_u32 v10, v6, v0 ; CHECK-NEXT: v_mul_lo_u32 v6, v6, v9 ; CHECK-NEXT: v_mul_lo_u32 v11, v9, v8 -; CHECK-NEXT: v_mul_hi_u32 v12, v4, v8 +; CHECK-NEXT: v_mul_hi_u32 v12, v0, v8 ; CHECK-NEXT: v_mul_hi_u32 v8, v9, v8 ; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 ; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10 -; CHECK-NEXT: v_mul_lo_u32 v7, v4, v6 +; CHECK-NEXT: v_mul_lo_u32 v7, v0, v6 ; CHECK-NEXT: v_mul_lo_u32 v10, v9, v6 -; CHECK-NEXT: v_mul_hi_u32 v13, v4, v6 +; CHECK-NEXT: v_mul_hi_u32 v13, v0, v6 ; CHECK-NEXT: v_mul_hi_u32 v6, v9, v6 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] @@ -85,92 +87,90 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 ; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, v5, v6, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v1, v4 -; CHECK-NEXT: v_mul_hi_u32 v7, v0, v4 -; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4 -; CHECK-NEXT: v_mul_lo_u32 v8, v0, v5 -; CHECK-NEXT: v_mul_lo_u32 v9, v1, v5 -; CHECK-NEXT: v_mul_hi_u32 v10, v0, v5 -; CHECK-NEXT: v_mul_hi_u32 v5, v1, v5 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CHECK-NEXT: v_mul_lo_u32 v6, v5, v0 +; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0 +; CHECK-NEXT: v_mul_lo_u32 v8, v4, v1 +; CHECK-NEXT: v_mul_lo_u32 v9, v5, v1 +; CHECK-NEXT: v_mul_hi_u32 v10, v4, v1 +; CHECK-NEXT: v_mul_hi_u32 v1, v5, v1 ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v9, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_mul_lo_u32 v7, v2, v4 -; CHECK-NEXT: v_mul_lo_u32 v8, v3, v4 -; CHECK-NEXT: v_mul_hi_u32 v4, v2, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CHECK-NEXT: v_mul_lo_u32 v5, v2, v5 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; CHECK-NEXT: v_subb_u32_e64 v5, s[4:5], v1, v4, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v4 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3 +; CHECK-NEXT: v_mul_lo_u32 v7, v2, v0 +; CHECK-NEXT: v_mul_lo_u32 v8, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v2, v0 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v6 +; CHECK-NEXT: v_mul_lo_u32 v1, v2, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v4, v7 +; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v5, v0, vcc +; CHECK-NEXT: v_sub_i32_e64 v0, s[4:5], v5, v0 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v0, v2 -; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v1, vcc +; CHECK-NEXT: v_subb_u32_e32 v0, vcc, v0, v3, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; CHECK-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v1, v2 +; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v0, vcc ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; CHECK-NEXT: v_subb_u32_e32 v0, vcc, v0, v3, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc ; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v6, v2 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc +; CHECK-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v3, v7, v0, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; CHECK-NEXT: ; implicit-def: $vgpr2 -; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr4 ; CHECK-NEXT: BB0_2: ; %Flow ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] ; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz BB0_4 ; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v2 -; CHECK-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_lo_u32 v3, v3, v1 -; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v2 +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 +; CHECK-NEXT: v_mul_lo_u32 v1, v1, v0 ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, v2 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 +; CHECK-NEXT: v_mul_lo_u32 v0, v0, v2 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; CHECK-NEXT: v_mov_b32_e32 v5, 0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: BB0_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: v_mov_b32_e32 v0, v4 -; CHECK-NEXT: v_mov_b32_e32 v1, v5 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = urem i64 %num, %den ret i64 %result @@ -620,9 +620,11 @@ ; CGP-LABEL: v_urem_v2i64: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_mov_b32_e32 v8, v0 -; CGP-NEXT: v_mov_b32_e32 v9, v1 -; CGP-NEXT: v_or_b32_e32 v1, v9, v5 +; CGP-NEXT: v_mov_b32_e32 v10, v0 +; CGP-NEXT: v_mov_b32_e32 v11, v1 +; CGP-NEXT: v_mov_b32_e32 v8, v2 +; CGP-NEXT: v_mov_b32_e32 v9, v3 +; CGP-NEXT: v_or_b32_e32 v1, v11, v5 ; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -632,8 +634,8 @@ ; CGP-NEXT: ; %bb.1: ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v4 ; CGP-NEXT: v_cvt_f32_u32_e32 v1, v5 -; CGP-NEXT: v_sub_i32_e32 v10, vcc, 0, v4 -; CGP-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc +; CGP-NEXT: v_sub_i32_e32 v2, vcc, 0, v4 +; CGP-NEXT: v_subb_u32_e32 v3, vcc, 0, v5, vcc ; CGP-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -642,10 +644,10 @@ ; CGP-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; CGP-NEXT: v_cvt_u32_f32_e32 v1, v1 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CGP-NEXT: v_mul_lo_u32 v12, v10, v1 -; CGP-NEXT: v_mul_lo_u32 v13, v10, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v11, v0 -; CGP-NEXT: v_mul_hi_u32 v15, v10, v0 +; CGP-NEXT: v_mul_lo_u32 v12, v2, v1 +; CGP-NEXT: v_mul_lo_u32 v13, v2, v0 +; CGP-NEXT: v_mul_lo_u32 v14, v3, v0 +; CGP-NEXT: v_mul_hi_u32 v15, v2, v0 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12 ; CGP-NEXT: v_mul_lo_u32 v14, v1, v13 ; CGP-NEXT: v_mul_hi_u32 v16, v0, v13 @@ -672,73 +674,73 @@ ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13 ; CGP-NEXT: v_addc_u32_e64 v13, s[4:5], v1, v12, vcc ; CGP-NEXT: v_add_i32_e64 v1, s[4:5], v1, v12 -; CGP-NEXT: v_mul_lo_u32 v12, v10, v0 -; CGP-NEXT: v_mul_lo_u32 v11, v11, v0 -; CGP-NEXT: v_mul_hi_u32 v14, v10, v0 -; CGP-NEXT: v_mul_lo_u32 v10, v10, v13 +; CGP-NEXT: v_mul_lo_u32 v12, v2, v0 +; CGP-NEXT: v_mul_lo_u32 v3, v3, v0 +; CGP-NEXT: v_mul_hi_u32 v14, v2, v0 +; CGP-NEXT: v_mul_lo_u32 v2, v2, v13 ; CGP-NEXT: v_mul_lo_u32 v15, v13, v12 ; CGP-NEXT: v_mul_hi_u32 v16, v0, v12 ; CGP-NEXT: v_mul_hi_u32 v12, v13, v12 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14 -; CGP-NEXT: v_mul_lo_u32 v11, v0, v10 -; CGP-NEXT: v_mul_lo_u32 v14, v13, v10 -; CGP-NEXT: v_mul_hi_u32 v17, v0, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v13, v10 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v15, v11 +; CGP-NEXT: v_add_i32_e64 v2, s[4:5], v3, v2 +; CGP-NEXT: v_add_i32_e64 v2, s[4:5], v2, v14 +; CGP-NEXT: v_mul_lo_u32 v3, v0, v2 +; CGP-NEXT: v_mul_lo_u32 v14, v13, v2 +; CGP-NEXT: v_mul_hi_u32 v17, v0, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v13, v2 +; CGP-NEXT: v_add_i32_e64 v3, s[4:5], v15, v3 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v14, v12 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v11, v16 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v3, s[4:5], v3, v16 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v12, v17 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v13, v11 +; CGP-NEXT: v_add_i32_e64 v3, s[4:5], v13, v3 ; CGP-NEXT: v_add_i32_e64 v13, s[4:5], v14, v15 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 +; CGP-NEXT: v_add_i32_e64 v3, s[4:5], v12, v3 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12 -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v10, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v11 +; CGP-NEXT: v_add_i32_e64 v2, s[4:5], v2, v12 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v10, v9, v0 -; CGP-NEXT: v_mul_hi_u32 v11, v8, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 -; CGP-NEXT: v_mul_lo_u32 v12, v8, v1 -; CGP-NEXT: v_mul_lo_u32 v13, v9, v1 -; CGP-NEXT: v_mul_hi_u32 v14, v8, v1 -; CGP-NEXT: v_mul_hi_u32 v1, v9, v1 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 +; CGP-NEXT: v_mul_lo_u32 v2, v11, v0 +; CGP-NEXT: v_mul_hi_u32 v3, v10, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v11, v0 +; CGP-NEXT: v_mul_lo_u32 v12, v10, v1 +; CGP-NEXT: v_mul_lo_u32 v13, v11, v1 +; CGP-NEXT: v_mul_hi_u32 v14, v10, v1 +; CGP-NEXT: v_mul_hi_u32 v1, v11, v1 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v13, v0 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_mul_lo_u32 v11, v4, v0 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v12, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v13, v3 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CGP-NEXT: v_mul_lo_u32 v3, v4, v0 ; CGP-NEXT: v_mul_lo_u32 v12, v5, v0 ; CGP-NEXT: v_mul_hi_u32 v0, v4, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v10 +; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; CGP-NEXT: v_mul_lo_u32 v1, v4, v1 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v12, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v8, v11 -; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v9, v0, vcc -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v9, v0 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v10, v3 +; CGP-NEXT: v_subb_u32_e64 v2, s[4:5], v11, v0, vcc +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v11, v0 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v4 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v5 +; CGP-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; CGP-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v8, v5 -; CGP-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc ; CGP-NEXT: v_sub_i32_e32 v10, vcc, v1, v4 ; CGP-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v0, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v4 @@ -753,11 +755,11 @@ ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc ; CGP-NEXT: v_cndmask_b32_e32 v5, v11, v0, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; CGP-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc ; CGP-NEXT: ; implicit-def: $vgpr4 -; CGP-NEXT: ; implicit-def: $vgpr8 +; CGP-NEXT: ; implicit-def: $vgpr10 ; CGP-NEXT: BB2_2: ; %Flow2 ; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] @@ -771,9 +773,9 @@ ; CGP-NEXT: v_mul_lo_u32 v1, v1, v0 ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 ; CGP-NEXT: v_mul_lo_u32 v0, v0, v4 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v8, v0 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v10, v0 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v4 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc @@ -783,39 +785,39 @@ ; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: BB2_4: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] -; CGP-NEXT: v_or_b32_e32 v5, v3, v7 -; CGP-NEXT: v_mov_b32_e32 v4, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; CGP-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CGP-NEXT: v_or_b32_e32 v3, v9, v7 +; CGP-NEXT: v_mov_b32_e32 v2, 0 +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB2_6 ; CGP-NEXT: ; %bb.5: -; CGP-NEXT: v_cvt_f32_u32_e32 v4, v6 -; CGP-NEXT: v_cvt_f32_u32_e32 v5, v7 -; CGP-NEXT: v_sub_i32_e32 v8, vcc, 0, v6 -; CGP-NEXT: v_subb_u32_e32 v9, vcc, 0, v7, vcc -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v5, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v8, v5 -; CGP-NEXT: v_mul_lo_u32 v11, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v12, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v13, v8, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v7 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v6 +; CGP-NEXT: v_subb_u32_e32 v5, vcc, 0, v7, vcc +; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 +; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 +; CGP-NEXT: v_trunc_f32_e32 v3, v3 +; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CGP-NEXT: v_mul_lo_u32 v10, v4, v3 +; CGP-NEXT: v_mul_lo_u32 v11, v4, v2 +; CGP-NEXT: v_mul_lo_u32 v12, v5, v2 +; CGP-NEXT: v_mul_hi_u32 v13, v4, v2 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_mul_lo_u32 v12, v5, v11 -; CGP-NEXT: v_mul_hi_u32 v14, v4, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v5, v11 +; CGP-NEXT: v_mul_lo_u32 v12, v3, v11 +; CGP-NEXT: v_mul_hi_u32 v14, v2, v11 +; CGP-NEXT: v_mul_hi_u32 v11, v3, v11 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v4, v10 -; CGP-NEXT: v_mul_lo_u32 v15, v5, v10 -; CGP-NEXT: v_mul_hi_u32 v16, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v5, v10 +; CGP-NEXT: v_mul_lo_u32 v13, v2, v10 +; CGP-NEXT: v_mul_lo_u32 v15, v3, v10 +; CGP-NEXT: v_mul_hi_u32 v16, v2, v10 +; CGP-NEXT: v_mul_hi_u32 v10, v3, v10 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v11, vcc, v15, v11 @@ -830,122 +832,120 @@ ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; CGP-NEXT: v_addc_u32_e64 v11, s[4:5], v5, v10, vcc -; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v5, v10 -; CGP-NEXT: v_mul_lo_u32 v10, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v9, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v12, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v8, v8, v11 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v11 +; CGP-NEXT: v_addc_u32_e64 v11, s[4:5], v3, v10, vcc +; CGP-NEXT: v_add_i32_e64 v3, s[4:5], v3, v10 +; CGP-NEXT: v_mul_lo_u32 v10, v4, v2 +; CGP-NEXT: v_mul_lo_u32 v5, v5, v2 +; CGP-NEXT: v_mul_hi_u32 v12, v4, v2 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v11 ; CGP-NEXT: v_mul_lo_u32 v13, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v14, v4, v10 +; CGP-NEXT: v_mul_hi_u32 v14, v2, v10 ; CGP-NEXT: v_mul_hi_u32 v10, v11, v10 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v12 -; CGP-NEXT: v_mul_lo_u32 v9, v4, v8 -; CGP-NEXT: v_mul_lo_u32 v12, v11, v8 -; CGP-NEXT: v_mul_hi_u32 v15, v4, v8 -; CGP-NEXT: v_mul_hi_u32 v8, v11, v8 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v13, v9 +; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4 +; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v4, v12 +; CGP-NEXT: v_mul_lo_u32 v5, v2, v4 +; CGP-NEXT: v_mul_lo_u32 v12, v11, v4 +; CGP-NEXT: v_mul_hi_u32 v15, v2, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v11, v4 +; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v13, v5 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v10 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v9, v14 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v5, v14 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v15 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v11, v9 +; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v11, v5 ; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v13 -; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v10, v9 +; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 -; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; CGP-NEXT: v_mul_lo_u32 v8, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v9, v2, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v2, v5 -; CGP-NEXT: v_mul_lo_u32 v11, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v12, v2, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v3, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v10 +; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v4, v10 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CGP-NEXT: v_mul_lo_u32 v4, v9, v2 +; CGP-NEXT: v_mul_hi_u32 v5, v8, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v9, v2 +; CGP-NEXT: v_mul_lo_u32 v10, v8, v3 +; CGP-NEXT: v_mul_lo_u32 v11, v9, v3 +; CGP-NEXT: v_mul_hi_u32 v12, v8, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v9, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v11, v2 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CGP-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; CGP-NEXT: v_mul_lo_u32 v9, v6, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v6, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; CGP-NEXT: v_mul_lo_u32 v5, v6, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 +; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 -; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v3, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v7 +; CGP-NEXT: v_mul_lo_u32 v5, v6, v2 +; CGP-NEXT: v_mul_lo_u32 v10, v7, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v6, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_mul_lo_u32 v3, v6, v3 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v8, v5 +; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v9, v2, vcc +; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v9, v2 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v6 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v7 ; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; CGP-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; CGP-NEXT: v_sub_i32_e32 v8, vcc, v2, v6 -; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc +; CGP-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7 +; CGP-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v3, v6 +; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v2, vcc ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v6 ; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc +; CGP-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v7 ; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc ; CGP-NEXT: v_sub_i32_e32 v6, vcc, v8, v6 -; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc +; CGP-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc ; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v7 ; CGP-NEXT: v_cndmask_b32_e32 v7, v11, v10, vcc ; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; CGP-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CGP-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; CGP-NEXT: v_cndmask_b32_e32 v7, v9, v2, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CGP-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc ; CGP-NEXT: ; implicit-def: $vgpr6 -; CGP-NEXT: ; implicit-def: $vgpr2 +; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: BB2_6: ; %Flow ; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB2_8 ; CGP-NEXT: ; %bb.7: -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v6 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v6 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v3 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v6 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6 +; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_mul_lo_u32 v3, v3, v6 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_mul_hi_u32 v2, v8, v2 +; CGP-NEXT: v_mul_lo_u32 v2, v2, v6 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 ; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v6 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v6 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; CGP-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc -; CGP-NEXT: v_mov_b32_e32 v5, 0 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: BB2_8: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] -; CGP-NEXT: v_mov_b32_e32 v2, v4 -; CGP-NEXT: v_mov_b32_e32 v3, v5 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = urem <2 x i64> %num, %den ret <2 x i64> %result @@ -1651,41 +1651,43 @@ ; CHECK-LABEL: v_urem_i64_pow2_shl_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v3, v0 +; CHECK-NEXT: v_mov_b32_e32 v4, v1 ; CHECK-NEXT: s_mov_b64 s[4:5], 0x1000 -; CHECK-NEXT: v_lshl_b64 v[4:5], s[4:5], v2 -; CHECK-NEXT: v_or_b32_e32 v3, v1, v5 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; CHECK-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CHECK-NEXT: v_lshl_b64 v[5:6], s[4:5], v2 +; CHECK-NEXT: v_or_b32_e32 v1, v4, v6 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 ; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz BB7_2 ; CHECK-NEXT: ; %bb.1: -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v4 -; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v5 -; CHECK-NEXT: v_sub_i32_e32 v6, vcc, 0, v4 -; CHECK-NEXT: v_subb_u32_e32 v7, vcc, 0, v5, vcc -; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v3, v3 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 -; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v3 -; CHECK-NEXT: v_mul_lo_u32 v9, v6, v2 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v11, v6, v2 +; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5 +; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v6 +; CHECK-NEXT: v_sub_i32_e32 v2, vcc, 0, v5 +; CHECK-NEXT: v_subb_u32_e32 v7, vcc, 0, v6, vcc +; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; CHECK-NEXT: v_trunc_f32_e32 v1, v1 +; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 +; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 +; CHECK-NEXT: v_mul_lo_u32 v8, v2, v1 +; CHECK-NEXT: v_mul_lo_u32 v9, v2, v0 +; CHECK-NEXT: v_mul_lo_u32 v10, v7, v0 +; CHECK-NEXT: v_mul_hi_u32 v11, v2, v0 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CHECK-NEXT: v_mul_lo_u32 v10, v3, v9 -; CHECK-NEXT: v_mul_hi_u32 v12, v2, v9 -; CHECK-NEXT: v_mul_hi_u32 v9, v3, v9 +; CHECK-NEXT: v_mul_lo_u32 v10, v1, v9 +; CHECK-NEXT: v_mul_hi_u32 v12, v0, v9 +; CHECK-NEXT: v_mul_hi_u32 v9, v1, v9 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; CHECK-NEXT: v_mul_lo_u32 v11, v2, v8 -; CHECK-NEXT: v_mul_lo_u32 v13, v3, v8 -; CHECK-NEXT: v_mul_hi_u32 v14, v2, v8 -; CHECK-NEXT: v_mul_hi_u32 v8, v3, v8 +; CHECK-NEXT: v_mul_lo_u32 v11, v0, v8 +; CHECK-NEXT: v_mul_lo_u32 v13, v1, v8 +; CHECK-NEXT: v_mul_hi_u32 v14, v0, v8 +; CHECK-NEXT: v_mul_hi_u32 v8, v1, v8 ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v9, vcc, v13, v9 @@ -1700,22 +1702,22 @@ ; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; CHECK-NEXT: v_addc_u32_e64 v9, s[4:5], v3, v8, vcc -; CHECK-NEXT: v_add_i32_e64 v3, s[4:5], v3, v8 -; CHECK-NEXT: v_mul_lo_u32 v8, v6, v2 -; CHECK-NEXT: v_mul_lo_u32 v7, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v10, v6, v2 -; CHECK-NEXT: v_mul_lo_u32 v6, v6, v9 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9 +; CHECK-NEXT: v_addc_u32_e64 v9, s[4:5], v1, v8, vcc +; CHECK-NEXT: v_add_i32_e64 v1, s[4:5], v1, v8 +; CHECK-NEXT: v_mul_lo_u32 v8, v2, v0 +; CHECK-NEXT: v_mul_lo_u32 v7, v7, v0 +; CHECK-NEXT: v_mul_hi_u32 v10, v2, v0 +; CHECK-NEXT: v_mul_lo_u32 v2, v2, v9 ; CHECK-NEXT: v_mul_lo_u32 v11, v9, v8 -; CHECK-NEXT: v_mul_hi_u32 v12, v2, v8 +; CHECK-NEXT: v_mul_hi_u32 v12, v0, v8 ; CHECK-NEXT: v_mul_hi_u32 v8, v9, v8 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10 -; CHECK-NEXT: v_mul_lo_u32 v7, v2, v6 -; CHECK-NEXT: v_mul_lo_u32 v10, v9, v6 -; CHECK-NEXT: v_mul_hi_u32 v13, v2, v6 -; CHECK-NEXT: v_mul_hi_u32 v6, v9, v6 +; CHECK-NEXT: v_add_i32_e64 v2, s[4:5], v7, v2 +; CHECK-NEXT: v_add_i32_e64 v2, s[4:5], v2, v10 +; CHECK-NEXT: v_mul_lo_u32 v7, v0, v2 +; CHECK-NEXT: v_mul_lo_u32 v10, v9, v2 +; CHECK-NEXT: v_mul_hi_u32 v13, v0, v2 +; CHECK-NEXT: v_mul_hi_u32 v2, v9, v2 ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v10, v8 @@ -1729,93 +1731,91 @@ ; CHECK-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; CHECK-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 -; CHECK-NEXT: v_add_i32_e64 v6, s[4:5], v6, v8 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CHECK-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CHECK-NEXT: v_mul_lo_u32 v6, v1, v2 -; CHECK-NEXT: v_mul_hi_u32 v7, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v0, v3 -; CHECK-NEXT: v_mul_lo_u32 v9, v1, v3 -; CHECK-NEXT: v_mul_hi_u32 v10, v0, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3 -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; CHECK-NEXT: v_add_i32_e64 v2, s[4:5], v2, v8 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CHECK-NEXT: v_mul_lo_u32 v2, v4, v0 +; CHECK-NEXT: v_mul_hi_u32 v7, v3, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0 +; CHECK-NEXT: v_mul_lo_u32 v8, v3, v1 +; CHECK-NEXT: v_mul_lo_u32 v9, v4, v1 +; CHECK-NEXT: v_mul_hi_u32 v10, v3, v1 +; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v9, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v9, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v10 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v10 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2 ; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CHECK-NEXT: v_mul_lo_u32 v7, v4, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v4, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CHECK-NEXT: v_mul_lo_u32 v3, v4, v3 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; CHECK-NEXT: v_subb_u32_e64 v3, s[4:5], v1, v2, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v2 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v0, v4 -; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2 +; CHECK-NEXT: v_mul_lo_u32 v7, v5, v0 +; CHECK-NEXT: v_mul_lo_u32 v8, v6, v0 +; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_mul_lo_u32 v1, v5, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v3, v7 +; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v4, v0, vcc +; CHECK-NEXT: v_sub_i32_e64 v0, s[4:5], v4, v0 +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5] +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] +; CHECK-NEXT: v_subb_u32_e32 v0, vcc, v0, v6, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v6 +; CHECK-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v1, v5 +; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v0, vcc +; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v5 ; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v7, v5 +; CHECK-NEXT: v_subb_u32_e32 v0, vcc, v0, v6, vcc +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v7, v6 ; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc -; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v6, v4 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v7, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v5, v9, v8, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CHECK-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc -; CHECK-NEXT: ; implicit-def: $vgpr4_vgpr5 -; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v4, v5 +; CHECK-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6 +; CHECK-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v5, v7, v0, vcc +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; CHECK-NEXT: ; implicit-def: $vgpr5_vgpr6 +; CHECK-NEXT: ; implicit-def: $vgpr3 ; CHECK-NEXT: BB7_2: ; %Flow ; CHECK-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] ; CHECK-NEXT: s_xor_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz BB7_4 ; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: v_cvt_f32_u32_e32 v1, v4 -; CHECK-NEXT: v_sub_i32_e32 v2, vcc, 0, v4 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1 -; CHECK-NEXT: v_mul_lo_u32 v2, v2, v1 -; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v5 +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5 +; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 +; CHECK-NEXT: v_mul_lo_u32 v1, v1, v0 ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, v4 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v4 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_mul_hi_u32 v0, v3, v0 +; CHECK-NEXT: v_mul_lo_u32 v0, v0, v5 +; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v3, v0 +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v5 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v4 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; CHECK-NEXT: v_mov_b32_e32 v3, 0 +; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v0, v5 +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: BB7_4: ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] -; CHECK-NEXT: v_mov_b32_e32 v0, v2 -; CHECK-NEXT: v_mov_b32_e32 v1, v3 ; CHECK-NEXT: s_setpc_b64 s[30:31] %shl.y = shl i64 4096, %y %r = urem i64 %x, %shl.y @@ -2086,12 +2086,14 @@ ; CGP-LABEL: v_urem_v2i64_pow2_shl_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_mov_b32_e32 v5, v0 -; CGP-NEXT: v_mov_b32_e32 v7, v1 +; CGP-NEXT: v_mov_b32_e32 v8, v0 +; CGP-NEXT: v_mov_b32_e32 v9, v1 +; CGP-NEXT: v_mov_b32_e32 v5, v2 +; CGP-NEXT: v_mov_b32_e32 v7, v3 ; CGP-NEXT: s_mov_b64 s[4:5], 0x1000 -; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v4 -; CGP-NEXT: v_lshl_b64 v[8:9], s[4:5], v6 -; CGP-NEXT: v_or_b32_e32 v1, v7, v11 +; CGP-NEXT: v_lshl_b64 v[2:3], s[4:5], v4 +; CGP-NEXT: v_lshl_b64 v[10:11], s[4:5], v6 +; CGP-NEXT: v_or_b32_e32 v1, v9, v3 ; CGP-NEXT: v_mov_b32_e32 v0, 0 ; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -2099,10 +2101,10 @@ ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB8_2 ; CGP-NEXT: ; %bb.1: -; CGP-NEXT: v_cvt_f32_u32_e32 v0, v10 -; CGP-NEXT: v_cvt_f32_u32_e32 v1, v11 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v10 -; CGP-NEXT: v_subb_u32_e32 v6, vcc, 0, v11, vcc +; CGP-NEXT: v_cvt_f32_u32_e32 v0, v2 +; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 +; CGP-NEXT: v_subb_u32_e32 v6, vcc, 0, v3, vcc ; CGP-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2171,13 +2173,13 @@ ; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 ; CGP-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v4, v7, v0 -; CGP-NEXT: v_mul_hi_u32 v6, v5, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v7, v0 -; CGP-NEXT: v_mul_lo_u32 v12, v5, v1 -; CGP-NEXT: v_mul_lo_u32 v13, v7, v1 -; CGP-NEXT: v_mul_hi_u32 v14, v5, v1 -; CGP-NEXT: v_mul_hi_u32 v1, v7, v1 +; CGP-NEXT: v_mul_lo_u32 v4, v9, v0 +; CGP-NEXT: v_mul_hi_u32 v6, v8, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 +; CGP-NEXT: v_mul_lo_u32 v12, v8, v1 +; CGP-NEXT: v_mul_lo_u32 v13, v9, v1 +; CGP-NEXT: v_mul_hi_u32 v14, v8, v1 +; CGP-NEXT: v_mul_hi_u32 v1, v9, v1 ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v0, vcc, v13, v0 @@ -2191,230 +2193,228 @@ ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; CGP-NEXT: v_mul_lo_u32 v6, v10, v0 -; CGP-NEXT: v_mul_lo_u32 v12, v11, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v10, v0 +; CGP-NEXT: v_mul_lo_u32 v6, v2, v0 +; CGP-NEXT: v_mul_lo_u32 v12, v3, v0 +; CGP-NEXT: v_mul_hi_u32 v0, v2, v0 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; CGP-NEXT: v_mul_lo_u32 v1, v10, v1 +; CGP-NEXT: v_mul_lo_u32 v1, v2, v1 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v12, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v5, v6 -; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v7, v0, vcc -; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v7, v0 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v10 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v11 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v8, v6 +; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v9, v0, vcc +; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v9, v0 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v2 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CGP-NEXT: v_subb_u32_e32 v0, vcc, v0, v11, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v11 -; CGP-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; CGP-NEXT: v_sub_i32_e32 v6, vcc, v1, v10 -; CGP-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v0, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v10 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v3 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; CGP-NEXT: v_subb_u32_e32 v0, vcc, v0, v3, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 +; CGP-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc +; CGP-NEXT: v_sub_i32_e32 v8, vcc, v1, v2 +; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v0, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5] -; CGP-NEXT: v_subb_u32_e32 v0, vcc, v0, v11, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v11 +; CGP-NEXT: v_subb_u32_e32 v0, vcc, v0, v3, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v3 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc -; CGP-NEXT: v_sub_i32_e32 v10, vcc, v6, v10 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 ; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v11 -; CGP-NEXT: v_cndmask_b32_e32 v11, v13, v12, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; CGP-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc -; CGP-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v1, v6, vcc -; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc -; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 -; CGP-NEXT: ; implicit-def: $vgpr5 +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v3 +; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v12, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v9, v0, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; CGP-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; CGP-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 +; CGP-NEXT: ; implicit-def: $vgpr8 ; CGP-NEXT: BB8_2: ; %Flow2 ; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB8_4 ; CGP-NEXT: ; %bb.3: -; CGP-NEXT: v_cvt_f32_u32_e32 v0, v10 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v10 +; CGP-NEXT: v_cvt_f32_u32_e32 v0, v2 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v2 ; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0 ; CGP-NEXT: v_mul_lo_u32 v1, v1, v0 ; CGP-NEXT: v_mul_hi_u32 v1, v0, v1 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CGP-NEXT: v_mul_hi_u32 v0, v5, v0 -; CGP-NEXT: v_mul_lo_u32 v0, v0, v10 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v5, v0 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v10 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v10 +; CGP-NEXT: v_mul_hi_u32 v0, v8, v0 +; CGP-NEXT: v_mul_lo_u32 v0, v0, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v8, v0 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v10 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v10 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CGP-NEXT: v_mov_b32_e32 v1, 0 ; CGP-NEXT: BB8_4: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] -; CGP-NEXT: v_or_b32_e32 v5, v3, v9 -; CGP-NEXT: v_mov_b32_e32 v4, 0 -; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; CGP-NEXT: ; implicit-def: $vgpr4_vgpr5 +; CGP-NEXT: v_or_b32_e32 v3, v7, v11 +; CGP-NEXT: v_mov_b32_e32 v2, 0 +; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3 ; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB8_6 ; CGP-NEXT: ; %bb.5: -; CGP-NEXT: v_cvt_f32_u32_e32 v4, v8 -; CGP-NEXT: v_cvt_f32_u32_e32 v5, v9 -; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v8 -; CGP-NEXT: v_subb_u32_e32 v7, vcc, 0, v9, vcc -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v5, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v6, v5 -; CGP-NEXT: v_mul_lo_u32 v11, v6, v4 -; CGP-NEXT: v_mul_lo_u32 v12, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v13, v6, v4 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10 -; CGP-NEXT: v_mul_lo_u32 v12, v5, v11 -; CGP-NEXT: v_mul_hi_u32 v14, v4, v11 -; CGP-NEXT: v_mul_hi_u32 v11, v5, v11 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; CGP-NEXT: v_mul_lo_u32 v13, v4, v10 -; CGP-NEXT: v_mul_lo_u32 v15, v5, v10 -; CGP-NEXT: v_mul_hi_u32 v16, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v5, v10 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v11 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v10 +; CGP-NEXT: v_subb_u32_e32 v6, vcc, 0, v11, vcc +; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 +; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 +; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 +; CGP-NEXT: v_trunc_f32_e32 v3, v3 +; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 +; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CGP-NEXT: v_mul_lo_u32 v8, v4, v3 +; CGP-NEXT: v_mul_lo_u32 v9, v4, v2 +; CGP-NEXT: v_mul_lo_u32 v12, v6, v2 +; CGP-NEXT: v_mul_hi_u32 v13, v4, v2 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8 +; CGP-NEXT: v_mul_lo_u32 v12, v3, v9 +; CGP-NEXT: v_mul_hi_u32 v14, v2, v9 +; CGP-NEXT: v_mul_hi_u32 v9, v3, v9 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v13 +; CGP-NEXT: v_mul_lo_u32 v13, v2, v8 +; CGP-NEXT: v_mul_lo_u32 v15, v3, v8 +; CGP-NEXT: v_mul_hi_u32 v16, v2, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v3, v8 ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v15, v11 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v15, v9 ; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v16 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v16 ; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 ; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; CGP-NEXT: v_add_i32_e32 v9, vcc, v9, v12 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; CGP-NEXT: v_addc_u32_e64 v11, s[4:5], v5, v10, vcc -; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v5, v10 -; CGP-NEXT: v_mul_lo_u32 v10, v6, v4 -; CGP-NEXT: v_mul_lo_u32 v7, v7, v4 -; CGP-NEXT: v_mul_hi_u32 v12, v6, v4 -; CGP-NEXT: v_mul_lo_u32 v6, v6, v11 -; CGP-NEXT: v_mul_lo_u32 v13, v11, v10 -; CGP-NEXT: v_mul_hi_u32 v14, v4, v10 -; CGP-NEXT: v_mul_hi_u32 v10, v11, v10 -; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 -; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v6, v12 -; CGP-NEXT: v_mul_lo_u32 v7, v4, v6 -; CGP-NEXT: v_mul_lo_u32 v12, v11, v6 -; CGP-NEXT: v_mul_hi_u32 v15, v4, v6 -; CGP-NEXT: v_mul_hi_u32 v6, v11, v6 -; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v13, v7 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v12, v10 +; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v9 +; CGP-NEXT: v_addc_u32_e64 v9, s[4:5], v3, v8, vcc +; CGP-NEXT: v_add_i32_e64 v3, s[4:5], v3, v8 +; CGP-NEXT: v_mul_lo_u32 v8, v4, v2 +; CGP-NEXT: v_mul_lo_u32 v6, v6, v2 +; CGP-NEXT: v_mul_hi_u32 v12, v4, v2 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v9 +; CGP-NEXT: v_mul_lo_u32 v13, v9, v8 +; CGP-NEXT: v_mul_hi_u32 v14, v2, v8 +; CGP-NEXT: v_mul_hi_u32 v8, v9, v8 +; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v6, v4 +; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v4, v12 +; CGP-NEXT: v_mul_lo_u32 v6, v2, v4 +; CGP-NEXT: v_mul_lo_u32 v12, v9, v4 +; CGP-NEXT: v_mul_hi_u32 v15, v2, v4 +; CGP-NEXT: v_mul_hi_u32 v4, v9, v4 +; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v13, v6 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v12, v8 ; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v7, v14 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v10, v15 +; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v6, v14 +; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v8, v15 ; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7 -; CGP-NEXT: v_add_i32_e64 v11, s[4:5], v12, v13 -; CGP-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] -; CGP-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10 -; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v6, v10 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, v5, v6, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; CGP-NEXT: v_mul_lo_u32 v6, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v7, v2, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v2, v5 -; CGP-NEXT: v_mul_lo_u32 v11, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v12, v2, v5 -; CGP-NEXT: v_mul_hi_u32 v5, v3, v5 -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v10, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v11, v7 +; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v9, v6 +; CGP-NEXT: v_add_i32_e64 v9, s[4:5], v12, v13 +; CGP-NEXT: v_add_i32_e64 v6, s[4:5], v8, v6 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; CGP-NEXT: v_add_i32_e64 v8, s[4:5], v9, v8 +; CGP-NEXT: v_add_i32_e64 v4, s[4:5], v4, v8 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CGP-NEXT: v_mul_lo_u32 v4, v7, v2 +; CGP-NEXT: v_mul_hi_u32 v6, v5, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v7, v2 +; CGP-NEXT: v_mul_lo_u32 v8, v5, v3 +; CGP-NEXT: v_mul_lo_u32 v9, v7, v3 +; CGP-NEXT: v_mul_hi_u32 v12, v5, v3 +; CGP-NEXT: v_mul_hi_u32 v3, v7, v3 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v9, v2 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_mul_lo_u32 v7, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v10, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v8, v4 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CGP-NEXT: v_mul_lo_u32 v5, v8, v5 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v7 -; CGP-NEXT: v_subb_u32_e64 v5, s[4:5], v3, v4, vcc -; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v8 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v9 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 +; CGP-NEXT: v_add_i32_e32 v6, vcc, v9, v6 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; CGP-NEXT: v_mul_lo_u32 v6, v10, v2 +; CGP-NEXT: v_mul_lo_u32 v8, v11, v2 +; CGP-NEXT: v_mul_hi_u32 v2, v10, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_mul_lo_u32 v3, v10, v3 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v8, v3 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v5, v6 +; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v7, v2, vcc +; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v7, v2 +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v10 +; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5] +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v11 ; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v5, v9 -; CGP-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; CGP-NEXT: v_sub_i32_e32 v6, vcc, v2, v8 -; CGP-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v8 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v9 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; CGP-NEXT: v_sub_i32_e32 v8, vcc, v6, v8 -; CGP-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v9 -; CGP-NEXT: v_cndmask_b32_e32 v9, v11, v10, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; CGP-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; CGP-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc -; CGP-NEXT: ; implicit-def: $vgpr8_vgpr9 -; CGP-NEXT: ; implicit-def: $vgpr2 +; CGP-NEXT: v_subb_u32_e32 v2, vcc, v2, v11, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v11 +; CGP-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc +; CGP-NEXT: v_sub_i32_e32 v6, vcc, v3, v10 +; CGP-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v2, vcc +; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v10 +; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; CGP-NEXT: v_subb_u32_e32 v2, vcc, v2, v11, vcc +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v11 +; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc +; CGP-NEXT: v_sub_i32_e32 v10, vcc, v6, v10 +; CGP-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc +; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v11 +; CGP-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; CGP-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc +; CGP-NEXT: v_cndmask_b32_e32 v7, v7, v2, vcc +; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; CGP-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc +; CGP-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc +; CGP-NEXT: ; implicit-def: $vgpr10_vgpr11 +; CGP-NEXT: ; implicit-def: $vgpr5 ; CGP-NEXT: BB8_6: ; %Flow ; CGP-NEXT: s_or_saveexec_b64 s[4:5], s[6:7] ; CGP-NEXT: s_xor_b64 exec, exec, s[4:5] ; CGP-NEXT: s_cbranch_execz BB8_8 ; CGP-NEXT: ; %bb.7: -; CGP-NEXT: v_cvt_f32_u32_e32 v3, v8 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v8 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v3 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v10 +; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 +; CGP-NEXT: v_mul_lo_u32 v3, v3, v2 ; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_mul_lo_u32 v3, v3, v8 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v8 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v8 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; CGP-NEXT: v_mul_hi_u32 v2, v5, v2 +; CGP-NEXT: v_mul_lo_u32 v2, v2, v10 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v10 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v10 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v10 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v8 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v8 -; CGP-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc -; CGP-NEXT: v_mov_b32_e32 v5, 0 +; CGP-NEXT: v_mov_b32_e32 v3, 0 ; CGP-NEXT: BB8_8: ; CGP-NEXT: s_or_b64 exec, exec, s[4:5] -; CGP-NEXT: v_mov_b32_e32 v2, v4 -; CGP-NEXT: v_mov_b32_e32 v3, v5 ; CGP-NEXT: s_setpc_b64 s[30:31] %shl.y = shl <2 x i64> , %y %r = urem <2 x i64> %x, %shl.y diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -1809,125 +1809,125 @@ ; ; GFX6-LABEL: sdiv_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0xd +; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd ; GFX6-NEXT: s_mov_b32 s16, 0x4f7ffffe -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s15, 0xf000 +; GFX6-NEXT: s_mov_b32 s14, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s2, s12, 31 -; GFX6-NEXT: s_add_i32 s3, s12, s2 -; GFX6-NEXT: s_xor_b32 s12, s3, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 -; GFX6-NEXT: s_ashr_i32 s3, s13, 31 -; GFX6-NEXT: s_add_i32 s0, s13, s3 -; GFX6-NEXT: s_xor_b32 s13, s0, s3 +; GFX6-NEXT: s_ashr_i32 s2, s8, 31 +; GFX6-NEXT: s_add_i32 s3, s8, s2 +; GFX6-NEXT: s_xor_b32 s8, s3, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX6-NEXT: s_ashr_i32 s3, s9, 31 +; GFX6-NEXT: s_add_i32 s0, s9, s3 +; GFX6-NEXT: s_xor_b32 s9, s0, s3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX6-NEXT: s_sub_i32 s1, 0, s12 -; GFX6-NEXT: s_ashr_i32 s0, s8, 31 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GFX6-NEXT: s_sub_i32 s1, 0, s8 +; GFX6-NEXT: s_ashr_i32 s0, s4, 31 ; GFX6-NEXT: v_mul_f32_e32 v0, s16, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX6-NEXT: s_xor_b32 s2, s0, s2 ; GFX6-NEXT: v_mul_lo_u32 v2, s1, v0 -; GFX6-NEXT: s_add_i32 s1, s8, s0 +; GFX6-NEXT: s_add_i32 s1, s4, s0 ; GFX6-NEXT: v_mul_f32_e32 v1, s16, v1 ; GFX6-NEXT: s_xor_b32 s1, s1, s0 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: s_sub_i32 s0, 0, s13 +; GFX6-NEXT: s_sub_i32 s0, 0, s9 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 ; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, s12 +; GFX6-NEXT: v_mul_lo_u32 v3, v0, s8 ; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v3 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s12, v3 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s8, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX6-NEXT: s_ashr_i32 s0, s9, 31 -; GFX6-NEXT: s_add_i32 s1, s9, s0 +; GFX6-NEXT: s_ashr_i32 s0, s5, 31 +; GFX6-NEXT: s_add_i32 s1, s5, s0 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 ; GFX6-NEXT: s_xor_b32 s2, s0, s3 -; GFX6-NEXT: s_ashr_i32 s3, s14, 31 +; GFX6-NEXT: s_ashr_i32 s3, s10, 31 ; GFX6-NEXT: s_xor_b32 s1, s1, s0 -; GFX6-NEXT: s_add_i32 s0, s14, s3 -; GFX6-NEXT: s_xor_b32 s9, s0, s3 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s9 +; GFX6-NEXT: s_add_i32 s0, s10, s3 +; GFX6-NEXT: s_xor_b32 s5, s0, s3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s5 ; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX6-NEXT: v_mul_lo_u32 v2, v1, s13 +; GFX6-NEXT: v_mul_lo_u32 v2, v1, s9 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 ; GFX6-NEXT: v_mul_f32_e32 v3, s16, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v2 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s13, v2 +; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] -; GFX6-NEXT: s_sub_i32 s0, 0, s9 +; GFX6-NEXT: s_sub_i32 s0, 0, s5 ; GFX6-NEXT: v_mul_lo_u32 v5, s0, v3 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX6-NEXT: v_mul_hi_u32 v2, v3, v5 ; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 ; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 -; GFX6-NEXT: s_ashr_i32 s2, s15, 31 -; GFX6-NEXT: s_ashr_i32 s0, s10, 31 -; GFX6-NEXT: s_add_i32 s8, s15, s2 -; GFX6-NEXT: s_add_i32 s1, s10, s0 -; GFX6-NEXT: s_xor_b32 s8, s8, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s8 +; GFX6-NEXT: s_ashr_i32 s2, s11, 31 +; GFX6-NEXT: s_ashr_i32 s0, s6, 31 +; GFX6-NEXT: s_add_i32 s4, s11, s2 +; GFX6-NEXT: s_add_i32 s1, s6, s0 +; GFX6-NEXT: s_xor_b32 s4, s4, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 ; GFX6-NEXT: s_xor_b32 s1, s1, s0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_mul_hi_u32 v2, s1, v2 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GFX6-NEXT: s_xor_b32 s3, s0, s3 -; GFX6-NEXT: v_mul_lo_u32 v3, v2, s9 +; GFX6-NEXT: v_mul_lo_u32 v3, v2, s5 ; GFX6-NEXT: v_mul_f32_e32 v4, s16, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v2 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v3 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s9, v3 +; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] -; GFX6-NEXT: s_sub_i32 s0, 0, s8 +; GFX6-NEXT: s_sub_i32 s0, 0, s4 ; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 -; GFX6-NEXT: s_ashr_i32 s0, s11, 31 -; GFX6-NEXT: s_add_i32 s1, s11, s0 +; GFX6-NEXT: s_ashr_i32 s0, s7, 31 +; GFX6-NEXT: s_add_i32 s1, s7, s0 ; GFX6-NEXT: s_xor_b32 s1, s1, s0 ; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 ; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v2 ; GFX6-NEXT: s_xor_b32 s2, s0, s2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_mul_hi_u32 v4, s1, v4 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GFX6-NEXT: v_xor_b32_e32 v2, s3, v2 -; GFX6-NEXT: v_mul_lo_u32 v3, v4, s8 +; GFX6-NEXT: v_mul_lo_u32 v3, v4, s4 ; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v2 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v3 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s8, v3 +; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] ; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 ; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc ; GFX6-NEXT: v_xor_b32_e32 v3, s2, v3 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v3 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v4i32: @@ -10600,35 +10600,31 @@ ; GFX9-NEXT: s_mov_b32 s17, 0x5f7ffffc ; GFX9-NEXT: s_mov_b32 s18, 0x2f800000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[12:13], s[2:3], s6 +; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s6 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: s_add_u32 s2, s2, s4 ; GFX9-NEXT: s_mov_b32 s5, s4 ; GFX9-NEXT: s_addc_u32 s3, s3, s4 -; GFX9-NEXT: s_xor_b64 s[14:15], s[2:3], s[4:5] -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s14 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s15 +; GFX9-NEXT: s_xor_b64 s[12:13], s[2:3], s[4:5] +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 ; GFX9-NEXT: s_mov_b32 s19, 0xcf800000 -; GFX9-NEXT: s_sub_u32 s4, 0, s14 -; GFX9-NEXT: s_subb_u32 s5, 0, s15 +; GFX9-NEXT: s_sub_u32 s8, 0, s12 +; GFX9-NEXT: s_subb_u32 s4, 0, s13 ; GFX9-NEXT: v_mac_f32_e32 v0, s16, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 ; GFX9-NEXT: v_mul_f32_e32 v0, s17, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, s18, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, s19, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s6, s9, 31 -; GFX9-NEXT: s_mov_b32 s7, s6 -; GFX9-NEXT: v_mul_hi_u32 v3, s4, v0 -; GFX9-NEXT: v_mul_lo_u32 v2, s4, v1 -; GFX9-NEXT: v_mul_lo_u32 v5, s5, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, s4, v0 +; GFX9-NEXT: v_mul_hi_u32 v3, s8, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, s8, v1 +; GFX9-NEXT: v_mul_lo_u32 v5, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s8, v0 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 @@ -10648,11 +10644,11 @@ ; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v4, s4, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, s4, v0 -; GFX9-NEXT: v_mul_lo_u32 v8, s5, v0 -; GFX9-NEXT: v_mul_lo_u32 v9, s4, v0 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_mul_lo_u32 v4, s8, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, s8, v0 +; GFX9-NEXT: v_mul_lo_u32 v8, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v9, s8, v0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: v_add_u32_e32 v4, v7, v4 ; GFX9-NEXT: v_add_u32_e32 v4, v4, v8 ; GFX9-NEXT: v_mul_lo_u32 v10, v0, v4 @@ -10668,65 +10664,69 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v8, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s8, s5, 31 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v5, v4, vcc +; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] -; GFX9-NEXT: s_add_u32 s2, s8, s6 -; GFX9-NEXT: s_addc_u32 s3, s9, s6 +; GFX9-NEXT: s_add_u32 s2, s4, s8 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: s_xor_b64 s[8:9], s[2:3], s[6:7] +; GFX9-NEXT: s_addc_u32 s3, s5, s8 +; GFX9-NEXT: s_mov_b32 s9, s8 +; GFX9-NEXT: s_xor_b64 s[14:15], s[2:3], s[8:9] ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s8, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s8, v0 -; GFX9-NEXT: v_mul_hi_u32 v4, s8, v1 -; GFX9-NEXT: v_mul_hi_u32 v7, s9, v1 -; GFX9-NEXT: v_mul_lo_u32 v1, s9, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, s14, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s14, v0 +; GFX9-NEXT: v_mul_hi_u32 v4, s14, v1 +; GFX9-NEXT: v_mul_hi_u32 v7, s15, v1 +; GFX9-NEXT: v_mul_lo_u32 v1, s15, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s9, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s15, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s15, v0 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v2, vcc -; GFX9-NEXT: v_mul_lo_u32 v1, s14, v1 -; GFX9-NEXT: v_mul_hi_u32 v2, s14, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s15, v0 -; GFX9-NEXT: v_mul_lo_u32 v0, s14, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, s12, v1 +; GFX9-NEXT: v_mul_hi_u32 v2, s12, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s13, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s12, v0 ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_sub_co_u32_e64 v0, s[0:1], s8, v0 -; GFX9-NEXT: v_sub_u32_e32 v2, s9, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 +; GFX9-NEXT: v_sub_co_u32_e64 v0, s[0:1], s14, v0 +; GFX9-NEXT: v_sub_u32_e32 v2, s15, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, s13 ; GFX9-NEXT: v_subb_co_u32_e64 v2, vcc, v2, v3, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[2:3], s14, v0 +; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[2:3], s12, v0 ; GFX9-NEXT: v_subbrev_co_u32_e64 v7, vcc, 0, v2, s[2:3] -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s15, v7 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s13, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v4 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s15, v7 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s13, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc ; GFX9-NEXT: v_subb_co_u32_e64 v2, vcc, v2, v3, s[2:3] -; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s14, v4 +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s12, v4 ; GFX9-NEXT: v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-NEXT: v_mov_b32_e32 v7, s15 ; GFX9-NEXT: v_subb_co_u32_e64 v1, vcc, v7, v1, s[0:1] -; GFX9-NEXT: s_ashr_i32 s0, s13, 31 -; GFX9-NEXT: s_add_u32 s8, s12, s0 +; GFX9-NEXT: s_ashr_i32 s0, s11, 31 +; GFX9-NEXT: s_add_u32 s10, s10, s0 ; GFX9-NEXT: s_mov_b32 s1, s0 -; GFX9-NEXT: s_addc_u32 s9, s13, s0 -; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[0:1] -; GFX9-NEXT: v_cvt_f32_u32_e32 v9, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v10, s9 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s15, v1 +; GFX9-NEXT: s_addc_u32 s11, s11, s0 +; GFX9-NEXT: s_xor_b64 s[10:11], s[10:11], s[0:1] +; GFX9-NEXT: v_cvt_f32_u32_e32 v9, s10 +; GFX9-NEXT: v_cvt_f32_u32_e32 v10, s11 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s15, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 ; GFX9-NEXT: v_mac_f32_e32 v9, s16, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX9-NEXT: v_rcp_f32_e32 v8, v9 @@ -10739,8 +10739,8 @@ ; GFX9-NEXT: v_mac_f32_e32 v3, s19, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX9-NEXT: s_sub_u32 s2, 0, s8 -; GFX9-NEXT: s_subb_u32 s3, 0, s9 +; GFX9-NEXT: s_sub_u32 s2, 0, s10 +; GFX9-NEXT: s_subb_u32 s3, 0, s11 ; GFX9-NEXT: v_mul_hi_u32 v7, s2, v3 ; GFX9-NEXT: v_mul_lo_u32 v8, s2, v4 ; GFX9-NEXT: v_mul_lo_u32 v9, s3, v3 @@ -10757,7 +10757,7 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v10, vcc ; GFX9-NEXT: v_mul_lo_u32 v10, v4, v2 ; GFX9-NEXT: v_mul_hi_u32 v2, v4, v2 -; GFX9-NEXT: s_ashr_i32 s12, s11, 31 +; GFX9-NEXT: s_ashr_i32 s12, s7, 31 ; GFX9-NEXT: s_mov_b32 s13, s12 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v9, v2, vcc @@ -10788,60 +10788,60 @@ ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v10, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v5, v8, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v4, vcc, v4, v8, s[0:1] -; GFX9-NEXT: s_add_u32 s0, s10, s12 +; GFX9-NEXT: s_add_u32 s0, s6, s12 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: s_addc_u32 s1, s11, s12 -; GFX9-NEXT: s_xor_b64 s[10:11], s[0:1], s[12:13] +; GFX9-NEXT: s_addc_u32 s1, s7, s12 +; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s10, v3 -; GFX9-NEXT: v_mul_hi_u32 v7, s10, v2 -; GFX9-NEXT: v_mul_hi_u32 v9, s10, v3 -; GFX9-NEXT: v_mul_hi_u32 v10, s11, v3 -; GFX9-NEXT: v_mul_lo_u32 v3, s11, v3 +; GFX9-NEXT: v_mul_lo_u32 v4, s6, v3 +; GFX9-NEXT: v_mul_hi_u32 v7, s6, v2 +; GFX9-NEXT: v_mul_hi_u32 v9, s6, v3 +; GFX9-NEXT: v_mul_hi_u32 v10, s7, v3 +; GFX9-NEXT: v_mul_lo_u32 v3, s7, v3 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v9, vcc -; GFX9-NEXT: v_mul_lo_u32 v9, s11, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, s11, v2 -; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, s6, v1 +; GFX9-NEXT: v_mul_lo_u32 v9, s7, v2 +; GFX9-NEXT: v_mul_hi_u32 v2, s7, v2 +; GFX9-NEXT: v_xor_b32_e32 v0, s8, v0 +; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v2, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v10, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s8, v3 -; GFX9-NEXT: v_mul_hi_u32 v4, s8, v2 -; GFX9-NEXT: v_mul_lo_u32 v5, s9, v2 -; GFX9-NEXT: v_mul_lo_u32 v2, s8, v2 -; GFX9-NEXT: v_mov_b32_e32 v8, s6 +; GFX9-NEXT: v_mul_lo_u32 v3, s10, v3 +; GFX9-NEXT: v_mul_hi_u32 v4, s10, v2 +; GFX9-NEXT: v_mul_lo_u32 v5, s11, v2 +; GFX9-NEXT: v_mul_lo_u32 v2, s10, v2 +; GFX9-NEXT: v_mov_b32_e32 v8, s8 ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v0 -; GFX9-NEXT: v_sub_co_u32_e64 v2, s[0:1], s10, v2 -; GFX9-NEXT: v_sub_u32_e32 v4, s11, v3 -; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s8, v0 +; GFX9-NEXT: v_sub_co_u32_e64 v2, s[0:1], s6, v2 +; GFX9-NEXT: v_sub_u32_e32 v4, s7, v3 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc ; GFX9-NEXT: v_subb_co_u32_e64 v4, vcc, v4, v5, s[0:1] -; GFX9-NEXT: v_subrev_co_u32_e64 v7, s[2:3], s8, v2 +; GFX9-NEXT: v_subrev_co_u32_e64 v7, s[2:3], s10, v2 ; GFX9-NEXT: v_subbrev_co_u32_e64 v8, vcc, 0, v4, s[2:3] -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v8 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v7 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v8 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v10, vcc ; GFX9-NEXT: v_subb_co_u32_e64 v4, vcc, v4, v5, s[2:3] -; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s8, v7 +; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s10, v7 ; GFX9-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: v_mov_b32_e32 v8, s7 ; GFX9-NEXT: v_subb_co_u32_e64 v3, vcc, v8, v3, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -157,65 +157,65 @@ ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xd +; GFX7LESS-NEXT: s_load_dword s8, s[0:1], 0xd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB1_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 +; GFX7LESS-NEXT: s_mul_i32 s2, s8, s2 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: BB1_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 -; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s1, v0 +; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX8-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz BB1_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s1, s0, s1 +; GFX8-NEXT: s_mul_i32 s2, s8, s2 ; GFX8-NEXT: s_mov_b32 s15, 0xf000 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s12, s6 ; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v1 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 @@ -226,23 +226,23 @@ ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_mov_b64 s[8:9], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz BB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[8:9] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s3, s2, s3 +; GFX9-NEXT: s_mul_i32 s2, s8, s2 ; GFX9-NEXT: s_mov_b32 s15, 0xf000 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -250,7 +250,7 @@ ; GFX9-NEXT: BB1_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -262,26 +262,26 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz BB1_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[8:9] -; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s3, s2, s3 -; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, s3 -; GFX1064-NEXT: s_mov_b32 s8, s6 -; GFX1064-NEXT: s_mov_b32 s9, s7 +; GFX1064-NEXT: s_mul_i32 s2, s8, s2 +; GFX1064-NEXT: s_mov_b32 s14, -1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc +; GFX1064-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv @@ -289,7 +289,7 @@ ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX1064-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 @@ -602,9 +602,9 @@ ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB3_2 ; GFX7LESS-NEXT: ; %bb.1: @@ -615,10 +615,10 @@ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s8, s2 ; GFX7LESS-NEXT: s_mov_b32 s9, s3 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc +; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: BB3_2: @@ -626,11 +626,11 @@ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v2 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 -; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 ; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc @@ -642,9 +642,9 @@ ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX89-NEXT: s_mov_b64 s[6:7], exec ; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 -; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX89-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX89-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX89-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX89-NEXT: s_cbranch_execz BB3_2 ; GFX89-NEXT: ; %bb.1: @@ -655,20 +655,20 @@ ; GFX89-NEXT: s_mov_b32 s11, 0xf000 ; GFX89-NEXT: s_mov_b32 s10, -1 ; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: v_mov_b32_e32 v1, s2 -; GFX89-NEXT: v_mov_b32_e32 v2, 0 +; GFX89-NEXT: v_mov_b32_e32 v0, s2 +; GFX89-NEXT: v_mov_b32_e32 v1, 0 ; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX89-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc +; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: buffer_wbinvl1_vol ; GFX89-NEXT: BB3_2: ; GFX89-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_readfirstlane_b32 s2, v1 -; GFX89-NEXT: v_readfirstlane_b32 s3, v2 -; GFX89-NEXT: v_mov_b32_e32 v1, s2 -; GFX89-NEXT: v_mov_b32_e32 v2, s3 -; GFX89-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] +; GFX89-NEXT: v_readfirstlane_b32 s2, v0 +; GFX89-NEXT: v_readfirstlane_b32 s3, v1 +; GFX89-NEXT: v_mov_b32_e32 v0, s2 +; GFX89-NEXT: v_mov_b32_e32 v1, s3 +; GFX89-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] ; GFX89-NEXT: s_mov_b32 s3, 0xf000 ; GFX89-NEXT: s_mov_b32 s2, -1 ; GFX89-NEXT: s_nop 2 @@ -679,25 +679,25 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec -; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB3_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_mul_i32 s6, s6, 5 ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1064-NEXT: v_mov_b32_e32 v1, s6 +; GFX1064-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mov_b32 s8, s2 ; GFX1064-NEXT: s_mov_b32 s9, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc +; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv @@ -705,9 +705,9 @@ ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 -; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3] +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -717,24 +717,24 @@ ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB3_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_mul_i32 s5, s5, 5 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1032-NEXT: v_mov_b32_e32 v1, s5 +; GFX1032-NEXT: v_mov_b32_e32 v0, s5 ; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mov_b32 s8, s2 ; GFX1032-NEXT: s_mov_b32 s9, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc +; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv @@ -742,9 +742,9 @@ ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 -; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3] +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -762,9 +762,9 @@ ; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s9, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB4_2 ; GFX7LESS-NEXT: ; %bb.1: @@ -775,13 +775,13 @@ ; GFX7LESS-NEXT: s_mov_b32 s13, s7 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[8:9] ; GFX7LESS-NEXT: s_mul_i32 s7, s1, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 -; GFX7LESS-NEXT: v_mul_hi_u32 v1, s0, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX7LESS-NEXT: s_mul_i32 s6, s0, s6 -; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_atomic_add_x2 v[1:2], off, s[12:15], 0 glc +; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: BB4_2: @@ -789,16 +789,16 @@ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v2 +; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 +; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v1 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX7LESS-NEXT: v_mul_hi_u32 v2, s0, v0 -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 -; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s3 -; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 -; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s1, v2 +; GFX7LESS-NEXT: v_mul_hi_u32 v1, s0, v2 +; GFX7LESS-NEXT: v_mul_lo_u32 v2, s0, v2 +; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s3 +; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -808,42 +808,42 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_mov_b64 s[8:9], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB4_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s12, s6 ; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: v_mul_hi_u32 v1, s0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX8-NEXT: s_mov_b32 s13, s7 ; GFX8-NEXT: s_mul_i32 s7, s1, s6 ; GFX8-NEXT: s_mul_i32 s6, s0, s6 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s7, v0 ; GFX8-NEXT: s_mov_b32 s15, 0xf000 ; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_atomic_add_x2 v[1:2], off, s[12:15], 0 glc +; GFX8-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB4_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: v_readfirstlane_b32 s2, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX8-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, s1, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, s0, v2 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_mul_lo_u32 v1, s0, v2 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v1 ; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; @@ -853,9 +853,9 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b64 s[8:9], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz BB4_2 ; GFX9-NEXT: ; %bb.1: @@ -869,23 +869,23 @@ ; GFX9-NEXT: s_mul_i32 s6, s2, s6 ; GFX9-NEXT: s_mov_b32 s15, 0xf000 ; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_atomic_add_x2 v[1:2], off, s[12:15], 0 glc +; GFX9-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB4_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 -; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 -; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_readfirstlane_b32 s1, v2 -; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_mul_lo_u32 v0, s2, v2 +; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2 +; GFX9-NEXT: v_mul_hi_u32 v4, s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc @@ -898,10 +898,10 @@ ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1064-NEXT: s_mov_b64 s[8:9], exec -; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz BB4_2 ; GFX1064-NEXT: ; %bb.1: @@ -912,14 +912,14 @@ ; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s8 ; GFX1064-NEXT: s_mul_i32 s8, s2, s8 ; GFX1064-NEXT: s_add_i32 s10, s10, s9 -; GFX1064-NEXT: v_mov_b32_e32 v1, s8 -; GFX1064-NEXT: v_mov_b32_e32 v2, s10 +; GFX1064-NEXT: v_mov_b32_e32 v0, s8 +; GFX1064-NEXT: v_mov_b32_e32 v1, s10 ; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_mov_b32 s8, s6 ; GFX1064-NEXT: s_mov_b32 s9, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc +; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv @@ -927,15 +927,15 @@ ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 -; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 -; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1064-NEXT: v_readfirstlane_b32 s1, v2 +; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v2 +; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v2 +; GFX1064-NEXT: v_mul_lo_u32 v2, s2, v2 +; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 ; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 -; GFX1064-NEXT: v_add_co_u32 v0, vcc, s0, v0 +; GFX1064-NEXT: v_add_co_u32 v0, vcc, s0, v2 ; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s1, v1, vcc ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1064-NEXT: s_endpgm @@ -946,9 +946,9 @@ ; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1032-NEXT: s_mov_b32 s8, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB4_2 ; GFX1032-NEXT: ; %bb.1: @@ -959,14 +959,14 @@ ; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s1 ; GFX1032-NEXT: s_mul_i32 s1, s2, s1 ; GFX1032-NEXT: s_add_i32 s9, s9, s8 -; GFX1032-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032-NEXT: v_mov_b32_e32 v2, s9 +; GFX1032-NEXT: v_mov_b32_e32 v0, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s9 ; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: s_mov_b32 s8, s6 ; GFX1032-NEXT: s_mov_b32 s9, s7 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc +; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv @@ -974,15 +974,15 @@ ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 -; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 -; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1032-NEXT: v_readfirstlane_b32 s1, v2 +; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v2 +; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v2 +; GFX1032-NEXT: v_mul_lo_u32 v2, s2, v2 +; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 ; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 -; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 +; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, s0, v2 ; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm @@ -1248,65 +1248,65 @@ ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xd +; GFX7LESS-NEXT: s_load_dword s8, s[0:1], 0xd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB7_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 +; GFX7LESS-NEXT: s_mul_i32 s2, s8, s2 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: BB7_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 -; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s1, v0 +; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX8-NEXT: s_load_dword s8, s[0:1], 0x34 ; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz BB7_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s1, s0, s1 +; GFX8-NEXT: s_mul_i32 s2, s8, s2 ; GFX8-NEXT: s_mov_b32 s15, 0xf000 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s12, s6 ; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB7_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v1 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 @@ -1317,23 +1317,23 @@ ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX9-NEXT: s_mov_b64 s[8:9], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz BB7_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[8:9] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s3, s2, s3 +; GFX9-NEXT: s_mul_i32 s2, s8, s2 ; GFX9-NEXT: s_mov_b32 s15, 0xf000 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s12, s6 ; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1341,7 +1341,7 @@ ; GFX9-NEXT: BB7_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -1353,26 +1353,26 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x34 -; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: s_load_dword s8, s[0:1], 0x34 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz BB7_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[8:9] -; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s15, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s3, s2, s3 -; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: v_mov_b32_e32 v1, s3 -; GFX1064-NEXT: s_mov_b32 s8, s6 -; GFX1064-NEXT: s_mov_b32 s9, s7 +; GFX1064-NEXT: s_mul_i32 s2, s8, s2 +; GFX1064-NEXT: s_mov_b32 s14, -1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc +; GFX1064-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv @@ -1380,7 +1380,7 @@ ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX1064-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 @@ -1693,9 +1693,9 @@ ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB9_2 ; GFX7LESS-NEXT: ; %bb.1: @@ -1706,10 +1706,10 @@ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s8, s2 ; GFX7LESS-NEXT: s_mov_b32 s9, s3 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc +; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: BB9_2: @@ -1717,11 +1717,11 @@ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v2 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 -; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 ; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc @@ -1733,9 +1733,9 @@ ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_cbranch_execz BB9_2 ; GFX8-NEXT: ; %bb.1: @@ -1746,18 +1746,18 @@ ; GFX8-NEXT: s_mov_b32 s11, 0xf000 ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_mov_b32 s9, s3 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc +; GFX8-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB9_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_readfirstlane_b32 s5, v2 -; GFX8-NEXT: v_readfirstlane_b32 s4, v1 -; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 -; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX8-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX8-NEXT: v_readfirstlane_b32 s5, v1 +; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1772,9 +1772,9 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz BB9_2 ; GFX9-NEXT: ; %bb.1: @@ -1785,18 +1785,18 @@ ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s9, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc +; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB9_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s5, v2 -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 -; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX9-NEXT: v_readfirstlane_b32 s5, v1 +; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1810,25 +1810,25 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec -; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB9_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_mul_i32 s6, s6, 5 ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1064-NEXT: v_mov_b32_e32 v1, s6 +; GFX1064-NEXT: v_mov_b32_e32 v0, s6 ; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mov_b32 s8, s2 ; GFX1064-NEXT: s_mov_b32 s9, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc +; GFX1064-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv @@ -1836,13 +1836,13 @@ ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1064-NEXT: v_mul_u32_u24_e32 v1, 5, v0 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 -; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 -; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 ; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc +; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm @@ -1851,24 +1851,24 @@ ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB9_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_mul_i32 s5, s5, 5 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1032-NEXT: v_mov_b32_e32 v1, s5 +; GFX1032-NEXT: v_mov_b32_e32 v0, s5 ; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mov_b32 s8, s2 ; GFX1032-NEXT: s_mov_b32 s9, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc +; GFX1032-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv @@ -1876,13 +1876,13 @@ ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1032-NEXT: v_mul_u32_u24_e32 v1, 5, v0 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 -; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 -; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 ; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo +; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm @@ -1899,9 +1899,9 @@ ; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s9, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB10_2 ; GFX7LESS-NEXT: ; %bb.1: @@ -1912,13 +1912,13 @@ ; GFX7LESS-NEXT: s_mov_b32 s13, s7 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[8:9] ; GFX7LESS-NEXT: s_mul_i32 s7, s1, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 -; GFX7LESS-NEXT: v_mul_hi_u32 v1, s0, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX7LESS-NEXT: s_mul_i32 s6, s0, s6 -; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[12:15], 0 glc +; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: BB10_2: @@ -1926,16 +1926,16 @@ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v2 +; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 +; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v1 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) -; GFX7LESS-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX7LESS-NEXT: v_mul_hi_u32 v2, s0, v0 -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 -; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s3 -; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s1, v2 +; GFX7LESS-NEXT: v_mul_hi_u32 v1, s0, v2 +; GFX7LESS-NEXT: v_mul_lo_u32 v2, s0, v2 +; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s3 +; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v2 +; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -1945,42 +1945,42 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_mov_b64 s[8:9], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB10_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s12, s6 ; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v1, s6 -; GFX8-NEXT: v_mul_hi_u32 v1, s0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX8-NEXT: s_mov_b32 s13, s7 ; GFX8-NEXT: s_mul_i32 s7, s1, s6 ; GFX8-NEXT: s_mul_i32 s6, s0, s6 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s7, v0 ; GFX8-NEXT: s_mov_b32 s15, 0xf000 ; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[12:15], 0 glc +; GFX8-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB10_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: v_readfirstlane_b32 s2, v1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX8-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, s1, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, s0, v2 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_mul_lo_u32 v1, s0, v2 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v1 ; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; @@ -1990,9 +1990,9 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b64 s[8:9], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz BB10_2 ; GFX9-NEXT: ; %bb.1: @@ -2006,23 +2006,23 @@ ; GFX9-NEXT: s_mul_i32 s6, s2, s6 ; GFX9-NEXT: s_mov_b32 s15, 0xf000 ; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[12:15], 0 glc +; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB10_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 -; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 -; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_readfirstlane_b32 s1, v2 -; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_mul_lo_u32 v0, s2, v2 +; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2 +; GFX9-NEXT: v_mul_hi_u32 v4, s2, v2 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc @@ -2035,10 +2035,10 @@ ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1064-NEXT: s_mov_b64 s[8:9], exec -; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz BB10_2 ; GFX1064-NEXT: ; %bb.1: @@ -2049,14 +2049,14 @@ ; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s8 ; GFX1064-NEXT: s_mul_i32 s8, s2, s8 ; GFX1064-NEXT: s_add_i32 s10, s10, s9 -; GFX1064-NEXT: v_mov_b32_e32 v1, s8 -; GFX1064-NEXT: v_mov_b32_e32 v2, s10 +; GFX1064-NEXT: v_mov_b32_e32 v0, s8 +; GFX1064-NEXT: v_mov_b32_e32 v1, s10 ; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_mov_b32 s8, s6 ; GFX1064-NEXT: s_mov_b32 s9, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc +; GFX1064-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv @@ -2064,15 +2064,15 @@ ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 -; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 -; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1064-NEXT: v_readfirstlane_b32 s1, v2 +; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v2 +; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v2 +; GFX1064-NEXT: v_mul_lo_u32 v2, s2, v2 +; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 ; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 -; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v0 +; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v2 ; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1064-NEXT: s_endpgm @@ -2083,9 +2083,9 @@ ; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1032-NEXT: s_mov_b32 s8, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB10_2 ; GFX1032-NEXT: ; %bb.1: @@ -2096,14 +2096,14 @@ ; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s1 ; GFX1032-NEXT: s_mul_i32 s1, s2, s1 ; GFX1032-NEXT: s_add_i32 s9, s9, s8 -; GFX1032-NEXT: v_mov_b32_e32 v1, s1 -; GFX1032-NEXT: v_mov_b32_e32 v2, s9 +; GFX1032-NEXT: v_mov_b32_e32 v0, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s9 ; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: s_mov_b32 s8, s6 ; GFX1032-NEXT: s_mov_b32 s9, s7 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc +; GFX1032-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv @@ -2111,15 +2111,15 @@ ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 -; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 -; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1032-NEXT: v_readfirstlane_b32 s1, v2 +; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v2 +; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v2 +; GFX1032-NEXT: v_mul_lo_u32 v2, s2, v2 +; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 ; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 -; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0 +; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v2 ; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -176,30 +176,30 @@ ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB1_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 +; GFX7LESS-NEXT: s_mul_i32 s2, s6, s2 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: BB1_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s1, v0 +; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm @@ -207,28 +207,28 @@ ; GFX8-LABEL: add_i32_uniform: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz BB1_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s1, s0, s1 +; GFX8-NEXT: s_mul_i32 s2, s6, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v1 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 @@ -239,27 +239,27 @@ ; GFX9-LABEL: add_i32_uniform: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b64 s[6:7], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz BB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s3, s2, s3 +; GFX9-NEXT: s_mul_i32 s2, s6, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB1_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -271,20 +271,20 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz BB1_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s3, s2, s3 -; GFX1064-NEXT: v_mov_b32_e32 v2, s3 +; GFX1064-NEXT: s_mul_i32 s2, s6, s2 +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 @@ -294,7 +294,7 @@ ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 @@ -735,27 +735,27 @@ ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB4_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s4 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v2, v[1:2] +; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: BB4_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 -; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 -; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 +; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 @@ -769,28 +769,28 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB4_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v2, v[1:2] +; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB4_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s2, v1 -; GFX8-NEXT: v_readfirstlane_b32 s3, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] +; GFX8-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8-NEXT: v_readfirstlane_b32 s3, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_nop 2 @@ -802,27 +802,27 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB4_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v2, v[1:2] +; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB4_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s2, v1 -; GFX9-NEXT: v_readfirstlane_b32 s3, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] +; GFX9-NEXT: v_readfirstlane_b32 s2, v0 +; GFX9-NEXT: v_readfirstlane_b32 s3, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_nop 2 @@ -833,28 +833,28 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz BB4_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_mul_i32 s4, s4, 5 -; GFX1064-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v2, v[1:2] +; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: BB4_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 -; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3] +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -865,27 +865,27 @@ ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB4_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_mul_i32 s3, s3, 5 -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: v_mov_b32_e32 v0, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v2, v[1:2] +; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: BB4_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 -; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3] +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -905,9 +905,9 @@ ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB5_2 ; GFX7LESS-NEXT: ; %bb.1: @@ -915,14 +915,14 @@ ; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 -; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 -; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] +; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: BB5_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] @@ -931,15 +931,15 @@ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s4, s0 ; GFX7LESS-NEXT: s_mov_b32 s5, s1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v2 -; GFX7LESS-NEXT: v_mul_lo_u32 v1, s3, v0 -; GFX7LESS-NEXT: v_mul_hi_u32 v2, s2, v0 -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 -; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 +; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 +; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 +; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s1 +; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -948,41 +948,41 @@ ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_cbranch_execz BB5_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 +; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX8-NEXT: s_mul_i32 s7, s3, s6 ; GFX8-NEXT: s_mul_i32 s6, s2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s7, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] +; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB5_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s4, s0 -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 -; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 -; GFX8-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, s3, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, s2, v2 ; GFX8-NEXT: s_mov_b32 s5, s1 -; GFX8-NEXT: v_readfirstlane_b32 s1, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_mul_lo_u32 v1, s2, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v1 ; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; @@ -991,9 +991,9 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz BB5_2 ; GFX9-NEXT: ; %bb.1: @@ -1003,24 +1003,24 @@ ; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 ; GFX9-NEXT: s_add_i32 s8, s8, s7 ; GFX9-NEXT: s_mul_i32 s6, s2, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] +; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB5_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 -; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 -; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s2, v2 +; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2 +; GFX9-NEXT: v_mul_hi_u32 v4, s2, v2 ; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: v_readfirstlane_b32 s1, v2 -; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -1032,10 +1032,10 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec -; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB5_2 ; GFX1064-NEXT: ; %bb.1: @@ -1046,25 +1046,25 @@ ; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 ; GFX1064-NEXT: s_mul_i32 s6, s2, s6 ; GFX1064-NEXT: s_add_i32 s8, s8, s7 -; GFX1064-NEXT: v_mov_b32_e32 v1, s6 -; GFX1064-NEXT: v_mov_b32_e32 v2, s8 +; GFX1064-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064-NEXT: v_mov_b32_e32 v1, s8 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] +; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: BB5_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 -; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 -; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1064-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v2 +; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v2 +; GFX1064-NEXT: v_mul_lo_u32 v2, s2, v2 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 -; GFX1064-NEXT: v_add_co_u32 v0, vcc, s2, v0 +; GFX1064-NEXT: v_add_co_u32 v0, vcc, s2, v2 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s4, v1, vcc ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1074,9 +1074,9 @@ ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB5_2 ; GFX1032-NEXT: ; %bb.1: @@ -1087,25 +1087,25 @@ ; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 ; GFX1032-NEXT: s_mul_i32 s5, s2, s5 ; GFX1032-NEXT: s_add_i32 s7, s7, s6 -; GFX1032-NEXT: v_mov_b32_e32 v1, s5 -; GFX1032-NEXT: v_mov_b32_e32 v2, s7 +; GFX1032-NEXT: v_mov_b32_e32 v0, s5 +; GFX1032-NEXT: v_mov_b32_e32 v1, s7 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] +; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: BB5_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 -; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 -; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1032-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v2 +; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v2 +; GFX1032-NEXT: v_mul_lo_u32 v2, s2, v2 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 -; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0 +; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, s2, v2 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1347,30 +1347,30 @@ ; GFX7LESS: ; %bb.0: ; %entry ; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0xb ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB8_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 +; GFX7LESS-NEXT: s_mul_i32 s2, s6, s2 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: BB8_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s1, v0 +; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm @@ -1378,28 +1378,28 @@ ; GFX8-LABEL: sub_i32_uniform: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_cbranch_execz BB8_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mul_i32 s1, s0, s1 +; GFX8-NEXT: s_mul_i32 s2, s6, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB8_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v1 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 @@ -1410,27 +1410,27 @@ ; GFX9-LABEL: sub_i32_uniform: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b64 s[6:7], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz BB8_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s3, s2, s3 +; GFX9-NEXT: s_mul_i32 s2, s6, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB8_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -1442,20 +1442,20 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: s_load_dword s6, s[0:1], 0x2c +; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_cbranch_execz BB8_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mul_i32 s3, s2, s3 -; GFX1064-NEXT: v_mov_b32_e32 v2, s3 +; GFX1064-NEXT: s_mul_i32 s2, s6, s2 +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 @@ -1465,7 +1465,7 @@ ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 @@ -1906,27 +1906,27 @@ ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB11_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s4 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v2, v[1:2] +; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: BB11_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 -; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 -; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 +; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 @@ -1940,27 +1940,27 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b64 s[4:5], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB11_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: s_mul_i32 s4, s4, 5 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v2, v[1:2] +; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB11_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s3, v2 -; GFX8-NEXT: v_readfirstlane_b32 s2, v1 -; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 -; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX8-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX8-NEXT: v_readfirstlane_b32 s3, v1 +; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc @@ -1974,26 +1974,26 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[4:5], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB11_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: s_mul_i32 s4, s4, 5 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v2, v[1:2] +; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB11_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s3, v2 -; GFX9-NEXT: v_readfirstlane_b32 s2, v1 -; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 -; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX9-NEXT: v_readfirstlane_b32 s2, v0 +; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v1 +; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc @@ -2006,32 +2006,32 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[4:5], exec -; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz BB11_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_mul_i32 s4, s4, 5 -; GFX1064-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v2, v[1:2] +; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: BB11_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1064-NEXT: v_mul_u32_u24_e32 v1, 5, v0 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 -; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 -; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 ; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc +; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -2041,31 +2041,31 @@ ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s3, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB11_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_mul_i32 s3, s3, 5 -; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: v_mov_b32_e32 v0, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v2, v[1:2] +; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: BB11_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1032-NEXT: v_mul_u32_u24_e32 v1, 5, v0 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 -; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 -; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 +; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 ; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo +; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -2084,9 +2084,9 @@ ; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB12_2 ; GFX7LESS-NEXT: ; %bb.1: @@ -2094,14 +2094,14 @@ ; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 -; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 -; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] +; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: BB12_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] @@ -2110,15 +2110,15 @@ ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s4, s0 ; GFX7LESS-NEXT: s_mov_b32 s5, s1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v2 -; GFX7LESS-NEXT: v_mul_lo_u32 v1, s3, v0 -; GFX7LESS-NEXT: v_mul_hi_u32 v2, s2, v0 -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 -; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 -; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc +; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 +; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 +; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 +; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 +; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s1 +; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v2 +; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -2127,41 +2127,41 @@ ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_mov_b64 s[6:7], exec ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_cbranch_execz BB12_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 +; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX8-NEXT: s_mul_i32 s7, s3, s6 ; GFX8-NEXT: s_mul_i32 s6, s2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 -; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s7, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] +; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB12_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s4, s0 -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 -; GFX8-NEXT: v_mul_lo_u32 v1, s3, v0 -; GFX8-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX8-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, s3, v2 +; GFX8-NEXT: v_mul_hi_u32 v3, s2, v2 ; GFX8-NEXT: s_mov_b32 s5, s1 -; GFX8-NEXT: v_readfirstlane_b32 s1, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v1 +; GFX8-NEXT: v_mul_lo_u32 v1, s2, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v1 ; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; @@ -2170,9 +2170,9 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b64 s[6:7], exec ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz BB12_2 ; GFX9-NEXT: ; %bb.1: @@ -2182,24 +2182,24 @@ ; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 ; GFX9-NEXT: s_add_i32 s8, s8, s7 ; GFX9-NEXT: s_mul_i32 s6, s2, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] +; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB12_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 -; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 -; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s2, v2 +; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2 +; GFX9-NEXT: v_mul_hi_u32 v4, s2, v2 ; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: v_readfirstlane_b32 s1, v2 -; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 @@ -2211,10 +2211,10 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[6:7], exec -; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB12_2 ; GFX1064-NEXT: ; %bb.1: @@ -2225,25 +2225,25 @@ ; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 ; GFX1064-NEXT: s_mul_i32 s6, s2, s6 ; GFX1064-NEXT: s_add_i32 s8, s8, s7 -; GFX1064-NEXT: v_mov_b32_e32 v1, s6 -; GFX1064-NEXT: v_mov_b32_e32 v2, s8 +; GFX1064-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064-NEXT: v_mov_b32_e32 v1, s8 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] +; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: BB12_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 -; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 -; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1064-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v2 +; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v2 +; GFX1064-NEXT: v_mul_lo_u32 v2, s2, v2 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 -; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0 +; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v2 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -2253,9 +2253,9 @@ ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo -; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB12_2 ; GFX1032-NEXT: ; %bb.1: @@ -2266,25 +2266,25 @@ ; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 ; GFX1032-NEXT: s_mul_i32 s5, s2, s5 ; GFX1032-NEXT: s_add_i32 s7, s7, s6 -; GFX1032-NEXT: v_mov_b32_e32 v1, s5 -; GFX1032-NEXT: v_mov_b32_e32 v2, s7 +; GFX1032-NEXT: v_mov_b32_e32 v0, s5 +; GFX1032-NEXT: v_mov_b32_e32 v1, s7 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] +; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: BB12_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 -; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 -; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 -; GFX1032-NEXT: v_readfirstlane_b32 s4, v2 +; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v2 +; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v2 +; GFX1032-NEXT: v_mul_lo_u32 v2, s2, v2 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 -; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0 +; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v2 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -1520,11 +1520,11 @@ ; VI-LABEL: ctpop_i16_in_br: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: s_load_dword s2, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s1, s0, 16 -; VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s1, 0 -; VI-NEXT: s_and_b64 vcc, exec, s[2:3] +; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: v_cmp_ne_u16_e64 s[0:1], s0, 0 +; VI-NEXT: s_and_b64 vcc, exec, s[0:1] ; VI-NEXT: s_cbranch_vccz BB14_2 ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: s_mov_b32 s11, 0xf000 @@ -1537,7 +1537,7 @@ ; VI-NEXT: BB14_2: ; VI-NEXT: ; implicit-def: $vgpr0 ; VI-NEXT: BB14_3: ; %if -; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_and_b32 s0, s2, 0xffff ; VI-NEXT: s_bcnt1_i32_b32 s0, s0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll --- a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll +++ b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll @@ -20,46 +20,46 @@ ; ISA: ; %bb.0: ; %start ; ISA-NEXT: v_readfirstlane_b32 s0, v0 ; ISA-NEXT: s_mov_b32 m0, s0 -; ISA-NEXT: s_mov_b32 s0, 0 +; ISA-NEXT: s_mov_b32 s8, 0 ; ISA-NEXT: v_interp_p1_f32_e32 v0, v1, attr0.x ; ISA-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; ISA-NEXT: s_mov_b64 s[2:3], 0 -; ISA-NEXT: ; implicit-def: $sgpr6_sgpr7 +; ISA-NEXT: s_mov_b64 s[0:1], 0 ; ISA-NEXT: ; implicit-def: $sgpr4_sgpr5 +; ISA-NEXT: ; implicit-def: $sgpr2_sgpr3 ; ISA-NEXT: s_branch BB0_3 ; ISA-NEXT: BB0_1: ; %Flow1 ; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; ISA-NEXT: s_or_b64 exec, exec, s[8:9] -; ISA-NEXT: s_add_i32 s0, s0, 1 -; ISA-NEXT: s_mov_b64 s[8:9], 0 +; ISA-NEXT: s_or_b64 exec, exec, s[6:7] +; ISA-NEXT: s_add_i32 s8, s8, 1 +; ISA-NEXT: s_mov_b64 s[6:7], 0 ; ISA-NEXT: BB0_2: ; %Flow ; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; ISA-NEXT: s_and_b64 s[10:11], exec, s[6:7] -; ISA-NEXT: s_or_b64 s[2:3], s[10:11], s[2:3] -; ISA-NEXT: s_andn2_b64 s[4:5], s[4:5], exec -; ISA-NEXT: s_and_b64 s[8:9], s[8:9], exec -; ISA-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; ISA-NEXT: s_andn2_b64 exec, exec, s[2:3] +; ISA-NEXT: s_and_b64 s[10:11], exec, s[4:5] +; ISA-NEXT: s_or_b64 s[0:1], s[10:11], s[0:1] +; ISA-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; ISA-NEXT: s_and_b64 s[6:7], s[6:7], exec +; ISA-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] +; ISA-NEXT: s_andn2_b64 exec, exec, s[0:1] ; ISA-NEXT: s_cbranch_execz BB0_6 ; ISA-NEXT: BB0_3: ; %loop ; ISA-NEXT: ; =>This Inner Loop Header: Depth=1 -; ISA-NEXT: s_or_b64 s[6:7], s[6:7], exec -; ISA-NEXT: s_cmp_lt_u32 s0, 32 -; ISA-NEXT: s_mov_b64 s[8:9], -1 +; ISA-NEXT: s_or_b64 s[4:5], s[4:5], exec +; ISA-NEXT: s_cmp_lt_u32 s8, 32 +; ISA-NEXT: s_mov_b64 s[6:7], -1 ; ISA-NEXT: s_cbranch_scc0 BB0_2 ; ISA-NEXT: ; %bb.4: ; %endif1 ; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; ISA-NEXT: s_mov_b64 s[6:7], -1 -; ISA-NEXT: s_and_saveexec_b64 s[8:9], vcc +; ISA-NEXT: s_mov_b64 s[4:5], -1 +; ISA-NEXT: s_and_saveexec_b64 s[6:7], vcc ; ISA-NEXT: s_cbranch_execz BB0_1 ; ISA-NEXT: ; %bb.5: ; %endif2 ; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; ISA-NEXT: s_xor_b64 s[6:7], exec, -1 +; ISA-NEXT: s_xor_b64 s[4:5], exec, -1 ; ISA-NEXT: s_branch BB0_1 ; ISA-NEXT: BB0_6: ; %Flow2 -; ISA-NEXT: s_or_b64 exec, exec, s[2:3] +; ISA-NEXT: s_or_b64 exec, exec, s[0:1] ; ISA-NEXT: v_mov_b32_e32 v1, 0 -; ISA-NEXT: s_and_saveexec_b64 s[0:1], s[4:5] +; ISA-NEXT: s_and_saveexec_b64 s[0:1], s[2:3] ; ISA-NEXT: ; %bb.7: ; %if1 ; ISA-NEXT: v_sqrt_f32_e32 v1, v0 ; ISA-NEXT: ; %bb.8: ; %endloop diff --git a/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll --- a/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll @@ -10,18 +10,18 @@ define i32 @fp_save_restore_in_temp_sgpr(%struct.Data addrspace(5)* nocapture readonly byval(%struct.Data) align 4 %arg) #0 { ; GCN-LABEL: name: fp_save_restore_in_temp_sgpr ; GCN: bb.0.begin: - ; GCN: liveins: $sgpr7, $sgpr30_sgpr31 - ; GCN: $sgpr7 = frame-setup COPY $sgpr33 + ; GCN: liveins: $sgpr11, $sgpr30_sgpr31 + ; GCN: $sgpr11 = frame-setup COPY $sgpr33 ; GCN: $sgpr33 = frame-setup COPY $sgpr32 ; GCN: bb.1.lp_end: - ; GCN: liveins: $sgpr6, $sgpr7, $vgpr1, $sgpr4_sgpr5, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31 + ; GCN: liveins: $sgpr10, $sgpr11, $vgpr1, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr30_sgpr31 ; GCN: bb.2.lp_begin: - ; GCN: liveins: $sgpr6, $sgpr7, $vgpr1, $sgpr4_sgpr5, $sgpr8_sgpr9, $sgpr30_sgpr31 + ; GCN: liveins: $sgpr10, $sgpr11, $vgpr1, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr30_sgpr31 ; GCN: bb.3.Flow: - ; GCN: liveins: $sgpr6, $sgpr7, $vgpr0, $vgpr1, $sgpr4_sgpr5, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31 + ; GCN: liveins: $sgpr10, $sgpr11, $vgpr0, $vgpr1, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr30_sgpr31 ; GCN: bb.4.end: - ; GCN: liveins: $sgpr7, $vgpr0, $sgpr4_sgpr5, $sgpr30_sgpr31 - ; GCN: $sgpr33 = frame-destroy COPY $sgpr7 + ; GCN: liveins: $sgpr11, $vgpr0, $sgpr4_sgpr5, $sgpr30_sgpr31 + ; GCN: $sgpr33 = frame-destroy COPY $sgpr11 begin: br label %lp_begin diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -989,23 +989,23 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(double addrspace(3)* %ptr) #4 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX90A-NEXT: s_mov_b64 s[2:3], 0 +; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX90A-NEXT: s_mov_b64 s[0:1], 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s2 ; GFX90A-NEXT: ds_read_b64 v[0:1], v0 ; GFX90A-NEXT: BB52_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], 4.0 -; GFX90A-NEXT: v_mov_b32_e32 v4, s0 +; GFX90A-NEXT: v_mov_b32_e32 v4, s2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: ds_cmpst_rtn_b64 v[2:3], v4, v[0:1], v[2:3] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[0:1] -; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX90A-NEXT: s_cbranch_execnz BB52_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll --- a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll +++ b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll @@ -14,22 +14,22 @@ ; GCN-LABEL: test_move_load_address_to_vgpr: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dword v1, v2, s[0:1] glc +; GCN-NEXT: global_load_dword v0, v1, s[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_add_u32_e32 v0, 0xffffff00, v1 -; GCN-NEXT: v_lshlrev_b64 v[1:2], 2, v[1:2] -; GCN-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 -; GCN-NEXT: v_addc_co_u32_e32 v2, vcc, v3, v2, vcc +; GCN-NEXT: v_add_u32_e32 v2, 0xffffff00, v0 +; GCN-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GCN-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; GCN-NEXT: BB0_1: ; %bb3 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: global_load_dword v3, v[1:2], off glc +; GCN-NEXT: global_load_dword v3, v[0:1], off glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 -; GCN-NEXT: v_add_co_u32_e64 v1, s[0:1], 4, v1 -; GCN-NEXT: v_addc_co_u32_e64 v2, s[0:1], 0, v2, s[0:1] +; GCN-NEXT: v_add_co_u32_e32 v2, vcc, 1, v2 +; GCN-NEXT: v_add_co_u32_e64 v0, s[0:1], 4, v0 +; GCN-NEXT: v_addc_co_u32_e64 v1, s[0:1], 0, v1, s[0:1] ; GCN-NEXT: s_and_b64 vcc, exec, vcc ; GCN-NEXT: s_cbranch_vccz BB0_1 ; GCN-NEXT: ; %bb.2: ; %bb2 diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll --- a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll @@ -4,47 +4,47 @@ define amdgpu_ps void @i1_copy_from_loop(<4 x i32> inreg %rsrc, i32 %tid) { ; SI-LABEL: i1_copy_from_loop: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s14, 0 ; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: ; implicit-def: $sgpr6_sgpr7 ; SI-NEXT: ; implicit-def: $sgpr8_sgpr9 -; SI-NEXT: ; implicit-def: $sgpr10_sgpr11 ; SI-NEXT: s_branch BB0_3 ; SI-NEXT: BB0_1: ; %Flow1 ; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; SI-NEXT: s_or_b64 exec, exec, s[14:15] +; SI-NEXT: s_or_b64 exec, exec, s[12:13] ; SI-NEXT: BB0_2: ; %Flow ; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; SI-NEXT: s_and_b64 s[14:15], exec, s[10:11] -; SI-NEXT: s_or_b64 s[4:5], s[14:15], s[4:5] -; SI-NEXT: s_andn2_b64 s[8:9], s[8:9], exec -; SI-NEXT: s_and_b64 s[12:13], s[12:13], exec -; SI-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; SI-NEXT: s_and_b64 s[12:13], exec, s[8:9] +; SI-NEXT: s_or_b64 s[4:5], s[12:13], s[4:5] +; SI-NEXT: s_andn2_b64 s[6:7], s[6:7], exec +; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec +; SI-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] ; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] ; SI-NEXT: s_cbranch_execz BB0_6 ; SI-NEXT: BB0_3: ; %for.body ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_or_b64 s[10:11], s[10:11], exec -; SI-NEXT: s_cmp_gt_u32 s6, 3 -; SI-NEXT: v_cmp_lt_u32_e64 s[12:13], s6, 4 +; SI-NEXT: s_or_b64 s[8:9], s[8:9], exec +; SI-NEXT: s_cmp_gt_u32 s14, 3 +; SI-NEXT: v_cmp_lt_u32_e64 s[10:11], s14, 4 ; SI-NEXT: s_cbranch_scc1 BB0_2 ; SI-NEXT: ; %bb.4: ; %mid.loop ; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v1, s14 ; SI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 idxen offen -; SI-NEXT: s_mov_b64 s[12:13], -1 +; SI-NEXT: s_mov_b64 s[10:11], -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 -; SI-NEXT: s_mov_b64 s[10:11], -1 -; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc +; SI-NEXT: s_mov_b64 s[8:9], -1 +; SI-NEXT: s_and_saveexec_b64 s[12:13], vcc ; SI-NEXT: s_cbranch_execz BB0_1 ; SI-NEXT: ; %bb.5: ; %end.loop ; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; SI-NEXT: s_add_i32 s6, s6, 1 -; SI-NEXT: s_xor_b64 s[10:11], exec, -1 +; SI-NEXT: s_add_i32 s14, s14, 1 +; SI-NEXT: s_xor_b64 s[8:9], exec, -1 ; SI-NEXT: s_branch BB0_1 ; SI-NEXT: BB0_6: ; %for.end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_and_saveexec_b64 s[0:1], s[8:9] +; SI-NEXT: s_and_saveexec_b64 s[0:1], s[6:7] ; SI-NEXT: s_cbranch_execz BB0_8 ; SI-NEXT: ; %bb.7: ; %if ; SI-NEXT: exp mrt0 v0, v0, v0, v0 done vm diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -5,85 +5,85 @@ define amdgpu_kernel void @udiv32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) { ; GFX9-LABEL: udiv32_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_load_dword s5, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX9-NEXT: s_sub_i32 s3, 0, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: s_sub_i32 s4, 0, s5 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: BB0_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mul_lo_u32 v2, s5, v0 -; GFX9-NEXT: v_mul_hi_u32 v3, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, s3, v0 +; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2 +; GFX9-NEXT: v_mul_lo_u32 v3, s4, v2 ; GFX9-NEXT: v_not_b32_e32 v5, v2 -; GFX9-NEXT: v_mul_lo_u32 v5, s2, v5 +; GFX9-NEXT: v_mul_lo_u32 v5, s5, v5 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v2 -; GFX9-NEXT: v_add_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 +; GFX9-NEXT: v_add_u32_e32 v3, s2, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_add_u32_e32 v4, s4, v5 +; GFX9-NEXT: v_add_u32_e32 v4, s2, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: s_add_u32 s4, s4, 1 +; GFX9-NEXT: s_add_u32 s2, s2, 1 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: s_addc_u32 s3, s3, 0 ; GFX9-NEXT: global_store_dword v1, v2, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_cmpk_eq_i32 s4, 0x400 +; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x400 ; GFX9-NEXT: s_cbranch_scc0 BB0_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udiv32_invariant_denom: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX10-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_mov_b64 s[2:3], 0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX10-NEXT: s_sub_i32 s3, 0, s2 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX10-NEXT: s_sub_i32 s5, 0, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX10-NEXT: v_mul_lo_u32 v1, s5, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: BB0_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_mul_lo_u32 v2, s5, v0 -; GFX10-NEXT: v_mul_hi_u32 v3, s4, v0 +; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0 +; GFX10-NEXT: v_mul_hi_u32 v3, s2, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-NEXT: v_mul_lo_u32 v4, s3, v2 +; GFX10-NEXT: v_mul_lo_u32 v4, s5, v2 ; GFX10-NEXT: v_not_b32_e32 v3, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v2 -; GFX10-NEXT: v_mul_lo_u32 v3, s2, v3 -; GFX10-NEXT: v_add_nc_u32_e32 v4, s4, v4 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v3, s4, v3 -; GFX10-NEXT: s_add_u32 s4, s4, 1 -; GFX10-NEXT: s_addc_u32 s5, s5, 0 +; GFX10-NEXT: v_mul_lo_u32 v3, s4, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v3, s2, v3 +; GFX10-NEXT: s_add_u32 s2, s2, 1 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v3 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo ; GFX10-NEXT: global_store_dword v1, v2, s[0:1] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_add_u32 s0, s0, 4 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400 +; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x400 ; GFX10-NEXT: s_cbranch_scc0 BB0_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -107,81 +107,81 @@ define amdgpu_kernel void @urem32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) { ; GFX9-LABEL: urem32_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX9-NEXT: s_sub_i32 s3, 0, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_sub_i32 s5, 0, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, s5, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: BB1_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mul_lo_u32 v2, s5, v0 -; GFX9-NEXT: v_mul_hi_u32 v3, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, s3, v0 +; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2 +; GFX9-NEXT: v_mul_lo_u32 v3, s5, v2 ; GFX9-NEXT: v_not_b32_e32 v2, v2 -; GFX9-NEXT: v_mul_lo_u32 v2, s2, v2 -; GFX9-NEXT: v_add_u32_e32 v4, s4, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4 +; GFX9-NEXT: v_mul_lo_u32 v2, s4, v2 +; GFX9-NEXT: v_add_u32_e32 v4, s2, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX9-NEXT: v_add_u32_e32 v2, s4, v2 -; GFX9-NEXT: s_add_u32 s4, s4, 1 -; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 +; GFX9-NEXT: v_add_u32_e32 v2, s2, v2 +; GFX9-NEXT: s_add_u32 s2, s2, 1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: s_addc_u32 s3, s3, 0 ; GFX9-NEXT: global_store_dword v1, v2, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_cmpk_eq_i32 s4, 0x400 +; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x400 ; GFX9-NEXT: s_cbranch_scc0 BB1_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: urem32_invariant_denom: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX10-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_mov_b64 s[2:3], 0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX10-NEXT: s_sub_i32 s3, 0, s2 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX10-NEXT: s_sub_i32 s5, 0, s4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX10-NEXT: v_mul_lo_u32 v1, s5, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: BB1_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_mul_lo_u32 v2, s5, v0 -; GFX10-NEXT: v_mul_hi_u32 v3, s4, v0 +; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0 +; GFX10-NEXT: v_mul_hi_u32 v3, s2, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-NEXT: v_mul_lo_u32 v3, s3, v2 +; GFX10-NEXT: v_mul_lo_u32 v3, s5, v2 ; GFX10-NEXT: v_not_b32_e32 v2, v2 -; GFX10-NEXT: v_mul_lo_u32 v2, s2, v2 -; GFX10-NEXT: v_add_nc_u32_e32 v4, s4, v3 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v4 +; GFX10-NEXT: v_mul_lo_u32 v2, s4, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v4, s2, v3 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo -; GFX10-NEXT: v_add_nc_u32_e32 v2, s4, v2 -; GFX10-NEXT: s_add_u32 s4, s4, 1 -; GFX10-NEXT: s_addc_u32 s5, s5, 0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s2, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v2, s2, v2 +; GFX10-NEXT: s_add_u32 s2, s2, 1 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s4, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX10-NEXT: global_store_dword v1, v2, s[0:1] ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_add_u32 s0, s0, 4 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400 +; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x400 ; GFX10-NEXT: s_cbranch_scc0 BB1_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -211,37 +211,37 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s2, s3, 31 ; GFX9-NEXT: s_add_i32 s3, s3, s2 -; GFX9-NEXT: s_xor_b32 s3, s3, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_sub_i32 s4, 0, s3 +; GFX9-NEXT: s_xor_b32 s4, s3, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_sub_i32 s3, 0, s4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 -; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: v_mul_lo_u32 v1, s3, v0 +; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: BB2_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mul_hi_u32 v2, s4, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, v2, s3 +; GFX9-NEXT: v_mul_hi_u32 v2, s3, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, v2, s4 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v2 -; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, s3, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s4, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_xor_b32_e32 v2, s2, v2 ; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v2 -; GFX9-NEXT: s_add_i32 s4, s4, 1 +; GFX9-NEXT: s_add_i32 s3, s3, 1 ; GFX9-NEXT: global_store_dword v1, v2, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s0, 4 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_cmpk_eq_i32 s4, 0x400 +; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400 ; GFX9-NEXT: s_cbranch_scc0 BB2_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -407,34 +407,35 @@ define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) { ; GFX9-LABEL: udiv16_invariant_denom: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_movk_i32 s5, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s3, s2, s3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_movk_i32 s3, 0x400 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX9-NEXT: s_and_b32 s2, s4, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v2 ; GFX9-NEXT: BB4_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_and_b32_e32 v2, s2, v4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v2 -; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 -; GFX9-NEXT: v_mul_f32_e32 v2, v8, v1 -; GFX9-NEXT: v_trunc_f32_e32 v2, v2 +; GFX9-NEXT: v_and_b32_e32 v0, s4, v4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 +; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, s3 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s2, v5 +; GFX9-NEXT: v_mul_f32_e32 v0, v8, v3 +; GFX9-NEXT: v_trunc_f32_e32 v0, v0 ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1] -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v0 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 -; GFX9-NEXT: v_mad_f32 v2, -v2, v0, v8 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, v0 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v2, s[0:1], 0, v7, s[0:1] +; GFX9-NEXT: v_mad_f32 v0, -v0, v2, v8 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, v2 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4 +; GFX9-NEXT: v_addc_co_u32_e64 v0, s[0:1], 0, v7, s[0:1] ; GFX9-NEXT: s_and_b64 vcc, exec, vcc -; GFX9-NEXT: global_store_short v[5:6], v2, off +; GFX9-NEXT: global_store_short v[5:6], v0, off ; GFX9-NEXT: s_cbranch_vccz BB4_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -445,29 +446,29 @@ ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s1, 0xffff -; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s0, s1, s4 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s0 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v2 ; GFX10-NEXT: BB4_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_and_b32_e32 v2, s1, v4 +; GFX10-NEXT: v_and_b32_e32 v0, s1, v4 ; GFX10-NEXT: v_add_nc_u16 v4, v4, 1 -; GFX10-NEXT: v_cvt_f32_u32_e32 v7, v2 -; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] +; GFX10-NEXT: v_cvt_f32_u32_e32 v7, v0 +; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4 -; GFX10-NEXT: v_mul_f32_e32 v2, v7, v1 +; GFX10-NEXT: v_mul_f32_e32 v0, v7, v3 ; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5 ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0 -; GFX10-NEXT: v_trunc_f32_e32 v2, v2 -; GFX10-NEXT: v_mad_f32 v7, -v2, v0, v7 -; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX10-NEXT: v_cmp_ge_f32_e64 s0, |v7|, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, 0, v2, s0 -; GFX10-NEXT: global_store_short v[5:6], v2, off +; GFX10-NEXT: v_trunc_f32_e32 v0, v0 +; GFX10-NEXT: v_mad_f32 v7, -v0, v2, v7 +; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX10-NEXT: v_cmp_ge_f32_e64 s0, |v7|, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v0, s0, 0, v0, s0 +; GFX10-NEXT: global_store_short v[5:6], v0, off ; GFX10-NEXT: s_cbranch_vccz BB4_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -492,35 +493,35 @@ ; GFX9-LABEL: urem16_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b32 s4, 0xffff -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_mov_b32 s6, 0xffff +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_movk_i32 s8, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s5, s4, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: s_and_b32 s7, s6, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s7 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v2 ; GFX9-NEXT: BB5_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_and_b32_e32 v2, s4, v4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v2 -; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] +; GFX9-NEXT: v_and_b32_e32 v0, s6, v4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 +; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s8, v4 -; GFX9-NEXT: v_mul_f32_e32 v9, v8, v1 +; GFX9-NEXT: v_mul_f32_e32 v9, v8, v3 ; GFX9-NEXT: v_trunc_f32_e32 v9, v9 ; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v9 -; GFX9-NEXT: v_mad_f32 v8, -v9, v0, v8 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v8|, v0 -; GFX9-NEXT: v_mov_b32_e32 v7, s7 +; GFX9-NEXT: v_mad_f32 v8, -v9, v2, v8 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v8|, v2 +; GFX9-NEXT: v_mov_b32_e32 v7, s5 ; GFX9-NEXT: v_addc_co_u32_e64 v8, s[2:3], 0, v10, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v8, v8, s5 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s6, v5 +; GFX9-NEXT: v_mul_lo_u32 v8, v8, s7 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1] -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v8 -; GFX9-NEXT: global_store_short v[5:6], v2, off +; GFX9-NEXT: v_sub_u32_e32 v0, v0, v8 +; GFX9-NEXT: global_store_short v[5:6], v0, off ; GFX9-NEXT: s_cbranch_vccz BB5_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -531,31 +532,31 @@ ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s1, 0xffff -; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s4, s1, s4 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v2 ; GFX10-NEXT: BB5_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_and_b32_e32 v2, s1, v4 +; GFX10-NEXT: v_and_b32_e32 v0, s1, v4 ; GFX10-NEXT: v_add_nc_u16 v4, v4, 1 -; GFX10-NEXT: v_cvt_f32_u32_e32 v7, v2 -; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] -; GFX10-NEXT: v_mul_f32_e32 v8, v7, v1 +; GFX10-NEXT: v_cvt_f32_u32_e32 v7, v0 +; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] +; GFX10-NEXT: v_mul_f32_e32 v8, v7, v3 ; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0 ; GFX10-NEXT: v_trunc_f32_e32 v8, v8 -; GFX10-NEXT: v_mad_f32 v7, -v8, v0, v7 +; GFX10-NEXT: v_mad_f32 v7, -v8, v2, v7 ; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v7|, v0 +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v7|, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4 ; GFX10-NEXT: v_mul_lo_u32 v7, v7, s4 ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo -; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v7 -; GFX10-NEXT: global_store_short v[5:6], v2, off +; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v7 +; GFX10-NEXT: global_store_short v[5:6], v0, off ; GFX10-NEXT: s_cbranch_vccz BB5_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -580,37 +581,38 @@ ; GFX9-LABEL: sdiv16_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_movk_i32 s3, 0x400 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_movk_i32 s5, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX9-NEXT: s_sext_i32_i16 s4, s2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s4 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v2 ; GFX9-NEXT: BB6_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_bfe_i32 v5, v4, 0, 16 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX9-NEXT: v_cvt_f32_i32_e32 v9, v5 -; GFX9-NEXT: v_xor_b32_e32 v8, s2, v5 -; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 +; GFX9-NEXT: v_xor_b32_e32 v8, s4, v5 +; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, s3 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s2, v5 ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1] -; GFX9-NEXT: v_mul_f32_e32 v7, v9, v1 +; GFX9-NEXT: v_mul_f32_e32 v7, v9, v3 ; GFX9-NEXT: v_trunc_f32_e32 v7, v7 -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v8 +; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v8 ; GFX9-NEXT: v_cvt_i32_f32_e32 v8, v7 -; GFX9-NEXT: v_mad_f32 v7, -v7, v0, v9 +; GFX9-NEXT: v_mad_f32 v7, -v7, v2, v9 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 -; GFX9-NEXT: v_or_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v7|, |v0| -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1] -; GFX9-NEXT: v_add_u32_e32 v2, v8, v2 +; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v7|, |v2| +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[0:1] +; GFX9-NEXT: v_add_u32_e32 v0, v8, v0 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc -; GFX9-NEXT: global_store_short v[5:6], v2, off +; GFX9-NEXT: global_store_short v[5:6], v0, off ; GFX9-NEXT: s_cbranch_vccz BB6_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -620,34 +622,34 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i16 s4, s4 -; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX10-NEXT: v_cvt_f32_i32_e32 v2, s4 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v2 ; GFX10-NEXT: BB6_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_bfe_i32 v5, v4, 0, 16 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX10-NEXT: v_add_nc_u16 v4, v4, 1 ; GFX10-NEXT: v_cvt_f32_i32_e32 v7, v5 ; GFX10-NEXT: v_xor_b32_e32 v8, s4, v5 -; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] +; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4 -; GFX10-NEXT: v_mul_f32_e32 v2, v7, v1 +; GFX10-NEXT: v_mul_f32_e32 v0, v7, v3 ; GFX10-NEXT: v_ashrrev_i32_e32 v8, 30, v8 ; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5 ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo -; GFX10-NEXT: v_trunc_f32_e32 v2, v2 +; GFX10-NEXT: v_trunc_f32_e32 v0, v0 ; GFX10-NEXT: v_or_b32_e32 v8, 1, v8 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0 -; GFX10-NEXT: v_mad_f32 v7, -v2, v0, v7 -; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX10-NEXT: v_cmp_ge_f32_e64 s1, |v7|, |v0| +; GFX10-NEXT: v_mad_f32 v7, -v0, v2, v7 +; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX10-NEXT: v_cmp_ge_f32_e64 s1, |v7|, |v2| ; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, v8, s1 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v7 -; GFX10-NEXT: global_store_short v[5:6], v2, off +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v7 +; GFX10-NEXT: global_store_short v[5:6], v0, off ; GFX10-NEXT: s_cbranch_vccz BB6_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -672,39 +674,39 @@ ; GFX9-LABEL: srem16_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: s_movk_i32 s5, 0x400 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_movk_i32 s7, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s4, s2 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX9-NEXT: s_sext_i32_i16 s6, s2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s6 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v2 ; GFX9-NEXT: BB7_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_bfe_i32 v7, v4, 0, 16 ; GFX9-NEXT: v_cvt_f32_i32_e32 v10, v7 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX9-NEXT: v_xor_b32_e32 v9, s4, v7 -; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v9 -; GFX9-NEXT: v_mul_f32_e32 v9, v10, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX9-NEXT: v_xor_b32_e32 v9, s6, v7 +; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] +; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v9 +; GFX9-NEXT: v_mul_f32_e32 v9, v10, v3 ; GFX9-NEXT: v_trunc_f32_e32 v9, v9 ; GFX9-NEXT: v_cvt_i32_f32_e32 v11, v9 -; GFX9-NEXT: v_mad_f32 v9, -v9, v0, v10 -; GFX9-NEXT: v_or_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v9|, |v0| -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[2:3] -; GFX9-NEXT: v_add_u32_e32 v2, v11, v2 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4 +; GFX9-NEXT: v_mad_f32 v9, -v9, v2, v10 +; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v9|, |v2| +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[2:3] +; GFX9-NEXT: v_add_u32_e32 v0, v11, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4 -; GFX9-NEXT: v_mov_b32_e32 v8, s7 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s6, v5 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s7, v4 +; GFX9-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, v7, v2 +; GFX9-NEXT: v_sub_u32_e32 v0, v7, v0 ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1] -; GFX9-NEXT: global_store_short v[5:6], v2, off +; GFX9-NEXT: global_store_short v[5:6], v0, off ; GFX9-NEXT: s_cbranch_vccz BB7_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -714,36 +716,36 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_sext_i32_i16 s1, s4 -; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s1 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 +; GFX10-NEXT: v_cvt_f32_i32_e32 v2, s1 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v2 ; GFX10-NEXT: BB7_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_bfe_i32 v7, v4, 0, 16 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX10-NEXT: v_add_nc_u16 v4, v4, 1 ; GFX10-NEXT: v_cvt_f32_i32_e32 v5, v7 ; GFX10-NEXT: v_xor_b32_e32 v6, s1, v7 -; GFX10-NEXT: v_mul_f32_e32 v8, v5, v1 +; GFX10-NEXT: v_mul_f32_e32 v8, v5, v3 ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 30, v6 ; GFX10-NEXT: v_trunc_f32_e32 v8, v8 ; GFX10-NEXT: v_or_b32_e32 v6, 1, v6 -; GFX10-NEXT: v_mad_f32 v5, -v8, v0, v5 +; GFX10-NEXT: v_mad_f32 v5, -v8, v2, v5 ; GFX10-NEXT: v_cvt_i32_f32_e32 v8, v8 -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v5|, |v0| +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v5|, |v2| ; GFX10-NEXT: v_cndmask_b32_e32 v9, 0, v6, vcc_lo -; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] +; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v8, v9 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v8, v9 ; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5 ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0 -; GFX10-NEXT: v_mul_lo_u32 v2, v2, s1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, v7, v2 -; GFX10-NEXT: global_store_short v[5:6], v2, off +; GFX10-NEXT: v_mul_lo_u32 v0, v0, s1 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, v7, v0 +; GFX10-NEXT: global_store_short v[5:6], v0, off ; GFX10-NEXT: s_cbranch_vccz BB7_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -210,40 +210,40 @@ ; GCN-NEXT: v_writelane_b32 v40, s34, 0 ; GCN-NEXT: v_writelane_b32 v40, s35, 1 ; GCN-NEXT: v_writelane_b32 v40, s36, 2 -; GCN-NEXT: v_writelane_b32 v40, s38, 3 -; GCN-NEXT: v_writelane_b32 v40, s39, 4 -; GCN-NEXT: v_writelane_b32 v40, s40, 5 -; GCN-NEXT: v_writelane_b32 v40, s41, 6 -; GCN-NEXT: v_writelane_b32 v40, s42, 7 -; GCN-NEXT: v_writelane_b32 v40, s43, 8 -; GCN-NEXT: v_writelane_b32 v40, s44, 9 -; GCN-NEXT: v_writelane_b32 v40, s45, 10 +; GCN-NEXT: v_writelane_b32 v40, s37, 3 +; GCN-NEXT: v_writelane_b32 v40, s38, 4 +; GCN-NEXT: v_writelane_b32 v40, s39, 5 +; GCN-NEXT: v_writelane_b32 v40, s40, 6 +; GCN-NEXT: v_writelane_b32 v40, s41, 7 +; GCN-NEXT: v_writelane_b32 v40, s42, 8 +; GCN-NEXT: v_writelane_b32 v40, s43, 9 +; GCN-NEXT: v_writelane_b32 v40, s44, 10 ; GCN-NEXT: v_writelane_b32 v40, s46, 11 ; GCN-NEXT: v_writelane_b32 v40, s47, 12 ; GCN-NEXT: v_writelane_b32 v40, s48, 13 ; GCN-NEXT: v_writelane_b32 v40, s49, 14 ; GCN-NEXT: v_writelane_b32 v40, s30, 15 ; GCN-NEXT: v_writelane_b32 v40, s31, 16 -; GCN-NEXT: s_mov_b32 s34, s14 -; GCN-NEXT: s_mov_b32 s35, s13 -; GCN-NEXT: s_mov_b32 s36, s12 -; GCN-NEXT: s_mov_b64 s[38:39], s[10:11] -; GCN-NEXT: s_mov_b64 s[40:41], s[8:9] -; GCN-NEXT: s_mov_b64 s[42:43], s[6:7] -; GCN-NEXT: s_mov_b64 s[44:45], s[4:5] +; GCN-NEXT: s_mov_b32 s42, s14 +; GCN-NEXT: s_mov_b32 s43, s13 +; GCN-NEXT: s_mov_b32 s44, s12 +; GCN-NEXT: s_mov_b64 s[34:35], s[10:11] +; GCN-NEXT: s_mov_b64 s[36:37], s[8:9] +; GCN-NEXT: s_mov_b64 s[38:39], s[6:7] +; GCN-NEXT: s_mov_b64 s[40:41], s[4:5] ; GCN-NEXT: s_mov_b64 s[46:47], exec ; GCN-NEXT: BB2_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s16, v0 ; GCN-NEXT: v_readfirstlane_b32 s17, v1 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] ; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc -; GCN-NEXT: s_mov_b64 s[4:5], s[44:45] -; GCN-NEXT: s_mov_b64 s[6:7], s[42:43] -; GCN-NEXT: s_mov_b64 s[8:9], s[40:41] -; GCN-NEXT: s_mov_b64 s[10:11], s[38:39] -; GCN-NEXT: s_mov_b32 s12, s36 -; GCN-NEXT: s_mov_b32 s13, s35 -; GCN-NEXT: s_mov_b32 s14, s34 +; GCN-NEXT: s_mov_b64 s[4:5], s[40:41] +; GCN-NEXT: s_mov_b64 s[6:7], s[38:39] +; GCN-NEXT: s_mov_b64 s[8:9], s[36:37] +; GCN-NEXT: s_mov_b64 s[10:11], s[34:35] +; GCN-NEXT: s_mov_b32 s12, s44 +; GCN-NEXT: s_mov_b32 s13, s43 +; GCN-NEXT: s_mov_b32 s14, s42 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $vgpr31 @@ -257,14 +257,14 @@ ; GCN-NEXT: v_readlane_b32 s48, v40, 13 ; GCN-NEXT: v_readlane_b32 s47, v40, 12 ; GCN-NEXT: v_readlane_b32 s46, v40, 11 -; GCN-NEXT: v_readlane_b32 s45, v40, 10 -; GCN-NEXT: v_readlane_b32 s44, v40, 9 -; GCN-NEXT: v_readlane_b32 s43, v40, 8 -; GCN-NEXT: v_readlane_b32 s42, v40, 7 -; GCN-NEXT: v_readlane_b32 s41, v40, 6 -; GCN-NEXT: v_readlane_b32 s40, v40, 5 -; GCN-NEXT: v_readlane_b32 s39, v40, 4 -; GCN-NEXT: v_readlane_b32 s38, v40, 3 +; GCN-NEXT: v_readlane_b32 s44, v40, 10 +; GCN-NEXT: v_readlane_b32 s43, v40, 9 +; GCN-NEXT: v_readlane_b32 s42, v40, 8 +; GCN-NEXT: v_readlane_b32 s41, v40, 7 +; GCN-NEXT: v_readlane_b32 s40, v40, 6 +; GCN-NEXT: v_readlane_b32 s39, v40, 5 +; GCN-NEXT: v_readlane_b32 s38, v40, 4 +; GCN-NEXT: v_readlane_b32 s37, v40, 3 ; GCN-NEXT: v_readlane_b32 s36, v40, 2 ; GCN-NEXT: v_readlane_b32 s35, v40, 1 ; GCN-NEXT: v_readlane_b32 s34, v40, 0 @@ -292,27 +292,27 @@ ; GCN-NEXT: v_writelane_b32 v40, s34, 0 ; GCN-NEXT: v_writelane_b32 v40, s35, 1 ; GCN-NEXT: v_writelane_b32 v40, s36, 2 -; GCN-NEXT: v_writelane_b32 v40, s38, 3 -; GCN-NEXT: v_writelane_b32 v40, s39, 4 -; GCN-NEXT: v_writelane_b32 v40, s40, 5 -; GCN-NEXT: v_writelane_b32 v40, s41, 6 -; GCN-NEXT: v_writelane_b32 v40, s42, 7 -; GCN-NEXT: v_writelane_b32 v40, s43, 8 -; GCN-NEXT: v_writelane_b32 v40, s44, 9 -; GCN-NEXT: v_writelane_b32 v40, s45, 10 +; GCN-NEXT: v_writelane_b32 v40, s37, 3 +; GCN-NEXT: v_writelane_b32 v40, s38, 4 +; GCN-NEXT: v_writelane_b32 v40, s39, 5 +; GCN-NEXT: v_writelane_b32 v40, s40, 6 +; GCN-NEXT: v_writelane_b32 v40, s41, 7 +; GCN-NEXT: v_writelane_b32 v40, s42, 8 +; GCN-NEXT: v_writelane_b32 v40, s43, 9 +; GCN-NEXT: v_writelane_b32 v40, s44, 10 ; GCN-NEXT: v_writelane_b32 v40, s46, 11 ; GCN-NEXT: v_writelane_b32 v40, s47, 12 ; GCN-NEXT: v_writelane_b32 v40, s48, 13 ; GCN-NEXT: v_writelane_b32 v40, s49, 14 ; GCN-NEXT: v_writelane_b32 v40, s30, 15 ; GCN-NEXT: v_writelane_b32 v40, s31, 16 -; GCN-NEXT: s_mov_b32 s34, s14 -; GCN-NEXT: s_mov_b32 s35, s13 -; GCN-NEXT: s_mov_b32 s36, s12 -; GCN-NEXT: s_mov_b64 s[38:39], s[10:11] -; GCN-NEXT: s_mov_b64 s[40:41], s[8:9] -; GCN-NEXT: s_mov_b64 s[42:43], s[6:7] -; GCN-NEXT: s_mov_b64 s[44:45], s[4:5] +; GCN-NEXT: s_mov_b32 s42, s14 +; GCN-NEXT: s_mov_b32 s43, s13 +; GCN-NEXT: s_mov_b32 s44, s12 +; GCN-NEXT: s_mov_b64 s[34:35], s[10:11] +; GCN-NEXT: s_mov_b64 s[36:37], s[8:9] +; GCN-NEXT: s_mov_b64 s[38:39], s[6:7] +; GCN-NEXT: s_mov_b64 s[40:41], s[4:5] ; GCN-NEXT: s_mov_b64 s[46:47], exec ; GCN-NEXT: BB3_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s16, v0 @@ -320,13 +320,13 @@ ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] ; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc ; GCN-NEXT: v_mov_b32_e32 v0, 0x7b -; GCN-NEXT: s_mov_b64 s[4:5], s[44:45] -; GCN-NEXT: s_mov_b64 s[6:7], s[42:43] -; GCN-NEXT: s_mov_b64 s[8:9], s[40:41] -; GCN-NEXT: s_mov_b64 s[10:11], s[38:39] -; GCN-NEXT: s_mov_b32 s12, s36 -; GCN-NEXT: s_mov_b32 s13, s35 -; GCN-NEXT: s_mov_b32 s14, s34 +; GCN-NEXT: s_mov_b64 s[4:5], s[40:41] +; GCN-NEXT: s_mov_b64 s[6:7], s[38:39] +; GCN-NEXT: s_mov_b64 s[8:9], s[36:37] +; GCN-NEXT: s_mov_b64 s[10:11], s[34:35] +; GCN-NEXT: s_mov_b32 s12, s44 +; GCN-NEXT: s_mov_b32 s13, s43 +; GCN-NEXT: s_mov_b32 s14, s42 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $vgpr31 @@ -340,14 +340,14 @@ ; GCN-NEXT: v_readlane_b32 s48, v40, 13 ; GCN-NEXT: v_readlane_b32 s47, v40, 12 ; GCN-NEXT: v_readlane_b32 s46, v40, 11 -; GCN-NEXT: v_readlane_b32 s45, v40, 10 -; GCN-NEXT: v_readlane_b32 s44, v40, 9 -; GCN-NEXT: v_readlane_b32 s43, v40, 8 -; GCN-NEXT: v_readlane_b32 s42, v40, 7 -; GCN-NEXT: v_readlane_b32 s41, v40, 6 -; GCN-NEXT: v_readlane_b32 s40, v40, 5 -; GCN-NEXT: v_readlane_b32 s39, v40, 4 -; GCN-NEXT: v_readlane_b32 s38, v40, 3 +; GCN-NEXT: v_readlane_b32 s44, v40, 10 +; GCN-NEXT: v_readlane_b32 s43, v40, 9 +; GCN-NEXT: v_readlane_b32 s42, v40, 8 +; GCN-NEXT: v_readlane_b32 s41, v40, 7 +; GCN-NEXT: v_readlane_b32 s40, v40, 6 +; GCN-NEXT: v_readlane_b32 s39, v40, 5 +; GCN-NEXT: v_readlane_b32 s38, v40, 4 +; GCN-NEXT: v_readlane_b32 s37, v40, 3 ; GCN-NEXT: v_readlane_b32 s36, v40, 2 ; GCN-NEXT: v_readlane_b32 s35, v40, 1 ; GCN-NEXT: v_readlane_b32 s34, v40, 0 @@ -375,40 +375,40 @@ ; GCN-NEXT: v_writelane_b32 v40, s34, 0 ; GCN-NEXT: v_writelane_b32 v40, s35, 1 ; GCN-NEXT: v_writelane_b32 v40, s36, 2 -; GCN-NEXT: v_writelane_b32 v40, s38, 3 -; GCN-NEXT: v_writelane_b32 v40, s39, 4 -; GCN-NEXT: v_writelane_b32 v40, s40, 5 -; GCN-NEXT: v_writelane_b32 v40, s41, 6 -; GCN-NEXT: v_writelane_b32 v40, s42, 7 -; GCN-NEXT: v_writelane_b32 v40, s43, 8 -; GCN-NEXT: v_writelane_b32 v40, s44, 9 -; GCN-NEXT: v_writelane_b32 v40, s45, 10 +; GCN-NEXT: v_writelane_b32 v40, s37, 3 +; GCN-NEXT: v_writelane_b32 v40, s38, 4 +; GCN-NEXT: v_writelane_b32 v40, s39, 5 +; GCN-NEXT: v_writelane_b32 v40, s40, 6 +; GCN-NEXT: v_writelane_b32 v40, s41, 7 +; GCN-NEXT: v_writelane_b32 v40, s42, 8 +; GCN-NEXT: v_writelane_b32 v40, s43, 9 +; GCN-NEXT: v_writelane_b32 v40, s44, 10 ; GCN-NEXT: v_writelane_b32 v40, s46, 11 ; GCN-NEXT: v_writelane_b32 v40, s47, 12 ; GCN-NEXT: v_writelane_b32 v40, s48, 13 ; GCN-NEXT: v_writelane_b32 v40, s49, 14 ; GCN-NEXT: v_writelane_b32 v40, s30, 15 ; GCN-NEXT: v_writelane_b32 v40, s31, 16 -; GCN-NEXT: s_mov_b32 s34, s14 -; GCN-NEXT: s_mov_b32 s35, s13 -; GCN-NEXT: s_mov_b32 s36, s12 -; GCN-NEXT: s_mov_b64 s[38:39], s[10:11] -; GCN-NEXT: s_mov_b64 s[40:41], s[8:9] -; GCN-NEXT: s_mov_b64 s[42:43], s[6:7] -; GCN-NEXT: s_mov_b64 s[44:45], s[4:5] +; GCN-NEXT: s_mov_b32 s42, s14 +; GCN-NEXT: s_mov_b32 s43, s13 +; GCN-NEXT: s_mov_b32 s44, s12 +; GCN-NEXT: s_mov_b64 s[34:35], s[10:11] +; GCN-NEXT: s_mov_b64 s[36:37], s[8:9] +; GCN-NEXT: s_mov_b64 s[38:39], s[6:7] +; GCN-NEXT: s_mov_b64 s[40:41], s[4:5] ; GCN-NEXT: s_mov_b64 s[46:47], exec ; GCN-NEXT: BB4_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s16, v0 ; GCN-NEXT: v_readfirstlane_b32 s17, v1 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] ; GCN-NEXT: s_and_saveexec_b64 s[48:49], vcc -; GCN-NEXT: s_mov_b64 s[4:5], s[44:45] -; GCN-NEXT: s_mov_b64 s[6:7], s[42:43] -; GCN-NEXT: s_mov_b64 s[8:9], s[40:41] -; GCN-NEXT: s_mov_b64 s[10:11], s[38:39] -; GCN-NEXT: s_mov_b32 s12, s36 -; GCN-NEXT: s_mov_b32 s13, s35 -; GCN-NEXT: s_mov_b32 s14, s34 +; GCN-NEXT: s_mov_b64 s[4:5], s[40:41] +; GCN-NEXT: s_mov_b64 s[6:7], s[38:39] +; GCN-NEXT: s_mov_b64 s[8:9], s[36:37] +; GCN-NEXT: s_mov_b64 s[10:11], s[34:35] +; GCN-NEXT: s_mov_b32 s12, s44 +; GCN-NEXT: s_mov_b32 s13, s43 +; GCN-NEXT: s_mov_b32 s14, s42 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -424,14 +424,14 @@ ; GCN-NEXT: v_readlane_b32 s48, v40, 13 ; GCN-NEXT: v_readlane_b32 s47, v40, 12 ; GCN-NEXT: v_readlane_b32 s46, v40, 11 -; GCN-NEXT: v_readlane_b32 s45, v40, 10 -; GCN-NEXT: v_readlane_b32 s44, v40, 9 -; GCN-NEXT: v_readlane_b32 s43, v40, 8 -; GCN-NEXT: v_readlane_b32 s42, v40, 7 -; GCN-NEXT: v_readlane_b32 s41, v40, 6 -; GCN-NEXT: v_readlane_b32 s40, v40, 5 -; GCN-NEXT: v_readlane_b32 s39, v40, 4 -; GCN-NEXT: v_readlane_b32 s38, v40, 3 +; GCN-NEXT: v_readlane_b32 s44, v40, 10 +; GCN-NEXT: v_readlane_b32 s43, v40, 9 +; GCN-NEXT: v_readlane_b32 s42, v40, 8 +; GCN-NEXT: v_readlane_b32 s41, v40, 7 +; GCN-NEXT: v_readlane_b32 s40, v40, 6 +; GCN-NEXT: v_readlane_b32 s39, v40, 5 +; GCN-NEXT: v_readlane_b32 s38, v40, 4 +; GCN-NEXT: v_readlane_b32 s37, v40, 3 ; GCN-NEXT: v_readlane_b32 s36, v40, 2 ; GCN-NEXT: v_readlane_b32 s35, v40, 1 ; GCN-NEXT: v_readlane_b32 s34, v40, 0 @@ -460,27 +460,27 @@ ; GCN-NEXT: v_writelane_b32 v40, s34, 0 ; GCN-NEXT: v_writelane_b32 v40, s35, 1 ; GCN-NEXT: v_writelane_b32 v40, s36, 2 -; GCN-NEXT: v_writelane_b32 v40, s38, 3 -; GCN-NEXT: v_writelane_b32 v40, s39, 4 -; GCN-NEXT: v_writelane_b32 v40, s40, 5 -; GCN-NEXT: v_writelane_b32 v40, s41, 6 -; GCN-NEXT: v_writelane_b32 v40, s42, 7 -; GCN-NEXT: v_writelane_b32 v40, s43, 8 -; GCN-NEXT: v_writelane_b32 v40, s44, 9 -; GCN-NEXT: v_writelane_b32 v40, s45, 10 +; GCN-NEXT: v_writelane_b32 v40, s37, 3 +; GCN-NEXT: v_writelane_b32 v40, s38, 4 +; GCN-NEXT: v_writelane_b32 v40, s39, 5 +; GCN-NEXT: v_writelane_b32 v40, s40, 6 +; GCN-NEXT: v_writelane_b32 v40, s41, 7 +; GCN-NEXT: v_writelane_b32 v40, s42, 8 +; GCN-NEXT: v_writelane_b32 v40, s43, 9 +; GCN-NEXT: v_writelane_b32 v40, s44, 10 ; GCN-NEXT: v_writelane_b32 v40, s46, 11 ; GCN-NEXT: v_writelane_b32 v40, s47, 12 ; GCN-NEXT: v_writelane_b32 v40, s48, 13 ; GCN-NEXT: v_writelane_b32 v40, s49, 14 ; GCN-NEXT: v_writelane_b32 v40, s50, 15 ; GCN-NEXT: v_writelane_b32 v40, s51, 16 -; GCN-NEXT: s_mov_b32 s34, s14 -; GCN-NEXT: s_mov_b32 s35, s13 -; GCN-NEXT: s_mov_b32 s36, s12 -; GCN-NEXT: s_mov_b64 s[38:39], s[10:11] -; GCN-NEXT: s_mov_b64 s[40:41], s[8:9] -; GCN-NEXT: s_mov_b64 s[42:43], s[6:7] -; GCN-NEXT: s_mov_b64 s[44:45], s[4:5] +; GCN-NEXT: s_mov_b32 s42, s14 +; GCN-NEXT: s_mov_b32 s43, s13 +; GCN-NEXT: s_mov_b32 s44, s12 +; GCN-NEXT: s_mov_b64 s[34:35], s[10:11] +; GCN-NEXT: s_mov_b64 s[36:37], s[8:9] +; GCN-NEXT: s_mov_b64 s[38:39], s[6:7] +; GCN-NEXT: s_mov_b64 s[40:41], s[4:5] ; GCN-NEXT: v_and_b32_e32 v2, 1, v2 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GCN-NEXT: s_and_saveexec_b64 s[46:47], vcc @@ -494,13 +494,13 @@ ; GCN-NEXT: v_readfirstlane_b32 s17, v1 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1] ; GCN-NEXT: s_and_saveexec_b64 s[50:51], vcc -; GCN-NEXT: s_mov_b64 s[4:5], s[44:45] -; GCN-NEXT: s_mov_b64 s[6:7], s[42:43] -; GCN-NEXT: s_mov_b64 s[8:9], s[40:41] -; GCN-NEXT: s_mov_b64 s[10:11], s[38:39] -; GCN-NEXT: s_mov_b32 s12, s36 -; GCN-NEXT: s_mov_b32 s13, s35 -; GCN-NEXT: s_mov_b32 s14, s34 +; GCN-NEXT: s_mov_b64 s[4:5], s[40:41] +; GCN-NEXT: s_mov_b64 s[6:7], s[38:39] +; GCN-NEXT: s_mov_b64 s[8:9], s[36:37] +; GCN-NEXT: s_mov_b64 s[10:11], s[34:35] +; GCN-NEXT: s_mov_b32 s12, s44 +; GCN-NEXT: s_mov_b32 s13, s43 +; GCN-NEXT: s_mov_b32 s14, s42 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: ; implicit-def: $vgpr31 @@ -518,14 +518,14 @@ ; GCN-NEXT: v_readlane_b32 s48, v40, 13 ; GCN-NEXT: v_readlane_b32 s47, v40, 12 ; GCN-NEXT: v_readlane_b32 s46, v40, 11 -; GCN-NEXT: v_readlane_b32 s45, v40, 10 -; GCN-NEXT: v_readlane_b32 s44, v40, 9 -; GCN-NEXT: v_readlane_b32 s43, v40, 8 -; GCN-NEXT: v_readlane_b32 s42, v40, 7 -; GCN-NEXT: v_readlane_b32 s41, v40, 6 -; GCN-NEXT: v_readlane_b32 s40, v40, 5 -; GCN-NEXT: v_readlane_b32 s39, v40, 4 -; GCN-NEXT: v_readlane_b32 s38, v40, 3 +; GCN-NEXT: v_readlane_b32 s44, v40, 10 +; GCN-NEXT: v_readlane_b32 s43, v40, 9 +; GCN-NEXT: v_readlane_b32 s42, v40, 8 +; GCN-NEXT: v_readlane_b32 s41, v40, 7 +; GCN-NEXT: v_readlane_b32 s40, v40, 6 +; GCN-NEXT: v_readlane_b32 s39, v40, 5 +; GCN-NEXT: v_readlane_b32 s38, v40, 4 +; GCN-NEXT: v_readlane_b32 s37, v40, 3 ; GCN-NEXT: v_readlane_b32 s36, v40, 2 ; GCN-NEXT: v_readlane_b32 s35, v40, 1 ; GCN-NEXT: v_readlane_b32 s34, v40, 0 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1447,53 +1447,53 @@ define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) { ; SI-LABEL: insert_split_bb: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s0, s[4:5], 0x4 -; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; SI-NEXT: s_load_dword s6, s[4:5], 0x4 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s0, 0 +; SI-NEXT: s_cmp_lg_u32 s6, 0 ; SI-NEXT: s_cbranch_scc0 BB30_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_load_dword s1, s[6:7], 0x1 -; SI-NEXT: s_mov_b64 s[2:3], 0 -; SI-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; SI-NEXT: s_load_dword s7, s[2:3], 0x1 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 vcc, vcc ; SI-NEXT: s_cbranch_vccz BB30_3 ; SI-NEXT: s_branch BB30_4 ; SI-NEXT: BB30_2: ; SI-NEXT: BB30_3: ; %if -; SI-NEXT: s_load_dword s1, s[6:7], 0x0 +; SI-NEXT: s_load_dword s7, s[2:3], 0x0 ; SI-NEXT: BB30_4: ; %endif ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: s_mov_b32 s7, 0x100f000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v1, s1 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_mov_b32 s3, 0x100f000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: insert_split_bb: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[4:5], 0x10 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; VI-NEXT: s_load_dword s6, s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 +; VI-NEXT: s_cmp_lg_u32 s6, 0 ; VI-NEXT: s_cbranch_scc0 BB30_2 ; VI-NEXT: ; %bb.1: ; %else -; VI-NEXT: s_load_dword s1, s[6:7], 0x4 +; VI-NEXT: s_load_dword s7, s[2:3], 0x4 ; VI-NEXT: s_cbranch_execz BB30_3 ; VI-NEXT: s_branch BB30_4 ; VI-NEXT: BB30_2: ; VI-NEXT: BB30_3: ; %if ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s1, s[6:7], 0x0 +; VI-NEXT: s_load_dword s7, s[2:3], 0x0 ; VI-NEXT: BB30_4: ; %endif ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_mov_b32 s7, 0x1100f000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_mov_b32 s3, 0x1100f000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm entry: %0 = insertelement <2 x i32> undef, i32 %a, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll @@ -891,45 +891,45 @@ ; SI-NEXT: s_mov_b64 s[0:1], exec ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc +; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; SI-NEXT: s_cbranch_execz BB7_3 ; SI-NEXT: ; %bb.1: ; %.demote0 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 BB7_9 ; SI-NEXT: ; %bb.2: ; %.demote0 -; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] -; SI-NEXT: s_and_b64 exec, exec, s[6:7] +; SI-NEXT: s_wqm_b64 s[4:5], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[4:5] ; SI-NEXT: BB7_3: ; %.continue0.preheader -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: s_mov_b64 s[2:3], 0 ; SI-NEXT: s_branch BB7_5 ; SI-NEXT: BB7_4: ; %.continue1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; SI-NEXT: s_or_b64 exec, exec, s[6:7] -; SI-NEXT: s_add_i32 s2, s2, 1 -; SI-NEXT: v_cmp_ge_i32_e32 vcc, s2, v1 -; SI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_add_i32 s6, s6, 1 +; SI-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1 +; SI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] ; SI-NEXT: s_cbranch_execz BB7_8 ; SI-NEXT: BB7_5: ; %.continue0 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: s_mov_b64 s[6:7], s[0:1] -; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[6:7] +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_mov_b64 s[4:5], s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] ; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: s_xor_b64 s[6:7], s[0:1], -1 +; SI-NEXT: s_xor_b64 s[4:5], s[0:1], -1 ; SI-NEXT: s_nop 0 ; SI-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: s_nop 1 ; SI-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; SI-NEXT: s_or_b64 s[6:7], s[6:7], vcc -; SI-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] -; SI-NEXT: s_xor_b64 s[6:7], exec, s[8:9] +; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; SI-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; SI-NEXT: s_xor_b64 s[4:5], exec, s[8:9] ; SI-NEXT: s_cbranch_execz BB7_4 ; SI-NEXT: ; %bb.6: ; %.demote1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -941,7 +941,7 @@ ; SI-NEXT: s_and_b64 exec, exec, s[8:9] ; SI-NEXT: s_branch BB7_4 ; SI-NEXT: BB7_8: ; %.return -; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_and_b64 exec, exec, s[0:1] ; SI-NEXT: v_bfrev_b32_e32 v0, 60 ; SI-NEXT: v_mov_b32_e32 v1, 0x3c00 @@ -957,45 +957,45 @@ ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_mov_b32 s6, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz BB7_3 ; GFX9-NEXT: ; %bb.1: ; %.demote0 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 BB7_9 ; GFX9-NEXT: ; %bb.2: ; %.demote0 -; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[4:5] ; GFX9-NEXT: BB7_3: ; %.continue0.preheader -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_branch BB7_5 ; GFX9-NEXT: BB7_4: ; %.continue1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX9-NEXT: s_add_i32 s2, s2, 1 -; GFX9-NEXT: v_cmp_ge_i32_e32 vcc, s2, v1 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_add_i32 s6, s6, 1 +; GFX9-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz BB7_8 ; GFX9-NEXT: BB7_5: ; %.continue0 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], -1 +; GFX9-NEXT: s_xor_b64 s[4:5], s[0:1], -1 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], vcc -; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] -; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[8:9] +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[8:9] ; GFX9-NEXT: s_cbranch_execz BB7_4 ; GFX9-NEXT: ; %bb.6: ; %.demote1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -1007,7 +1007,7 @@ ; GFX9-NEXT: s_and_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_branch BB7_4 ; GFX9-NEXT: BB7_8: ; %.return -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 @@ -1023,10 +1023,10 @@ ; GFX10-32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-32-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX10-32-NEXT: s_mov_b32 s1, 0 +; GFX10-32-NEXT: s_mov_b32 s2, 0 ; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX10-32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX10-32-NEXT: s_cbranch_execz BB7_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote0 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo @@ -1035,30 +1035,30 @@ ; GFX10-32-NEXT: s_wqm_b32 s3, s0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 ; GFX10-32-NEXT: BB7_3: ; %.continue0.preheader -; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10-32-NEXT: s_mov_b32 s2, 0 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: s_mov_b32 s3, 0 ; GFX10-32-NEXT: s_branch BB7_5 ; GFX10-32-NEXT: BB7_4: ; %.continue1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX10-32-NEXT: s_add_i32 s2, s2, 1 -; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, s2, v1 -; GFX10-32-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: s_add_i32 s3, s3, 1 +; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, s3, v1 +; GFX10-32-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX10-32-NEXT: s_cbranch_execz BB7_8 ; GFX10-32-NEXT: BB7_5: ; %.continue0 ; GFX10-32-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-32-NEXT: s_mov_b32 s3, s0 -; GFX10-32-NEXT: v_cndmask_b32_e64 v0, s2, 0, s3 -; GFX10-32-NEXT: s_xor_b32 s3, s0, -1 +; GFX10-32-NEXT: s_mov_b32 s1, s0 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, s3, 0, s1 +; GFX10-32-NEXT: s_xor_b32 s1, s0, -1 ; GFX10-32-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-32-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX10-32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_or_b32 s3, s3, vcc_lo -; GFX10-32-NEXT: s_and_saveexec_b32 s4, s3 -; GFX10-32-NEXT: s_xor_b32 s3, exec_lo, s4 +; GFX10-32-NEXT: s_or_b32 s1, s1, vcc_lo +; GFX10-32-NEXT: s_and_saveexec_b32 s4, s1 +; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s4 ; GFX10-32-NEXT: s_cbranch_execz BB7_4 ; GFX10-32-NEXT: ; %bb.6: ; %.demote1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -1070,7 +1070,7 @@ ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s4 ; GFX10-32-NEXT: s_branch BB7_4 ; GFX10-32-NEXT: BB7_8: ; %.return -; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 @@ -1086,42 +1086,42 @@ ; GFX10-64-NEXT: s_mov_b64 s[0:1], exec ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX10-64-NEXT: s_mov_b32 s2, 0 +; GFX10-64-NEXT: s_mov_b32 s6, 0 ; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX10-64-NEXT: s_cbranch_execz BB7_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote0 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 BB7_9 ; GFX10-64-NEXT: ; %bb.2: ; %.demote0 -; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] -; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: s_wqm_b64 s[4:5], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5] ; GFX10-64-NEXT: BB7_3: ; %.continue0.preheader -; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX10-64-NEXT: s_mov_b64 s[4:5], 0 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-64-NEXT: s_mov_b64 s[2:3], 0 ; GFX10-64-NEXT: s_branch BB7_5 ; GFX10-64-NEXT: BB7_4: ; %.continue1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 -; GFX10-64-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX10-64-NEXT: s_add_i32 s2, s2, 1 -; GFX10-64-NEXT: v_cmp_ge_i32_e32 vcc, s2, v1 -; GFX10-64-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: s_add_i32 s6, s6, 1 +; GFX10-64-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1 +; GFX10-64-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: s_cbranch_execz BB7_8 ; GFX10-64-NEXT: BB7_5: ; %.continue0 ; GFX10-64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-64-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX10-64-NEXT: v_cndmask_b32_e64 v0, s2, 0, s[6:7] -; GFX10-64-NEXT: s_xor_b64 s[6:7], s[0:1], -1 +; GFX10-64-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, s6, 0, s[4:5] +; GFX10-64-NEXT: s_xor_b64 s[4:5], s[0:1], -1 ; GFX10-64-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-64-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec ; GFX10-64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; GFX10-64-NEXT: s_or_b64 s[6:7], s[6:7], vcc -; GFX10-64-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] -; GFX10-64-NEXT: s_xor_b64 s[6:7], exec, s[8:9] +; GFX10-64-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GFX10-64-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[8:9] ; GFX10-64-NEXT: s_cbranch_execz BB7_4 ; GFX10-64-NEXT: ; %bb.6: ; %.demote1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -1133,7 +1133,7 @@ ; GFX10-64-NEXT: s_and_b64 exec, exec, s[8:9] ; GFX10-64-NEXT: s_branch BB7_4 ; GFX10-64-NEXT: BB7_8: ; %.return -; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -237,111 +237,111 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 { ; SI-LABEL: round_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x11 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_movk_i32 s18, 0xfc01 ; SI-NEXT: s_mov_b32 s3, 0xfffff ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 +; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014 ; SI-NEXT: s_add_i32 s19, s0, s18 -; SI-NEXT: s_mov_b32 s2, s6 +; SI-NEXT: s_mov_b32 s2, s14 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s19 ; SI-NEXT: s_brev_b32 s20, 1 -; SI-NEXT: s_andn2_b64 s[16:17], s[10:11], s[0:1] -; SI-NEXT: s_and_b32 s0, s11, s20 +; SI-NEXT: s_andn2_b64 s[16:17], s[6:7], s[0:1] +; SI-NEXT: s_and_b32 s0, s7, s20 ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: v_mov_b32_e32 v0, s17 ; SI-NEXT: v_cmp_lt_i32_e64 vcc, s19, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s19, 51 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] ; SI-NEXT: v_mov_b32_e32 v0, s16 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; SI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] -; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014 +; SI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] +; SI-NEXT: s_bfe_u32 s0, s5, 0xb0014 ; SI-NEXT: s_add_i32 s17, s0, s18 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 ; SI-NEXT: s_brev_b32 s16, -2 ; SI-NEXT: v_mov_b32_e32 v12, 0x3ff00000 -; SI-NEXT: v_mov_b32_e32 v4, s11 +; SI-NEXT: v_mov_b32_e32 v4, s7 ; SI-NEXT: v_bfi_b32 v4, s16, v12, v4 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s17 ; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc ; SI-NEXT: v_mov_b32_e32 v2, 0 -; SI-NEXT: s_andn2_b64 s[10:11], s[8:9], s[0:1] -; SI-NEXT: s_and_b32 s0, s9, s20 +; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[0:1] +; SI-NEXT: s_and_b32 s0, s5, s20 ; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] ; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: v_mov_b32_e32 v0, s11 +; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: v_cmp_lt_i32_e64 vcc, s17, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s17, 51 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] -; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; SI-NEXT: v_add_f64 v[4:5], s[8:9], -v[0:1] -; SI-NEXT: s_bfe_u32 s0, s15, 0xb0014 -; SI-NEXT: s_add_i32 s10, s0, s18 -; SI-NEXT: v_mov_b32_e32 v6, s9 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s10 +; SI-NEXT: v_add_f64 v[4:5], s[4:5], -v[0:1] +; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 +; SI-NEXT: s_add_i32 s6, s0, s18 +; SI-NEXT: v_mov_b32_e32 v6, s5 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s6 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 -; SI-NEXT: s_andn2_b64 s[8:9], s[14:15], s[0:1] +; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[0:1] ; SI-NEXT: v_bfi_b32 v6, s16, v12, v6 -; SI-NEXT: s_and_b32 s0, s15, s20 +; SI-NEXT: s_and_b32 s0, s11, s20 ; SI-NEXT: v_cndmask_b32_e32 v9, 0, v6, vcc ; SI-NEXT: v_mov_b32_e32 v5, s0 -; SI-NEXT: v_mov_b32_e32 v4, s9 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s10, 0 +; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: v_cmp_lt_i32_e64 vcc, s6, 0 ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s10, 51 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s6, 51 ; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[0:1] -; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc -; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1] -; SI-NEXT: v_add_f64 v[6:7], s[14:15], -v[4:5] -; SI-NEXT: s_bfe_u32 s0, s13, 0xb0014 -; SI-NEXT: v_mov_b32_e32 v10, s15 -; SI-NEXT: s_add_i32 s8, s0, s18 +; SI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5] +; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014 +; SI-NEXT: v_mov_b32_e32 v10, s11 +; SI-NEXT: s_add_i32 s4, s0, s18 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s8 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s4 ; SI-NEXT: v_bfi_b32 v10, s16, v12, v10 ; SI-NEXT: v_cndmask_b32_e32 v7, 0, v10, vcc ; SI-NEXT: v_mov_b32_e32 v6, 0 -; SI-NEXT: s_andn2_b64 s[2:3], s[12:13], s[0:1] -; SI-NEXT: s_and_b32 s0, s13, s20 +; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], s[0:1] +; SI-NEXT: s_and_b32 s0, s9, s20 ; SI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7] ; SI-NEXT: v_mov_b32_e32 v5, s0 ; SI-NEXT: v_mov_b32_e32 v4, s3 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s8, 0 +; SI-NEXT: v_cmp_lt_i32_e64 vcc, s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; SI-NEXT: v_mov_b32_e32 v5, s13 -; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s8, 51 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], s4, 51 ; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[0:1] ; SI-NEXT: v_mov_b32_e32 v4, s2 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc -; SI-NEXT: v_mov_b32_e32 v10, s12 +; SI-NEXT: v_mov_b32_e32 v10, s8 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] -; SI-NEXT: v_add_f64 v[10:11], s[12:13], -v[4:5] -; SI-NEXT: v_mov_b32_e32 v13, s13 +; SI-NEXT: v_add_f64 v[10:11], s[8:9], -v[4:5] +; SI-NEXT: v_mov_b32_e32 v13, s9 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[10:11]|, 0.5 ; SI-NEXT: v_bfi_b32 v12, s16, v12, v13 ; SI-NEXT: v_cndmask_b32_e32 v11, 0, v12, vcc ; SI-NEXT: v_mov_b32_e32 v10, 0 ; SI-NEXT: v_mov_b32_e32 v8, 0 ; SI-NEXT: v_add_f64 v[4:5], v[4:5], v[10:11] -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s15, 0xf000 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[8:9] -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: round_v4f64: @@ -396,166 +396,166 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 { ; SI-LABEL: round_v8f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x19 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_movk_i32 s7, 0xfc01 -; SI-NEXT: s_mov_b32 s5, 0xfffff -; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 +; SI-NEXT: s_mov_b32 s22, -1 +; SI-NEXT: s_movk_i32 s23, 0xfc01 +; SI-NEXT: s_mov_b32 s21, 0xfffff +; SI-NEXT: s_mov_b32 s20, s22 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s2, s11, 0xb0014 -; SI-NEXT: s_add_i32 s26, s2, s7 -; SI-NEXT: s_lshr_b64 s[2:3], s[4:5], s26 +; SI-NEXT: s_bfe_u32 s2, s7, 0xb0014 +; SI-NEXT: s_add_i32 s26, s2, s23 +; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s26 ; SI-NEXT: s_brev_b32 s27, 1 -; SI-NEXT: s_andn2_b64 s[24:25], s[10:11], s[2:3] -; SI-NEXT: s_and_b32 s2, s11, s27 +; SI-NEXT: s_andn2_b64 s[24:25], s[6:7], s[2:3] +; SI-NEXT: s_and_b32 s2, s7, s27 ; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_mov_b32_e32 v0, s25 ; SI-NEXT: v_cmp_lt_i32_e64 vcc, s26, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s26, 51 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] ; SI-NEXT: v_mov_b32_e32 v0, s24 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] -; SI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] -; SI-NEXT: s_bfe_u32 s2, s9, 0xb0014 -; SI-NEXT: s_add_i32 s25, s2, s7 +; SI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] +; SI-NEXT: s_bfe_u32 s2, s5, 0xb0014 +; SI-NEXT: s_add_i32 s25, s2, s23 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 ; SI-NEXT: s_brev_b32 s24, -2 ; SI-NEXT: v_mov_b32_e32 v18, 0x3ff00000 -; SI-NEXT: v_mov_b32_e32 v4, s11 +; SI-NEXT: v_mov_b32_e32 v4, s7 ; SI-NEXT: v_bfi_b32 v4, s24, v18, v4 -; SI-NEXT: s_lshr_b64 s[2:3], s[4:5], s25 +; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s25 ; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc ; SI-NEXT: v_mov_b32_e32 v2, 0 -; SI-NEXT: s_andn2_b64 s[10:11], s[8:9], s[2:3] -; SI-NEXT: s_and_b32 s2, s9, s27 +; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[2:3] +; SI-NEXT: s_and_b32 s2, s5, s27 ; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] ; SI-NEXT: v_mov_b32_e32 v1, s2 -; SI-NEXT: v_mov_b32_e32 v0, s11 +; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: v_cmp_lt_i32_e64 vcc, s25, 0 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s25, 51 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[2:3] -; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[2:3] -; SI-NEXT: v_add_f64 v[4:5], s[8:9], -v[0:1] -; SI-NEXT: s_bfe_u32 s2, s15, 0xb0014 -; SI-NEXT: v_mov_b32_e32 v6, s9 -; SI-NEXT: s_add_i32 s10, s2, s7 +; SI-NEXT: v_add_f64 v[4:5], s[4:5], -v[0:1] +; SI-NEXT: s_bfe_u32 s2, s11, 0xb0014 +; SI-NEXT: v_mov_b32_e32 v6, s5 +; SI-NEXT: s_add_i32 s6, s2, s23 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 ; SI-NEXT: v_bfi_b32 v6, s24, v18, v6 -; SI-NEXT: s_lshr_b64 s[2:3], s[4:5], s10 +; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s6 ; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc ; SI-NEXT: v_mov_b32_e32 v4, 0 -; SI-NEXT: s_andn2_b64 s[8:9], s[14:15], s[2:3] -; SI-NEXT: s_and_b32 s2, s15, s27 +; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[2:3] +; SI-NEXT: s_and_b32 s2, s11, s27 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] ; SI-NEXT: v_mov_b32_e32 v5, s2 -; SI-NEXT: v_mov_b32_e32 v4, s9 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s10, 0 +; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: v_cmp_lt_i32_e64 vcc, s6, 0 ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s10, 51 +; SI-NEXT: v_mov_b32_e32 v5, s11 +; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s6, 51 ; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[2:3] -; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc -; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[2:3] -; SI-NEXT: v_add_f64 v[6:7], s[14:15], -v[4:5] -; SI-NEXT: s_bfe_u32 s2, s13, 0xb0014 -; SI-NEXT: v_mov_b32_e32 v8, s15 -; SI-NEXT: s_add_i32 s10, s2, s7 +; SI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5] +; SI-NEXT: s_bfe_u32 s2, s9, 0xb0014 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: s_add_i32 s6, s2, s23 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 ; SI-NEXT: v_bfi_b32 v8, s24, v18, v8 -; SI-NEXT: s_lshr_b64 s[2:3], s[4:5], s10 +; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s6 ; SI-NEXT: v_cndmask_b32_e32 v7, 0, v8, vcc ; SI-NEXT: v_mov_b32_e32 v6, 0 -; SI-NEXT: s_andn2_b64 s[8:9], s[12:13], s[2:3] -; SI-NEXT: s_and_b32 s2, s13, s27 +; SI-NEXT: s_andn2_b64 s[4:5], s[8:9], s[2:3] +; SI-NEXT: s_and_b32 s2, s9, s27 ; SI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7] ; SI-NEXT: v_mov_b32_e32 v5, s2 -; SI-NEXT: v_mov_b32_e32 v4, s9 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s10, 0 +; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: v_cmp_lt_i32_e64 vcc, s6, 0 ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; SI-NEXT: v_mov_b32_e32 v5, s13 -; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s10, 51 +; SI-NEXT: v_mov_b32_e32 v5, s9 +; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s6, 51 ; SI-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[2:3] -; SI-NEXT: v_mov_b32_e32 v4, s8 +; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc -; SI-NEXT: v_mov_b32_e32 v8, s12 +; SI-NEXT: v_mov_b32_e32 v8, s8 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[2:3] -; SI-NEXT: v_add_f64 v[8:9], s[12:13], -v[4:5] -; SI-NEXT: s_bfe_u32 s2, s19, 0xb0014 -; SI-NEXT: v_mov_b32_e32 v10, s13 -; SI-NEXT: s_add_i32 s10, s2, s7 +; SI-NEXT: v_add_f64 v[8:9], s[8:9], -v[4:5] +; SI-NEXT: s_bfe_u32 s2, s15, 0xb0014 +; SI-NEXT: v_mov_b32_e32 v10, s9 +; SI-NEXT: s_add_i32 s6, s2, s23 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[8:9]|, 0.5 ; SI-NEXT: v_bfi_b32 v10, s24, v18, v10 -; SI-NEXT: s_lshr_b64 s[2:3], s[4:5], s10 +; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s6 ; SI-NEXT: v_cndmask_b32_e32 v9, 0, v10, vcc ; SI-NEXT: v_mov_b32_e32 v8, 0 -; SI-NEXT: s_andn2_b64 s[8:9], s[18:19], s[2:3] -; SI-NEXT: s_and_b32 s2, s19, s27 +; SI-NEXT: s_andn2_b64 s[4:5], s[14:15], s[2:3] +; SI-NEXT: s_and_b32 s2, s15, s27 ; SI-NEXT: v_add_f64 v[4:5], v[4:5], v[8:9] ; SI-NEXT: v_mov_b32_e32 v9, s2 -; SI-NEXT: v_mov_b32_e32 v8, s9 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s10, 0 +; SI-NEXT: v_mov_b32_e32 v8, s5 +; SI-NEXT: v_cmp_lt_i32_e64 vcc, s6, 0 ; SI-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; SI-NEXT: v_mov_b32_e32 v9, s19 -; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s10, 51 +; SI-NEXT: v_mov_b32_e32 v9, s15 +; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s6, 51 ; SI-NEXT: v_cndmask_b32_e64 v13, v8, v9, s[2:3] -; SI-NEXT: v_mov_b32_e32 v8, s8 +; SI-NEXT: v_mov_b32_e32 v8, s4 ; SI-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc -; SI-NEXT: v_mov_b32_e32 v9, s18 +; SI-NEXT: v_mov_b32_e32 v9, s14 ; SI-NEXT: v_cndmask_b32_e64 v12, v8, v9, s[2:3] -; SI-NEXT: s_bfe_u32 s2, s17, 0xb0014 -; SI-NEXT: s_add_i32 s12, s2, s7 -; SI-NEXT: s_lshr_b64 s[2:3], s[4:5], s12 -; SI-NEXT: s_andn2_b64 s[8:9], s[16:17], s[2:3] -; SI-NEXT: s_bfe_u32 s2, s23, 0xb0014 -; SI-NEXT: s_add_i32 s14, s2, s7 -; SI-NEXT: s_lshr_b64 s[2:3], s[4:5], s14 -; SI-NEXT: v_mov_b32_e32 v8, s19 -; SI-NEXT: s_andn2_b64 s[10:11], s[22:23], s[2:3] -; SI-NEXT: s_and_b32 s2, s23, s27 +; SI-NEXT: s_bfe_u32 s2, s13, 0xb0014 +; SI-NEXT: s_add_i32 s8, s2, s23 +; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s8 +; SI-NEXT: s_andn2_b64 s[4:5], s[12:13], s[2:3] +; SI-NEXT: s_bfe_u32 s2, s19, 0xb0014 +; SI-NEXT: s_add_i32 s10, s2, s23 +; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s10 +; SI-NEXT: v_mov_b32_e32 v8, s15 +; SI-NEXT: s_andn2_b64 s[6:7], s[18:19], s[2:3] +; SI-NEXT: s_and_b32 s2, s19, s27 ; SI-NEXT: v_bfi_b32 v19, s24, v18, v8 ; SI-NEXT: v_mov_b32_e32 v9, s2 -; SI-NEXT: v_mov_b32_e32 v8, s11 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s14, 0 +; SI-NEXT: v_mov_b32_e32 v8, s7 +; SI-NEXT: v_cmp_lt_i32_e64 vcc, s10, 0 ; SI-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; SI-NEXT: v_mov_b32_e32 v9, s23 -; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s14, 51 +; SI-NEXT: v_mov_b32_e32 v9, s19 +; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s10, 51 ; SI-NEXT: v_cndmask_b32_e64 v9, v8, v9, s[2:3] -; SI-NEXT: v_mov_b32_e32 v8, s10 +; SI-NEXT: v_mov_b32_e32 v8, s6 ; SI-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc -; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v10, s18 ; SI-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[2:3] -; SI-NEXT: s_bfe_u32 s2, s21, 0xb0014 -; SI-NEXT: s_add_i32 s7, s2, s7 -; SI-NEXT: s_lshr_b64 s[2:3], s[4:5], s7 -; SI-NEXT: s_andn2_b64 s[4:5], s[20:21], s[2:3] -; SI-NEXT: s_and_b32 s2, s21, s27 +; SI-NEXT: s_bfe_u32 s2, s17, 0xb0014 +; SI-NEXT: s_add_i32 s10, s2, s23 +; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s10 +; SI-NEXT: s_andn2_b64 s[6:7], s[16:17], s[2:3] +; SI-NEXT: s_and_b32 s2, s17, s27 ; SI-NEXT: v_mov_b32_e32 v11, s2 -; SI-NEXT: v_mov_b32_e32 v10, s5 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s7, 0 +; SI-NEXT: v_mov_b32_e32 v10, s7 +; SI-NEXT: v_cmp_lt_i32_e64 vcc, s10, 0 ; SI-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc -; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s7, 51 -; SI-NEXT: v_mov_b32_e32 v11, s21 +; SI-NEXT: v_mov_b32_e32 v11, s17 +; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s10, 51 ; SI-NEXT: v_cndmask_b32_e64 v15, v10, v11, s[2:3] -; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: v_mov_b32_e32 v10, s6 ; SI-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc -; SI-NEXT: v_mov_b32_e32 v11, s20 +; SI-NEXT: v_mov_b32_e32 v11, s16 ; SI-NEXT: v_cndmask_b32_e64 v14, v10, v11, s[2:3] -; SI-NEXT: v_add_f64 v[10:11], s[20:21], -v[14:15] -; SI-NEXT: v_mov_b32_e32 v17, s23 +; SI-NEXT: v_add_f64 v[10:11], s[16:17], -v[14:15] +; SI-NEXT: v_mov_b32_e32 v17, s19 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[10:11]|, 0.5 -; SI-NEXT: v_add_f64 v[10:11], s[22:23], -v[8:9] -; SI-NEXT: v_mov_b32_e32 v16, s21 +; SI-NEXT: v_add_f64 v[10:11], s[18:19], -v[8:9] +; SI-NEXT: v_mov_b32_e32 v16, s17 ; SI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[10:11]|, 0.5 ; SI-NEXT: v_bfi_b32 v17, s24, v18, v17 ; SI-NEXT: v_cndmask_b32_e64 v11, 0, v17, s[2:3] @@ -564,26 +564,26 @@ ; SI-NEXT: v_add_f64 v[10:11], v[8:9], v[10:11] ; SI-NEXT: v_cndmask_b32_e32 v9, 0, v16, vcc ; SI-NEXT: v_mov_b32_e32 v8, 0 -; SI-NEXT: s_and_b32 s13, s17, s27 +; SI-NEXT: s_and_b32 s9, s13, s27 ; SI-NEXT: v_add_f64 v[8:9], v[14:15], v[8:9] -; SI-NEXT: v_mov_b32_e32 v14, s9 -; SI-NEXT: v_mov_b32_e32 v15, s13 -; SI-NEXT: v_cmp_lt_i32_e64 vcc, s12, 0 +; SI-NEXT: v_mov_b32_e32 v14, s5 +; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_cmp_lt_i32_e64 vcc, s8, 0 ; SI-NEXT: v_cndmask_b32_e32 v14, v14, v15, vcc -; SI-NEXT: v_mov_b32_e32 v15, s17 -; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s12, 51 +; SI-NEXT: v_mov_b32_e32 v15, s13 +; SI-NEXT: v_cmp_gt_i32_e64 s[2:3], s8, 51 ; SI-NEXT: v_cndmask_b32_e64 v17, v14, v15, s[2:3] -; SI-NEXT: v_mov_b32_e32 v14, s8 +; SI-NEXT: v_mov_b32_e32 v14, s4 ; SI-NEXT: v_cndmask_b32_e64 v14, v14, 0, vcc -; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: v_mov_b32_e32 v15, s12 ; SI-NEXT: v_cndmask_b32_e64 v16, v14, v15, s[2:3] -; SI-NEXT: v_mov_b32_e32 v14, s17 +; SI-NEXT: v_mov_b32_e32 v14, s13 ; SI-NEXT: v_bfi_b32 v18, s24, v18, v14 -; SI-NEXT: v_add_f64 v[14:15], s[16:17], -v[16:17] -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: v_add_f64 v[14:15], s[12:13], -v[16:17] +; SI-NEXT: s_load_dwordx2 s[20:21], s[0:1], 0x9 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5 -; SI-NEXT: v_add_f64 v[14:15], s[18:19], -v[12:13] -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: v_add_f64 v[14:15], s[14:15], -v[12:13] +; SI-NEXT: s_mov_b32 s23, 0xf000 ; SI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5 ; SI-NEXT: v_mov_b32_e32 v14, 0 ; SI-NEXT: v_cndmask_b32_e64 v15, 0, v19, s[0:1] @@ -592,78 +592,78 @@ ; SI-NEXT: v_mov_b32_e32 v12, 0 ; SI-NEXT: v_add_f64 v[12:13], v[16:17], v[12:13] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48 -; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:32 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[20:23], 0 offset:48 +; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[20:23], 0 offset:32 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: round_v8f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x19 -; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx16 s[0:15], s[0:1], 0x19 +; CI-NEXT: s_brev_b32 s18, -2 ; CI-NEXT: v_mov_b32_e32 v16, 0x3ff00000 -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s19, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_trunc_f64_e32 v[0:1], s[10:11] -; CI-NEXT: v_mov_b32_e32 v4, s11 -; CI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] -; CI-NEXT: v_bfi_b32 v4, s2, v16, v4 +; CI-NEXT: v_trunc_f64_e32 v[0:1], s[2:3] +; CI-NEXT: v_mov_b32_e32 v4, s3 +; CI-NEXT: v_add_f64 v[2:3], s[2:3], -v[0:1] +; CI-NEXT: v_bfi_b32 v4, s18, v16, v4 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v2, 0 ; CI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc -; CI-NEXT: v_trunc_f64_e32 v[4:5], s[8:9] +; CI-NEXT: v_trunc_f64_e32 v[4:5], s[0:1] ; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] -; CI-NEXT: v_add_f64 v[0:1], s[8:9], -v[4:5] -; CI-NEXT: v_mov_b32_e32 v6, s9 +; CI-NEXT: v_add_f64 v[0:1], s[0:1], -v[4:5] +; CI-NEXT: v_mov_b32_e32 v6, s1 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5 -; CI-NEXT: v_bfi_b32 v6, s2, v16, v6 +; CI-NEXT: v_bfi_b32 v6, s18, v16, v6 ; CI-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc -; CI-NEXT: v_trunc_f64_e32 v[6:7], s[14:15] +; CI-NEXT: v_trunc_f64_e32 v[6:7], s[6:7] ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_add_f64 v[0:1], v[4:5], v[0:1] -; CI-NEXT: v_add_f64 v[4:5], s[14:15], -v[6:7] -; CI-NEXT: v_mov_b32_e32 v8, s15 +; CI-NEXT: v_add_f64 v[4:5], s[6:7], -v[6:7] +; CI-NEXT: v_mov_b32_e32 v8, s7 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 -; CI-NEXT: v_bfi_b32 v8, s2, v16, v8 +; CI-NEXT: v_bfi_b32 v8, s18, v16, v8 ; CI-NEXT: v_cndmask_b32_e32 v5, 0, v8, vcc -; CI-NEXT: v_trunc_f64_e32 v[8:9], s[12:13] +; CI-NEXT: v_trunc_f64_e32 v[8:9], s[4:5] ; CI-NEXT: v_mov_b32_e32 v4, 0 ; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5] -; CI-NEXT: v_add_f64 v[4:5], s[12:13], -v[8:9] -; CI-NEXT: v_mov_b32_e32 v10, s13 +; CI-NEXT: v_add_f64 v[4:5], s[4:5], -v[8:9] +; CI-NEXT: v_mov_b32_e32 v10, s5 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 -; CI-NEXT: v_bfi_b32 v10, s2, v16, v10 +; CI-NEXT: v_bfi_b32 v10, s18, v16, v10 ; CI-NEXT: v_cndmask_b32_e32 v5, 0, v10, vcc ; CI-NEXT: v_mov_b32_e32 v4, 0 ; CI-NEXT: v_add_f64 v[4:5], v[8:9], v[4:5] -; CI-NEXT: v_mov_b32_e32 v8, s19 -; CI-NEXT: v_bfi_b32 v18, s2, v16, v8 -; CI-NEXT: v_trunc_f64_e32 v[8:9], s[20:21] -; CI-NEXT: v_trunc_f64_e32 v[10:11], s[22:23] -; CI-NEXT: v_add_f64 v[14:15], s[20:21], -v[8:9] -; CI-NEXT: v_mov_b32_e32 v19, s23 +; CI-NEXT: v_mov_b32_e32 v8, s11 +; CI-NEXT: v_bfi_b32 v18, s18, v16, v8 +; CI-NEXT: v_trunc_f64_e32 v[8:9], s[12:13] +; CI-NEXT: v_trunc_f64_e32 v[10:11], s[14:15] +; CI-NEXT: v_add_f64 v[14:15], s[12:13], -v[8:9] +; CI-NEXT: v_mov_b32_e32 v19, s15 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5 -; CI-NEXT: v_add_f64 v[14:15], s[22:23], -v[10:11] -; CI-NEXT: v_mov_b32_e32 v17, s21 +; CI-NEXT: v_add_f64 v[14:15], s[14:15], -v[10:11] +; CI-NEXT: v_mov_b32_e32 v17, s13 ; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5 -; CI-NEXT: v_bfi_b32 v19, s2, v16, v19 -; CI-NEXT: v_trunc_f64_e32 v[12:13], s[16:17] -; CI-NEXT: v_bfi_b32 v17, s2, v16, v17 +; CI-NEXT: v_bfi_b32 v19, s18, v16, v19 +; CI-NEXT: v_trunc_f64_e32 v[12:13], s[8:9] +; CI-NEXT: v_bfi_b32 v17, s18, v16, v17 ; CI-NEXT: v_cndmask_b32_e64 v15, 0, v19, s[0:1] ; CI-NEXT: v_mov_b32_e32 v14, 0 ; CI-NEXT: v_add_f64 v[10:11], v[10:11], v[14:15] ; CI-NEXT: v_cndmask_b32_e32 v15, 0, v17, vcc ; CI-NEXT: v_mov_b32_e32 v14, 0 -; CI-NEXT: v_mov_b32_e32 v17, s17 +; CI-NEXT: v_mov_b32_e32 v17, s9 +; CI-NEXT: v_bfi_b32 v19, s18, v16, v17 ; CI-NEXT: v_add_f64 v[8:9], v[8:9], v[14:15] -; CI-NEXT: v_add_f64 v[14:15], s[16:17], -v[12:13] -; CI-NEXT: v_bfi_b32 v19, s2, v16, v17 -; CI-NEXT: v_trunc_f64_e32 v[16:17], s[18:19] +; CI-NEXT: v_add_f64 v[14:15], s[8:9], -v[12:13] +; CI-NEXT: v_trunc_f64_e32 v[16:17], s[10:11] ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5 -; CI-NEXT: v_add_f64 v[14:15], s[18:19], -v[16:17] -; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: v_add_f64 v[14:15], s[10:11], -v[16:17] +; CI-NEXT: s_mov_b32 s18, -1 ; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v14, 0 ; CI-NEXT: v_cndmask_b32_e64 v15, 0, v18, s[0:1] @@ -671,10 +671,10 @@ ; CI-NEXT: v_cndmask_b32_e32 v17, 0, v19, vcc ; CI-NEXT: v_mov_b32_e32 v16, 0 ; CI-NEXT: v_add_f64 v[12:13], v[12:13], v[16:17] -; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48 -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:32 -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:48 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[16:19], 0 offset:32 +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; CI-NEXT: s_endpgm %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1 store <8 x double> %result, <8 x double> addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -2893,28 +2893,32 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 +; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, 0xffff -; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x10 +; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[36:51], s[18:19], 0x10 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s13, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s15, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s14, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s17, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s16, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s52, s19, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s53, s18, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s1, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s0, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s3, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s13, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s15, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s14, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s52, s1, s20 +; GCN-NOHSA-SI-NEXT: s_and_b32 s53, s0, s20 +; GCN-NOHSA-SI-NEXT: s_and_b32 s54, s3, s20 +; GCN-NOHSA-SI-NEXT: s_and_b32 s55, s2, s20 ; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s20 ; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, s20 ; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, s20 @@ -2927,25 +2931,21 @@ ; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, s20 ; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, s20 ; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s17, s17, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s16, s16, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s19, s19, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s18, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s54, s37, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s55, s36, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s56, s39, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s57, s38, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s58, s41, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s59, s40, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s60, s43, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s61, s42, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s62, s45, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s63, s44, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s64, s47, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s65, s46, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s66, s49, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s67, s48, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s68, s51, s20 +; GCN-NOHSA-SI-NEXT: s_and_b32 s56, s37, s20 +; GCN-NOHSA-SI-NEXT: s_and_b32 s57, s36, s20 +; GCN-NOHSA-SI-NEXT: s_and_b32 s58, s39, s20 +; GCN-NOHSA-SI-NEXT: s_and_b32 s59, s38, s20 +; GCN-NOHSA-SI-NEXT: s_and_b32 s60, s41, s20 +; GCN-NOHSA-SI-NEXT: s_and_b32 s61, s40, s20 +; GCN-NOHSA-SI-NEXT: s_and_b32 s62, s43, s20 +; GCN-NOHSA-SI-NEXT: s_and_b32 s63, s42, s20 +; GCN-NOHSA-SI-NEXT: s_and_b32 s64, s45, s20 +; GCN-NOHSA-SI-NEXT: s_and_b32 s65, s44, s20 +; GCN-NOHSA-SI-NEXT: s_and_b32 s66, s47, s20 +; GCN-NOHSA-SI-NEXT: s_and_b32 s67, s46, s20 +; GCN-NOHSA-SI-NEXT: s_and_b32 s68, s49, s20 +; GCN-NOHSA-SI-NEXT: s_and_b32 s69, s48, s20 +; GCN-NOHSA-SI-NEXT: s_and_b32 s70, s51, s20 ; GCN-NOHSA-SI-NEXT: s_and_b32 s20, s50, s20 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s37, s37, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s36, 16 @@ -2963,31 +2963,33 @@ ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s51, s51, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s50, s50, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s43, s43, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s50 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s68 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s70 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s51 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s67 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s69 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s66 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s68 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s49 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s65 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s67 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s46 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s66 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s47 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s63 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s65 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s44 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s62 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s64 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s45 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s61 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s63 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s42 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s60 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s59 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s62 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s61 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s43 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s40 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s58 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s60 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s41 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 @@ -2996,220 +2998,220 @@ ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s57 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s59 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s56 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s58 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s39 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s55 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s57 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s56 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s37 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s53 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s52 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s34 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s33 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s31 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s29 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s27 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s55 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s54 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s53 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s52 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GCN-HSA-NEXT: s_mov_b32 s37, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 +; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s20, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s21, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s22, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s23, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s24, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s25, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s26, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s27, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s28, s13, 16 -; GCN-HSA-NEXT: s_lshr_b32 s29, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s30, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s31, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s33, s17, 16 -; GCN-HSA-NEXT: s_lshr_b32 s34, s16, 16 -; GCN-HSA-NEXT: s_lshr_b32 s35, s19, 16 -; GCN-HSA-NEXT: s_lshr_b32 s36, s18, 16 -; GCN-HSA-NEXT: s_and_b32 s38, s5, s37 -; GCN-HSA-NEXT: s_and_b32 s39, s4, s37 -; GCN-HSA-NEXT: s_and_b32 s40, s7, s37 -; GCN-HSA-NEXT: s_and_b32 s41, s6, s37 -; GCN-HSA-NEXT: s_and_b32 s42, s9, s37 -; GCN-HSA-NEXT: s_and_b32 s43, s8, s37 -; GCN-HSA-NEXT: s_and_b32 s44, s11, s37 -; GCN-HSA-NEXT: s_and_b32 s45, s10, s37 -; GCN-HSA-NEXT: s_and_b32 s46, s13, s37 -; GCN-HSA-NEXT: s_and_b32 s47, s12, s37 -; GCN-HSA-NEXT: s_and_b32 s48, s15, s37 -; GCN-HSA-NEXT: s_and_b32 s49, s14, s37 -; GCN-HSA-NEXT: s_and_b32 s50, s17, s37 -; GCN-HSA-NEXT: s_and_b32 s51, s16, s37 -; GCN-HSA-NEXT: s_and_b32 s52, s19, s37 -; GCN-HSA-NEXT: s_and_b32 s53, s18, s37 -; GCN-HSA-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x10 +; GCN-HSA-NEXT: s_lshr_b32 s20, s1, 16 +; GCN-HSA-NEXT: s_lshr_b32 s21, s0, 16 +; GCN-HSA-NEXT: s_lshr_b32 s22, s3, 16 +; GCN-HSA-NEXT: s_lshr_b32 s23, s2, 16 +; GCN-HSA-NEXT: s_lshr_b32 s24, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s25, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s26, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s27, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s28, s9, 16 +; GCN-HSA-NEXT: s_lshr_b32 s29, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s30, s11, 16 +; GCN-HSA-NEXT: s_lshr_b32 s31, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s33, s13, 16 +; GCN-HSA-NEXT: s_lshr_b32 s34, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s35, s15, 16 +; GCN-HSA-NEXT: s_lshr_b32 s36, s14, 16 +; GCN-HSA-NEXT: s_and_b32 s38, s1, s37 +; GCN-HSA-NEXT: s_and_b32 s39, s0, s37 +; GCN-HSA-NEXT: s_and_b32 s40, s3, s37 +; GCN-HSA-NEXT: s_and_b32 s41, s2, s37 +; GCN-HSA-NEXT: s_and_b32 s42, s5, s37 +; GCN-HSA-NEXT: s_and_b32 s43, s4, s37 +; GCN-HSA-NEXT: s_and_b32 s44, s7, s37 +; GCN-HSA-NEXT: s_and_b32 s45, s6, s37 +; GCN-HSA-NEXT: s_and_b32 s46, s9, s37 +; GCN-HSA-NEXT: s_and_b32 s47, s8, s37 +; GCN-HSA-NEXT: s_and_b32 s48, s11, s37 +; GCN-HSA-NEXT: s_and_b32 s49, s10, s37 +; GCN-HSA-NEXT: s_and_b32 s50, s13, s37 +; GCN-HSA-NEXT: s_and_b32 s51, s12, s37 +; GCN-HSA-NEXT: s_and_b32 s52, s15, s37 +; GCN-HSA-NEXT: s_and_b32 s53, s14, s37 +; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_and_b32 s59, s8, s37 -; GCN-HSA-NEXT: s_and_b32 s60, s11, s37 -; GCN-HSA-NEXT: s_and_b32 s61, s10, s37 -; GCN-HSA-NEXT: s_and_b32 s62, s13, s37 -; GCN-HSA-NEXT: s_and_b32 s63, s12, s37 -; GCN-HSA-NEXT: s_and_b32 s64, s15, s37 -; GCN-HSA-NEXT: s_and_b32 s65, s14, s37 -; GCN-HSA-NEXT: s_and_b32 s66, s17, s37 -; GCN-HSA-NEXT: s_and_b32 s67, s16, s37 -; GCN-HSA-NEXT: s_and_b32 s68, s19, s37 -; GCN-HSA-NEXT: s_and_b32 s54, s5, s37 -; GCN-HSA-NEXT: s_and_b32 s55, s4, s37 -; GCN-HSA-NEXT: s_and_b32 s56, s7, s37 -; GCN-HSA-NEXT: s_and_b32 s57, s6, s37 -; GCN-HSA-NEXT: s_and_b32 s58, s9, s37 -; GCN-HSA-NEXT: s_and_b32 s37, s18, s37 +; GCN-HSA-NEXT: s_and_b32 s57, s4, s37 +; GCN-HSA-NEXT: s_and_b32 s58, s7, s37 +; GCN-HSA-NEXT: s_and_b32 s59, s6, s37 +; GCN-HSA-NEXT: s_and_b32 s60, s9, s37 +; GCN-HSA-NEXT: s_and_b32 s61, s8, s37 +; GCN-HSA-NEXT: s_and_b32 s62, s11, s37 +; GCN-HSA-NEXT: s_and_b32 s63, s10, s37 +; GCN-HSA-NEXT: s_and_b32 s64, s13, s37 +; GCN-HSA-NEXT: s_and_b32 s65, s12, s37 +; GCN-HSA-NEXT: s_and_b32 s66, s15, s37 +; GCN-HSA-NEXT: s_and_b32 s54, s3, s37 +; GCN-HSA-NEXT: s_and_b32 s55, s2, s37 +; GCN-HSA-NEXT: s_and_b32 s56, s5, s37 +; GCN-HSA-NEXT: s_lshr_b32 s5, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s7, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s6, s6, 16 ; GCN-HSA-NEXT: s_lshr_b32 s9, s9, 16 +; GCN-HSA-NEXT: s_lshr_b32 s8, s8, 16 ; GCN-HSA-NEXT: s_lshr_b32 s11, s11, 16 ; GCN-HSA-NEXT: s_lshr_b32 s10, s10, 16 ; GCN-HSA-NEXT: s_lshr_b32 s13, s13, 16 ; GCN-HSA-NEXT: s_lshr_b32 s12, s12, 16 ; GCN-HSA-NEXT: s_lshr_b32 s15, s15, 16 +; GCN-HSA-NEXT: s_and_b32 s18, s1, s37 +; GCN-HSA-NEXT: s_and_b32 s19, s0, s37 +; GCN-HSA-NEXT: s_and_b32 s37, s14, s37 ; GCN-HSA-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s17, s17, 16 -; GCN-HSA-NEXT: s_lshr_b32 s16, s16, 16 -; GCN-HSA-NEXT: s_lshr_b32 s19, s19, 16 -; GCN-HSA-NEXT: s_lshr_b32 s18, s18, 16 -; GCN-HSA-NEXT: s_lshr_b32 s5, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s67, s1, 16 +; GCN-HSA-NEXT: s_lshr_b32 s68, s0, 16 +; GCN-HSA-NEXT: s_lshr_b32 s3, s3, 16 +; GCN-HSA-NEXT: s_lshr_b32 s2, s2, 16 ; GCN-HSA-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s7, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s67 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s66 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s17 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xe0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xd0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xc0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xa0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s65 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s64 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v35, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v34, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s61 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s60 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s11 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v34, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s59 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s58 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s7 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s37 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s68 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s66 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s65 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s64 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s63 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s62 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s59 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s58 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s56 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s55 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s4 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s63 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s62 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s61 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s60 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s57 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s68 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] ; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s67 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s51 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s52 @@ -3222,224 +3224,226 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s49 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s48 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s47 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s46 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s45 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s44 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s26 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s43 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s42 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s24 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s41 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s40 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s39 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s20, 0xffff ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x40 -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x40 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[36:51], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s68, s19, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s51, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s53, s50, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s5, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s55, s4, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s7, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s57, s6, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s9, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s59, s8, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s11, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s61, s10, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s62, s13, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s63, s12, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s64, s15, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s65, s14, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s66, s17, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s67, s16, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s19, s19, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s21, s37, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s22, s37, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s23, s36, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s36, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s25, s39, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s39, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s27, s38, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s38, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s29, s41, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s41, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s31, s40, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s33, s40, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s43, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s35, s43, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s42, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s37, s42, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s45, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s39, s45, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s44, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s41, s44, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s47, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s43, s47, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s46, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s45, s46, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s49, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s47, s49, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s49, s48, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s48, s48, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s51, s51, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s70, s15, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s5, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s61, s4, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s7, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s63, s6, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s64, s9, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s65, s8, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s66, s11, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s67, s10, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s68, s13, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s69, s12, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s15, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s37, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s37, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s21, s36, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s22, s36, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s23, s39, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s39, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s25, s38, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s38, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s27, s41, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s41, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s29, s40, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s40, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s31, s43, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s33, s43, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s42, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s35, s42, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s45, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s37, s45, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s44, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s39, s44, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s47, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s41, s47, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s46, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s43, s46, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s49, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s45, s49, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s48, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s47, s48, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s51, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s49, s51, s20 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s51, s50, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s50, s50, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s53, s1, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s55, s0, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s57, s3, s20 +; GCN-NOHSA-VI-NEXT: s_and_b32 s59, s2, s20 ; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s20 ; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s20 ; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, s20 ; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, s20 -; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s18, s20 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s18, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s17, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s16, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s18 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s68 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s15, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s14, s20 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s67 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s66 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s17 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s1, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s0, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s3, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s2, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s13, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s12, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s65 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s63 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s11, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s69 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s62 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s68 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s9, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s67 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s66 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s65 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s61 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s60 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s63 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s62 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s59 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s58 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s61 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s60 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s59 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s58 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s57 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s56 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s55 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s53 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s52 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s51 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s52 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s49 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s48 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s47 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s46 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s47 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s46 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s45 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s44 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s45 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s44 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s43 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s42 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s43 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s42 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s41 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s40 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s41 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s40 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s39 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s38 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s39 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s38 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s36 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s33 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s31 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s29 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s28 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s27 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s23 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s18 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -4015,163 +4019,166 @@ ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x40 -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x40 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[36:51], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s67, s19, 16 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s68, s18, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s19, s19 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s18, s18 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s65, s17, 16 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s66, s16, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s17, s17 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s16, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s68 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s19 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s67 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s63, s15, 16 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s64, s14, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s69, s15, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s70, s14, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s15, s15 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s14, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s66 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s61, s13, 16 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s62, s12, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s67, s13, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s68, s12, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s18, s37, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s20, s37 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s22, s39, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s24, s39 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s26, s41, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s28, s41 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s30, s43, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s33, s43 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s35, s45, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s37, s45 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s39, s47, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s41, s47 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s43, s49, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s45, s49 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s47, s51, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s49, s51 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s51, s1, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s53, s1 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s52, s0, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s54, s0 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s55, s3, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s57, s3 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s56, s2, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s58, s2 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s13, s13 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s12, s12 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s70 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s59, s11, 16 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s60, s10, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s69 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s65, s11, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s66, s10, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s11, s11 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s10, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s62 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s68 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s57, s9, 16 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s58, s8, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s67 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s63, s9, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s64, s8, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s9, s9 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s8, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s60 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s66 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s55, s7, 16 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s56, s6, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s61, s7, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s62, s6, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s7 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s58 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s53, s5, 16 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s54, s4, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s59, s5, 16 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s60, s4, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s62 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s19, s36, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s60 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s21, s36 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s58 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s57 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s52, s50, 16 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s20, s37, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s22, s37 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s24, s39, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s26, s39 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s28, s41, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s30, s41 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s43, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s35, s43 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s37, s45, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s39, s45 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s41, s47, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s43, s47 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s45, s49, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s47, s49 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s49, s51, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s51, s51 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s23, s38, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s25, s38 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s27, s40, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s29, s40 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s31, s42, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s34, s42 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s36, s44, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s38, s44 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s40, s46, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s42, s46 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s44, s48, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s46, s48 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s48, s50, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s50, s50 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s54 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s52 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s53 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s21, s36, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s23, s36 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s25, s38, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s27, s38 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s29, s40, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s31, s40 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s34, s42, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s36, s42 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s38, s44, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s40, s44 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s42, s46, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s44, s46 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s46, s48, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s48, s48 +; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s52 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s51 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s49 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s46 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s47 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s45 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s46 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s44 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s45 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s44 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s42 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s43 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s42 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s40 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s41 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s39 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s40 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s38 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s39 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s36 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s33 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s30 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s31 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s28 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s29 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s28 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s26 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s27 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s22 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s23 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s18 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -6644,101 +6651,103 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_sextload_v32i16_to_v32i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0 +; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, s23 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s21 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, s19 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s17 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s15 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s28, s13 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s11 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s34, s9 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s22, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s20, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s18, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s16, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[4:5], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s46, s14, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s48, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s50, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s52, s8, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[8:9], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[54:55], s[10:11], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[56:57], s[12:13], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[58:59], s[14:15], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[60:61], s[16:17], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[62:63], s[18:19], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[64:65], s[20:21], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[66:67], s[22:23], 0x100000 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s15 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s13 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s11 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s9 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s7 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s28, s5 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s3 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s34, s1 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s8, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[18:19], 0x100000 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s46, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s48, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s50, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s52, s0, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[54:55], s[2:3], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[56:57], s[4:5], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[58:59], s[6:7], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[60:61], s[8:9], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[62:63], s[10:11], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[64:65], s[12:13], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[66:67], s[14:15], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[68:69], s[0:1], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[70:71], s[2:3], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[6:7], s[6:7], 48 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[8:9], s[8:9], 48 ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[10:11], s[10:11], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[14:15], s[14:15], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[16:17], s[16:17], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[18:19], s[18:19], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[20:21], s[20:21], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[22:23], s[22:23], 48 -; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[12:13], s[12:13], 48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s23 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s44 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s45 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s21 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[2:3], s[12:13], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[12:13], s[14:15], 48 +; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 48 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s44 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s45 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s3 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[28:29], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[26:27], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[34:35], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[30:31], 0x100000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s25 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s23 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s13 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[28:29], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[26:27], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[24:25], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[22:23], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[34:35], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[30:31], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[52:53], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[50:51], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[48:49], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[46:47], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[42:43], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[40:41], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[38:39], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[36:37], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[52:53], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[50:51], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[48:49], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[46:47], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[42:43], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[40:41], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[38:39], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[36:37], 0x100000 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s70 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s71 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s68 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s69 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s66 @@ -6755,382 +6764,390 @@ ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s57 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, s54 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, s55 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s17 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s15 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s13 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s11 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s9 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, s7 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x0 +; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_mov_b32 s8, s51 -; GCN-HSA-NEXT: s_mov_b32 s34, s49 -; GCN-HSA-NEXT: s_mov_b32 s52, s47 -; GCN-HSA-NEXT: s_mov_b32 s54, s45 -; GCN-HSA-NEXT: s_mov_b32 s56, s43 -; GCN-HSA-NEXT: s_mov_b32 s58, s41 -; GCN-HSA-NEXT: s_mov_b32 s60, s39 -; GCN-HSA-NEXT: s_mov_b32 s62, s37 -; GCN-HSA-NEXT: s_lshr_b32 s30, s46, 16 -; GCN-HSA-NEXT: s_lshr_b32 s24, s44, 16 -; GCN-HSA-NEXT: s_lshr_b32 s20, s42, 16 -; GCN-HSA-NEXT: s_lshr_b32 s16, s40, 16 -; GCN-HSA-NEXT: s_lshr_b32 s12, s38, 16 -; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 -; GCN-HSA-NEXT: s_lshr_b32 s64, s50, 16 -; GCN-HSA-NEXT: s_lshr_b32 s66, s48, 16 -; GCN-HSA-NEXT: s_lshr_b32 s68, s36, 16 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[36:37], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[28:29], s[36:37], 48 -; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[38:39], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[36:37], s[38:39], 48 -; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[40:41], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[38:39], s[40:41], 48 -; GCN-HSA-NEXT: s_ashr_i64 s[40:41], s[42:43], 48 -; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[42:43], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[42:43], s[44:45], 48 -; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[44:45], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[44:45], s[46:47], 48 -; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[46:47], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[46:47], s[48:49], 48 -; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[48:49], 0x100000 -; GCN-HSA-NEXT: s_ashr_i64 s[48:49], s[50:51], 48 -; GCN-HSA-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000 +; GCN-HSA-NEXT: s_mov_b32 s38, s15 +; GCN-HSA-NEXT: s_mov_b32 s40, s13 +; GCN-HSA-NEXT: s_mov_b32 s42, s11 +; GCN-HSA-NEXT: s_mov_b32 s44, s9 +; GCN-HSA-NEXT: s_mov_b32 s46, s7 +; GCN-HSA-NEXT: s_mov_b32 s48, s5 +; GCN-HSA-NEXT: s_mov_b32 s50, s3 +; GCN-HSA-NEXT: s_mov_b32 s52, s1 +; GCN-HSA-NEXT: s_lshr_b32 s54, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s56, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s58, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s60, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s62, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s64, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s66, s2, 16 +; GCN-HSA-NEXT: s_lshr_b32 s68, s0, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 +; GCN-HSA-NEXT: s_ashr_i64 s[36:37], s[0:1], 48 +; GCN-HSA-NEXT: s_ashr_i64 s[0:1], s[14:15], 48 +; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 +; GCN-HSA-NEXT: s_ashr_i64 s[70:71], s[2:3], 48 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[38:39], 0x100000 +; GCN-HSA-NEXT: s_ashr_i64 s[74:75], s[6:7], 48 +; GCN-HSA-NEXT: s_ashr_i64 s[76:77], s[8:9], 48 +; GCN-HSA-NEXT: s_ashr_i64 s[78:79], s[10:11], 48 +; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[10:11], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[12:13], 0x100000 +; GCN-HSA-NEXT: s_ashr_i64 s[12:13], s[12:13], 48 +; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 +; GCN-HSA-NEXT: s_ashr_i64 s[72:73], s[4:5], 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 +; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[68:69], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[66:67], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[64:65], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[62:63], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[60:61], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[58:59], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[56:57], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[38:39], s[54:55], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[50:51], 0x100000 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s49 -; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[68:69], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[66:67], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[64:65], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x100000 -; GCN-HSA-NEXT: s_add_u32 s64, s0, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s65, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s34 -; GCN-HSA-NEXT: s_add_u32 s34, s0, 0xd0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s35 -; GCN-HSA-NEXT: s_addc_u32 s35, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s35 -; GCN-HSA-NEXT: s_add_u32 s34, s0, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s35, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s35 -; GCN-HSA-NEXT: s_add_u32 s34, s0, 0x90 -; GCN-HSA-NEXT: s_addc_u32 s35, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s35 -; GCN-HSA-NEXT: s_add_u32 s34, s0, 0x70 -; GCN-HSA-NEXT: s_addc_u32 s35, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s35 -; GCN-HSA-NEXT: s_add_u32 s34, s0, 0x50 -; GCN-HSA-NEXT: s_addc_u32 s35, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s35 -; GCN-HSA-NEXT: s_add_u32 s34, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s47 +; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000 +; GCN-HSA-NEXT: s_add_u32 s54, s16, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s55, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s12 +; GCN-HSA-NEXT: s_add_u32 s12, s16, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13 +; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s12 +; GCN-HSA-NEXT: s_add_u32 s12, s16, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s12 +; GCN-HSA-NEXT: s_add_u32 s12, s16, 0x90 +; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s12 +; GCN-HSA-NEXT: s_add_u32 s12, s16, 0x70 +; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s12 +; GCN-HSA-NEXT: s_add_u32 s12, s16, 0x50 +; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s12 +; GCN-HSA-NEXT: s_add_u32 s12, s16, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s41 +; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: s_addc_u32 s35, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s35 -; GCN-HSA-NEXT: s_add_u32 s34, s0, 16 -; GCN-HSA-NEXT: s_addc_u32 s35, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s12 +; GCN-HSA-NEXT: s_add_u32 s12, s16, 16 +; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v34, s12 +; GCN-HSA-NEXT: s_add_u32 s12, s16, 0xe0 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s74 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s75 +; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s12 +; GCN-HSA-NEXT: s_add_u32 s12, s16, 0xc0 +; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s55 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s78 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s79 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s76 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s77 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s72 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s73 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s70 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s71 ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s64 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s26 -; GCN-HSA-NEXT: s_add_u32 s26, s0, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s55 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s43 -; GCN-HSA-NEXT: s_addc_u32 s27, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s65 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s56 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s41 -; GCN-HSA-NEXT: s_add_u32 s22, s0, 0xc0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v34, s34 -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s26 -; GCN-HSA-NEXT: s_addc_u32 s23, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s58 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s59 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s60 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s38 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s39 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s61 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s62 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s63 -; GCN-HSA-NEXT: v_mov_b32_e32 v35, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s31 ; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s12 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GCN-HSA-NEXT: s_add_u32 s18, s0, 0xa0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GCN-HSA-NEXT: s_addc_u32 s19, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x80 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 -; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GCN-HSA-NEXT: s_add_u32 s10, s0, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 -; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GCN-HSA-NEXT: s_add_u32 s10, s16, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GCN-HSA-NEXT: s_addc_u32 s11, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 +; GCN-HSA-NEXT: s_add_u32 s8, s16, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 +; GCN-HSA-NEXT: s_addc_u32 s9, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s16, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_sextload_v32i16_to_v32i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[12:27], s[10:11], 0x0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, -1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_mov_b32 s66, s27 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s68, s27, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[64:65], s[26:27], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s26, 16 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s60, s25 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s25, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s66 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s67 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s68 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s69 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:240 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[24:25], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s24, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s0, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s26, s1 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s1, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s2, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s34, s3 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s3, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s4, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s40, s5 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s5, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s46, s7 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s7, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s52, s9 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s9, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s58, s11 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s11, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s64, s13 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s13, 16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s70, s15 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s72, s15, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[68:69], s[14:15], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s14, 16 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[24:25], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[28:29], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[34:35], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[38:39], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[42:43], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[48:49], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[54:55], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[60:61], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[66:67], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[66:67], s[72:73], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s16 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s17 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[26:27], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[30:31], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[36:37], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[40:41], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[46:47], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[52:53], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[58:59], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[64:65], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[64:65], s[70:71], 0x100000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s64 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s65 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s27 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:224 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s54, s23 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s23, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s60 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s61 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s62 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:208 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[22:23], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s22, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x100000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s66 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s67 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[62:63], s[12:13], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s12, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s68 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s69 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s58 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s59 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s25 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:192 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s48, s21 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s21, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s54 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s56 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s57 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:176 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[20:21], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s20, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s60 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[10:11], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s62 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s63 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s52 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s23 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:160 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s42, s19 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s19, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s50 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:144 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[18:19], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s18, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[8:9], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s46 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s47 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s21 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:128 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s36, s17 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s17, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s42 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s43 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s44 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s45 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[16:17], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s16, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x100000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[6:7], 0x100000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s40 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s41 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s30, s15 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s15, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s42 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s44 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s45 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s38 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s39 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[14:15], 0x100000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s17 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s12, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s13, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[12:13], 0x100000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, s13 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s29 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s30 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s31 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s27 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s25 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_sextload_v32i16_to_v32i64: diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -161,68 +161,68 @@ ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: s_or_b64 s[18:19], s[12:13], s[14:15] ; GCN-IR-NEXT: s_flbit_i32_b32 s12, s6 -; GCN-IR-NEXT: s_flbit_i32_b32 s14, s10 ; GCN-IR-NEXT: s_add_i32 s12, s12, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s13, s7 -; GCN-IR-NEXT: s_add_i32 s14, s14, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s15, s11 -; GCN-IR-NEXT: s_min_u32 s12, s12, s13 -; GCN-IR-NEXT: s_min_u32 s16, s14, s15 -; GCN-IR-NEXT: s_sub_u32 s14, s12, s16 -; GCN-IR-NEXT: s_subb_u32 s15, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[20:21], s[14:15], 63 -; GCN-IR-NEXT: s_mov_b32 s13, 0 +; GCN-IR-NEXT: s_min_u32 s14, s12, s13 +; GCN-IR-NEXT: s_flbit_i32_b32 s12, s10 +; GCN-IR-NEXT: s_add_i32 s12, s12, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s13, s11 +; GCN-IR-NEXT: s_min_u32 s16, s12, s13 +; GCN-IR-NEXT: s_sub_u32 s12, s14, s16 +; GCN-IR-NEXT: s_subb_u32 s13, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[20:21], s[12:13], 63 +; GCN-IR-NEXT: s_mov_b32 s15, 0 ; GCN-IR-NEXT: s_or_b64 s[18:19], s[18:19], s[20:21] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[20:21], s[14:15], 63 +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[20:21], s[12:13], 63 ; GCN-IR-NEXT: s_xor_b64 s[22:23], s[18:19], -1 ; GCN-IR-NEXT: s_and_b64 s[20:21], s[22:23], s[20:21] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[20:21] ; GCN-IR-NEXT: s_cbranch_vccz BB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s18, s14, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s14 -; GCN-IR-NEXT: s_addc_u32 s19, s15, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s15 +; GCN-IR-NEXT: s_add_u32 s18, s12, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s12 +; GCN-IR-NEXT: s_addc_u32 s19, s13, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s13 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[18:19], v[0:1] -; GCN-IR-NEXT: s_sub_i32 s14, 63, s14 +; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, vcc -; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[10:11], s14 +; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[10:11], s12 ; GCN-IR-NEXT: s_cbranch_vccz BB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[18:19], s[10:11], s18 -; GCN-IR-NEXT: s_add_u32 s10, s6, -1 -; GCN-IR-NEXT: s_addc_u32 s11, s7, -1 -; GCN-IR-NEXT: s_not_b64 s[8:9], s[12:13] -; GCN-IR-NEXT: s_mov_b32 s17, s13 -; GCN-IR-NEXT: s_add_u32 s12, s8, s16 -; GCN-IR-NEXT: s_addc_u32 s13, s9, s13 -; GCN-IR-NEXT: s_mov_b64 s[16:17], 0 +; GCN-IR-NEXT: s_add_u32 s20, s6, -1 +; GCN-IR-NEXT: s_addc_u32 s21, s7, -1 +; GCN-IR-NEXT: s_not_b64 s[8:9], s[14:15] +; GCN-IR-NEXT: s_add_u32 s10, s8, s16 +; GCN-IR-NEXT: s_addc_u32 s11, s9, s15 +; GCN-IR-NEXT: s_mov_b32 s17, s15 +; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: BB0_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshr_b32 s8, s15, 31 -; GCN-IR-NEXT: s_lshl_b64 s[18:19], s[18:19], 1 -; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 -; GCN-IR-NEXT: s_or_b64 s[18:19], s[18:19], s[8:9] -; GCN-IR-NEXT: s_or_b64 s[14:15], s[16:17], s[14:15] -; GCN-IR-NEXT: s_sub_u32 s8, s10, s18 -; GCN-IR-NEXT: s_subb_u32 s8, s11, s19 -; GCN-IR-NEXT: s_ashr_i32 s16, s8, 31 -; GCN-IR-NEXT: s_mov_b32 s17, s16 -; GCN-IR-NEXT: s_and_b32 s8, s16, 1 -; GCN-IR-NEXT: s_and_b64 s[20:21], s[16:17], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s18, s18, s20 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s12 -; GCN-IR-NEXT: s_subb_u32 s19, s19, s21 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s13 -; GCN-IR-NEXT: s_add_u32 s12, s12, 1 -; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] -; GCN-IR-NEXT: s_mov_b64 s[16:17], s[8:9] +; GCN-IR-NEXT: s_lshr_b32 s8, s13, 31 +; GCN-IR-NEXT: s_lshl_b64 s[16:17], s[18:19], 1 +; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 +; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[8:9] +; GCN-IR-NEXT: s_or_b64 s[12:13], s[14:15], s[12:13] +; GCN-IR-NEXT: s_sub_u32 s8, s20, s16 +; GCN-IR-NEXT: s_subb_u32 s8, s21, s17 +; GCN-IR-NEXT: s_ashr_i32 s14, s8, 31 +; GCN-IR-NEXT: s_mov_b32 s15, s14 +; GCN-IR-NEXT: s_and_b32 s8, s14, 1 +; GCN-IR-NEXT: s_and_b64 s[18:19], s[14:15], s[6:7] +; GCN-IR-NEXT: s_sub_u32 s18, s16, s18 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 +; GCN-IR-NEXT: s_subb_u32 s19, s17, s19 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 +; GCN-IR-NEXT: s_add_u32 s10, s10, 1 +; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1] +; GCN-IR-NEXT: s_mov_b64 s[14:15], s[8:9] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc ; GCN-IR-NEXT: s_cbranch_vccz BB0_3 ; GCN-IR-NEXT: BB0_4: ; %Flow6 -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[14:15], 1 +; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[12:13], 1 ; GCN-IR-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] ; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 @@ -396,73 +396,73 @@ ; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[4:5] ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v3 -; GCN-IR-NEXT: v_min_u32_e32 v13, v0, v7 -; GCN-IR-NEXT: v_ffbh_u32_e32 v0, v9 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v10 -; GCN-IR-NEXT: v_min_u32_e32 v14, v0, v7 -; GCN-IR-NEXT: v_sub_i32_e32 v7, vcc, v13, v14 +; GCN-IR-NEXT: v_min_u32_e32 v0, v0, v7 +; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v9 +; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 32, v7 +; GCN-IR-NEXT: v_ffbh_u32_e32 v8, v10 +; GCN-IR-NEXT: v_min_u32_e32 v13, v7, v8 +; GCN-IR-NEXT: v_sub_i32_e32 v7, vcc, v0, v13 ; GCN-IR-NEXT: v_subb_u32_e64 v8, s[4:5], 0, 0, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[7:8] ; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[4:5], 63, v[7:8] ; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], vcc ; GCN-IR-NEXT: s_xor_b64 s[8:9], s[6:7], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v18, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v17, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v6, v4 ; GCN-IR-NEXT: v_mov_b32_e32 v1, v5 ; GCN-IR-NEXT: v_cndmask_b32_e64 v12, v10, 0, s[6:7] ; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v15, v18 -; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v9, 0, s[6:7] +; GCN-IR-NEXT: v_mov_b32_e32 v14, v17 +; GCN-IR-NEXT: v_cndmask_b32_e64 v11, v9, 0, s[6:7] ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, 1, v7 -; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, 0, v8, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v0, s[4:5], 63, v7 -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[16:17], v[7:8] +; GCN-IR-NEXT: v_add_i32_e32 v15, vcc, 1, v7 +; GCN-IR-NEXT: v_addc_u32_e32 v16, vcc, 0, v8, vcc +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[15:16], v[7:8] +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], 63, v7 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[9:10], v0 +; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[9:10], v7 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, -1, v2 -; GCN-IR-NEXT: v_lshr_b64 v[16:17], v[9:10], v16 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, -1, v3, vcc -; GCN-IR-NEXT: v_not_b32_e32 v10, v13 -; GCN-IR-NEXT: v_not_b32_e32 v11, v18 -; GCN-IR-NEXT: v_add_i32_e32 v13, vcc, v10, v14 +; GCN-IR-NEXT: v_add_i32_e32 v19, vcc, -1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v20, vcc, -1, v3, vcc +; GCN-IR-NEXT: v_not_b32_e32 v0, v0 +; GCN-IR-NEXT: v_lshr_b64 v[15:16], v[9:10], v15 +; GCN-IR-NEXT: v_not_b32_e32 v10, v17 +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, v0, v13 +; GCN-IR-NEXT: v_mov_b32_e32 v17, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v18, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v19, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v14, vcc, v11, v15, vcc +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, v10, v14, vcc ; GCN-IR-NEXT: BB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v10, 31, v8 -; GCN-IR-NEXT: v_or_b32_e32 v10, v16, v10 +; GCN-IR-NEXT: v_lshl_b64 v[13:14], v[15:16], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 31, v8 +; GCN-IR-NEXT: v_or_b32_e32 v0, v13, v0 ; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v11, vcc, v0, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v9, v17, vcc -; GCN-IR-NEXT: v_or_b32_e32 v7, v18, v7 -; GCN-IR-NEXT: v_add_i32_e32 v18, vcc, 1, v13 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v15, 31, v11 -; GCN-IR-NEXT: v_or_b32_e32 v8, v19, v8 -; GCN-IR-NEXT: v_addc_u32_e32 v19, vcc, 0, v14, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[18:19], v[13:14] -; GCN-IR-NEXT: v_mov_b32_e32 v13, v18 +; GCN-IR-NEXT: v_sub_i32_e32 v11, vcc, v19, v0 +; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, v20, v14, vcc +; GCN-IR-NEXT: v_or_b32_e32 v7, v17, v7 +; GCN-IR-NEXT: v_add_i32_e32 v17, vcc, 1, v9 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v13, 31, v11 +; GCN-IR-NEXT: v_or_b32_e32 v8, v18, v8 +; GCN-IR-NEXT: v_addc_u32_e32 v18, vcc, 0, v10, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[17:18], v[9:10] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v17 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 -; GCN-IR-NEXT: v_and_b32_e32 v11, 1, v15 -; GCN-IR-NEXT: v_and_b32_e32 v20, v15, v3 -; GCN-IR-NEXT: v_and_b32_e32 v15, v15, v2 -; GCN-IR-NEXT: v_sub_i32_e64 v16, s[4:5], v10, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v14, v19 -; GCN-IR-NEXT: v_mov_b32_e32 v19, v12 -; GCN-IR-NEXT: v_subb_u32_e64 v17, s[4:5], v17, v20, s[4:5] +; GCN-IR-NEXT: v_and_b32_e32 v11, 1, v13 +; GCN-IR-NEXT: v_and_b32_e32 v16, v13, v3 +; GCN-IR-NEXT: v_and_b32_e32 v13, v13, v2 +; GCN-IR-NEXT: v_sub_i32_e64 v15, s[4:5], v0, v13 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v18 +; GCN-IR-NEXT: v_mov_b32_e32 v18, v12 +; GCN-IR-NEXT: v_subb_u32_e64 v16, s[4:5], v14, v16, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v18, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v17, v11 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow @@ -471,15 +471,15 @@ ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[7:8], 1 ; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v3 -; GCN-IR-NEXT: v_or_b32_e32 v0, v11, v2 +; GCN-IR-NEXT: v_or_b32_e32 v11, v11, v2 ; GCN-IR-NEXT: BB1_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_xor_b32_e32 v2, v5, v4 -; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v2 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v5, v4 +; GCN-IR-NEXT: v_xor_b32_e32 v3, v11, v0 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v6 -; GCN-IR-NEXT: v_xor_b32_e32 v3, v12, v1 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GCN-IR-NEXT: v_xor_b32_e32 v2, v12, v1 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v3, v0 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = sdiv i64 %x, %y ret i64 %result @@ -1022,68 +1022,68 @@ ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: s_or_b64 s[18:19], s[12:13], s[14:15] ; GCN-IR-NEXT: s_flbit_i32_b32 s12, s6 -; GCN-IR-NEXT: s_flbit_i32_b32 s14, s10 ; GCN-IR-NEXT: s_add_i32 s12, s12, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s13, s7 -; GCN-IR-NEXT: s_add_i32 s14, s14, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s15, s11 -; GCN-IR-NEXT: s_min_u32 s12, s12, s13 -; GCN-IR-NEXT: s_min_u32 s16, s14, s15 -; GCN-IR-NEXT: s_sub_u32 s14, s12, s16 -; GCN-IR-NEXT: s_subb_u32 s15, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[20:21], s[14:15], 63 -; GCN-IR-NEXT: s_mov_b32 s13, 0 +; GCN-IR-NEXT: s_min_u32 s14, s12, s13 +; GCN-IR-NEXT: s_flbit_i32_b32 s12, s10 +; GCN-IR-NEXT: s_add_i32 s12, s12, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s13, s11 +; GCN-IR-NEXT: s_min_u32 s16, s12, s13 +; GCN-IR-NEXT: s_sub_u32 s12, s14, s16 +; GCN-IR-NEXT: s_subb_u32 s13, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[20:21], s[12:13], 63 +; GCN-IR-NEXT: s_mov_b32 s15, 0 ; GCN-IR-NEXT: s_or_b64 s[18:19], s[18:19], s[20:21] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[20:21], s[14:15], 63 +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[20:21], s[12:13], 63 ; GCN-IR-NEXT: s_xor_b64 s[22:23], s[18:19], -1 ; GCN-IR-NEXT: s_and_b64 s[20:21], s[22:23], s[20:21] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[20:21] ; GCN-IR-NEXT: s_cbranch_vccz BB9_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s18, s14, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s14 -; GCN-IR-NEXT: s_addc_u32 s19, s15, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s15 +; GCN-IR-NEXT: s_add_u32 s18, s12, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s12 +; GCN-IR-NEXT: s_addc_u32 s19, s13, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s13 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[18:19], v[0:1] -; GCN-IR-NEXT: s_sub_i32 s14, 63, s14 +; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, vcc -; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[10:11], s14 +; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[10:11], s12 ; GCN-IR-NEXT: s_cbranch_vccz BB9_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[18:19], s[10:11], s18 -; GCN-IR-NEXT: s_add_u32 s10, s6, -1 -; GCN-IR-NEXT: s_addc_u32 s11, s7, -1 -; GCN-IR-NEXT: s_not_b64 s[8:9], s[12:13] -; GCN-IR-NEXT: s_mov_b32 s17, s13 -; GCN-IR-NEXT: s_add_u32 s12, s8, s16 -; GCN-IR-NEXT: s_addc_u32 s13, s9, s13 -; GCN-IR-NEXT: s_mov_b64 s[16:17], 0 +; GCN-IR-NEXT: s_add_u32 s20, s6, -1 +; GCN-IR-NEXT: s_addc_u32 s21, s7, -1 +; GCN-IR-NEXT: s_not_b64 s[8:9], s[14:15] +; GCN-IR-NEXT: s_add_u32 s10, s8, s16 +; GCN-IR-NEXT: s_addc_u32 s11, s9, s15 +; GCN-IR-NEXT: s_mov_b32 s17, s15 +; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: BB9_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshr_b32 s8, s15, 31 -; GCN-IR-NEXT: s_lshl_b64 s[18:19], s[18:19], 1 -; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 -; GCN-IR-NEXT: s_or_b64 s[18:19], s[18:19], s[8:9] -; GCN-IR-NEXT: s_or_b64 s[14:15], s[16:17], s[14:15] -; GCN-IR-NEXT: s_sub_u32 s8, s10, s18 -; GCN-IR-NEXT: s_subb_u32 s8, s11, s19 -; GCN-IR-NEXT: s_ashr_i32 s16, s8, 31 -; GCN-IR-NEXT: s_mov_b32 s17, s16 -; GCN-IR-NEXT: s_and_b32 s8, s16, 1 -; GCN-IR-NEXT: s_and_b64 s[20:21], s[16:17], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s18, s18, s20 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s12 -; GCN-IR-NEXT: s_subb_u32 s19, s19, s21 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s13 -; GCN-IR-NEXT: s_add_u32 s12, s12, 1 -; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] -; GCN-IR-NEXT: s_mov_b64 s[16:17], s[8:9] +; GCN-IR-NEXT: s_lshr_b32 s8, s13, 31 +; GCN-IR-NEXT: s_lshl_b64 s[16:17], s[18:19], 1 +; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 +; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[8:9] +; GCN-IR-NEXT: s_or_b64 s[12:13], s[14:15], s[12:13] +; GCN-IR-NEXT: s_sub_u32 s8, s20, s16 +; GCN-IR-NEXT: s_subb_u32 s8, s21, s17 +; GCN-IR-NEXT: s_ashr_i32 s14, s8, 31 +; GCN-IR-NEXT: s_mov_b32 s15, s14 +; GCN-IR-NEXT: s_and_b32 s8, s14, 1 +; GCN-IR-NEXT: s_and_b64 s[18:19], s[14:15], s[6:7] +; GCN-IR-NEXT: s_sub_u32 s18, s16, s18 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 +; GCN-IR-NEXT: s_subb_u32 s19, s17, s19 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 +; GCN-IR-NEXT: s_add_u32 s10, s10, 1 +; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1] +; GCN-IR-NEXT: s_mov_b64 s[14:15], s[8:9] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc ; GCN-IR-NEXT: s_cbranch_vccz BB9_3 ; GCN-IR-NEXT: BB9_4: ; %Flow3 -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[14:15], 1 +; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[12:13], 1 ; GCN-IR-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] ; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 @@ -1242,61 +1242,61 @@ ; GCN-IR-NEXT: s_flbit_i32_b32 s6, s2 ; GCN-IR-NEXT: s_add_i32 s6, s6, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3 -; GCN-IR-NEXT: s_min_u32 s10, s6, s7 -; GCN-IR-NEXT: s_add_u32 s8, s10, 0xffffffc5 -; GCN-IR-NEXT: s_addc_u32 s9, 0, -1 +; GCN-IR-NEXT: s_min_u32 s8, s6, s7 +; GCN-IR-NEXT: s_add_u32 s10, s8, 0xffffffc5 +; GCN-IR-NEXT: s_addc_u32 s11, 0, -1 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[2:3], 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[14:15], s[8:9], 63 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[14:15], s[10:11], 63 ; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[14:15], s[8:9], 63 +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[14:15], s[10:11], 63 ; GCN-IR-NEXT: s_xor_b64 s[16:17], s[12:13], -1 ; GCN-IR-NEXT: s_and_b64 s[14:15], s[16:17], s[14:15] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[14:15] ; GCN-IR-NEXT: s_cbranch_vccz BB10_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s14, s8, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 -; GCN-IR-NEXT: s_addc_u32 s15, s9, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[14:15], v[0:1] -; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 +; GCN-IR-NEXT: s_add_u32 s12, s10, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 +; GCN-IR-NEXT: s_addc_u32 s13, s11, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] +; GCN-IR-NEXT: s_sub_i32 s9, 63, s10 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, vcc -; GCN-IR-NEXT: s_lshl_b64 s[12:13], 24, s8 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], 24, s9 ; GCN-IR-NEXT: s_cbranch_vccz BB10_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[16:17], 24, s14 -; GCN-IR-NEXT: s_add_u32 s8, s2, -1 -; GCN-IR-NEXT: s_addc_u32 s9, s3, -1 -; GCN-IR-NEXT: s_sub_u32 s10, 58, s10 -; GCN-IR-NEXT: s_subb_u32 s11, 0, 0 -; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 +; GCN-IR-NEXT: s_lshr_b64 s[14:15], 24, s12 +; GCN-IR-NEXT: s_add_u32 s16, s2, -1 +; GCN-IR-NEXT: s_addc_u32 s17, s3, -1 +; GCN-IR-NEXT: s_sub_u32 s8, 58, s8 +; GCN-IR-NEXT: s_subb_u32 s9, 0, 0 +; GCN-IR-NEXT: s_mov_b64 s[12:13], 0 ; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: BB10_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshr_b32 s6, s13, 31 -; GCN-IR-NEXT: s_lshl_b64 s[16:17], s[16:17], 1 -; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 -; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[6:7] -; GCN-IR-NEXT: s_or_b64 s[12:13], s[14:15], s[12:13] -; GCN-IR-NEXT: s_sub_u32 s6, s8, s16 -; GCN-IR-NEXT: s_subb_u32 s6, s9, s17 -; GCN-IR-NEXT: s_ashr_i32 s14, s6, 31 -; GCN-IR-NEXT: s_mov_b32 s15, s14 -; GCN-IR-NEXT: s_and_b32 s6, s14, 1 -; GCN-IR-NEXT: s_and_b64 s[18:19], s[14:15], s[2:3] -; GCN-IR-NEXT: s_sub_u32 s16, s16, s18 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 -; GCN-IR-NEXT: s_subb_u32 s17, s17, s19 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 -; GCN-IR-NEXT: s_add_u32 s10, s10, 1 -; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1] -; GCN-IR-NEXT: s_mov_b64 s[14:15], s[6:7] +; GCN-IR-NEXT: s_lshr_b32 s6, s11, 31 +; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 +; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[6:7] +; GCN-IR-NEXT: s_or_b64 s[10:11], s[12:13], s[10:11] +; GCN-IR-NEXT: s_sub_u32 s6, s16, s14 +; GCN-IR-NEXT: s_subb_u32 s6, s17, s15 +; GCN-IR-NEXT: s_ashr_i32 s12, s6, 31 +; GCN-IR-NEXT: s_mov_b32 s13, s12 +; GCN-IR-NEXT: s_and_b32 s6, s12, 1 +; GCN-IR-NEXT: s_and_b64 s[18:19], s[12:13], s[2:3] +; GCN-IR-NEXT: s_sub_u32 s14, s14, s18 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 +; GCN-IR-NEXT: s_subb_u32 s15, s15, s19 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 +; GCN-IR-NEXT: s_add_u32 s8, s8, 1 +; GCN-IR-NEXT: s_addc_u32 s9, s9, 0 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; GCN-IR-NEXT: s_mov_b64 s[12:13], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc ; GCN-IR-NEXT: s_cbranch_vccz BB10_3 ; GCN-IR-NEXT: BB10_4: ; %Flow5 -; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[12:13], 1 +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[10:11], 1 ; GCN-IR-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3] ; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 @@ -1442,26 +1442,26 @@ ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0 ; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 -; GCN-IR-NEXT: v_min_u32_e32 v10, v4, v5 +; GCN-IR-NEXT: v_min_u32_e32 v8, v4, v5 ; GCN-IR-NEXT: s_movk_i32 s6, 0xffc5 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v10 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v8 ; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] ; GCN-IR-NEXT: v_cndmask_b32_e64 v6, 24, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v7, v9 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB11_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[4:5] +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[10:11], v[4:5] ; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], 24, v4 @@ -1471,38 +1471,38 @@ ; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB11_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_lshr_b64 v[12:13], 24, v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, -1, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, 58, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v15, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v11, vcc, 0, v11, vcc +; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 58, v8 +; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], 24, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 +; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc ; GCN-IR-NEXT: BB11_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 +; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 -; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v6 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v8, v12 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v9, v13, vcc -; GCN-IR-NEXT: v_or_b32_e32 v4, v14, v4 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v6 -; GCN-IR-NEXT: v_and_b32_e32 v17, v14, v0 -; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v14 -; GCN-IR-NEXT: v_and_b32_e32 v16, v14, v1 -; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v10 -; GCN-IR-NEXT: v_or_b32_e32 v5, v15, v5 -; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v11, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[14:15], v[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v14 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v14, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v11, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 +; GCN-IR-NEXT: v_and_b32_e32 v17, v12, v0 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 +; GCN-IR-NEXT: v_and_b32_e32 v16, v12, v1 +; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v8 +; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5 +; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v17 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v15, v7 -; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v13, v16, s[4:5] +; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v17 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 +; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 +; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v16, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v14, v6 +; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB11_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow @@ -1678,39 +1678,39 @@ ; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB12_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader +; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 -; GCN-IR-NEXT: v_lshr_b64 v[12:13], s[4:5], v10 -; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, -1, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6 -; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v15, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v10 +; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: BB12_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 +; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v8, 31, v5 -; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v8 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v8 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v10, v12 -; GCN-IR-NEXT: v_subb_u32_e32 v8, vcc, v11, v13, vcc -; GCN-IR-NEXT: v_or_b32_e32 v4, v14, v4 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v8 -; GCN-IR-NEXT: v_and_b32_e32 v17, v14, v0 -; GCN-IR-NEXT: v_and_b32_e32 v8, 1, v14 -; GCN-IR-NEXT: v_and_b32_e32 v16, v14, v1 -; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v6 -; GCN-IR-NEXT: v_or_b32_e32 v5, v15, v5 -; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v7, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[14:15], v[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v6, v14 +; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v14, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v8, vcc, v15, v11, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v8 +; GCN-IR-NEXT: v_and_b32_e32 v17, v12, v0 +; GCN-IR-NEXT: v_and_b32_e32 v8, 1, v12 +; GCN-IR-NEXT: v_and_b32_e32 v16, v12, v1 +; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v6 +; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5 +; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[6:7] +; GCN-IR-NEXT: v_mov_b32_e32 v6, v12 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v17 -; GCN-IR-NEXT: v_mov_b32_e32 v7, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v15, v9 -; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v13, v16, s[4:5] +; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v17 +; GCN-IR-NEXT: v_mov_b32_e32 v7, v13 +; GCN-IR-NEXT: v_mov_b32_e32 v13, v9 +; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v16, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v14, v8 +; GCN-IR-NEXT: v_mov_b32_e32 v12, v8 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB12_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -18,15 +18,15 @@ ; SI-NEXT: s_cmp_lg_u32 s8, 0 ; SI-NEXT: s_cbranch_scc0 BB0_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_add_i32 s0, s11, s0 +; SI-NEXT: s_add_i32 s2, s11, s0 ; SI-NEXT: s_cbranch_execz BB0_3 ; SI-NEXT: s_branch BB0_4 ; SI-NEXT: BB0_2: -; SI-NEXT: ; implicit-def: $sgpr0 +; SI-NEXT: ; implicit-def: $sgpr2 ; SI-NEXT: BB0_3: ; %if -; SI-NEXT: s_sub_i32 s0, s9, s10 +; SI-NEXT: s_sub_i32 s2, s9, s10 ; SI-NEXT: BB0_4: ; %endif -; SI-NEXT: s_add_i32 s0, s0, s8 +; SI-NEXT: s_add_i32 s0, s2, s8 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -55,27 +55,27 @@ define amdgpu_kernel void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b, [8 x i32], i32 %c, [8 x i32], i32 %d, [8 x i32], i32 %e) { ; SI-LABEL: sgpr_if_else_salu_br_opt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s2, s[0:1], 0x13 +; SI-NEXT: s_load_dword s6, s[0:1], 0x13 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s2, 0 +; SI-NEXT: s_cmp_lg_u32 s6, 0 ; SI-NEXT: s_cbranch_scc0 BB1_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_load_dword s3, s[0:1], 0x2e -; SI-NEXT: s_load_dword s6, s[0:1], 0x37 +; SI-NEXT: s_load_dword s2, s[0:1], 0x2e +; SI-NEXT: s_load_dword s3, s[0:1], 0x37 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_i32 s3, s3, s6 +; SI-NEXT: s_add_i32 s7, s2, s3 ; SI-NEXT: s_cbranch_execz BB1_3 ; SI-NEXT: s_branch BB1_4 ; SI-NEXT: BB1_2: -; SI-NEXT: ; implicit-def: $sgpr3 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: BB1_3: ; %if -; SI-NEXT: s_load_dword s3, s[0:1], 0x1c +; SI-NEXT: s_load_dword s2, s[0:1], 0x1c ; SI-NEXT: s_load_dword s0, s[0:1], 0x25 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_i32 s3, s3, s0 +; SI-NEXT: s_add_i32 s7, s2, s0 ; SI-NEXT: BB1_4: ; %endif -; SI-NEXT: s_add_i32 s0, s3, s2 +; SI-NEXT: s_add_i32 s0, s7, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -109,18 +109,18 @@ ; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xc -; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v0 -; SI-NEXT: s_and_saveexec_b64 s[8:9], vcc -; SI-NEXT: s_xor_b64 s[8:9], exec, s[8:9] +; SI-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SI-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; SI-NEXT: s_cbranch_execz BB2_2 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_i32 s6, s2, s3 +; SI-NEXT: s_add_i32 s8, s2, s3 ; SI-NEXT: BB2_2: ; %Flow ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_or_saveexec_b64 s[2:3], s[8:9] -; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_or_saveexec_b64 s[2:3], s[6:7] +; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: s_xor_b64 exec, exec, s[2:3] ; SI-NEXT: ; %bb.3: ; %if ; SI-NEXT: s_add_i32 s0, s0, s1 @@ -155,46 +155,45 @@ ; SI-LABEL: sgpr_if_else_valu_cmp_phi_br: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: ; implicit-def: $sgpr0_sgpr1 -; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc -; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; SI-NEXT: ; implicit-def: $sgpr8_sgpr9 +; SI-NEXT: s_and_saveexec_b64 s[10:11], vcc +; SI-NEXT: s_xor_b64 s[10:11], exec, s[10:11] ; SI-NEXT: s_cbranch_execz BB3_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v0 -; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: s_and_b64 s[8:9], vcc, exec ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: BB3_2: ; %Flow -; SI-NEXT: s_or_saveexec_b64 s[2:3], s[2:3] -; SI-NEXT: s_xor_b64 exec, exec, s[2:3] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[0:1], s[10:11] +; SI-NEXT: s_xor_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execz BB3_4 ; SI-NEXT: ; %bb.3: ; %if -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s14, 0 +; SI-NEXT: s_mov_b64 s[12:13], s[6:7] ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 +; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], exec ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_b64 s[6:7], vcc, exec -; SI-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; SI-NEXT: s_or_b64 s[8:9], s[2:3], s[6:7] ; SI-NEXT: BB3_4: ; %endif -; SI-NEXT: s_or_b64 exec, exec, s[2:3] -; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[8:9] ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -166,8 +166,8 @@ ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 -; SI-NEXT: s_load_dword s8, s[0:1], 0xc -; SI-NEXT: s_brev_b32 s9, 44 +; SI-NEXT: s_load_dword s14, s[0:1], 0xc +; SI-NEXT: s_brev_b32 s8, 44 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], s2, 1 ; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], s3, 4 @@ -176,53 +176,53 @@ ; SI-NEXT: s_and_b64 s[0:1], exec, s[4:5] ; SI-NEXT: s_and_b64 s[2:3], exec, s[2:3] ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s9 +; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s8 ; SI-NEXT: s_and_b64 s[4:5], exec, s[4:5] ; SI-NEXT: v_mov_b32_e32 v0, 3 ; SI-NEXT: s_branch BB3_4 ; SI-NEXT: BB3_1: ; %Flow6 ; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1 -; SI-NEXT: s_mov_b64 s[10:11], 0 +; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: BB3_2: ; %Flow5 ; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1 -; SI-NEXT: s_mov_b64 s[14:15], 0 +; SI-NEXT: s_mov_b64 s[12:13], 0 ; SI-NEXT: BB3_3: ; %Flow ; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1 -; SI-NEXT: s_and_b64 vcc, exec, s[12:13] +; SI-NEXT: s_and_b64 vcc, exec, s[10:11] ; SI-NEXT: s_cbranch_vccnz BB3_8 ; SI-NEXT: BB3_4: ; %while.cond ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_mov_b64 s[14:15], -1 -; SI-NEXT: s_mov_b64 s[10:11], -1 ; SI-NEXT: s_mov_b64 s[12:13], -1 +; SI-NEXT: s_mov_b64 s[8:9], -1 +; SI-NEXT: s_mov_b64 s[10:11], -1 ; SI-NEXT: s_mov_b64 vcc, s[0:1] ; SI-NEXT: s_cbranch_vccz BB3_3 ; SI-NEXT: ; %bb.5: ; %convex.exit ; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1 +; SI-NEXT: s_mov_b64 s[8:9], -1 ; SI-NEXT: s_mov_b64 s[10:11], -1 -; SI-NEXT: s_mov_b64 s[12:13], -1 ; SI-NEXT: s_mov_b64 vcc, s[2:3] ; SI-NEXT: s_cbranch_vccz BB3_2 ; SI-NEXT: ; %bb.6: ; %if.end ; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1 -; SI-NEXT: s_mov_b64 s[12:13], -1 +; SI-NEXT: s_mov_b64 s[10:11], -1 ; SI-NEXT: s_mov_b64 vcc, s[4:5] ; SI-NEXT: s_cbranch_vccz BB3_1 ; SI-NEXT: ; %bb.7: ; %if.else ; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1 -; SI-NEXT: s_mov_b64 s[12:13], 0 +; SI-NEXT: s_mov_b64 s[10:11], 0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_branch BB3_1 ; SI-NEXT: BB3_8: ; %loop.exit.guard4 ; SI-NEXT: ; in Loop: Header=BB3_4 Depth=1 -; SI-NEXT: s_and_b64 vcc, exec, s[10:11] +; SI-NEXT: s_and_b64 vcc, exec, s[8:9] ; SI-NEXT: s_cbranch_vccz BB3_4 ; SI-NEXT: ; %bb.9: ; %loop.exit.guard -; SI-NEXT: s_and_b64 vcc, exec, s[14:15] +; SI-NEXT: s_and_b64 vcc, exec, s[12:13] ; SI-NEXT: s_cbranch_vccz BB3_13 ; SI-NEXT: ; %bb.10: ; %for.cond.preheader -; SI-NEXT: s_cmpk_lt_i32 s8, 0x3e8 +; SI-NEXT: s_cmpk_lt_i32 s14, 0x3e8 ; SI-NEXT: s_cbranch_scc0 BB3_13 ; SI-NEXT: ; %bb.11: ; %for.body ; SI-NEXT: s_and_b64 vcc, exec, 0 @@ -239,8 +239,8 @@ ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; FLAT-NEXT: s_load_dword s8, s[0:1], 0x30 -; FLAT-NEXT: s_brev_b32 s9, 44 +; FLAT-NEXT: s_load_dword s14, s[0:1], 0x30 +; FLAT-NEXT: s_brev_b32 s8, 44 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_cmp_lt_i32_e64 s[0:1], s2, 1 ; FLAT-NEXT: v_cmp_lt_i32_e64 s[4:5], s3, 4 @@ -249,53 +249,53 @@ ; FLAT-NEXT: s_and_b64 s[0:1], exec, s[4:5] ; FLAT-NEXT: s_and_b64 s[2:3], exec, s[2:3] ; FLAT-NEXT: s_waitcnt vmcnt(0) -; FLAT-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s9 +; FLAT-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s8 ; FLAT-NEXT: s_and_b64 s[4:5], exec, s[4:5] ; FLAT-NEXT: v_mov_b32_e32 v0, 3 ; FLAT-NEXT: s_branch BB3_4 ; FLAT-NEXT: BB3_1: ; %Flow6 ; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1 -; FLAT-NEXT: s_mov_b64 s[10:11], 0 +; FLAT-NEXT: s_mov_b64 s[8:9], 0 ; FLAT-NEXT: BB3_2: ; %Flow5 ; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1 -; FLAT-NEXT: s_mov_b64 s[14:15], 0 +; FLAT-NEXT: s_mov_b64 s[12:13], 0 ; FLAT-NEXT: BB3_3: ; %Flow ; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1 -; FLAT-NEXT: s_and_b64 vcc, exec, s[12:13] +; FLAT-NEXT: s_and_b64 vcc, exec, s[10:11] ; FLAT-NEXT: s_cbranch_vccnz BB3_8 ; FLAT-NEXT: BB3_4: ; %while.cond ; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1 -; FLAT-NEXT: s_mov_b64 s[14:15], -1 -; FLAT-NEXT: s_mov_b64 s[10:11], -1 ; FLAT-NEXT: s_mov_b64 s[12:13], -1 +; FLAT-NEXT: s_mov_b64 s[8:9], -1 +; FLAT-NEXT: s_mov_b64 s[10:11], -1 ; FLAT-NEXT: s_mov_b64 vcc, s[0:1] ; FLAT-NEXT: s_cbranch_vccz BB3_3 ; FLAT-NEXT: ; %bb.5: ; %convex.exit ; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1 +; FLAT-NEXT: s_mov_b64 s[8:9], -1 ; FLAT-NEXT: s_mov_b64 s[10:11], -1 -; FLAT-NEXT: s_mov_b64 s[12:13], -1 ; FLAT-NEXT: s_mov_b64 vcc, s[2:3] ; FLAT-NEXT: s_cbranch_vccz BB3_2 ; FLAT-NEXT: ; %bb.6: ; %if.end ; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1 -; FLAT-NEXT: s_mov_b64 s[12:13], -1 +; FLAT-NEXT: s_mov_b64 s[10:11], -1 ; FLAT-NEXT: s_mov_b64 vcc, s[4:5] ; FLAT-NEXT: s_cbranch_vccz BB3_1 ; FLAT-NEXT: ; %bb.7: ; %if.else ; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1 -; FLAT-NEXT: s_mov_b64 s[12:13], 0 +; FLAT-NEXT: s_mov_b64 s[10:11], 0 ; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; FLAT-NEXT: s_waitcnt vmcnt(0) ; FLAT-NEXT: s_branch BB3_1 ; FLAT-NEXT: BB3_8: ; %loop.exit.guard4 ; FLAT-NEXT: ; in Loop: Header=BB3_4 Depth=1 -; FLAT-NEXT: s_and_b64 vcc, exec, s[10:11] +; FLAT-NEXT: s_and_b64 vcc, exec, s[8:9] ; FLAT-NEXT: s_cbranch_vccz BB3_4 ; FLAT-NEXT: ; %bb.9: ; %loop.exit.guard -; FLAT-NEXT: s_and_b64 vcc, exec, s[14:15] +; FLAT-NEXT: s_and_b64 vcc, exec, s[12:13] ; FLAT-NEXT: s_cbranch_vccz BB3_13 ; FLAT-NEXT: ; %bb.10: ; %for.cond.preheader -; FLAT-NEXT: s_cmpk_lt_i32 s8, 0x3e8 +; FLAT-NEXT: s_cmpk_lt_i32 s14, 0x3e8 ; FLAT-NEXT: s_cbranch_scc0 BB3_13 ; FLAT-NEXT: ; %bb.11: ; %for.body ; FLAT-NEXT: s_and_b64 vcc, exec, 0 diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -629,11 +629,11 @@ ; SI-NEXT: s_mov_b64 s[0:1], exec ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc -; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; SI-NEXT: s_xor_b64 s[4:5], exec, s[2:3] ; SI-NEXT: s_cbranch_execz BB10_4 ; SI-NEXT: ; %bb.1: ; %bb.preheader -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: BB10_2: ; %bb ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: ;;#ASMSTART @@ -655,13 +655,13 @@ ; SI-NEXT: ; %bb.3: ; %bb ; SI-NEXT: ; in Loop: Header=BB10_2 Depth=1 ; SI-NEXT: s_andn2_b64 exec, exec, vcc -; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc +; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_b64 vcc, exec, vcc ; SI-NEXT: s_cbranch_vccnz BB10_2 ; SI-NEXT: BB10_4: ; %Flow1 -; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 8 @@ -1285,23 +1285,23 @@ ; SI-NEXT: s_cbranch_scc1 BB15_7 ; SI-NEXT: ; %bb.1: ; %.lr.ph ; SI-NEXT: s_mov_b64 s[2:3], exec -; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_branch BB15_3 ; SI-NEXT: BB15_2: ; %latch ; SI-NEXT: ; in Loop: Header=BB15_3 Depth=1 -; SI-NEXT: s_or_b64 exec, exec, s[6:7] -; SI-NEXT: s_add_i32 s4, s4, 1 -; SI-NEXT: v_cmp_ge_i32_e32 vcc, s4, v1 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_add_i32 s6, s6, 1 +; SI-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1 ; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execz BB15_6 ; SI-NEXT: BB15_3: ; %hdr ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 -; SI-NEXT: s_and_saveexec_b64 s[6:7], vcc -; SI-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; SI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz BB15_2 ; SI-NEXT: ; %bb.4: ; %kill ; SI-NEXT: ; in Loop: Header=BB15_3 Depth=1 @@ -1328,23 +1328,23 @@ ; GFX10-WAVE64-NEXT: s_cbranch_scc1 BB15_7 ; GFX10-WAVE64-NEXT: ; %bb.1: ; %.lr.ph ; GFX10-WAVE64-NEXT: s_mov_b64 s[2:3], exec -; GFX10-WAVE64-NEXT: s_mov_b32 s4, 0 +; GFX10-WAVE64-NEXT: s_mov_b32 s6, 0 ; GFX10-WAVE64-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-WAVE64-NEXT: s_branch BB15_3 ; GFX10-WAVE64-NEXT: BB15_2: ; %latch ; GFX10-WAVE64-NEXT: ; in Loop: Header=BB15_3 Depth=1 -; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX10-WAVE64-NEXT: s_add_i32 s4, s4, 1 -; GFX10-WAVE64-NEXT: v_cmp_ge_i32_e32 vcc, s4, v1 -; GFX10-WAVE64-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-WAVE64-NEXT: s_add_i32 s6, s6, 1 +; GFX10-WAVE64-NEXT: v_cmp_ge_i32_e32 vcc, s6, v1 +; GFX10-WAVE64-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-WAVE64-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX10-WAVE64-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX10-WAVE64-NEXT: s_cbranch_execz BB15_6 ; GFX10-WAVE64-NEXT: BB15_3: ; %hdr ; GFX10-WAVE64-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-WAVE64-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 -; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX10-WAVE64-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX10-WAVE64-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0 +; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX10-WAVE64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX10-WAVE64-NEXT: s_cbranch_execz BB15_2 ; GFX10-WAVE64-NEXT: ; %bb.4: ; %kill ; GFX10-WAVE64-NEXT: ; in Loop: Header=BB15_3 Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -17,7 +17,7 @@ ; GFX9-FLATSCR: s_mov_b32 [[SOFF1:s[0-9]+]], 4{{$}} ; GFX9-FLATSCR: scratch_store_dwordx4 off, v[{{[0-9:]+}}], [[SOFF1]] ; 16-byte Folded Spill ; GFX9-FLATSCR: s_movk_i32 [[SOFF2:s[0-9]+]], 0x1{{[0-9a-f]+}}{{$}} -; GFX9-FLATSCR: scratch_load_dwordx4 v[{{[0-9:]+}}], off, [[SOFF2]] ; 16-byte Folded Reload +; GFX9-FLATSCR-NOT: scratch_load_dwordx4 off, v[{{[0-9:]+}}], [[SOFF2]] ; 16-byte Folded Reload ; GFX10-FLATSCR: scratch_store_dwordx4 off, v[{{[0-9:]+}}], off offset:{{[0-9]+}} ; 16-byte Folded Spill ; GFX10-FLATSCR: scratch_load_dwordx4 v[{{[0-9:]+}}], off, off offset:{{[0-9]+}} ; 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll --- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll @@ -246,7 +246,7 @@ ; GFX908-DAG: v_accvgpr_read_b32 ; GCN: NumVgprs: 256 -; GFX900: ScratchSize: 1796 +; GFX900: ScratchSize: 2052 ; GFX908-FIXME: ScratchSize: 0 ; GCN: VGPRBlocks: 63 ; GCN: NumVGPRsForWavesPerEU: 256 diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir b/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir --- a/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir +++ b/llvm/test/CodeGen/AMDGPU/splitkit-copy-bundle.mir @@ -14,42 +14,39 @@ ; RA: successors: %bb.1(0x80000000) ; RA: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; RA: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; RA: undef %5.sub1:sgpr_1024 = S_MOV_B32 -1 - ; RA: %5.sub0:sgpr_1024 = S_MOV_B32 -1 - ; RA: undef %4.sub0_sub1:sgpr_1024 = COPY %5.sub0_sub1 + ; RA: undef %2.sub1:sgpr_1024 = S_MOV_B32 -1 + ; RA: %2.sub0:sgpr_1024 = S_MOV_B32 -1 ; RA: undef %3.sub0:sgpr_1024 = S_MOV_B32 0 ; RA: bb.1: ; RA: successors: %bb.2(0x80000000) - ; RA: undef %6.sub0_sub1:sgpr_1024 = COPY %4.sub0_sub1 - ; RA: %6.sub2:sgpr_1024 = COPY %6.sub0 - ; RA: %6.sub3:sgpr_1024 = COPY %6.sub1 - ; RA: %6.sub4:sgpr_1024 = COPY %6.sub0 - ; RA: %6.sub5:sgpr_1024 = COPY %6.sub1 - ; RA: %6.sub6:sgpr_1024 = COPY %6.sub0 - ; RA: %6.sub7:sgpr_1024 = COPY %6.sub1 - ; RA: %6.sub8:sgpr_1024 = COPY %6.sub0 - ; RA: %6.sub9:sgpr_1024 = COPY %6.sub1 - ; RA: %6.sub10:sgpr_1024 = COPY %6.sub0 - ; RA: %6.sub11:sgpr_1024 = COPY %6.sub1 - ; RA: %6.sub12:sgpr_1024 = COPY %6.sub0 - ; RA: %6.sub13:sgpr_1024 = COPY %6.sub1 - ; RA: %6.sub14:sgpr_1024 = COPY %6.sub0 - ; RA: %6.sub15:sgpr_1024 = COPY %6.sub1 - ; RA: %6.sub16:sgpr_1024 = COPY %6.sub0 - ; RA: %6.sub17:sgpr_1024 = COPY %6.sub1 - ; RA: %6.sub18:sgpr_1024 = COPY %6.sub0 - ; RA: %6.sub19:sgpr_1024 = COPY %6.sub1 - ; RA: %6.sub20:sgpr_1024 = COPY %6.sub0 - ; RA: %6.sub21:sgpr_1024 = COPY %6.sub1 - ; RA: %6.sub22:sgpr_1024 = COPY %6.sub0 - ; RA: %6.sub23:sgpr_1024 = COPY %6.sub1 - ; RA: %6.sub24:sgpr_1024 = COPY %6.sub0 - ; RA: %6.sub25:sgpr_1024 = COPY %6.sub1 - ; RA: %6.sub26:sgpr_1024 = COPY %6.sub0 - ; RA: %6.sub27:sgpr_1024 = COPY %6.sub1 - ; RA: %6.sub28:sgpr_1024 = COPY %6.sub0 - ; RA: %6.sub29:sgpr_1024 = COPY %6.sub1 - ; RA: undef %4.sub0_sub1:sgpr_1024 = COPY %6.sub0_sub1 + ; RA: %2.sub2:sgpr_1024 = COPY %2.sub0 + ; RA: %2.sub3:sgpr_1024 = COPY %2.sub1 + ; RA: %2.sub4:sgpr_1024 = COPY %2.sub0 + ; RA: %2.sub5:sgpr_1024 = COPY %2.sub1 + ; RA: %2.sub6:sgpr_1024 = COPY %2.sub0 + ; RA: %2.sub7:sgpr_1024 = COPY %2.sub1 + ; RA: %2.sub8:sgpr_1024 = COPY %2.sub0 + ; RA: %2.sub9:sgpr_1024 = COPY %2.sub1 + ; RA: %2.sub10:sgpr_1024 = COPY %2.sub0 + ; RA: %2.sub11:sgpr_1024 = COPY %2.sub1 + ; RA: %2.sub12:sgpr_1024 = COPY %2.sub0 + ; RA: %2.sub13:sgpr_1024 = COPY %2.sub1 + ; RA: %2.sub14:sgpr_1024 = COPY %2.sub0 + ; RA: %2.sub15:sgpr_1024 = COPY %2.sub1 + ; RA: %2.sub16:sgpr_1024 = COPY %2.sub0 + ; RA: %2.sub17:sgpr_1024 = COPY %2.sub1 + ; RA: %2.sub18:sgpr_1024 = COPY %2.sub0 + ; RA: %2.sub19:sgpr_1024 = COPY %2.sub1 + ; RA: %2.sub20:sgpr_1024 = COPY %2.sub0 + ; RA: %2.sub21:sgpr_1024 = COPY %2.sub1 + ; RA: %2.sub22:sgpr_1024 = COPY %2.sub0 + ; RA: %2.sub23:sgpr_1024 = COPY %2.sub1 + ; RA: %2.sub24:sgpr_1024 = COPY %2.sub0 + ; RA: %2.sub25:sgpr_1024 = COPY %2.sub1 + ; RA: %2.sub26:sgpr_1024 = COPY %2.sub0 + ; RA: %2.sub27:sgpr_1024 = COPY %2.sub1 + ; RA: %2.sub28:sgpr_1024 = COPY %2.sub0 + ; RA: %2.sub29:sgpr_1024 = COPY %2.sub1 ; RA: %3.sub1:sgpr_1024 = COPY %3.sub0 ; RA: %3.sub2:sgpr_1024 = COPY %3.sub0 ; RA: %3.sub3:sgpr_1024 = COPY %3.sub0 @@ -89,79 +86,77 @@ ; VR-LABEL: name: splitkit_copy_bundle ; VR: bb.0: ; VR: successors: %bb.1(0x80000000) - ; VR: renamable $sgpr69 = S_MOV_B32 -1 - ; VR: renamable $sgpr68 = S_MOV_B32 -1 - ; VR: renamable $sgpr36 = S_MOV_B32 0 + ; VR: renamable $sgpr37 = S_MOV_B32 -1 + ; VR: renamable $sgpr36 = S_MOV_B32 -1 + ; VR: renamable $sgpr68 = S_MOV_B32 0 ; VR: renamable $sgpr34_sgpr35 = IMPLICIT_DEF - ; VR: renamable $sgpr70_sgpr71 = IMPLICIT_DEF + ; VR: renamable $sgpr66_sgpr67 = IMPLICIT_DEF ; VR: bb.1: ; VR: successors: %bb.2(0x80000000) - ; VR: liveins: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67:0x0000000000000003, $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99:0x000000000000000F, $sgpr34_sgpr35, $sgpr70_sgpr71 - ; VR: renamable $sgpr40_sgpr41 = COPY killed renamable $sgpr68_sgpr69 - ; VR: renamable $sgpr42 = COPY renamable $sgpr40 - ; VR: renamable $sgpr43 = COPY renamable $sgpr41 - ; VR: renamable $sgpr44 = COPY renamable $sgpr40 - ; VR: renamable $sgpr45 = COPY renamable $sgpr41 - ; VR: renamable $sgpr46 = COPY renamable $sgpr40 - ; VR: renamable $sgpr47 = COPY renamable $sgpr41 - ; VR: renamable $sgpr48 = COPY renamable $sgpr40 - ; VR: renamable $sgpr49 = COPY renamable $sgpr41 - ; VR: renamable $sgpr50 = COPY renamable $sgpr40 - ; VR: renamable $sgpr51 = COPY renamable $sgpr41 - ; VR: renamable $sgpr52 = COPY renamable $sgpr40 - ; VR: renamable $sgpr53 = COPY renamable $sgpr41 - ; VR: renamable $sgpr54 = COPY renamable $sgpr40 - ; VR: renamable $sgpr55 = COPY renamable $sgpr41 - ; VR: renamable $sgpr56 = COPY renamable $sgpr40 - ; VR: renamable $sgpr57 = COPY renamable $sgpr41 - ; VR: renamable $sgpr58 = COPY renamable $sgpr40 - ; VR: renamable $sgpr59 = COPY renamable $sgpr41 - ; VR: renamable $sgpr60 = COPY renamable $sgpr40 - ; VR: renamable $sgpr61 = COPY renamable $sgpr41 - ; VR: renamable $sgpr62 = COPY renamable $sgpr40 - ; VR: renamable $sgpr63 = COPY renamable $sgpr41 - ; VR: renamable $sgpr64 = COPY renamable $sgpr40 - ; VR: renamable $sgpr65 = COPY renamable $sgpr41 - ; VR: renamable $sgpr66 = COPY renamable $sgpr40 - ; VR: renamable $sgpr67 = COPY renamable $sgpr41 - ; VR: renamable $sgpr68 = COPY renamable $sgpr40 - ; VR: renamable $sgpr69 = COPY renamable $sgpr41 - ; VR: renamable $sgpr68_sgpr69 = COPY killed renamable $sgpr40_sgpr41 - ; VR: renamable $sgpr37 = COPY renamable $sgpr36 + ; VR: liveins: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67:0x000000000000000F, $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99:0x0000000000000003, $sgpr34_sgpr35, $sgpr66_sgpr67 ; VR: renamable $sgpr38 = COPY renamable $sgpr36 - ; VR: renamable $sgpr39 = COPY renamable $sgpr36 + ; VR: renamable $sgpr39 = COPY renamable $sgpr37 ; VR: renamable $sgpr40 = COPY renamable $sgpr36 - ; VR: renamable $sgpr41 = COPY renamable $sgpr36 + ; VR: renamable $sgpr41 = COPY renamable $sgpr37 ; VR: renamable $sgpr42 = COPY renamable $sgpr36 - ; VR: renamable $sgpr43 = COPY renamable $sgpr36 + ; VR: renamable $sgpr43 = COPY renamable $sgpr37 ; VR: renamable $sgpr44 = COPY renamable $sgpr36 - ; VR: renamable $sgpr45 = COPY renamable $sgpr36 + ; VR: renamable $sgpr45 = COPY renamable $sgpr37 ; VR: renamable $sgpr46 = COPY renamable $sgpr36 - ; VR: renamable $sgpr47 = COPY renamable $sgpr36 + ; VR: renamable $sgpr47 = COPY renamable $sgpr37 ; VR: renamable $sgpr48 = COPY renamable $sgpr36 - ; VR: renamable $sgpr49 = COPY renamable $sgpr36 + ; VR: renamable $sgpr49 = COPY renamable $sgpr37 ; VR: renamable $sgpr50 = COPY renamable $sgpr36 - ; VR: renamable $sgpr51 = COPY renamable $sgpr36 + ; VR: renamable $sgpr51 = COPY renamable $sgpr37 ; VR: renamable $sgpr52 = COPY renamable $sgpr36 - ; VR: renamable $sgpr53 = COPY renamable $sgpr36 + ; VR: renamable $sgpr53 = COPY renamable $sgpr37 ; VR: renamable $sgpr54 = COPY renamable $sgpr36 - ; VR: renamable $sgpr55 = COPY renamable $sgpr36 + ; VR: renamable $sgpr55 = COPY renamable $sgpr37 ; VR: renamable $sgpr56 = COPY renamable $sgpr36 - ; VR: renamable $sgpr57 = COPY renamable $sgpr36 + ; VR: renamable $sgpr57 = COPY renamable $sgpr37 ; VR: renamable $sgpr58 = COPY renamable $sgpr36 - ; VR: renamable $sgpr59 = COPY renamable $sgpr36 + ; VR: renamable $sgpr59 = COPY renamable $sgpr37 ; VR: renamable $sgpr60 = COPY renamable $sgpr36 - ; VR: renamable $sgpr61 = COPY renamable $sgpr36 + ; VR: renamable $sgpr61 = COPY renamable $sgpr37 ; VR: renamable $sgpr62 = COPY renamable $sgpr36 - ; VR: renamable $sgpr63 = COPY renamable $sgpr36 + ; VR: renamable $sgpr63 = COPY renamable $sgpr37 ; VR: renamable $sgpr64 = COPY renamable $sgpr36 - ; VR: renamable $sgpr65 = COPY renamable $sgpr36 - ; VR: renamable $sgpr66 = COPY renamable $sgpr36 - ; VR: renamable $sgpr67 = COPY renamable $sgpr36 + ; VR: renamable $sgpr65 = COPY renamable $sgpr37 + ; VR: renamable $sgpr69 = COPY renamable $sgpr68 + ; VR: renamable $sgpr70 = COPY renamable $sgpr68 + ; VR: renamable $sgpr71 = COPY renamable $sgpr68 + ; VR: renamable $sgpr72 = COPY renamable $sgpr68 + ; VR: renamable $sgpr73 = COPY renamable $sgpr68 + ; VR: renamable $sgpr74 = COPY renamable $sgpr68 + ; VR: renamable $sgpr75 = COPY renamable $sgpr68 + ; VR: renamable $sgpr76 = COPY renamable $sgpr68 + ; VR: renamable $sgpr77 = COPY renamable $sgpr68 + ; VR: renamable $sgpr78 = COPY renamable $sgpr68 + ; VR: renamable $sgpr79 = COPY renamable $sgpr68 + ; VR: renamable $sgpr80 = COPY renamable $sgpr68 + ; VR: renamable $sgpr81 = COPY renamable $sgpr68 + ; VR: renamable $sgpr82 = COPY renamable $sgpr68 + ; VR: renamable $sgpr83 = COPY renamable $sgpr68 + ; VR: renamable $sgpr84 = COPY renamable $sgpr68 + ; VR: renamable $sgpr85 = COPY renamable $sgpr68 + ; VR: renamable $sgpr86 = COPY renamable $sgpr68 + ; VR: renamable $sgpr87 = COPY renamable $sgpr68 + ; VR: renamable $sgpr88 = COPY renamable $sgpr68 + ; VR: renamable $sgpr89 = COPY renamable $sgpr68 + ; VR: renamable $sgpr90 = COPY renamable $sgpr68 + ; VR: renamable $sgpr91 = COPY renamable $sgpr68 + ; VR: renamable $sgpr92 = COPY renamable $sgpr68 + ; VR: renamable $sgpr93 = COPY renamable $sgpr68 + ; VR: renamable $sgpr94 = COPY renamable $sgpr68 + ; VR: renamable $sgpr95 = COPY renamable $sgpr68 + ; VR: renamable $sgpr96 = COPY renamable $sgpr68 + ; VR: renamable $sgpr97 = COPY renamable $sgpr68 + ; VR: renamable $sgpr98 = COPY renamable $sgpr68 + ; VR: renamable $sgpr99 = COPY renamable $sgpr68 ; VR: bb.2: ; VR: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; VR: liveins: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67:0x0000000000000003, $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99:0x000000000000000F, $sgpr34_sgpr35, $sgpr70_sgpr71 - ; VR: S_NOP 0, csr_amdgpu_highregs, implicit renamable $sgpr34_sgpr35, implicit renamable $sgpr70_sgpr71 + ; VR: liveins: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67:0x000000000000000F, $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99:0x0000000000000003, $sgpr34_sgpr35, $sgpr66_sgpr67 + ; VR: S_NOP 0, csr_amdgpu_highregs, implicit renamable $sgpr34_sgpr35, implicit renamable $sgpr66_sgpr67 ; VR: S_CBRANCH_VCCNZ %bb.1, implicit undef $vcc ; VR: S_BRANCH %bb.2 bb.0: @@ -309,11 +304,11 @@ ; VR: renamable $sgpr9 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr13, 0 :: (dereferenceable invariant load (s32)) ; VR: renamable $sgpr14 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr15, 0 :: (dereferenceable invariant load (s32)) ; VR: renamable $sgpr10_sgpr11 = IMPLICIT_DEF - ; VR: renamable $sgpr17 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr22, 0 :: (dereferenceable invariant load (s32)) - ; VR: renamable $sgpr15 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr16, 0 :: (dereferenceable invariant load (s32)) - ; VR: renamable $sgpr12 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr18, 0 :: (dereferenceable invariant load (s32)) - ; VR: renamable $sgpr13 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr19, 0 :: (dereferenceable invariant load (s32)) - ; VR: renamable $sgpr16 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr21, 0 :: (dereferenceable invariant load (s32)) + ; VR: renamable $sgpr17 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr22, 0 :: (dereferenceable invariant load (s32)) + ; VR: renamable $sgpr15 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr16, 0 :: (dereferenceable invariant load (s32)) + ; VR: renamable $sgpr12 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr18, 0 :: (dereferenceable invariant load (s32)) + ; VR: renamable $sgpr13 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr19, 0 :: (dereferenceable invariant load (s32)) + ; VR: renamable $sgpr16 = S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr4_sgpr5_sgpr6_sgpr7, killed renamable $sgpr21, 0 :: (dereferenceable invariant load (s32)) ; VR: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, implicit killed renamable $sgpr10_sgpr11, implicit killed renamable $sgpr8, implicit killed renamable $sgpr9, implicit killed renamable $sgpr12, implicit killed renamable $sgpr13, implicit killed renamable $sgpr14, implicit killed renamable $sgpr15, implicit killed renamable $sgpr16, implicit killed renamable $sgpr17 %0:sgpr_128 = IMPLICIT_DEF %1:sreg_64 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -133,78 +133,78 @@ ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[6:7], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[0:1], 0 ; GCN-IR-NEXT: s_flbit_i32_b32 s12, s0 -; GCN-IR-NEXT: s_add_i32 s14, s12, 32 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[8:9], s[10:11] +; GCN-IR-NEXT: s_or_b64 s[14:15], s[8:9], s[10:11] +; GCN-IR-NEXT: s_flbit_i32_b32 s10, s6 +; GCN-IR-NEXT: s_add_i32 s12, s12, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s8, s1 -; GCN-IR-NEXT: s_min_u32 s10, s14, s8 -; GCN-IR-NEXT: s_flbit_i32_b32 s8, s6 -; GCN-IR-NEXT: s_add_i32 s8, s8, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s9, s7 -; GCN-IR-NEXT: s_min_u32 s14, s8, s9 -; GCN-IR-NEXT: s_sub_u32 s8, s10, s14 -; GCN-IR-NEXT: s_subb_u32 s9, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[8:9], 63 -; GCN-IR-NEXT: s_mov_b32 s11, 0 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[16:17] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[16:17], s[8:9], 63 -; GCN-IR-NEXT: s_xor_b64 s[18:19], s[12:13], -1 +; GCN-IR-NEXT: s_add_i32 s10, s10, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s11, s7 +; GCN-IR-NEXT: s_min_u32 s8, s12, s8 +; GCN-IR-NEXT: s_min_u32 s12, s10, s11 +; GCN-IR-NEXT: s_sub_u32 s10, s8, s12 +; GCN-IR-NEXT: s_subb_u32 s11, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[10:11], 63 +; GCN-IR-NEXT: s_mov_b32 s9, 0 +; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[16:17], s[10:11], 63 +; GCN-IR-NEXT: s_xor_b64 s[18:19], s[14:15], -1 ; GCN-IR-NEXT: s_and_b64 s[16:17], s[18:19], s[16:17] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17] ; GCN-IR-NEXT: s_cbranch_vccz BB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s16, s8, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 -; GCN-IR-NEXT: s_addc_u32 s17, s9, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1] -; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 +; GCN-IR-NEXT: s_add_u32 s14, s10, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 +; GCN-IR-NEXT: s_addc_u32 s15, s11, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[14:15], v[0:1] +; GCN-IR-NEXT: s_sub_i32 s10, 63, s10 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, vcc -; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[6:7], s8 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[6:7], s10 ; GCN-IR-NEXT: s_cbranch_vccz BB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[6:7], s16 -; GCN-IR-NEXT: s_add_u32 s8, s0, -1 -; GCN-IR-NEXT: s_addc_u32 s9, s1, -1 -; GCN-IR-NEXT: s_not_b64 s[2:3], s[10:11] -; GCN-IR-NEXT: s_mov_b32 s15, s11 -; GCN-IR-NEXT: s_add_u32 s10, s2, s14 -; GCN-IR-NEXT: s_addc_u32 s11, s3, s11 -; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 +; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[6:7], s14 +; GCN-IR-NEXT: s_add_u32 s16, s0, -1 +; GCN-IR-NEXT: s_addc_u32 s17, s1, -1 +; GCN-IR-NEXT: s_not_b64 s[2:3], s[8:9] +; GCN-IR-NEXT: s_mov_b32 s13, s9 +; GCN-IR-NEXT: s_add_u32 s8, s2, s12 +; GCN-IR-NEXT: s_addc_u32 s9, s3, s9 +; GCN-IR-NEXT: s_mov_b64 s[12:13], 0 ; GCN-IR-NEXT: s_mov_b32 s3, 0 ; GCN-IR-NEXT: BB0_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshr_b32 s2, s13, 31 -; GCN-IR-NEXT: s_lshl_b64 s[16:17], s[16:17], 1 -; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 -; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[2:3] -; GCN-IR-NEXT: s_or_b64 s[12:13], s[14:15], s[12:13] -; GCN-IR-NEXT: s_sub_u32 s2, s8, s16 -; GCN-IR-NEXT: s_subb_u32 s2, s9, s17 -; GCN-IR-NEXT: s_ashr_i32 s14, s2, 31 -; GCN-IR-NEXT: s_mov_b32 s15, s14 -; GCN-IR-NEXT: s_and_b32 s2, s14, 1 -; GCN-IR-NEXT: s_and_b64 s[18:19], s[14:15], s[0:1] -; GCN-IR-NEXT: s_sub_u32 s16, s16, s18 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 -; GCN-IR-NEXT: s_subb_u32 s17, s17, s19 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 -; GCN-IR-NEXT: s_add_u32 s10, s10, 1 -; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1] -; GCN-IR-NEXT: s_mov_b64 s[14:15], s[2:3] +; GCN-IR-NEXT: s_lshr_b32 s2, s11, 31 +; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 +; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[2:3] +; GCN-IR-NEXT: s_or_b64 s[10:11], s[12:13], s[10:11] +; GCN-IR-NEXT: s_sub_u32 s2, s16, s14 +; GCN-IR-NEXT: s_subb_u32 s2, s17, s15 +; GCN-IR-NEXT: s_ashr_i32 s12, s2, 31 +; GCN-IR-NEXT: s_mov_b32 s13, s12 +; GCN-IR-NEXT: s_and_b32 s2, s12, 1 +; GCN-IR-NEXT: s_and_b64 s[18:19], s[12:13], s[0:1] +; GCN-IR-NEXT: s_sub_u32 s14, s14, s18 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 +; GCN-IR-NEXT: s_subb_u32 s15, s15, s19 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 +; GCN-IR-NEXT: s_add_u32 s8, s8, 1 +; GCN-IR-NEXT: s_addc_u32 s9, s9, 0 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; GCN-IR-NEXT: s_mov_b64 s[12:13], s[2:3] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc ; GCN-IR-NEXT: s_cbranch_vccz BB0_3 ; GCN-IR-NEXT: BB0_4: ; %Flow6 -; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[12:13], 1 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[10:11], 1 ; GCN-IR-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] ; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 ; GCN-IR-NEXT: s_branch BB0_6 ; GCN-IR-NEXT: BB0_5: ; GCN-IR-NEXT: v_mov_b32_e32 v0, s7 -; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[12:13] +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[14:15] ; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 -; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[12:13] +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[14:15] ; GCN-IR-NEXT: BB0_6: ; %udiv-end ; GCN-IR-NEXT: v_mul_lo_u32 v1, s0, v1 ; GCN-IR-NEXT: v_mul_hi_u32 v2, s0, v0 @@ -372,72 +372,72 @@ ; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[4:5] ; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, 32, v3 ; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v6 -; GCN-IR-NEXT: v_min_u32_e32 v12, v3, v7 -; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v0 -; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, 32, v3 -; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v1 -; GCN-IR-NEXT: v_min_u32_e32 v14, v3, v7 -; GCN-IR-NEXT: v_sub_i32_e32 v7, vcc, v12, v14 +; GCN-IR-NEXT: v_min_u32_e32 v3, v3, v7 +; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v0 +; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 32, v7 +; GCN-IR-NEXT: v_ffbh_u32_e32 v8, v1 +; GCN-IR-NEXT: v_min_u32_e32 v12, v7, v8 +; GCN-IR-NEXT: v_sub_i32_e32 v7, vcc, v3, v12 ; GCN-IR-NEXT: v_subb_u32_e64 v8, s[4:5], 0, 0, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[7:8] ; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[4:5], 63, v[7:8] ; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], vcc -; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: s_xor_b64 s[8:9], s[6:7], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v2, v4 -; GCN-IR-NEXT: v_mov_b32_e32 v15, v13 -; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v1, 0, s[6:7] +; GCN-IR-NEXT: v_mov_b32_e32 v13, v11 +; GCN-IR-NEXT: v_cndmask_b32_e64 v10, v1, 0, s[6:7] ; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], s[4:5] ; GCN-IR-NEXT: v_cndmask_b32_e64 v9, v0, 0, s[6:7] ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v7 -; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v8, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v3, s[4:5], 63, v7 -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[9:10], v[7:8] +; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v7 +; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v8, vcc +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[14:15], v[7:8] +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], 63, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[0:1], v7 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[0:1], v3 -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, -1, v5 -; GCN-IR-NEXT: v_lshr_b64 v[16:17], v[0:1], v9 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, -1, v6, vcc -; GCN-IR-NEXT: v_not_b32_e32 v10, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v18, 0 -; GCN-IR-NEXT: v_not_b32_e32 v11, v13 -; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, v10, v14 -; GCN-IR-NEXT: v_mov_b32_e32 v19, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, v11, v15, vcc +; GCN-IR-NEXT: v_add_i32_e32 v18, vcc, -1, v5 +; GCN-IR-NEXT: v_addc_u32_e32 v19, vcc, -1, v6, vcc +; GCN-IR-NEXT: v_not_b32_e32 v3, v3 +; GCN-IR-NEXT: v_not_b32_e32 v9, v11 +; GCN-IR-NEXT: v_add_i32_e32 v11, vcc, v3, v12 +; GCN-IR-NEXT: v_mov_b32_e32 v16, 0 +; GCN-IR-NEXT: v_lshr_b64 v[14:15], v[0:1], v14 +; GCN-IR-NEXT: v_mov_b32_e32 v17, 0 +; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, v9, v13, vcc ; GCN-IR-NEXT: BB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[14:15], v[16:17], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v10, 31, v8 -; GCN-IR-NEXT: v_or_b32_e32 v14, v14, v10 +; GCN-IR-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v3, 31, v8 +; GCN-IR-NEXT: v_or_b32_e32 v3, v14, v3 ; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v3, v14 -; GCN-IR-NEXT: v_subb_u32_e32 v10, vcc, v9, v15, vcc -; GCN-IR-NEXT: v_or_b32_e32 v7, v18, v7 -; GCN-IR-NEXT: v_add_i32_e32 v18, vcc, 1, v12 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v16, 31, v10 -; GCN-IR-NEXT: v_or_b32_e32 v8, v19, v8 -; GCN-IR-NEXT: v_addc_u32_e32 v19, vcc, 0, v13, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[18:19], v[12:13] -; GCN-IR-NEXT: v_mov_b32_e32 v12, v18 -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: v_and_b32_e32 v10, 1, v16 -; GCN-IR-NEXT: v_and_b32_e32 v17, v16, v6 -; GCN-IR-NEXT: v_and_b32_e32 v16, v16, v5 -; GCN-IR-NEXT: v_sub_i32_e64 v16, s[4:5], v14, v16 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v19 -; GCN-IR-NEXT: v_mov_b32_e32 v19, v11 -; GCN-IR-NEXT: v_subb_u32_e64 v17, s[4:5], v15, v17, s[4:5] +; GCN-IR-NEXT: v_sub_i32_e32 v9, vcc, v18, v3 +; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, v19, v15, vcc +; GCN-IR-NEXT: v_or_b32_e32 v7, v16, v7 +; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, 1, v11 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v13, 31, v9 +; GCN-IR-NEXT: v_or_b32_e32 v8, v17, v8 +; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, 0, v12, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[11:12] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v16 +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_and_b32_e32 v9, 1, v13 +; GCN-IR-NEXT: v_and_b32_e32 v20, v13, v6 +; GCN-IR-NEXT: v_and_b32_e32 v13, v13, v5 +; GCN-IR-NEXT: v_sub_i32_e64 v14, s[4:5], v3, v13 +; GCN-IR-NEXT: v_mov_b32_e32 v12, v17 +; GCN-IR-NEXT: v_mov_b32_e32 v17, v10 +; GCN-IR-NEXT: v_subb_u32_e64 v15, s[4:5], v15, v20, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v18, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v16, v9 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow @@ -445,11 +445,11 @@ ; GCN-IR-NEXT: BB1_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 -; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v8 -; GCN-IR-NEXT: v_or_b32_e32 v9, v10, v7 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v8 +; GCN-IR-NEXT: v_or_b32_e32 v9, v9, v7 ; GCN-IR-NEXT: BB1_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mul_lo_u32 v3, v5, v3 +; GCN-IR-NEXT: v_mul_lo_u32 v3, v5, v10 ; GCN-IR-NEXT: v_mul_hi_u32 v7, v5, v9 ; GCN-IR-NEXT: v_mul_lo_u32 v6, v6, v9 ; GCN-IR-NEXT: v_mul_lo_u32 v5, v5, v9 @@ -1030,79 +1030,79 @@ ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[8:9], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[2:3], 0 ; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[10:11], s[12:13] +; GCN-IR-NEXT: s_or_b64 s[16:17], s[10:11], s[12:13] ; GCN-IR-NEXT: s_flbit_i32_b32 s10, s8 +; GCN-IR-NEXT: s_flbit_i32_b32 s12, s2 ; GCN-IR-NEXT: s_add_i32 s10, s10, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s11, s9 -; GCN-IR-NEXT: s_min_u32 s12, s10, s11 -; GCN-IR-NEXT: s_flbit_i32_b32 s10, s2 -; GCN-IR-NEXT: s_add_i32 s10, s10, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s11, s3 -; GCN-IR-NEXT: s_min_u32 s16, s10, s11 -; GCN-IR-NEXT: s_sub_u32 s10, s12, s16 -; GCN-IR-NEXT: s_subb_u32 s11, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[18:19], s[10:11], 63 -; GCN-IR-NEXT: s_mov_b32 s13, 0 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[18:19] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[18:19], s[10:11], 63 -; GCN-IR-NEXT: s_xor_b64 s[20:21], s[14:15], -1 +; GCN-IR-NEXT: s_add_i32 s12, s12, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s13, s3 +; GCN-IR-NEXT: s_min_u32 s10, s10, s11 +; GCN-IR-NEXT: s_min_u32 s14, s12, s13 +; GCN-IR-NEXT: s_sub_u32 s12, s10, s14 +; GCN-IR-NEXT: s_subb_u32 s13, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[18:19], s[12:13], 63 +; GCN-IR-NEXT: s_mov_b32 s11, 0 +; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[18:19], s[12:13], 63 +; GCN-IR-NEXT: s_xor_b64 s[20:21], s[16:17], -1 ; GCN-IR-NEXT: s_and_b64 s[18:19], s[20:21], s[18:19] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] ; GCN-IR-NEXT: s_cbranch_vccz BB8_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s18, s10, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 -; GCN-IR-NEXT: s_addc_u32 s19, s11, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[18:19], v[0:1] -; GCN-IR-NEXT: s_sub_i32 s10, 63, s10 +; GCN-IR-NEXT: s_add_u32 s16, s12, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s12 +; GCN-IR-NEXT: s_addc_u32 s17, s13, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s13 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1] +; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, vcc -; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[2:3], s10 +; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[2:3], s12 ; GCN-IR-NEXT: s_cbranch_vccz BB8_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[18:19], s[2:3], s18 -; GCN-IR-NEXT: s_add_u32 s10, s8, -1 -; GCN-IR-NEXT: s_addc_u32 s11, s9, -1 -; GCN-IR-NEXT: s_not_b64 s[6:7], s[12:13] -; GCN-IR-NEXT: s_mov_b32 s17, s13 -; GCN-IR-NEXT: s_add_u32 s12, s6, s16 -; GCN-IR-NEXT: s_addc_u32 s13, s7, s13 -; GCN-IR-NEXT: s_mov_b64 s[16:17], 0 +; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[2:3], s16 +; GCN-IR-NEXT: s_add_u32 s18, s8, -1 +; GCN-IR-NEXT: s_addc_u32 s19, s9, -1 +; GCN-IR-NEXT: s_not_b64 s[6:7], s[10:11] +; GCN-IR-NEXT: s_mov_b32 s15, s11 +; GCN-IR-NEXT: s_add_u32 s10, s6, s14 +; GCN-IR-NEXT: s_addc_u32 s11, s7, s11 +; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: BB8_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshr_b32 s6, s15, 31 -; GCN-IR-NEXT: s_lshl_b64 s[18:19], s[18:19], 1 -; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 -; GCN-IR-NEXT: s_or_b64 s[18:19], s[18:19], s[6:7] -; GCN-IR-NEXT: s_or_b64 s[14:15], s[16:17], s[14:15] -; GCN-IR-NEXT: s_sub_u32 s6, s10, s18 -; GCN-IR-NEXT: s_subb_u32 s6, s11, s19 -; GCN-IR-NEXT: s_ashr_i32 s16, s6, 31 -; GCN-IR-NEXT: s_mov_b32 s17, s16 -; GCN-IR-NEXT: s_and_b32 s6, s16, 1 -; GCN-IR-NEXT: s_and_b64 s[20:21], s[16:17], s[8:9] -; GCN-IR-NEXT: s_sub_u32 s18, s18, s20 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s12 -; GCN-IR-NEXT: s_subb_u32 s19, s19, s21 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s13 -; GCN-IR-NEXT: s_add_u32 s12, s12, 1 -; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] -; GCN-IR-NEXT: s_mov_b64 s[16:17], s[6:7] +; GCN-IR-NEXT: s_lshr_b32 s6, s13, 31 +; GCN-IR-NEXT: s_lshl_b64 s[16:17], s[16:17], 1 +; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 +; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[6:7] +; GCN-IR-NEXT: s_or_b64 s[12:13], s[14:15], s[12:13] +; GCN-IR-NEXT: s_sub_u32 s6, s18, s16 +; GCN-IR-NEXT: s_subb_u32 s6, s19, s17 +; GCN-IR-NEXT: s_ashr_i32 s14, s6, 31 +; GCN-IR-NEXT: s_mov_b32 s15, s14 +; GCN-IR-NEXT: s_and_b32 s6, s14, 1 +; GCN-IR-NEXT: s_and_b64 s[20:21], s[14:15], s[8:9] +; GCN-IR-NEXT: s_sub_u32 s16, s16, s20 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 +; GCN-IR-NEXT: s_subb_u32 s17, s17, s21 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 +; GCN-IR-NEXT: s_add_u32 s10, s10, 1 +; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1] +; GCN-IR-NEXT: s_mov_b64 s[14:15], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc ; GCN-IR-NEXT: s_cbranch_vccz BB8_3 ; GCN-IR-NEXT: BB8_4: ; %Flow6 -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[14:15], 1 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], 1 ; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 ; GCN-IR-NEXT: s_branch BB8_6 ; GCN-IR-NEXT: BB8_5: ; GCN-IR-NEXT: v_mov_b32_e32 v0, s3 -; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[14:15] +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[16:17] ; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 -; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[14:15] +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[16:17] ; GCN-IR-NEXT: BB8_6: ; %udiv-end ; GCN-IR-NEXT: v_mul_lo_u32 v1, s8, v1 ; GCN-IR-NEXT: v_mul_hi_u32 v2, s8, v0 @@ -1192,79 +1192,79 @@ ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[6:7], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[2:3], 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[10:11], s[12:13] +; GCN-IR-NEXT: s_or_b64 s[16:17], s[10:11], s[12:13] ; GCN-IR-NEXT: s_flbit_i32_b32 s10, s6 +; GCN-IR-NEXT: s_flbit_i32_b32 s12, s2 ; GCN-IR-NEXT: s_add_i32 s10, s10, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s11, s7 -; GCN-IR-NEXT: s_min_u32 s12, s10, s11 -; GCN-IR-NEXT: s_flbit_i32_b32 s10, s2 -; GCN-IR-NEXT: s_add_i32 s10, s10, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s11, s3 -; GCN-IR-NEXT: s_min_u32 s16, s10, s11 -; GCN-IR-NEXT: s_sub_u32 s10, s12, s16 -; GCN-IR-NEXT: s_subb_u32 s11, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[18:19], s[10:11], 63 -; GCN-IR-NEXT: s_mov_b32 s13, 0 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[18:19] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[18:19], s[10:11], 63 -; GCN-IR-NEXT: s_xor_b64 s[20:21], s[14:15], -1 +; GCN-IR-NEXT: s_add_i32 s12, s12, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s13, s3 +; GCN-IR-NEXT: s_min_u32 s10, s10, s11 +; GCN-IR-NEXT: s_min_u32 s14, s12, s13 +; GCN-IR-NEXT: s_sub_u32 s12, s10, s14 +; GCN-IR-NEXT: s_subb_u32 s13, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[18:19], s[12:13], 63 +; GCN-IR-NEXT: s_mov_b32 s11, 0 +; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[18:19], s[12:13], 63 +; GCN-IR-NEXT: s_xor_b64 s[20:21], s[16:17], -1 ; GCN-IR-NEXT: s_and_b64 s[18:19], s[20:21], s[18:19] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] ; GCN-IR-NEXT: s_cbranch_vccz BB9_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s18, s10, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 -; GCN-IR-NEXT: s_addc_u32 s19, s11, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[18:19], v[0:1] -; GCN-IR-NEXT: s_sub_i32 s10, 63, s10 +; GCN-IR-NEXT: s_add_u32 s16, s12, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s12 +; GCN-IR-NEXT: s_addc_u32 s17, s13, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s13 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1] +; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, vcc -; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[2:3], s10 +; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[2:3], s12 ; GCN-IR-NEXT: s_cbranch_vccz BB9_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[18:19], s[2:3], s18 -; GCN-IR-NEXT: s_add_u32 s10, s6, -1 -; GCN-IR-NEXT: s_addc_u32 s11, s7, -1 -; GCN-IR-NEXT: s_not_b64 s[8:9], s[12:13] -; GCN-IR-NEXT: s_mov_b32 s17, s13 -; GCN-IR-NEXT: s_add_u32 s12, s8, s16 -; GCN-IR-NEXT: s_addc_u32 s13, s9, s13 -; GCN-IR-NEXT: s_mov_b64 s[16:17], 0 +; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[2:3], s16 +; GCN-IR-NEXT: s_add_u32 s18, s6, -1 +; GCN-IR-NEXT: s_addc_u32 s19, s7, -1 +; GCN-IR-NEXT: s_not_b64 s[8:9], s[10:11] +; GCN-IR-NEXT: s_mov_b32 s15, s11 +; GCN-IR-NEXT: s_add_u32 s10, s8, s14 +; GCN-IR-NEXT: s_addc_u32 s11, s9, s11 +; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: BB9_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshr_b32 s8, s15, 31 -; GCN-IR-NEXT: s_lshl_b64 s[18:19], s[18:19], 1 -; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 -; GCN-IR-NEXT: s_or_b64 s[18:19], s[18:19], s[8:9] -; GCN-IR-NEXT: s_or_b64 s[14:15], s[16:17], s[14:15] -; GCN-IR-NEXT: s_sub_u32 s8, s10, s18 -; GCN-IR-NEXT: s_subb_u32 s8, s11, s19 -; GCN-IR-NEXT: s_ashr_i32 s16, s8, 31 -; GCN-IR-NEXT: s_mov_b32 s17, s16 -; GCN-IR-NEXT: s_and_b32 s8, s16, 1 -; GCN-IR-NEXT: s_and_b64 s[20:21], s[16:17], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s18, s18, s20 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s12 -; GCN-IR-NEXT: s_subb_u32 s19, s19, s21 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s13 -; GCN-IR-NEXT: s_add_u32 s12, s12, 1 -; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] -; GCN-IR-NEXT: s_mov_b64 s[16:17], s[8:9] +; GCN-IR-NEXT: s_lshr_b32 s8, s13, 31 +; GCN-IR-NEXT: s_lshl_b64 s[16:17], s[16:17], 1 +; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 +; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[8:9] +; GCN-IR-NEXT: s_or_b64 s[12:13], s[14:15], s[12:13] +; GCN-IR-NEXT: s_sub_u32 s8, s18, s16 +; GCN-IR-NEXT: s_subb_u32 s8, s19, s17 +; GCN-IR-NEXT: s_ashr_i32 s14, s8, 31 +; GCN-IR-NEXT: s_mov_b32 s15, s14 +; GCN-IR-NEXT: s_and_b32 s8, s14, 1 +; GCN-IR-NEXT: s_and_b64 s[20:21], s[14:15], s[6:7] +; GCN-IR-NEXT: s_sub_u32 s16, s16, s20 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 +; GCN-IR-NEXT: s_subb_u32 s17, s17, s21 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 +; GCN-IR-NEXT: s_add_u32 s10, s10, 1 +; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1] +; GCN-IR-NEXT: s_mov_b64 s[14:15], s[8:9] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc ; GCN-IR-NEXT: s_cbranch_vccz BB9_3 ; GCN-IR-NEXT: BB9_4: ; %Flow3 -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[14:15], 1 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[12:13], 1 ; GCN-IR-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 ; GCN-IR-NEXT: s_branch BB9_6 ; GCN-IR-NEXT: BB9_5: ; GCN-IR-NEXT: v_mov_b32_e32 v0, s3 -; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[14:15] +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[16:17] ; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 -; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[14:15] +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[16:17] ; GCN-IR-NEXT: BB9_6: ; %udiv-end ; GCN-IR-NEXT: v_mul_lo_u32 v1, s6, v1 ; GCN-IR-NEXT: v_mul_hi_u32 v2, s6, v0 @@ -1416,61 +1416,61 @@ ; GCN-IR-NEXT: s_flbit_i32_b32 s2, s4 ; GCN-IR-NEXT: s_add_i32 s2, s2, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s3, s5 -; GCN-IR-NEXT: s_min_u32 s8, s2, s3 -; GCN-IR-NEXT: s_add_u32 s6, s8, 0xffffffc5 -; GCN-IR-NEXT: s_addc_u32 s7, 0, -1 +; GCN-IR-NEXT: s_min_u32 s6, s2, s3 +; GCN-IR-NEXT: s_add_u32 s8, s6, 0xffffffc5 +; GCN-IR-NEXT: s_addc_u32 s9, 0, -1 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[4:5], 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[12:13], s[6:7], 63 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[12:13], s[8:9], 63 ; GCN-IR-NEXT: s_mov_b64 s[2:3], 0 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[12:13], s[6:7], 63 +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[12:13], s[8:9], 63 ; GCN-IR-NEXT: s_xor_b64 s[14:15], s[10:11], -1 ; GCN-IR-NEXT: s_and_b64 s[12:13], s[14:15], s[12:13] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[12:13] ; GCN-IR-NEXT: s_cbranch_vccz BB10_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s12, s6, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 -; GCN-IR-NEXT: s_addc_u32 s13, s7, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] -; GCN-IR-NEXT: s_sub_i32 s6, 63, s6 +; GCN-IR-NEXT: s_add_u32 s10, s8, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 +; GCN-IR-NEXT: s_addc_u32 s11, s9, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1] +; GCN-IR-NEXT: s_sub_i32 s7, 63, s8 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, vcc -; GCN-IR-NEXT: s_lshl_b64 s[10:11], 24, s6 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], 24, s7 ; GCN-IR-NEXT: s_cbranch_vccz BB10_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[14:15], 24, s12 -; GCN-IR-NEXT: s_add_u32 s6, s4, -1 -; GCN-IR-NEXT: s_addc_u32 s7, s5, -1 -; GCN-IR-NEXT: s_sub_u32 s8, 58, s8 -; GCN-IR-NEXT: s_subb_u32 s9, 0, 0 -; GCN-IR-NEXT: s_mov_b64 s[12:13], 0 +; GCN-IR-NEXT: s_lshr_b64 s[12:13], 24, s10 +; GCN-IR-NEXT: s_add_u32 s14, s4, -1 +; GCN-IR-NEXT: s_addc_u32 s15, s5, -1 +; GCN-IR-NEXT: s_sub_u32 s6, 58, s6 +; GCN-IR-NEXT: s_subb_u32 s7, 0, 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_mov_b32 s3, 0 ; GCN-IR-NEXT: BB10_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshr_b32 s2, s11, 31 -; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[2:3] -; GCN-IR-NEXT: s_or_b64 s[10:11], s[12:13], s[10:11] -; GCN-IR-NEXT: s_sub_u32 s2, s6, s14 -; GCN-IR-NEXT: s_subb_u32 s2, s7, s15 -; GCN-IR-NEXT: s_ashr_i32 s12, s2, 31 -; GCN-IR-NEXT: s_mov_b32 s13, s12 -; GCN-IR-NEXT: s_and_b32 s2, s12, 1 -; GCN-IR-NEXT: s_and_b64 s[16:17], s[12:13], s[4:5] -; GCN-IR-NEXT: s_sub_u32 s14, s14, s16 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 -; GCN-IR-NEXT: s_subb_u32 s15, s15, s17 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 -; GCN-IR-NEXT: s_add_u32 s8, s8, 1 -; GCN-IR-NEXT: s_addc_u32 s9, s9, 0 -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] -; GCN-IR-NEXT: s_mov_b64 s[12:13], s[2:3] +; GCN-IR-NEXT: s_lshr_b32 s2, s9, 31 +; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 +; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[2:3] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] +; GCN-IR-NEXT: s_sub_u32 s2, s14, s12 +; GCN-IR-NEXT: s_subb_u32 s2, s15, s13 +; GCN-IR-NEXT: s_ashr_i32 s10, s2, 31 +; GCN-IR-NEXT: s_mov_b32 s11, s10 +; GCN-IR-NEXT: s_and_b32 s2, s10, 1 +; GCN-IR-NEXT: s_and_b64 s[16:17], s[10:11], s[4:5] +; GCN-IR-NEXT: s_sub_u32 s12, s12, s16 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 +; GCN-IR-NEXT: s_subb_u32 s13, s13, s17 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 +; GCN-IR-NEXT: s_add_u32 s6, s6, 1 +; GCN-IR-NEXT: s_addc_u32 s7, s7, 0 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN-IR-NEXT: s_mov_b64 s[10:11], s[2:3] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc ; GCN-IR-NEXT: s_cbranch_vccz BB10_3 ; GCN-IR-NEXT: BB10_4: ; %Flow5 -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[10:11], 1 +; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[8:9], 1 ; GCN-IR-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] ; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 @@ -1614,26 +1614,26 @@ ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 -; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3 +; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3 ; GCN-IR-NEXT: s_movk_i32 s6, 0xffc5 -; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, s6, v8 +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, s6, v6 ; GCN-IR-NEXT: v_addc_u32_e64 v4, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[3:4] -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[3:4] ; GCN-IR-NEXT: v_cndmask_b32_e64 v2, 24, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v5, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v5, v7 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB11_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v3 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v4, vcc +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v3 +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v4, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v3 -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[3:4] +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[3:4] ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], 24, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 @@ -1642,38 +1642,38 @@ ; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB11_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_lshr_b64 v[10:11], 24, v6 -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 58, v8 -; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 58, v6 +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_lshr_b64 v[8:9], 24, v8 +; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc ; GCN-IR-NEXT: BB11_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v4 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v6, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v7, v11, vcc -; GCN-IR-NEXT: v_or_b32_e32 v2, v12, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v4 -; GCN-IR-NEXT: v_and_b32_e32 v15, v12, v0 -; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v12 -; GCN-IR-NEXT: v_and_b32_e32 v14, v12, v1 -; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v8 -; GCN-IR-NEXT: v_or_b32_e32 v3, v13, v3 -; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v8 +; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v13, v9, vcc +; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v4 +; GCN-IR-NEXT: v_and_b32_e32 v15, v10, v0 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v10 +; GCN-IR-NEXT: v_and_b32_e32 v14, v10, v1 +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v6 +; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[6:7] +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v5 -; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5] +; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v15 +; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v14, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v12, v4 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB11_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow @@ -1847,39 +1847,39 @@ ; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB12_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader +; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 -; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, -1, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 47, v4 -; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v8 +; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, 0, v5, vcc ; GCN-IR-NEXT: BB12_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v6 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v8, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v9, v11, vcc -; GCN-IR-NEXT: v_or_b32_e32 v2, v12, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 -; GCN-IR-NEXT: v_and_b32_e32 v15, v12, v0 -; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 -; GCN-IR-NEXT: v_and_b32_e32 v14, v12, v1 -; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v3, v13, v3 -; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v12 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v12, v8 +; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v13, v9, vcc +; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6 +; GCN-IR-NEXT: v_and_b32_e32 v15, v10, v0 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10 +; GCN-IR-NEXT: v_and_b32_e32 v14, v10, v1 +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v4 +; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v4, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v5, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 -; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5] +; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v15 +; GCN-IR-NEXT: v_mov_b32_e32 v5, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v14, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB12_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -135,68 +135,68 @@ ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[0:1], 0 ; GCN-IR-NEXT: s_flbit_i32_b32 s12, s0 ; GCN-IR-NEXT: s_or_b64 s[14:15], s[8:9], s[10:11] -; GCN-IR-NEXT: s_flbit_i32_b32 s10, s6 ; GCN-IR-NEXT: s_add_i32 s12, s12, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s8, s1 -; GCN-IR-NEXT: s_add_i32 s10, s10, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s11, s7 -; GCN-IR-NEXT: s_min_u32 s8, s12, s8 -; GCN-IR-NEXT: s_min_u32 s12, s10, s11 -; GCN-IR-NEXT: s_sub_u32 s10, s8, s12 -; GCN-IR-NEXT: s_subb_u32 s11, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[10:11], 63 -; GCN-IR-NEXT: s_mov_b32 s9, 0 +; GCN-IR-NEXT: s_min_u32 s10, s12, s8 +; GCN-IR-NEXT: s_flbit_i32_b32 s8, s6 +; GCN-IR-NEXT: s_add_i32 s8, s8, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s9, s7 +; GCN-IR-NEXT: s_min_u32 s12, s8, s9 +; GCN-IR-NEXT: s_sub_u32 s8, s10, s12 +; GCN-IR-NEXT: s_subb_u32 s9, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[8:9], 63 +; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[16:17], s[10:11], 63 +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[16:17], s[8:9], 63 ; GCN-IR-NEXT: s_xor_b64 s[18:19], s[14:15], -1 ; GCN-IR-NEXT: s_and_b64 s[16:17], s[18:19], s[16:17] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17] ; GCN-IR-NEXT: s_cbranch_vccz BB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s14, s10, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 -; GCN-IR-NEXT: s_addc_u32 s15, s11, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 +; GCN-IR-NEXT: s_add_u32 s14, s8, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 +; GCN-IR-NEXT: s_addc_u32 s15, s9, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[14:15], v[0:1] -; GCN-IR-NEXT: s_sub_i32 s10, 63, s10 +; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, vcc -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[6:7], s10 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[6:7], s8 ; GCN-IR-NEXT: s_cbranch_vccz BB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[6:7], s14 -; GCN-IR-NEXT: s_add_u32 s6, s0, -1 -; GCN-IR-NEXT: s_addc_u32 s7, s1, -1 -; GCN-IR-NEXT: s_not_b64 s[2:3], s[8:9] -; GCN-IR-NEXT: s_mov_b32 s13, s9 -; GCN-IR-NEXT: s_add_u32 s8, s2, s12 -; GCN-IR-NEXT: s_addc_u32 s9, s3, s9 -; GCN-IR-NEXT: s_mov_b64 s[12:13], 0 +; GCN-IR-NEXT: s_add_u32 s16, s0, -1 +; GCN-IR-NEXT: s_addc_u32 s17, s1, -1 +; GCN-IR-NEXT: s_not_b64 s[2:3], s[10:11] +; GCN-IR-NEXT: s_add_u32 s6, s2, s12 +; GCN-IR-NEXT: s_addc_u32 s7, s3, s11 +; GCN-IR-NEXT: s_mov_b32 s13, s11 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_mov_b32 s3, 0 ; GCN-IR-NEXT: BB0_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshr_b32 s2, s11, 31 -; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[2:3] -; GCN-IR-NEXT: s_or_b64 s[10:11], s[12:13], s[10:11] -; GCN-IR-NEXT: s_sub_u32 s2, s6, s14 -; GCN-IR-NEXT: s_subb_u32 s2, s7, s15 -; GCN-IR-NEXT: s_ashr_i32 s12, s2, 31 -; GCN-IR-NEXT: s_mov_b32 s13, s12 -; GCN-IR-NEXT: s_and_b32 s2, s12, 1 -; GCN-IR-NEXT: s_and_b64 s[16:17], s[12:13], s[0:1] -; GCN-IR-NEXT: s_sub_u32 s14, s14, s16 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 -; GCN-IR-NEXT: s_subb_u32 s15, s15, s17 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 -; GCN-IR-NEXT: s_add_u32 s8, s8, 1 -; GCN-IR-NEXT: s_addc_u32 s9, s9, 0 -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] -; GCN-IR-NEXT: s_mov_b64 s[12:13], s[2:3] +; GCN-IR-NEXT: s_lshr_b32 s2, s9, 31 +; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[14:15], 1 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 +; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[2:3] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] +; GCN-IR-NEXT: s_sub_u32 s2, s16, s12 +; GCN-IR-NEXT: s_subb_u32 s2, s17, s13 +; GCN-IR-NEXT: s_ashr_i32 s10, s2, 31 +; GCN-IR-NEXT: s_mov_b32 s11, s10 +; GCN-IR-NEXT: s_and_b32 s2, s10, 1 +; GCN-IR-NEXT: s_and_b64 s[14:15], s[10:11], s[0:1] +; GCN-IR-NEXT: s_sub_u32 s14, s12, s14 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 +; GCN-IR-NEXT: s_subb_u32 s15, s13, s15 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 +; GCN-IR-NEXT: s_add_u32 s6, s6, 1 +; GCN-IR-NEXT: s_addc_u32 s7, s7, 0 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN-IR-NEXT: s_mov_b64 s[10:11], s[2:3] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc ; GCN-IR-NEXT: s_cbranch_vccz BB0_3 ; GCN-IR-NEXT: BB0_4: ; %Flow6 -; GCN-IR-NEXT: s_lshl_b64 s[0:1], s[10:11], 1 +; GCN-IR-NEXT: s_lshl_b64 s[0:1], s[8:9], 1 ; GCN-IR-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s1 @@ -370,40 +370,40 @@ ; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader +; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v2 ; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v12 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, -1, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc -; GCN-IR-NEXT: v_not_b32_e32 v6, v8 -; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 -; GCN-IR-NEXT: v_not_b32_e32 v7, v9 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v6, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v15, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, v7, v11, vcc +; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc +; GCN-IR-NEXT: v_not_b32_e32 v0, v8 +; GCN-IR-NEXT: v_not_b32_e32 v1, v9 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, v1, v11, vcc ; GCN-IR-NEXT: BB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[12:13], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 ; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v0, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v1, v11, vcc -; GCN-IR-NEXT: v_or_b32_e32 v4, v14, v4 -; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v8 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 -; GCN-IR-NEXT: v_or_b32_e32 v5, v15, v5 -; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v9, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[14:15], v[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v8, v14 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v14, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v11, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v8, v4 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v6 +; GCN-IR-NEXT: v_and_b32_e32 v12, v8, v2 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v8 +; GCN-IR-NEXT: v_and_b32_e32 v13, v8, v3 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v0 +; GCN-IR-NEXT: v_or_b32_e32 v5, v9, v5 +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v0, v8 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 -; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v3 -; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v2 ; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v10, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v15, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v1, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v7 ; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v11, v13, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v14, v6 +; GCN-IR-NEXT: v_mov_b32_e32 v8, v6 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow @@ -833,68 +833,68 @@ ; GCN-IR-NEXT: s_mov_b64 s[0:1], 0 ; GCN-IR-NEXT: s_or_b64 s[14:15], s[8:9], s[10:11] ; GCN-IR-NEXT: s_flbit_i32_b32 s8, s2 -; GCN-IR-NEXT: s_flbit_i32_b32 s10, s6 ; GCN-IR-NEXT: s_add_i32 s8, s8, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s9, s3 -; GCN-IR-NEXT: s_add_i32 s10, s10, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s11, s7 -; GCN-IR-NEXT: s_min_u32 s8, s8, s9 -; GCN-IR-NEXT: s_min_u32 s12, s10, s11 -; GCN-IR-NEXT: s_sub_u32 s10, s8, s12 -; GCN-IR-NEXT: s_subb_u32 s11, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[10:11], 63 -; GCN-IR-NEXT: s_mov_b32 s9, 0 +; GCN-IR-NEXT: s_min_u32 s10, s8, s9 +; GCN-IR-NEXT: s_flbit_i32_b32 s8, s6 +; GCN-IR-NEXT: s_add_i32 s8, s8, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s9, s7 +; GCN-IR-NEXT: s_min_u32 s12, s8, s9 +; GCN-IR-NEXT: s_sub_u32 s8, s10, s12 +; GCN-IR-NEXT: s_subb_u32 s9, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[8:9], 63 +; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[16:17], s[10:11], 63 +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[16:17], s[8:9], 63 ; GCN-IR-NEXT: s_xor_b64 s[18:19], s[14:15], -1 ; GCN-IR-NEXT: s_and_b64 s[16:17], s[18:19], s[16:17] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17] ; GCN-IR-NEXT: s_cbranch_vccz BB7_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s14, s10, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 -; GCN-IR-NEXT: s_addc_u32 s15, s11, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 +; GCN-IR-NEXT: s_add_u32 s14, s8, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 +; GCN-IR-NEXT: s_addc_u32 s15, s9, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[14:15], v[0:1] -; GCN-IR-NEXT: s_sub_i32 s10, 63, s10 +; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, vcc -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[6:7], s10 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[6:7], s8 ; GCN-IR-NEXT: s_cbranch_vccz BB7_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[6:7], s14 -; GCN-IR-NEXT: s_add_u32 s6, s2, -1 -; GCN-IR-NEXT: s_addc_u32 s7, s3, -1 -; GCN-IR-NEXT: s_not_b64 s[0:1], s[8:9] -; GCN-IR-NEXT: s_mov_b32 s13, s9 -; GCN-IR-NEXT: s_add_u32 s8, s0, s12 -; GCN-IR-NEXT: s_addc_u32 s9, s1, s9 -; GCN-IR-NEXT: s_mov_b64 s[12:13], 0 +; GCN-IR-NEXT: s_add_u32 s16, s2, -1 +; GCN-IR-NEXT: s_addc_u32 s17, s3, -1 +; GCN-IR-NEXT: s_not_b64 s[0:1], s[10:11] +; GCN-IR-NEXT: s_add_u32 s6, s0, s12 +; GCN-IR-NEXT: s_addc_u32 s7, s1, s11 +; GCN-IR-NEXT: s_mov_b32 s13, s11 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_mov_b32 s1, 0 ; GCN-IR-NEXT: BB7_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshr_b32 s0, s11, 31 -; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[0:1] -; GCN-IR-NEXT: s_or_b64 s[10:11], s[12:13], s[10:11] -; GCN-IR-NEXT: s_sub_u32 s0, s6, s14 -; GCN-IR-NEXT: s_subb_u32 s0, s7, s15 -; GCN-IR-NEXT: s_ashr_i32 s12, s0, 31 -; GCN-IR-NEXT: s_mov_b32 s13, s12 -; GCN-IR-NEXT: s_and_b32 s0, s12, 1 -; GCN-IR-NEXT: s_and_b64 s[16:17], s[12:13], s[2:3] -; GCN-IR-NEXT: s_sub_u32 s14, s14, s16 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 -; GCN-IR-NEXT: s_subb_u32 s15, s15, s17 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 -; GCN-IR-NEXT: s_add_u32 s8, s8, 1 -; GCN-IR-NEXT: s_addc_u32 s9, s9, 0 -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] -; GCN-IR-NEXT: s_mov_b64 s[12:13], s[0:1] +; GCN-IR-NEXT: s_lshr_b32 s0, s9, 31 +; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[14:15], 1 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 +; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[0:1] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] +; GCN-IR-NEXT: s_sub_u32 s0, s16, s12 +; GCN-IR-NEXT: s_subb_u32 s0, s17, s13 +; GCN-IR-NEXT: s_ashr_i32 s10, s0, 31 +; GCN-IR-NEXT: s_mov_b32 s11, s10 +; GCN-IR-NEXT: s_and_b32 s0, s10, 1 +; GCN-IR-NEXT: s_and_b64 s[14:15], s[10:11], s[2:3] +; GCN-IR-NEXT: s_sub_u32 s14, s12, s14 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 +; GCN-IR-NEXT: s_subb_u32 s15, s13, s15 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 +; GCN-IR-NEXT: s_add_u32 s6, s6, 1 +; GCN-IR-NEXT: s_addc_u32 s7, s7, 0 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN-IR-NEXT: s_mov_b64 s[10:11], s[0:1] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc ; GCN-IR-NEXT: s_cbranch_vccz BB7_3 ; GCN-IR-NEXT: BB7_4: ; %Flow3 -; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[10:11], 1 +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1 ; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s1 @@ -1034,61 +1034,61 @@ ; GCN-IR-NEXT: s_flbit_i32_b32 s4, s2 ; GCN-IR-NEXT: s_flbit_i32_b32 s5, s3 ; GCN-IR-NEXT: s_add_i32 s4, s4, 32 -; GCN-IR-NEXT: s_min_u32 s8, s4, s5 -; GCN-IR-NEXT: s_add_u32 s6, s8, 0xffffffc5 -; GCN-IR-NEXT: s_addc_u32 s7, 0, -1 +; GCN-IR-NEXT: s_min_u32 s6, s4, s5 +; GCN-IR-NEXT: s_add_u32 s8, s6, 0xffffffc5 +; GCN-IR-NEXT: s_addc_u32 s9, 0, -1 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[2:3], 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[12:13], s[6:7], 63 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[12:13], s[8:9], 63 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[12:13], s[6:7], 63 +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[12:13], s[8:9], 63 ; GCN-IR-NEXT: s_xor_b64 s[14:15], s[10:11], -1 ; GCN-IR-NEXT: s_and_b64 s[12:13], s[14:15], s[12:13] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[12:13] ; GCN-IR-NEXT: s_cbranch_vccz BB8_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s12, s6, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 -; GCN-IR-NEXT: s_addc_u32 s13, s7, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] -; GCN-IR-NEXT: s_sub_i32 s6, 63, s6 +; GCN-IR-NEXT: s_add_u32 s10, s8, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 +; GCN-IR-NEXT: s_addc_u32 s11, s9, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1] +; GCN-IR-NEXT: s_sub_i32 s7, 63, s8 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, vcc -; GCN-IR-NEXT: s_lshl_b64 s[10:11], 24, s6 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], 24, s7 ; GCN-IR-NEXT: s_cbranch_vccz BB8_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[14:15], 24, s12 -; GCN-IR-NEXT: s_add_u32 s6, s2, -1 -; GCN-IR-NEXT: s_addc_u32 s7, s3, -1 -; GCN-IR-NEXT: s_sub_u32 s8, 58, s8 -; GCN-IR-NEXT: s_subb_u32 s9, 0, 0 -; GCN-IR-NEXT: s_mov_b64 s[12:13], 0 +; GCN-IR-NEXT: s_lshr_b64 s[12:13], 24, s10 +; GCN-IR-NEXT: s_add_u32 s14, s2, -1 +; GCN-IR-NEXT: s_addc_u32 s15, s3, -1 +; GCN-IR-NEXT: s_sub_u32 s6, 58, s6 +; GCN-IR-NEXT: s_subb_u32 s7, 0, 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_mov_b32 s5, 0 ; GCN-IR-NEXT: BB8_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshr_b32 s4, s11, 31 -; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[4:5] -; GCN-IR-NEXT: s_or_b64 s[10:11], s[12:13], s[10:11] -; GCN-IR-NEXT: s_sub_u32 s4, s6, s14 -; GCN-IR-NEXT: s_subb_u32 s4, s7, s15 -; GCN-IR-NEXT: s_ashr_i32 s12, s4, 31 -; GCN-IR-NEXT: s_mov_b32 s13, s12 -; GCN-IR-NEXT: s_and_b32 s4, s12, 1 -; GCN-IR-NEXT: s_and_b64 s[16:17], s[12:13], s[2:3] -; GCN-IR-NEXT: s_sub_u32 s14, s14, s16 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 -; GCN-IR-NEXT: s_subb_u32 s15, s15, s17 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 -; GCN-IR-NEXT: s_add_u32 s8, s8, 1 -; GCN-IR-NEXT: s_addc_u32 s9, s9, 0 -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] -; GCN-IR-NEXT: s_mov_b64 s[12:13], s[4:5] +; GCN-IR-NEXT: s_lshr_b32 s4, s9, 31 +; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 +; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[4:5] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] +; GCN-IR-NEXT: s_sub_u32 s4, s14, s12 +; GCN-IR-NEXT: s_subb_u32 s4, s15, s13 +; GCN-IR-NEXT: s_ashr_i32 s10, s4, 31 +; GCN-IR-NEXT: s_mov_b32 s11, s10 +; GCN-IR-NEXT: s_and_b32 s4, s10, 1 +; GCN-IR-NEXT: s_and_b64 s[16:17], s[10:11], s[2:3] +; GCN-IR-NEXT: s_sub_u32 s12, s12, s16 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 +; GCN-IR-NEXT: s_subb_u32 s13, s13, s17 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 +; GCN-IR-NEXT: s_add_u32 s6, s6, 1 +; GCN-IR-NEXT: s_addc_u32 s7, s7, 0 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc ; GCN-IR-NEXT: s_cbranch_vccz BB8_3 ; GCN-IR-NEXT: BB8_4: ; %Flow5 -; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[10:11], 1 +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1 ; GCN-IR-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] ; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 @@ -1245,39 +1245,39 @@ ; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB9_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader +; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 -; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, -1, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 47, v4 -; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v8 +; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, 0, v5, vcc ; GCN-IR-NEXT: BB9_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v6 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v8, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v9, v11, vcc -; GCN-IR-NEXT: v_or_b32_e32 v2, v12, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 -; GCN-IR-NEXT: v_and_b32_e32 v15, v12, v0 -; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 -; GCN-IR-NEXT: v_and_b32_e32 v14, v12, v1 -; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v3, v13, v3 -; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v12 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v12, v8 +; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v13, v9, vcc +; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6 +; GCN-IR-NEXT: v_and_b32_e32 v15, v10, v0 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10 +; GCN-IR-NEXT: v_and_b32_e32 v14, v10, v1 +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v4 +; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v4, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v5, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 -; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5] +; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v15 +; GCN-IR-NEXT: v_mov_b32_e32 v5, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v14, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB9_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll --- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -202,14 +202,14 @@ ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: flat_load_dword v41, v[1:2] -; GCN-NEXT: v_mov_b32_e32 v43, 0 +; GCN-NEXT: flat_load_dword v43, v[1:2] +; GCN-NEXT: v_mov_b32_e32 v42, 0 ; GCN-NEXT: s_getpc_b64 s[36:37] ; GCN-NEXT: s_add_u32 s36, s36, spam@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s37, s37, spam@rel32@hi+12 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 2, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v41, 2, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_f32_e64 s[34:35], 0, v41 +; GCN-NEXT: v_cmp_eq_f32_e64 s[34:35], 0, v43 ; GCN-NEXT: s_branch BB1_3 ; GCN-NEXT: BB1_1: ; %bb10 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 @@ -228,7 +228,7 @@ ; GCN-NEXT: BB1_4: ; %bb2 ; GCN-NEXT: ; Parent Loop BB1_3 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 -; GCN-NEXT: flat_load_dword v0, v[42:43] +; GCN-NEXT: flat_load_dword v0, v[41:42] ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_waitcnt vmcnt(1) @@ -272,7 +272,7 @@ ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: BB1_10: ; %bb17 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], 0 ; GCN-NEXT: s_branch BB1_2 bb: %tmp = load float, float* null, align 16 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -133,78 +133,78 @@ ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[6:7], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[0:1], 0 ; GCN-IR-NEXT: s_flbit_i32_b32 s12, s0 -; GCN-IR-NEXT: s_add_i32 s14, s12, 32 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[8:9], s[10:11] +; GCN-IR-NEXT: s_or_b64 s[14:15], s[8:9], s[10:11] +; GCN-IR-NEXT: s_flbit_i32_b32 s10, s6 +; GCN-IR-NEXT: s_add_i32 s12, s12, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s8, s1 -; GCN-IR-NEXT: s_min_u32 s10, s14, s8 -; GCN-IR-NEXT: s_flbit_i32_b32 s8, s6 -; GCN-IR-NEXT: s_add_i32 s8, s8, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s9, s7 -; GCN-IR-NEXT: s_min_u32 s14, s8, s9 -; GCN-IR-NEXT: s_sub_u32 s8, s10, s14 -; GCN-IR-NEXT: s_subb_u32 s9, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[8:9], 63 -; GCN-IR-NEXT: s_mov_b32 s11, 0 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[16:17] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[16:17], s[8:9], 63 -; GCN-IR-NEXT: s_xor_b64 s[18:19], s[12:13], -1 +; GCN-IR-NEXT: s_add_i32 s10, s10, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s11, s7 +; GCN-IR-NEXT: s_min_u32 s8, s12, s8 +; GCN-IR-NEXT: s_min_u32 s12, s10, s11 +; GCN-IR-NEXT: s_sub_u32 s10, s8, s12 +; GCN-IR-NEXT: s_subb_u32 s11, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[10:11], 63 +; GCN-IR-NEXT: s_mov_b32 s9, 0 +; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[16:17], s[10:11], 63 +; GCN-IR-NEXT: s_xor_b64 s[18:19], s[14:15], -1 ; GCN-IR-NEXT: s_and_b64 s[16:17], s[18:19], s[16:17] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17] ; GCN-IR-NEXT: s_cbranch_vccz BB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s16, s8, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 -; GCN-IR-NEXT: s_addc_u32 s17, s9, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1] -; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 +; GCN-IR-NEXT: s_add_u32 s14, s10, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 +; GCN-IR-NEXT: s_addc_u32 s15, s11, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[14:15], v[0:1] +; GCN-IR-NEXT: s_sub_i32 s10, 63, s10 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, vcc -; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[6:7], s8 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[6:7], s10 ; GCN-IR-NEXT: s_cbranch_vccz BB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[6:7], s16 -; GCN-IR-NEXT: s_add_u32 s8, s0, -1 -; GCN-IR-NEXT: s_addc_u32 s9, s1, -1 -; GCN-IR-NEXT: s_not_b64 s[2:3], s[10:11] -; GCN-IR-NEXT: s_mov_b32 s15, s11 -; GCN-IR-NEXT: s_add_u32 s10, s2, s14 -; GCN-IR-NEXT: s_addc_u32 s11, s3, s11 -; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 +; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[6:7], s14 +; GCN-IR-NEXT: s_add_u32 s16, s0, -1 +; GCN-IR-NEXT: s_addc_u32 s17, s1, -1 +; GCN-IR-NEXT: s_not_b64 s[2:3], s[8:9] +; GCN-IR-NEXT: s_mov_b32 s13, s9 +; GCN-IR-NEXT: s_add_u32 s8, s2, s12 +; GCN-IR-NEXT: s_addc_u32 s9, s3, s9 +; GCN-IR-NEXT: s_mov_b64 s[12:13], 0 ; GCN-IR-NEXT: s_mov_b32 s3, 0 ; GCN-IR-NEXT: BB0_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshr_b32 s2, s13, 31 -; GCN-IR-NEXT: s_lshl_b64 s[16:17], s[16:17], 1 -; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 -; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[2:3] -; GCN-IR-NEXT: s_or_b64 s[12:13], s[14:15], s[12:13] -; GCN-IR-NEXT: s_sub_u32 s2, s8, s16 -; GCN-IR-NEXT: s_subb_u32 s2, s9, s17 -; GCN-IR-NEXT: s_ashr_i32 s14, s2, 31 -; GCN-IR-NEXT: s_mov_b32 s15, s14 -; GCN-IR-NEXT: s_and_b32 s2, s14, 1 -; GCN-IR-NEXT: s_and_b64 s[18:19], s[14:15], s[0:1] -; GCN-IR-NEXT: s_sub_u32 s16, s16, s18 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 -; GCN-IR-NEXT: s_subb_u32 s17, s17, s19 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 -; GCN-IR-NEXT: s_add_u32 s10, s10, 1 -; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1] -; GCN-IR-NEXT: s_mov_b64 s[14:15], s[2:3] +; GCN-IR-NEXT: s_lshr_b32 s2, s11, 31 +; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 +; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[2:3] +; GCN-IR-NEXT: s_or_b64 s[10:11], s[12:13], s[10:11] +; GCN-IR-NEXT: s_sub_u32 s2, s16, s14 +; GCN-IR-NEXT: s_subb_u32 s2, s17, s15 +; GCN-IR-NEXT: s_ashr_i32 s12, s2, 31 +; GCN-IR-NEXT: s_mov_b32 s13, s12 +; GCN-IR-NEXT: s_and_b32 s2, s12, 1 +; GCN-IR-NEXT: s_and_b64 s[18:19], s[12:13], s[0:1] +; GCN-IR-NEXT: s_sub_u32 s14, s14, s18 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 +; GCN-IR-NEXT: s_subb_u32 s15, s15, s19 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 +; GCN-IR-NEXT: s_add_u32 s8, s8, 1 +; GCN-IR-NEXT: s_addc_u32 s9, s9, 0 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] +; GCN-IR-NEXT: s_mov_b64 s[12:13], s[2:3] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc ; GCN-IR-NEXT: s_cbranch_vccz BB0_3 ; GCN-IR-NEXT: BB0_4: ; %Flow6 -; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[12:13], 1 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[10:11], 1 ; GCN-IR-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] ; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 ; GCN-IR-NEXT: s_branch BB0_6 ; GCN-IR-NEXT: BB0_5: ; GCN-IR-NEXT: v_mov_b32_e32 v0, s7 -; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[12:13] +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[14:15] ; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 -; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[12:13] +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[14:15] ; GCN-IR-NEXT: BB0_6: ; %udiv-end ; GCN-IR-NEXT: v_mul_lo_u32 v1, s0, v1 ; GCN-IR-NEXT: v_mul_hi_u32 v2, s0, v0 @@ -348,71 +348,71 @@ ; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3 -; GCN-IR-NEXT: v_min_u32_e32 v10, v4, v5 +; GCN-IR-NEXT: v_min_u32_e32 v8, v4, v5 ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0 ; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 -; GCN-IR-NEXT: v_min_u32_e32 v12, v4, v5 -; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, v10, v12 +; GCN-IR-NEXT: v_min_u32_e32 v10, v4, v5 +; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, v8, v10 ; GCN-IR-NEXT: v_subb_u32_e64 v6, s[6:7], 0, 0, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[5:6] -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[5:6] ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v9 ; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v1, 0, s[4:5] ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v0, 0, s[4:5] ; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v5 -; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v5 +; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v6, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v5 -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[7:8], v[5:6] -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[12:13], v[5:6] +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v2 -; GCN-IR-NEXT: v_lshr_b64 v[14:15], v[0:1], v7 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v3, vcc -; GCN-IR-NEXT: v_not_b32_e32 v8, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v16, 0 -; GCN-IR-NEXT: v_not_b32_e32 v9, v11 -; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, v8, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v17, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, v9, v13, vcc +; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, -1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, -1, v3, vcc +; GCN-IR-NEXT: v_not_b32_e32 v6, v8 +; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 +; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v12 +; GCN-IR-NEXT: v_not_b32_e32 v7, v9 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v15, 0 +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, v7, v11, vcc ; GCN-IR-NEXT: BB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[14:15], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v8, 31, v5 -; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v8 +; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[12:13], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v6, v12 -; GCN-IR-NEXT: v_subb_u32_e32 v8, vcc, v7, v13, vcc -; GCN-IR-NEXT: v_or_b32_e32 v4, v16, v4 -; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, 1, v10 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v8 -; GCN-IR-NEXT: v_or_b32_e32 v5, v17, v5 -; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, 0, v11, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v16 -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: v_and_b32_e32 v8, 1, v14 -; GCN-IR-NEXT: v_and_b32_e32 v15, v14, v3 -; GCN-IR-NEXT: v_and_b32_e32 v14, v14, v2 -; GCN-IR-NEXT: v_sub_i32_e64 v14, s[4:5], v12, v14 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v17 -; GCN-IR-NEXT: v_mov_b32_e32 v17, v9 -; GCN-IR-NEXT: v_subb_u32_e64 v15, s[4:5], v13, v15, s[4:5] +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v16, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v17, v11, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v14, v4 +; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v8 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 +; GCN-IR-NEXT: v_or_b32_e32 v5, v15, v5 +; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[14:15], v[8:9] +; GCN-IR-NEXT: v_mov_b32_e32 v8, v14 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 +; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v3 +; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v2 +; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v10, v12 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v15 +; GCN-IR-NEXT: v_mov_b32_e32 v15, v7 +; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v11, v13, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v16, v8 +; GCN-IR-NEXT: v_mov_b32_e32 v14, v6 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow @@ -420,8 +420,8 @@ ; GCN-IR-NEXT: BB1_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_or_b32_e32 v7, v9, v5 -; GCN-IR-NEXT: v_or_b32_e32 v4, v8, v4 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v5 +; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v4 ; GCN-IR-NEXT: BB1_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_mul_lo_u32 v5, v2, v7 @@ -853,61 +853,61 @@ ; GCN-IR-NEXT: s_flbit_i32_b32 s4, s2 ; GCN-IR-NEXT: s_flbit_i32_b32 s5, s3 ; GCN-IR-NEXT: s_add_i32 s4, s4, 32 -; GCN-IR-NEXT: s_min_u32 s8, s4, s5 -; GCN-IR-NEXT: s_add_u32 s6, s8, 0xffffffc5 -; GCN-IR-NEXT: s_addc_u32 s7, 0, -1 +; GCN-IR-NEXT: s_min_u32 s6, s4, s5 +; GCN-IR-NEXT: s_add_u32 s8, s6, 0xffffffc5 +; GCN-IR-NEXT: s_addc_u32 s9, 0, -1 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[2:3], 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[12:13], s[6:7], 63 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[12:13], s[8:9], 63 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[12:13], s[6:7], 63 +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[12:13], s[8:9], 63 ; GCN-IR-NEXT: s_xor_b64 s[14:15], s[10:11], -1 ; GCN-IR-NEXT: s_and_b64 s[12:13], s[14:15], s[12:13] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[12:13] ; GCN-IR-NEXT: s_cbranch_vccz BB6_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s12, s6, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 -; GCN-IR-NEXT: s_addc_u32 s13, s7, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] -; GCN-IR-NEXT: s_sub_i32 s6, 63, s6 +; GCN-IR-NEXT: s_add_u32 s10, s8, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 +; GCN-IR-NEXT: s_addc_u32 s11, s9, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1] +; GCN-IR-NEXT: s_sub_i32 s7, 63, s8 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, vcc -; GCN-IR-NEXT: s_lshl_b64 s[10:11], 24, s6 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], 24, s7 ; GCN-IR-NEXT: s_cbranch_vccz BB6_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[14:15], 24, s12 -; GCN-IR-NEXT: s_add_u32 s6, s2, -1 -; GCN-IR-NEXT: s_addc_u32 s7, s3, -1 -; GCN-IR-NEXT: s_sub_u32 s8, 58, s8 -; GCN-IR-NEXT: s_subb_u32 s9, 0, 0 -; GCN-IR-NEXT: s_mov_b64 s[12:13], 0 +; GCN-IR-NEXT: s_lshr_b64 s[12:13], 24, s10 +; GCN-IR-NEXT: s_add_u32 s14, s2, -1 +; GCN-IR-NEXT: s_addc_u32 s15, s3, -1 +; GCN-IR-NEXT: s_sub_u32 s6, 58, s6 +; GCN-IR-NEXT: s_subb_u32 s7, 0, 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_mov_b32 s5, 0 ; GCN-IR-NEXT: BB6_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: s_lshr_b32 s4, s11, 31 -; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[4:5] -; GCN-IR-NEXT: s_or_b64 s[10:11], s[12:13], s[10:11] -; GCN-IR-NEXT: s_sub_u32 s4, s6, s14 -; GCN-IR-NEXT: s_subb_u32 s4, s7, s15 -; GCN-IR-NEXT: s_ashr_i32 s12, s4, 31 -; GCN-IR-NEXT: s_mov_b32 s13, s12 -; GCN-IR-NEXT: s_and_b32 s4, s12, 1 -; GCN-IR-NEXT: s_and_b64 s[16:17], s[12:13], s[2:3] -; GCN-IR-NEXT: s_sub_u32 s14, s14, s16 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 -; GCN-IR-NEXT: s_subb_u32 s15, s15, s17 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 -; GCN-IR-NEXT: s_add_u32 s8, s8, 1 -; GCN-IR-NEXT: s_addc_u32 s9, s9, 0 -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] -; GCN-IR-NEXT: s_mov_b64 s[12:13], s[4:5] +; GCN-IR-NEXT: s_lshr_b32 s4, s9, 31 +; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 +; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[4:5] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] +; GCN-IR-NEXT: s_sub_u32 s4, s14, s12 +; GCN-IR-NEXT: s_subb_u32 s4, s15, s13 +; GCN-IR-NEXT: s_ashr_i32 s10, s4, 31 +; GCN-IR-NEXT: s_mov_b32 s11, s10 +; GCN-IR-NEXT: s_and_b32 s4, s10, 1 +; GCN-IR-NEXT: s_and_b64 s[16:17], s[10:11], s[2:3] +; GCN-IR-NEXT: s_sub_u32 s12, s12, s16 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 +; GCN-IR-NEXT: s_subb_u32 s13, s13, s17 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 +; GCN-IR-NEXT: s_add_u32 s6, s6, 1 +; GCN-IR-NEXT: s_addc_u32 s7, s7, 0 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] ; GCN-IR-NEXT: s_and_b64 vcc, exec, vcc ; GCN-IR-NEXT: s_cbranch_vccz BB6_3 ; GCN-IR-NEXT: BB6_4: ; %Flow5 -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[10:11], 1 +; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[8:9], 1 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s5 @@ -1268,39 +1268,39 @@ ; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB8_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader +; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 -; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v8 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, -1, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, 47, v4 -; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v8 +; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, 0, v5, vcc ; GCN-IR-NEXT: BB8_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; GCN-IR-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v3 -; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 +; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v6 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v8, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v9, v11, vcc -; GCN-IR-NEXT: v_or_b32_e32 v2, v12, v2 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 -; GCN-IR-NEXT: v_and_b32_e32 v15, v12, v0 -; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 -; GCN-IR-NEXT: v_and_b32_e32 v14, v12, v1 -; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v4 -; GCN-IR-NEXT: v_or_b32_e32 v3, v13, v3 -; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v4, v12 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v12, v8 +; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v13, v9, vcc +; GCN-IR-NEXT: v_or_b32_e32 v2, v10, v2 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6 +; GCN-IR-NEXT: v_and_b32_e32 v15, v10, v0 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10 +; GCN-IR-NEXT: v_and_b32_e32 v14, v10, v1 +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v4 +; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v4, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v5, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 -; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5] +; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v15 +; GCN-IR-NEXT: v_mov_b32_e32 v5, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v14, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB8_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow