Index: llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -475,39 +475,25 @@ MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); - MachineBasicBlock::iterator InsPt = MBB.begin(); + MachineBasicBlock::iterator InsPt = MI; - // If we have instructions that aren't prolog instructions, split the block - // and emit a terminator instruction. This ensures correct spill placement. - // FIXME: We should unconditionally split the block here. - bool NeedBlockSplit = false; Register DataReg = MI.getOperand(0).getReg(); - for (MachineBasicBlock::iterator I = InsPt, E = MI.getIterator(); - I != E; ++I) { - if (I->modifiesRegister(DataReg, TRI)) { - NeedBlockSplit = true; - break; - } - } - unsigned Opcode = OrOpc; - MachineBasicBlock *SplitBB = &MBB; - if (NeedBlockSplit) { - SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/true, LIS); - if (MDT && SplitBB != &MBB) { - MachineDomTreeNode *MBBNode = (*MDT)[&MBB]; - SmallVector Children(MBBNode->begin(), - MBBNode->end()); - MachineDomTreeNode *SplitBBNode = MDT->addNewBlock(SplitBB, &MBB); - for (MachineDomTreeNode *Child : Children) - MDT->changeImmediateDominator(Child, SplitBBNode); - } - Opcode = OrTermrOpc; - InsPt = MI; + // If we have instructions that aren't prolog instructions, split the block + // and emit a terminator instruction. This ensures correct spill placement + // relative to exec writes. + MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/true, LIS); + if (MDT && SplitBB != &MBB) { + MachineDomTreeNode *MBBNode = (*MDT)[&MBB]; + SmallVector Children(MBBNode->begin(), + MBBNode->end()); + MachineDomTreeNode *SplitBBNode = MDT->addNewBlock(SplitBB, &MBB); + for (MachineDomTreeNode *Child : Children) + MDT->changeImmediateDominator(Child, SplitBBNode); } MachineInstr *NewMI = - BuildMI(MBB, InsPt, DL, TII->get(Opcode), Exec) + BuildMI(MBB, InsPt, DL, TII->get(OrTermrOpc), Exec) .addReg(Exec) .add(MI.getOperand(0)); if (LV) { Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll @@ -899,21 +899,21 @@ ; SI-NEXT: s_or_b32 s6, s2, s4 ; SI-NEXT: s_lshl_b32 s2, s2, 16 ; SI-NEXT: s_or_b32 s7, s3, s2 -; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_mov_b32 s8, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc -; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; SI-NEXT: s_xor_b64 s[4:5], exec, s[2:3] ; SI-NEXT: s_cbranch_execz .LBB7_3 ; SI-NEXT: ; %bb.1: ; %.demote0 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 .LBB7_9 ; SI-NEXT: ; %bb.2: ; %.demote0 -; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] -; SI-NEXT: s_and_b64 exec, exec, s[8:9] +; SI-NEXT: s_wqm_b64 s[2:3], s[0:1] +; SI-NEXT: s_and_b64 exec, exec, s[2:3] ; SI-NEXT: .LBB7_3: ; %.continue0.preheader -; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_mov_b64 s[2:3], 0 -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: s_branch .LBB7_5 ; SI-NEXT: .LBB7_4: ; %.continue1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -949,8 +949,8 @@ ; SI-NEXT: s_and_b64 exec, exec, s[8:9] ; SI-NEXT: s_branch .LBB7_4 ; SI-NEXT: .LBB7_8: ; %.return -; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm @@ -965,21 +965,21 @@ ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_wqm_b64 exec, exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_mov_b32 s6, 0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB7_3 ; GFX9-NEXT: ; %bb.1: ; %.demote0 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX9-NEXT: ; %bb.2: ; %.demote0 -; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] -; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_wqm_b64 s[2:3], s[0:1] +; GFX9-NEXT: s_and_b64 exec, exec, s[2:3] ; GFX9-NEXT: .LBB7_3: ; %.continue0.preheader -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_mov_b64 s[2:3], 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_branch .LBB7_5 ; GFX9-NEXT: .LBB7_4: ; %.continue1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -1015,8 +1015,8 @@ ; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] ; GFX9-NEXT: s_branch .LBB7_4 ; GFX9-NEXT: .LBB7_8: ; %.return -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 ; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm @@ -1078,8 +1078,8 @@ ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 ; GFX10-32-NEXT: s_branch .LBB7_4 ; GFX10-32-NEXT: .LBB7_8: ; %.return -; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 ; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm @@ -1094,21 +1094,21 @@ ; GFX10-64-NEXT: s_mov_b64 s[0:1], exec ; GFX10-64-NEXT: s_wqm_b64 exec, exec ; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX10-64-NEXT: s_mov_b32 s4, 0 +; GFX10-64-NEXT: s_mov_b32 s6, 0 ; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[2:3] ; GFX10-64-NEXT: s_cbranch_execz .LBB7_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote0 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_9 ; GFX10-64-NEXT: ; %bb.2: ; %.demote0 -; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] -; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] +; GFX10-64-NEXT: s_wqm_b64 s[2:3], s[0:1] +; GFX10-64-NEXT: s_and_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: .LBB7_3: ; %.continue0.preheader -; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX10-64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-64-NEXT: s_mov_b64 s[2:3], 0 +; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-64-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-64-NEXT: s_branch .LBB7_5 ; GFX10-64-NEXT: .LBB7_4: ; %.continue1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -1142,8 +1142,8 @@ ; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] ; GFX10-64-NEXT: s_branch .LBB7_4 ; GFX10-64-NEXT: .LBB7_8: ; %.return -; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 ; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm Index: llvm/test/CodeGen/AMDGPU/block-should-not-be-in-alive-blocks.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/block-should-not-be-in-alive-blocks.mir +++ llvm/test/CodeGen/AMDGPU/block-should-not-be-in-alive-blocks.mir @@ -34,8 +34,8 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]], 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4) ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %15, 0, implicit $exec - ; CHECK-NEXT: %7:vgpr_32, dead %8:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed %7, %subreg.sub1 + ; CHECK-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed [[V_ADDC_U32_e64_]], %subreg.sub1 ; CHECK-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec ; CHECK-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B]], killed [[GLOBAL_LOAD_UBYTE]], 0, 0, implicit $exec :: (store (s8), addrspace 1) @@ -55,7 +55,7 @@ ; CHECK-NEXT: successors: %bb.6(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec - ; CHECK-NEXT: dead %13:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[V_MOV_B1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) + ; CHECK-NEXT: dead [[GLOBAL_LOAD_UBYTE1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[V_MOV_B1]], 0, 0, implicit $exec :: (load (s8), addrspace 1) ; CHECK-NEXT: S_BRANCH %bb.6 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: @@ -75,7 +75,11 @@ ; CHECK-NEXT: S_BRANCH %bb.5 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.7: - ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, killed [[S_AND_B32_1]], implicit-def $scc + ; CHECK-NEXT: successors: %bb.8(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec_lo = S_OR_B32_term $exec_lo, killed [[S_AND_B32_1]], implicit-def $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.8: ; CHECK-NEXT: S_ENDPGM 0 bb.0: successors: %bb.2(0x40000000), %bb.5(0x40000000) Index: llvm/test/CodeGen/AMDGPU/collapse-endcf.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -59,7 +59,7 @@ ; GCN-O0-NEXT: v_writelane_b32 v1, s1, 3 ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB0_4 +; GCN-O0-NEXT: s_cbranch_execz .LBB0_5 ; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then ; GCN-O0-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: v_readlane_b32 s4, v1, 0 @@ -110,10 +110,12 @@ ; GCN-O0-NEXT: v_readlane_b32 s0, v1, 4 ; GCN-O0-NEXT: v_readlane_b32 s1, v1, 5 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN-O0-NEXT: .LBB0_4: ; %bb.outer.end +; GCN-O0-NEXT: ; %bb.4: ; %Flow +; GCN-O0-NEXT: .LBB0_5: ; %bb.outer.end ; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2 ; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-O0-NEXT: ; %bb.6: ; %bb.outer.end ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 @@ -230,7 +232,7 @@ ; GCN-O0-NEXT: v_writelane_b32 v1, s1, 5 ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB1_4 +; GCN-O0-NEXT: s_cbranch_execz .LBB1_5 ; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload @@ -251,18 +253,20 @@ ; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] ; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 -; GCN-O0-NEXT: s_branch .LBB1_4 +; GCN-O0-NEXT: s_branch .LBB1_5 ; GCN-O0-NEXT: .LBB1_3: ; %Flow ; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2 ; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN-O0-NEXT: s_branch .LBB1_5 -; GCN-O0-NEXT: .LBB1_4: ; %bb.inner.end +; GCN-O0-NEXT: ; %bb.4: ; %Flow +; GCN-O0-NEXT: s_branch .LBB1_7 +; GCN-O0-NEXT: .LBB1_5: ; %bb.inner.end +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 4 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 5 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-O0-NEXT: ; %bb.6: ; %bb.inner.end ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_readlane_b32 s2, v1, 4 -; GCN-O0-NEXT: v_readlane_b32 s3, v1, 5 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0 ; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 2 @@ -280,7 +284,7 @@ ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] ; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; GCN-O0-NEXT: s_branch .LBB1_3 -; GCN-O0-NEXT: .LBB1_5: ; %bb.outer.end +; GCN-O0-NEXT: .LBB1_7: ; %bb.outer.end ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 @@ -401,7 +405,7 @@ ; GCN-O0-NEXT: v_writelane_b32 v1, s1, 3 ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB2_6 +; GCN-O0-NEXT: s_cbranch_execz .LBB2_7 ; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then ; GCN-O0-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s0, 2 @@ -469,10 +473,12 @@ ; GCN-O0-NEXT: v_readlane_b32 s0, v1, 6 ; GCN-O0-NEXT: v_readlane_b32 s1, v1, 7 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN-O0-NEXT: .LBB2_6: ; %bb.outer.end +; GCN-O0-NEXT: ; %bb.6: ; %Flow1 +; GCN-O0-NEXT: .LBB2_7: ; %bb.outer.end ; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2 ; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-O0-NEXT: ; %bb.8: ; %bb.outer.end ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 @@ -627,7 +633,7 @@ ; GCN-O0-NEXT: v_writelane_b32 v1, s0, 2 ; GCN-O0-NEXT: v_writelane_b32 v1, s1, 3 ; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB3_8 +; GCN-O0-NEXT: s_cbranch_execz .LBB3_10 ; GCN-O0-NEXT: ; %bb.2: ; %bb.outer.then ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload @@ -651,7 +657,7 @@ ; GCN-O0-NEXT: v_writelane_b32 v1, s1, 5 ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] -; GCN-O0-NEXT: s_cbranch_execz .LBB3_7 +; GCN-O0-NEXT: s_cbranch_execz .LBB3_8 ; GCN-O0-NEXT: ; %bb.3: ; %bb.inner.then ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload @@ -667,7 +673,7 @@ ; GCN-O0-NEXT: v_mov_b32_e32 v0, 2 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 offset:8 -; GCN-O0-NEXT: s_branch .LBB3_7 +; GCN-O0-NEXT: s_branch .LBB3_8 ; GCN-O0-NEXT: .LBB3_4: ; %bb.outer.else ; GCN-O0-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload @@ -710,15 +716,18 @@ ; GCN-O0-NEXT: v_readlane_b32 s0, v1, 6 ; GCN-O0-NEXT: v_readlane_b32 s1, v1, 7 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-O0-NEXT: ; %bb.7: ; %Flow ; GCN-O0-NEXT: s_branch .LBB3_1 -; GCN-O0-NEXT: .LBB3_7: ; %Flow1 +; GCN-O0-NEXT: .LBB3_8: ; %Flow1 ; GCN-O0-NEXT: v_readlane_b32 s0, v1, 4 ; GCN-O0-NEXT: v_readlane_b32 s1, v1, 5 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN-O0-NEXT: .LBB3_8: ; %bb.outer.end +; GCN-O0-NEXT: ; %bb.9: ; %Flow1 +; GCN-O0-NEXT: .LBB3_10: ; %bb.outer.end ; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2 ; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-O0-NEXT: ; %bb.11: ; %bb.outer.end ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 @@ -824,6 +833,7 @@ ; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2 ; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-O0-NEXT: ; %bb.3: ; %bb.end ; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-O0-NEXT: s_barrier ; GCN-O0-NEXT: s_endpgm @@ -952,10 +962,12 @@ ; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1 ; GCN-O0-NEXT: ; %bb.2: ; %bb2 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-O0-NEXT: v_readlane_b32 s4, v1, 6 ; GCN-O0-NEXT: v_readlane_b32 s5, v1, 7 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: ; %bb.3: ; %bb2 +; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 +; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s6, 0 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, s6 @@ -981,8 +993,8 @@ ; GCN-O0-NEXT: v_writelane_b32 v1, s5, 11 ; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execz .LBB5_5 -; GCN-O0-NEXT: ; %bb.3: ; %bb4 +; GCN-O0-NEXT: s_cbranch_execz .LBB5_6 +; GCN-O0-NEXT: ; %bb.4: ; %bb4 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 ; GCN-O0-NEXT: ; implicit-def: $sgpr4 ; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 @@ -1009,8 +1021,8 @@ ; GCN-O0-NEXT: v_writelane_b32 v1, s5, 13 ; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execz .LBB5_6 -; GCN-O0-NEXT: ; %bb.4: ; %bb8 +; GCN-O0-NEXT: s_cbranch_execz .LBB5_8 +; GCN-O0-NEXT: ; %bb.5: ; %bb8 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 ; GCN-O0-NEXT: s_mov_b32 s10, 0 ; GCN-O0-NEXT: ; implicit-def: $sgpr4 @@ -1033,42 +1045,46 @@ ; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_branch .LBB5_6 -; GCN-O0-NEXT: .LBB5_5: ; %Flow2 +; GCN-O0-NEXT: s_branch .LBB5_8 +; GCN-O0-NEXT: .LBB5_6: ; %Flow2 +; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 +; GCN-O0-NEXT: v_readlane_b32 s4, v1, 10 +; GCN-O0-NEXT: v_readlane_b32 s5, v1, 11 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: ; %bb.7: ; %Flow2 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_readlane_b32 s4, v1, 10 -; GCN-O0-NEXT: v_readlane_b32 s5, v1, 11 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_branch .LBB5_7 -; GCN-O0-NEXT: .LBB5_6: ; %Flow +; GCN-O0-NEXT: s_branch .LBB5_10 +; GCN-O0-NEXT: .LBB5_8: ; %Flow +; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 +; GCN-O0-NEXT: v_readlane_b32 s4, v1, 12 +; GCN-O0-NEXT: v_readlane_b32 s5, v1, 13 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: ; %bb.9: ; %Flow ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_readlane_b32 s4, v1, 12 -; GCN-O0-NEXT: v_readlane_b32 s5, v1, 13 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_branch .LBB5_5 -; GCN-O0-NEXT: .LBB5_7: ; %bb10 +; GCN-O0-NEXT: s_branch .LBB5_6 +; GCN-O0-NEXT: .LBB5_10: ; %bb10 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 ; GCN-O0-NEXT: v_readlane_b32 s6, v1, 8 ; GCN-O0-NEXT: v_readlane_b32 s7, v1, 9 @@ -1080,23 +1096,25 @@ ; GCN-O0-NEXT: v_writelane_b32 v1, s5, 17 ; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: s_cbranch_execz .LBB5_9 -; GCN-O0-NEXT: ; %bb.8: ; %Flow1 +; GCN-O0-NEXT: s_cbranch_execz .LBB5_12 +; GCN-O0-NEXT: ; %bb.11: ; %Flow1 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 ; GCN-O0-NEXT: s_mov_b64 s[4:5], 0 ; GCN-O0-NEXT: s_xor_b64 s[4:5], exec, -1 ; GCN-O0-NEXT: v_writelane_b32 v1, s4, 14 ; GCN-O0-NEXT: v_writelane_b32 v1, s5, 15 -; GCN-O0-NEXT: .LBB5_9: ; %Flow3 +; GCN-O0-NEXT: .LBB5_12: ; %Flow3 +; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 +; GCN-O0-NEXT: v_readlane_b32 s4, v1, 16 +; GCN-O0-NEXT: v_readlane_b32 s5, v1, 17 +; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-O0-NEXT: ; %bb.13: ; %Flow3 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-O0-NEXT: v_readlane_b32 s8, v1, 16 -; GCN-O0-NEXT: v_readlane_b32 s9, v1, 17 -; GCN-O0-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-O0-NEXT: v_readlane_b32 s6, v1, 4 ; GCN-O0-NEXT: v_readlane_b32 s7, v1, 5 ; GCN-O0-NEXT: v_readlane_b32 s4, v1, 14 @@ -1120,11 +1138,11 @@ ; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1 -; GCN-O0-NEXT: ; %bb.10: ; %bb12 +; GCN-O0-NEXT: ; %bb.14: ; %bb12 ; GCN-O0-NEXT: v_readlane_b32 s4, v1, 18 ; GCN-O0-NEXT: v_readlane_b32 s5, v1, 19 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: ; %bb.11: ; %bb12 +; GCN-O0-NEXT: ; %bb.15: ; %bb12 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload Index: llvm/test/CodeGen/AMDGPU/collapse-endcf.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/collapse-endcf.mir +++ llvm/test/CodeGen/AMDGPU/collapse-endcf.mir @@ -18,20 +18,29 @@ ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %3:sreg_64, implicit-def dead $scc ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GCN-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.5(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.5: ; GCN-NEXT: successors: %bb.4(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: DBG_VALUE ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: successors: %bb.6(0x80000000) + ; GCN-NEXT: {{ $}} ; GCN-NEXT: DBG_VALUE + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.6: ; GCN-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.4 @@ -89,7 +98,11 @@ ; GCN-NEXT: {{ $}} ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.5: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: successors: %bb.6(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.6: ; GCN-NEXT: S_ENDPGM 0 bb.0: %0:sreg_64 = SI_IF undef %1:sreg_64, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec @@ -146,7 +159,11 @@ ; GCN-NEXT: DBG_VALUE ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.5: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: successors: %bb.6(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.6: ; GCN-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.4 @@ -208,7 +225,11 @@ ; GCN-NEXT: KILL [[DEF]] ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: successors: %bb.5(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.5: ; GCN-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.4 @@ -253,18 +274,18 @@ ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %3:sreg_64, implicit-def dead $scc ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; GCN-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: - ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: successors: %bb.5(0x80000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.3: + ; GCN-NEXT: bb.5: ; GCN-NEXT: successors: %bb.4(0x80000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF @@ -273,7 +294,11 @@ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY [[S_BREV_B32_]] ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: successors: %bb.6(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.6: ; GCN-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.4 @@ -329,13 +354,21 @@ ; GCN-NEXT: {{ $}} ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: + ; GCN-NEXT: successors: %bb.5(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY1]], implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.5: ; GCN-NEXT: successors: %bb.4(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY1]], implicit-def $scc ; GCN-NEXT: [[S_BREV_B64_:%[0-9]+]]:sreg_64 = S_BREV_B64 $exec ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: successors: %bb.6(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.6: ; GCN-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.4 @@ -389,13 +422,21 @@ ; GCN-NEXT: {{ $}} ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: + ; GCN-NEXT: successors: %bb.5(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY1]], implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.5: ; GCN-NEXT: successors: %bb.4(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY1]], implicit-def $scc ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub2 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: successors: %bb.6(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.6: ; GCN-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.4 @@ -436,25 +477,34 @@ ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.6(0x40000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %3:sreg_64, implicit-def dead $scc ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec + ; GCN-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.6(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.6: ; GCN-NEXT: successors: %bb.5(0x80000000) ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_BRANCH %bb.5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.4: + ; GCN-NEXT: successors: %bb.7(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.7: + ; GCN-NEXT: S_ENDPGM 0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.5: ; GCN-NEXT: successors: %bb.4(0x80000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: S_BRANCH %bb.4 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.4: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc - ; GCN-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.4 @@ -499,9 +549,13 @@ ; GCN-NEXT: [[V_CMP_LT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_U32_e64 1, [[COPY]], implicit $exec ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[V_CMP_LT_U32_e64_]], implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: ; GCN-NEXT: successors: %bb.1(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[V_CMP_LT_U32_e64_]], implicit-def $scc ; GCN-NEXT: S_BRANCH %bb.1 bb.0: successors: %bb.1 @@ -564,10 +618,14 @@ ; GCN-NEXT: bb.5: ; GCN-NEXT: successors: %bb.6(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY1]], implicit-def $scc + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY1]], implicit-def $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.6: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[S_AND_B64_1]], implicit-def $scc + ; GCN-NEXT: successors: %bb.7(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[S_AND_B64_1]], implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.7: ; GCN-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.2 @@ -629,12 +687,12 @@ ; GCN-NEXT: bb.3: ; GCN-NEXT: successors: %bb.4(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: ; GCN-NEXT: successors: %bb.5(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, %2, implicit-def $scc + ; GCN-NEXT: $exec = S_OR_B64_term $exec, %2, implicit-def $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.5: ; GCN-NEXT: successors: %bb.6(0x80000000) @@ -707,7 +765,7 @@ ; GCN-NEXT: bb.4: ; GCN-NEXT: successors: %bb.5(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.5: ; GCN-NEXT: S_ENDPGM 0 @@ -756,14 +814,18 @@ ; GCN-NEXT: S_CBRANCH_EXECZ %bb.4, implicit $exec ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.7(0x40000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %3:sreg_64, implicit-def dead $scc ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec + ; GCN-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.7(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.7: ; GCN-NEXT: successors: %bb.5(0x80000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: S_BRANCH %bb.5 @@ -774,7 +836,7 @@ ; GCN-NEXT: bb.5: ; GCN-NEXT: successors: %bb.6(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.6: ; GCN-NEXT: successors: %bb.4(0x80000000) @@ -832,14 +894,14 @@ ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.14(0x40000000) + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.16(0x40000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 0, killed [[DEF1]], implicit $exec ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN-NEXT: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], killed [[V_CMP_EQ_U32_e64_1]], implicit-def dead $scc ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.14, implicit $exec + ; GCN-NEXT: S_CBRANCH_EXECZ %bb.16, implicit $exec ; GCN-NEXT: S_BRANCH %bb.2 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: @@ -854,25 +916,39 @@ ; GCN-NEXT: S_BRANCH %bb.3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.3: - ; GCN-NEXT: successors: %bb.4(0x40000000), %bb.7(0x40000000) + ; GCN-NEXT: successors: %bb.4(0x40000000), %bb.15(0x40000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN-NEXT: [[V_CMP_EQ_U32_e64_3:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 0, killed [[DEF3]], implicit $exec ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN-NEXT: [[S_AND_B64_3:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY3]], killed [[V_CMP_EQ_U32_e64_3]], implicit-def dead $scc ; GCN-NEXT: $exec = S_MOV_B64_term killed [[S_AND_B64_3]] - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.7, implicit $exec + ; GCN-NEXT: S_CBRANCH_EXECZ %bb.15, implicit $exec ; GCN-NEXT: S_BRANCH %bb.4 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.4: + ; GCN-NEXT: successors: %bb.15(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_BRANCH %bb.15 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.15: ; GCN-NEXT: successors: %bb.7(0x80000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: S_BRANCH %bb.7 ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.16: + ; GCN-NEXT: successors: %bb.14(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_BRANCH %bb.14 + ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.7: + ; GCN-NEXT: successors: %bb.17(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY2]], implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.17: ; GCN-NEXT: successors: %bb.8(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY2]], implicit-def $scc ; GCN-NEXT: S_BRANCH %bb.8 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.8: @@ -893,9 +969,9 @@ ; GCN-NEXT: S_BRANCH %bb.11 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.10: - ; GCN-NEXT: successors: %bb.14(0x80000000) + ; GCN-NEXT: successors: %bb.18(0x80000000) ; GCN-NEXT: {{ $}} - ; GCN-NEXT: S_BRANCH %bb.14 + ; GCN-NEXT: S_BRANCH %bb.18 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.11: ; GCN-NEXT: successors: %bb.12(0x80000000) @@ -903,16 +979,25 @@ ; GCN-NEXT: S_BRANCH %bb.12 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.12: - ; GCN-NEXT: successors: %bb.10(0x40000000), %bb.14(0x40000000) + ; GCN-NEXT: successors: %bb.10(0x40000000), %bb.18(0x40000000) ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[S_OR_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_OR_SAVEEXEC_B64 [[S_XOR_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; GCN-NEXT: [[S_AND_B64_5:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc ; GCN-NEXT: $exec = S_XOR_B64_term $exec, [[S_AND_B64_5]], implicit-def $scc - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.14, implicit $exec + ; GCN-NEXT: S_CBRANCH_EXECZ %bb.18, implicit $exec ; GCN-NEXT: S_BRANCH %bb.10 ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.18: + ; GCN-NEXT: successors: %bb.16(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_BRANCH %bb.16 + ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.14: - ; GCN-NEXT: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: successors: %bb.19(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.19: ; GCN-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.14 Index: llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -48,9 +48,6 @@ ; VMEM: [[ENDIF]]: -; Restore val -; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload - ; Reload and restore exec mask ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] @@ -61,6 +58,9 @@ ; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1 ; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]] +; Restore val +; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload + ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]] define amdgpu_kernel void @divergent_if_endif(ptr addrspace(1) %out) #0 { @@ -121,7 +121,6 @@ ; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: [[END]]: -; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] @@ -131,6 +130,7 @@ ; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1 ; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]] +; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]] define amdgpu_kernel void @divergent_loop(ptr addrspace(1) %out) #0 { @@ -230,7 +230,6 @@ ; GCN-NEXT: s_branch [[FLOW]] ; GCN: [[ENDIF]]: -; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]] @@ -241,6 +240,7 @@ ; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1 ; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]] +; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]] define amdgpu_kernel void @divergent_if_else_endif(ptr addrspace(1) %out) #0 { Index: llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -10,136 +10,136 @@ ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-NEXT: s_mov_b64 s[2:3], 0 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v0, s4 ; GFX900-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_add_f32_e32 v2, 4.0, v3 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX900-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX900-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_wbinvl1_vol -; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX900-NEXT: s_cbranch_execnz .LBB0_1 ; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX900-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX900-NEXT: global_store_dword v[0:1], v1, off +; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_endpgm ; ; GFX908-LABEL: global_atomic_fadd_ret_f32: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX908-NEXT: s_mov_b64 s[2:3], 0 -; GFX908-NEXT: v_mov_b32_e32 v0, 0 +; GFX908-NEXT: v_mov_b32_e32 v1, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s4 +; GFX908-NEXT: v_mov_b32_e32 v0, s4 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX908-NEXT: v_mov_b32_e32 v3, v0 +; GFX908-NEXT: v_add_f32_e32 v2, 4.0, v3 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX908-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1_vol -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX908-NEXT: s_cbranch_execnz .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX908-NEXT: global_store_dword v[0:1], v1, off +; GFX908-NEXT: global_store_dword v[0:1], v0, off ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: global_atomic_fadd_ret_f32: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_add_f32_e32 v2, 4.0, v3 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] glc +; GFX90A-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX90A-NEXT: global_store_dword v[0:1], v1, off +; GFX90A-NEXT: global_store_dword v[0:1], v0, off ; GFX90A-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_fadd_ret_f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_add_f32_e32 v2, 4.0, v3 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX10-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 ; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_execnz .LBB0_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: global_store_dword v[0:1], v1, off +; GFX10-NEXT: global_store_dword v[0:1], v0, off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_fadd_ret_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_mov_b32 s2, 0 ; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: v_add_f32_e32 v2, 4.0, v3 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v1, v[2:3], s[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 ; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_execnz .LBB0_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: global_store_b32 v[0:1], v1, off +; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 seq_cst @@ -152,52 +152,52 @@ ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-NEXT: s_mov_b64 s[2:3], 0 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v0, s4 ; GFX900-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_add_f32_e32 v2, 4.0, v3 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX900-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX900-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_wbinvl1_vol -; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX900-NEXT: s_cbranch_execnz .LBB1_1 ; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX900-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX900-NEXT: global_store_dword v[0:1], v1, off +; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_endpgm ; ; GFX908-LABEL: global_atomic_fadd_ret_f32_ieee: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX908-NEXT: s_mov_b64 s[2:3], 0 -; GFX908-NEXT: v_mov_b32_e32 v0, 0 +; GFX908-NEXT: v_mov_b32_e32 v1, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s4 +; GFX908-NEXT: v_mov_b32_e32 v0, s4 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX908-NEXT: v_mov_b32_e32 v3, v0 +; GFX908-NEXT: v_add_f32_e32 v2, 4.0, v3 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX908-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1_vol -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX908-NEXT: s_cbranch_execnz .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX908-NEXT: global_store_dword v[0:1], v1, off +; GFX908-NEXT: global_store_dword v[0:1], v0, off ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: global_atomic_fadd_ret_f32_ieee: @@ -215,29 +215,29 @@ ; GFX10-LABEL: global_atomic_fadd_ret_f32_ieee: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_add_f32_e32 v2, 4.0, v3 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX10-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 ; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_execnz .LBB1_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: global_store_dword v[0:1], v1, off +; GFX10-NEXT: global_store_dword v[0:1], v0, off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_fadd_ret_f32_ieee: @@ -439,52 +439,52 @@ ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-NEXT: s_mov_b64 s[2:3], 0 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v0, s4 ; GFX900-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_add_f32_e32 v2, 4.0, v3 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX900-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX900-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_wbinvl1_vol -; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX900-NEXT: s_cbranch_execnz .LBB4_1 ; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX900-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX900-NEXT: global_store_dword v[0:1], v1, off +; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_endpgm ; ; GFX908-LABEL: global_atomic_fadd_ret_f32_agent: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX908-NEXT: s_mov_b64 s[2:3], 0 -; GFX908-NEXT: v_mov_b32_e32 v0, 0 +; GFX908-NEXT: v_mov_b32_e32 v1, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s4 +; GFX908-NEXT: v_mov_b32_e32 v0, s4 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX908-NEXT: v_mov_b32_e32 v3, v0 +; GFX908-NEXT: v_add_f32_e32 v2, 4.0, v3 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX908-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1_vol -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX908-NEXT: s_cbranch_execnz .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX908-NEXT: global_store_dword v[0:1], v1, off +; GFX908-NEXT: global_store_dword v[0:1], v0, off ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: global_atomic_fadd_ret_f32_agent: @@ -502,29 +502,29 @@ ; GFX10-LABEL: global_atomic_fadd_ret_f32_agent: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_add_f32_e32 v2, 4.0, v3 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX10-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 ; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_execnz .LBB4_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: global_store_dword v[0:1], v1, off +; GFX10-NEXT: global_store_dword v[0:1], v0, off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_fadd_ret_f32_agent: @@ -550,136 +550,136 @@ ; GFX900: ; %bb.0: ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX900-NEXT: s_mov_b64 s[2:3], 0 -; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, 0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v1, s4 +; GFX900-NEXT: v_mov_b32_e32 v0, s4 ; GFX900-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX900-NEXT: v_mov_b32_e32 v2, v1 -; GFX900-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX900-NEXT: v_mov_b32_e32 v3, v0 +; GFX900-NEXT: v_add_f32_e32 v2, 4.0, v3 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX900-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: buffer_wbinvl1_vol -; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX900-NEXT: s_cbranch_execnz .LBB5_1 ; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX900-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX900-NEXT: global_store_dword v[0:1], v1, off +; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_endpgm ; ; GFX908-LABEL: global_atomic_fadd_ret_f32_system: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX908-NEXT: s_mov_b64 s[2:3], 0 -; GFX908-NEXT: v_mov_b32_e32 v0, 0 +; GFX908-NEXT: v_mov_b32_e32 v1, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s4 +; GFX908-NEXT: v_mov_b32_e32 v0, s4 ; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_mov_b32_e32 v2, v1 -; GFX908-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX908-NEXT: v_mov_b32_e32 v3, v0 +; GFX908-NEXT: v_add_f32_e32 v2, 4.0, v3 ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX908-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1_vol -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX908-NEXT: s_cbranch_execnz .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX908-NEXT: global_store_dword v[0:1], v1, off +; GFX908-NEXT: global_store_dword v[0:1], v0, off ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: global_atomic_fadd_ret_f32_system: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GFX90A-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 ; GFX90A-NEXT: v_add_f32_e32 v2, 4.0, v3 ; GFX90A-NEXT: buffer_wbl2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] glc +; GFX90A-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX90A-NEXT: s_cbranch_execnz .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX90A-NEXT: global_store_dword v[0:1], v1, off +; GFX90A-NEXT: global_store_dword v[0:1], v0, off ; GFX90A-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_fadd_ret_f32_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_mov_b32_e32 v2, v1 -; GFX10-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_add_f32_e32 v2, 4.0, v3 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX10-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 ; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: s_cbranch_execnz .LBB5_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10-NEXT: global_store_dword v[0:1], v1, off +; GFX10-NEXT: global_store_dword v[0:1], v0, off ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_fadd_ret_f32_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: s_mov_b32 s2, 0 ; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: v_add_f32_e32 v2, 4.0, v3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc +; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v1, v[2:3], s[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v3 ; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX11-NEXT: s_cbranch_execnz .LBB5_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: global_store_b32 v[0:1], v1, off +; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") seq_cst @@ -692,52 +692,52 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_mov_b64 s[2:3], 0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: .LBB6_1: ; %atomicrmw.start ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GCN-NEXT: v_mov_b32_e32 v3, v0 +; GCN-NEXT: v_add_f32_e32 v2, 4.0, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GCN-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1_vol -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GCN-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GCN-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN-NEXT: s_cbranch_execnz .LBB6_1 ; GCN-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN-NEXT: global_store_dword v[0:1], v1, off +; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_mov_b64 s[2:3], 0 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: v_mov_b32_e32 v0, s4 ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX11-NEXT: v_mov_b32_e32 v3, v0 +; GFX11-NEXT: v_add_f32_e32 v2, 4.0, v3 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX11-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_wbinvl1_vol -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 ; GFX11-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX11-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX11-NEXT: s_cbranch_execnz .LBB6_1 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX11-NEXT: global_store_dword v[0:1], v1, off +; GFX11-NEXT: global_store_dword v[0:1], v0, off ; GFX11-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst store float %result, ptr addrspace(1) undef Index: llvm/test/CodeGen/AMDGPU/issue61083.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/issue61083.ll +++ llvm/test/CodeGen/AMDGPU/issue61083.ll @@ -32,16 +32,17 @@ ; CHECK-NEXT: s_cbranch_execz .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %bb193 ; CHECK-NEXT: .LBB0_2: ; %bb194 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: v_readlane_b32 s4, v1, 0 ; CHECK-NEXT: v_readlane_b32 s5, v1, 1 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: ; %bb.3: ; %bb194 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u16_e64 s[4:5], v0, s4 ; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5] -; CHECK-NEXT: s_cbranch_vccnz .LBB0_4 -; CHECK-NEXT: ; %bb.3: ; %bb201 +; CHECK-NEXT: s_cbranch_vccnz .LBB0_5 +; CHECK-NEXT: ; %bb.4: ; %bb201 ; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, V2@rel32@lo+4 @@ -53,7 +54,7 @@ ; CHECK-NEXT: s_barrier ; CHECK-NEXT: s_trap 2 ; CHECK-NEXT: ; divergent unreachable -; CHECK-NEXT: .LBB0_4: ; %UnifiedReturnBlock +; CHECK-NEXT: .LBB0_5: ; %UnifiedReturnBlock ; CHECK-NEXT: s_endpgm bb: %i10 = tail call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll @@ -248,8 +248,8 @@ ; CHECK-NEXT: v_add_f32_e32 v2, v0, v1 ; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; CHECK-NEXT: .LBB7_4: ; %END -; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] ; CHECK-NEXT: s_and_b64 exec, exec, s[14:15] +; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] ; CHECK-NEXT: v_mov_b32_e32 v0, v2 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ; return to shader part epilog Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll @@ -937,8 +937,8 @@ ; SI-NEXT: s_and_b64 exec, exec, s[8:9] ; SI-NEXT: s_branch .LBB7_4 ; SI-NEXT: .LBB7_8: ; %.return -; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: v_bfrev_b32_e32 v0, 60 ; SI-NEXT: v_mov_b32_e32 v1, 0x3c00 ; SI-NEXT: exp mrt0 v1, v1, v0, v0 done compr vm @@ -1003,8 +1003,8 @@ ; GFX9-NEXT: s_and_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_branch .LBB7_4 ; GFX9-NEXT: .LBB7_8: ; %.return -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 ; GFX9-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm @@ -1066,8 +1066,8 @@ ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s4 ; GFX10-32-NEXT: s_branch .LBB7_4 ; GFX10-32-NEXT: .LBB7_8: ; %.return -; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 +; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 ; GFX10-32-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm @@ -1129,8 +1129,8 @@ ; GFX10-64-NEXT: s_and_b64 exec, exec, s[8:9] ; GFX10-64-NEXT: s_branch .LBB7_4 ; GFX10-64-NEXT: .LBB7_8: ; %.return -; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] +; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 ; GFX10-64-NEXT: exp mrt0 v0, v0, v1, v1 done compr vm Index: llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -1184,14 +1184,15 @@ ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; W64-O0-NEXT: .LBB2_8: ; %bb2 +; W64-O0-NEXT: v_readlane_b32 s4, v8, 10 +; W64-O0-NEXT: v_readlane_b32 s5, v8, 11 +; W64-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; W64-O0-NEXT: ; %bb.9: ; %bb2 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; W64-O0-NEXT: s_nop 0 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; W64-O0-NEXT: s_nop 0 ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; W64-O0-NEXT: v_readlane_b32 s4, v8, 10 -; W64-O0-NEXT: v_readlane_b32 s5, v8, 11 -; W64-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: global_store_dword v[0:1], v2, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) Index: llvm/test/CodeGen/AMDGPU/sdiv64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -380,26 +380,26 @@ ; GCN-IR-NEXT: v_add_i32_e64 v2, s[6:7], 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v11 ; GCN-IR-NEXT: v_min_u32_e32 v13, v2, v3 -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[6:7], v12, v13 +; GCN-IR-NEXT: v_sub_i32_e64 v8, s[6:7], v12, v13 ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; GCN-IR-NEXT: v_subb_u32_e64 v3, s[6:7], 0, 0, s[6:7] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[2:3] +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[6:7], 0, 0, s[6:7] +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[8:9] ; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[8:9] ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v6, v4 ; GCN-IR-NEXT: v_mov_b32_e32 v7, v5 -; GCN-IR-NEXT: v_cndmask_b32_e64 v9, v11, 0, s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v8, v10, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v11, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v10, 0, s[4:5] ; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v3, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 +; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v8 +; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v8 ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[10:11], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 @@ -448,14 +448,14 @@ ; GCN-IR-NEXT: .LBB1_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v9, v9, v1 -; GCN-IR-NEXT: v_or_b32_e32 v8, v8, v0 +; GCN-IR-NEXT: v_or_b32_e32 v2, v9, v1 +; GCN-IR-NEXT: v_or_b32_e32 v3, v8, v0 ; GCN-IR-NEXT: .LBB1_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_xor_b32_e32 v0, v5, v4 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v7, v6 -; GCN-IR-NEXT: v_xor_b32_e32 v3, v8, v0 -; GCN-IR-NEXT: v_xor_b32_e32 v2, v9, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v3, v3, v0 +; GCN-IR-NEXT: v_xor_b32_e32 v2, v2, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v3, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] Index: llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -87,20 +87,12 @@ ; GLOBALNESS1-NEXT: v_writelane_b32 v42, s4, 4 ; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[90:91], 1, v0 ; GLOBALNESS1-NEXT: v_writelane_b32 v42, s5, 5 -; GLOBALNESS1-NEXT: s_branch .LBB1_4 -; GLOBALNESS1-NEXT: .LBB1_1: ; %bb70.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_readlane_b32 s6, v42, 4 -; GLOBALNESS1-NEXT: v_readlane_b32 s7, v42, 5 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_29 -; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow6 -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[4:5] -; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], 0 -; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GLOBALNESS1-NEXT: .LBB1_3: ; %Flow19 -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: s_branch .LBB1_3 +; GLOBALNESS1-NEXT: .LBB1_1: ; in Loop: Header=BB1_3 Depth=1 +; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 +; GLOBALNESS1-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow19 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a63, v31 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a62, v30 @@ -135,7 +127,7 @@ ; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a33, v1 ; GLOBALNESS1-NEXT: v_accvgpr_write_b32 a32, v0 ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_30 -; GLOBALNESS1-NEXT: .LBB1_4: ; %bb5 +; GLOBALNESS1-NEXT: .LBB1_3: ; %bb5 ; GLOBALNESS1-NEXT: ; =>This Loop Header: Depth=1 ; GLOBALNESS1-NEXT: ; Child Loop BB1_15 Depth 2 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], s[92:93], s[92:93] op_sel:[0,1] @@ -157,27 +149,31 @@ ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_8 -; GLOBALNESS1-NEXT: ; %bb.5: ; %NodeBlock -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: ; %bb.4: ; %NodeBlock +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GLOBALNESS1-NEXT: s_cmp_lt_i32 s39, 1 -; GLOBALNESS1-NEXT: s_cbranch_scc1 .LBB1_7 -; GLOBALNESS1-NEXT: ; %bb.6: ; %LeafBlock3 -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: s_cbranch_scc1 .LBB1_6 +; GLOBALNESS1-NEXT: ; %bb.5: ; %LeafBlock3 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GLOBALNESS1-NEXT: s_cmp_lg_u32 s39, 1 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GLOBALNESS1-NEXT: s_cbranch_execnz .LBB1_8 -; GLOBALNESS1-NEXT: s_branch .LBB1_23 -; GLOBALNESS1-NEXT: .LBB1_7: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_7 +; GLOBALNESS1-NEXT: s_branch .LBB1_8 +; GLOBALNESS1-NEXT: .LBB1_6: ; in Loop: Header=BB1_3 Depth=1 ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], 0 ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GLOBALNESS1-NEXT: s_branch .LBB1_23 +; GLOBALNESS1-NEXT: .LBB1_7: ; %LeafBlock +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_3 Depth=1 +; GLOBALNESS1-NEXT: s_cmp_lg_u32 s39, 0 +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], 0 +; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GLOBALNESS1-NEXT: .LBB1_8: ; %Flow16 -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] -; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_24 -; GLOBALNESS1-NEXT: .LBB1_9: ; %baz.exit.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_1 +; GLOBALNESS1-NEXT: ; %bb.9: ; %baz.exit.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[32:33], 0, 0 ; GLOBALNESS1-NEXT: flat_load_dword v0, v[32:33] ; GLOBALNESS1-NEXT: s_mov_b32 s68, s93 @@ -216,21 +212,21 @@ ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[28:29], s[96:97], s[96:97] op_sel:[0,1] ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[30:31], s[98:99], s[98:99] op_sel:[0,1] ; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[70:71], s[96:97] -; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_26 +; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_24 ; GLOBALNESS1-NEXT: ; %bb.10: ; %bb33.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[32:33], off ; GLOBALNESS1-NEXT: v_readlane_b32 s4, v42, 0 ; GLOBALNESS1-NEXT: v_readlane_b32 s5, v42, 1 ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_12 ; GLOBALNESS1-NEXT: ; %bb.11: ; %bb39.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[40:41], off ; GLOBALNESS1-NEXT: .LBB1_12: ; %bb44.lr.ph.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v45 ; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v44, vcc ; GLOBALNESS1-NEXT: s_mov_b64 s[72:73], s[42:43] @@ -245,9 +241,9 @@ ; GLOBALNESS1-NEXT: .LBB1_14: ; %bb63.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[86:87] -; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_25 +; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_23 ; GLOBALNESS1-NEXT: .LBB1_15: ; %bb44.i -; GLOBALNESS1-NEXT: ; Parent Loop BB1_4 Depth=1 +; GLOBALNESS1-NEXT: ; Parent Loop BB1_3 Depth=1 ; GLOBALNESS1-NEXT: ; => This Inner Loop Header: Depth=2 ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[94:95] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_14 @@ -301,19 +297,8 @@ ; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[40:41], off ; GLOBALNESS1-NEXT: s_branch .LBB1_13 -; GLOBALNESS1-NEXT: .LBB1_23: ; %LeafBlock -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_cmp_lg_u32 s39, 0 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], 0 -; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] -; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_9 -; GLOBALNESS1-NEXT: .LBB1_24: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 -; GLOBALNESS1-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GLOBALNESS1-NEXT: s_branch .LBB1_3 -; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow14 -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: .LBB1_23: ; %Flow14 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b32 s36, s93 ; GLOBALNESS1-NEXT: s_mov_b32 s37, s93 @@ -354,28 +339,38 @@ ; GLOBALNESS1-NEXT: s_mov_b64 s[36:37], s[4:5] ; GLOBALNESS1-NEXT: s_mov_b32 s39, s75 ; GLOBALNESS1-NEXT: s_mov_b64 s[42:43], s[72:73] -; GLOBALNESS1-NEXT: .LBB1_26: ; %Flow15 -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: .LBB1_24: ; %Flow15 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[70:71] ; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[96:97] -; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2 -; GLOBALNESS1-NEXT: ; %bb.27: ; %bb67.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_29 +; GLOBALNESS1-NEXT: ; %bb.25: ; %bb67.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GLOBALNESS1-NEXT: v_readlane_b32 s6, v42, 2 ; GLOBALNESS1-NEXT: v_readlane_b32 s7, v42, 3 ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1 -; GLOBALNESS1-NEXT: ; %bb.28: ; %bb69.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_27 +; GLOBALNESS1-NEXT: ; %bb.26: ; %bb69.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[32:33], 0, 0 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[40:41], off -; GLOBALNESS1-NEXT: s_branch .LBB1_1 -; GLOBALNESS1-NEXT: .LBB1_29: ; %bb73.i -; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS1-NEXT: .LBB1_27: ; %bb70.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_3 Depth=1 +; GLOBALNESS1-NEXT: v_readlane_b32 s6, v42, 4 +; GLOBALNESS1-NEXT: v_readlane_b32 s7, v42, 5 +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_29 +; GLOBALNESS1-NEXT: ; %bb.28: ; %bb73.i +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[32:33], 0, 0 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[40:41], off +; GLOBALNESS1-NEXT: .LBB1_29: ; %Flow6 +; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_3 Depth=1 +; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[4:5] +; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], 0 +; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS1-NEXT: s_branch .LBB1_2 ; GLOBALNESS1-NEXT: .LBB1_30: ; %loop.exit.guard ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -472,20 +467,12 @@ ; GLOBALNESS0-NEXT: v_writelane_b32 v42, s4, 4 ; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[90:91], 1, v0 ; GLOBALNESS0-NEXT: v_writelane_b32 v42, s5, 5 -; GLOBALNESS0-NEXT: s_branch .LBB1_4 -; GLOBALNESS0-NEXT: .LBB1_1: ; %bb70.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_readlane_b32 s6, v42, 4 -; GLOBALNESS0-NEXT: v_readlane_b32 s7, v42, 5 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_29 -; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow6 -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5] -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], 0 -; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GLOBALNESS0-NEXT: .LBB1_3: ; %Flow19 -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: s_branch .LBB1_3 +; GLOBALNESS0-NEXT: .LBB1_1: ; in Loop: Header=BB1_3 Depth=1 +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 +; GLOBALNESS0-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow19 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a63, v31 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a62, v30 @@ -520,7 +507,7 @@ ; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a33, v1 ; GLOBALNESS0-NEXT: v_accvgpr_write_b32 a32, v0 ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_30 -; GLOBALNESS0-NEXT: .LBB1_4: ; %bb5 +; GLOBALNESS0-NEXT: .LBB1_3: ; %bb5 ; GLOBALNESS0-NEXT: ; =>This Loop Header: Depth=1 ; GLOBALNESS0-NEXT: ; Child Loop BB1_15 Depth 2 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], s[92:93], s[92:93] op_sel:[0,1] @@ -542,27 +529,31 @@ ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_8 -; GLOBALNESS0-NEXT: ; %bb.5: ; %NodeBlock -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: ; %bb.4: ; %NodeBlock +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GLOBALNESS0-NEXT: s_cmp_lt_i32 s39, 1 -; GLOBALNESS0-NEXT: s_cbranch_scc1 .LBB1_7 -; GLOBALNESS0-NEXT: ; %bb.6: ; %LeafBlock3 -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: s_cbranch_scc1 .LBB1_6 +; GLOBALNESS0-NEXT: ; %bb.5: ; %LeafBlock3 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GLOBALNESS0-NEXT: s_cmp_lg_u32 s39, 1 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS0-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GLOBALNESS0-NEXT: s_cbranch_execnz .LBB1_8 -; GLOBALNESS0-NEXT: s_branch .LBB1_23 -; GLOBALNESS0-NEXT: .LBB1_7: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_7 +; GLOBALNESS0-NEXT: s_branch .LBB1_8 +; GLOBALNESS0-NEXT: .LBB1_6: ; in Loop: Header=BB1_3 Depth=1 ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], 0 ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GLOBALNESS0-NEXT: s_branch .LBB1_23 +; GLOBALNESS0-NEXT: .LBB1_7: ; %LeafBlock +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_3 Depth=1 +; GLOBALNESS0-NEXT: s_cmp_lg_u32 s39, 0 +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], 0 +; GLOBALNESS0-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GLOBALNESS0-NEXT: .LBB1_8: ; %Flow16 -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] -; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_24 -; GLOBALNESS0-NEXT: .LBB1_9: ; %baz.exit.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_1 +; GLOBALNESS0-NEXT: ; %bb.9: ; %baz.exit.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[32:33], 0, 0 ; GLOBALNESS0-NEXT: flat_load_dword v0, v[32:33] ; GLOBALNESS0-NEXT: s_mov_b32 s68, s93 @@ -601,21 +592,21 @@ ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[28:29], s[96:97], s[96:97] op_sel:[0,1] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[30:31], s[98:99], s[98:99] op_sel:[0,1] ; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[70:71], s[96:97] -; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_26 +; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_24 ; GLOBALNESS0-NEXT: ; %bb.10: ; %bb33.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[32:33], off ; GLOBALNESS0-NEXT: v_readlane_b32 s4, v42, 0 ; GLOBALNESS0-NEXT: v_readlane_b32 s5, v42, 1 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_12 ; GLOBALNESS0-NEXT: ; %bb.11: ; %bb39.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[40:41], off ; GLOBALNESS0-NEXT: .LBB1_12: ; %bb44.lr.ph.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v45 ; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v44, vcc ; GLOBALNESS0-NEXT: s_mov_b64 s[72:73], s[42:43] @@ -630,9 +621,9 @@ ; GLOBALNESS0-NEXT: .LBB1_14: ; %bb63.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[86:87] -; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_25 +; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_23 ; GLOBALNESS0-NEXT: .LBB1_15: ; %bb44.i -; GLOBALNESS0-NEXT: ; Parent Loop BB1_4 Depth=1 +; GLOBALNESS0-NEXT: ; Parent Loop BB1_3 Depth=1 ; GLOBALNESS0-NEXT: ; => This Inner Loop Header: Depth=2 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[94:95] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_14 @@ -686,19 +677,8 @@ ; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[40:41], off ; GLOBALNESS0-NEXT: s_branch .LBB1_13 -; GLOBALNESS0-NEXT: .LBB1_23: ; %LeafBlock -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_cmp_lg_u32 s39, 0 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], 0 -; GLOBALNESS0-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] -; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_9 -; GLOBALNESS0-NEXT: .LBB1_24: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 -; GLOBALNESS0-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; GLOBALNESS0-NEXT: s_branch .LBB1_3 -; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow14 -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: .LBB1_23: ; %Flow14 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b32 s36, s93 ; GLOBALNESS0-NEXT: s_mov_b32 s37, s93 @@ -739,28 +719,38 @@ ; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[4:5] ; GLOBALNESS0-NEXT: s_mov_b32 s39, s75 ; GLOBALNESS0-NEXT: s_mov_b64 s[42:43], s[72:73] -; GLOBALNESS0-NEXT: .LBB1_26: ; %Flow15 -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: .LBB1_24: ; %Flow15 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[70:71] ; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[96:97] -; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2 -; GLOBALNESS0-NEXT: ; %bb.27: ; %bb67.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_29 +; GLOBALNESS0-NEXT: ; %bb.25: ; %bb67.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GLOBALNESS0-NEXT: v_readlane_b32 s6, v42, 2 ; GLOBALNESS0-NEXT: v_readlane_b32 s7, v42, 3 ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1 -; GLOBALNESS0-NEXT: ; %bb.28: ; %bb69.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_27 +; GLOBALNESS0-NEXT: ; %bb.26: ; %bb69.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[32:33], 0, 0 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[40:41], off -; GLOBALNESS0-NEXT: s_branch .LBB1_1 -; GLOBALNESS0-NEXT: .LBB1_29: ; %bb73.i -; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 +; GLOBALNESS0-NEXT: .LBB1_27: ; %bb70.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_3 Depth=1 +; GLOBALNESS0-NEXT: v_readlane_b32 s6, v42, 4 +; GLOBALNESS0-NEXT: v_readlane_b32 s7, v42, 5 +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_29 +; GLOBALNESS0-NEXT: ; %bb.28: ; %bb73.i +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[32:33], 0, 0 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[40:41], off +; GLOBALNESS0-NEXT: .LBB1_29: ; %Flow6 +; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_3 Depth=1 +; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5] +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], 0 +; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GLOBALNESS0-NEXT: s_branch .LBB1_2 ; GLOBALNESS0-NEXT: .LBB1_30: ; %loop.exit.guard ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] Index: llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll +++ llvm/test/CodeGen/AMDGPU/uniform-phi-with-undef.ll @@ -32,6 +32,7 @@ ; GCN-NEXT: v_div_fixup_f32 v0, v1, s2, v0 ; GCN-NEXT: .LBB0_2: ; %end ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GCN-NEXT: ; %bb.3: ; %end ; GCN-NEXT: v_add_f32_e64 v0, v0, s0 ; GCN-NEXT: ; return to shader part epilog entry: Index: llvm/test/CodeGen/AMDGPU/wqm.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/wqm.ll +++ llvm/test/CodeGen/AMDGPU/wqm.ll @@ -1239,8 +1239,8 @@ ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 ; GFX9-W64-NEXT: .LBB23_4: ; %END -; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-W64-NEXT: ; return to shader part epilog @@ -1266,8 +1266,8 @@ ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: .LBB23_4: ; %END -; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1398,8 +1398,8 @@ ; GFX9-W64-NEXT: ; %bb.3: ; %IF ; GFX9-W64-NEXT: v_mul_lo_u32 v0, v5, 3 ; GFX9-W64-NEXT: ; %bb.4: ; %END -; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: ; return to shader part epilog @@ -1428,8 +1428,8 @@ ; GFX10-W32-NEXT: ; %bb.3: ; %IF ; GFX10-W32-NEXT: v_mul_lo_u32 v0, v5, 3 ; GFX10-W32-NEXT: ; %bb.4: ; %END -; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2220,8 +2220,8 @@ ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v1, v0 ; GFX9-W64-NEXT: .LBB36_2: ; %ENDIF -; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-W64-NEXT: ; return to shader part epilog ; @@ -2250,8 +2250,8 @@ ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v1, v0 ; GFX10-W32-NEXT: .LBB36_2: ; %ENDIF -; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: @@ -2757,8 +2757,8 @@ ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v1, v0 ; GFX9-W64-NEXT: .LBB45_2: ; %ENDIF -; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-W64-NEXT: ; return to shader part epilog ; @@ -2787,8 +2787,8 @@ ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v1, v0 ; GFX10-W32-NEXT: .LBB45_2: ; %ENDIF -; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: @@ -2835,8 +2835,8 @@ ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX9-W64-NEXT: .LBB46_2: ; %ENDIF -; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13] +; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_strict_wqm_within_wqm: @@ -2859,8 +2859,8 @@ ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX10-W32-NEXT: .LBB46_2: ; %ENDIF -; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-W32-NEXT: ; return to shader part epilog main_body: %cmp = icmp eq i32 %z, 0 Index: llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -196,12 +196,12 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: .LBB1_2: ; %merge -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: v_readlane_b32 s34, v3, 4 ; GFX9-O0-NEXT: v_readlane_b32 s35, v3, 5 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-O0-NEXT: ; %bb.3: ; %merge +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: v_readlane_b32 s36, v3, 0 ; GFX9-O0-NEXT: v_readlane_b32 s37, v3, 1 ; GFX9-O0-NEXT: v_readlane_b32 s38, v3, 2