Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2235,9 +2235,11 @@ // Skip over the instructions that are artificially terminators for special // exec management. while (I != E && !I->isBranch() && !I->isReturn() && - I->getOpcode() != AMDGPU::SI_MASK_BRANCH) { + (I->getOpcode() != AMDGPU::SI_MASK_BRANCH && + I->getOpcode() != AMDGPU::SI_KILL_CLEANUP)) { switch (I->getOpcode()) { case AMDGPU::SI_MASK_BRANCH: + case AMDGPU::SI_KILL_CLEANUP: case AMDGPU::S_MOV_B64_term: case AMDGPU::S_XOR_B64_term: case AMDGPU::S_OR_B64_term: @@ -2301,7 +2303,8 @@ unsigned RemovedSize = 0; while (I != MBB.end()) { MachineBasicBlock::iterator Next = std::next(I); - if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) { + if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH || + I->getOpcode() == AMDGPU::SI_KILL_CLEANUP) { I = Next; continue; } Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -381,7 +381,9 @@ defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>; let Defs = [EXEC] in -def SI_KILL_CLEANUP : SPseudoInstSI <(outs), (ins)>; +def SI_KILL_CLEANUP : SPseudoInstSI <(outs), (ins)> { + let isTerminator = 1; +} let Defs = [EXEC,VCC] in def SI_ILLEGAL_COPY : SPseudoInstSI < Index: llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -533,31 +533,12 @@ MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); - MachineBasicBlock::iterator InsPt = MBB.begin(); - - // If we have instructions that aren't prolog instructions, split the block - // and emit a terminator instruction. This ensures correct spill placement. - // FIXME: We should unconditionally split the block here. - bool NeedBlockSplit = false; - Register DataReg = MI.getOperand(0).getReg(); - for (MachineBasicBlock::iterator I = InsPt, E = MI.getIterator(); - I != E; ++I) { - if (I->modifiesRegister(DataReg, TRI)) { - NeedBlockSplit = true; - break; - } - } - - unsigned Opcode = OrOpc; - MachineBasicBlock *SplitBB = &MBB; - if (NeedBlockSplit) { - SplitBB = splitBlock(MI, &MBB, LIS); - Opcode = OrTermrOpc; - InsPt = MI; - } + MachineBasicBlock::iterator InsPt = MI.getIterator(); + MachineBasicBlock *SplitBB = splitBlock(MI, &MBB, LIS); + // Split the block so we can turn this into a terminator. MachineInstr *NewMI = - BuildMI(MBB, InsPt, DL, TII->get(Opcode), Exec) + BuildMI(MBB, InsPt, DL, TII->get(OrTermrOpc), Exec) .addReg(Exec) .add(MI.getOperand(0)); Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -1073,12 +1073,12 @@ ; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c ; GFX10_W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX10_W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX10_W64-NEXT: s_mov_b32 s6, 0 +; GFX10_W64-NEXT: s_mov_b32 s4, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W64-NEXT: global_load_dwordx3 v[1:3], v1, s[2:3] ; GFX10_W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10_W64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX10_W64-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GFX10_W64-NEXT: s_cbranch_execz BB13_2 ; GFX10_W64-NEXT: ; %bb.1: ; %bb ; GFX10_W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x74 @@ -1086,10 +1086,10 @@ ; GFX10_W64-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W64-NEXT: s_cmp_lg_u32 s0, 0 -; GFX10_W64-NEXT: s_cselect_b32 s6, 1, 0 +; GFX10_W64-NEXT: s_cselect_b32 s4, 1, 0 ; GFX10_W64-NEXT: BB13_2: ; %exit -; GFX10_W64-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX10_W64-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W64-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX10_W64-NEXT: s_and_b32 s0, 1, s4 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W64-NEXT: s_add_u32 s0, s2, 8 Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -36,8 +36,8 @@ ; GFX7-NEXT: v_readfirstlane_b32 s4, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, 5, s4 ; GFX7-NEXT: BB0_4: ; %Flow -; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_wqm_b64 s[4:5], -1 +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX7-NEXT: s_cbranch_vccnz BB0_6 ; GFX7-NEXT: ; %bb.5: ; %if @@ -69,8 +69,8 @@ ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s4 ; GFX8-NEXT: BB0_4: ; %Flow -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_wqm_b64 s[4:5], -1 +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX8-NEXT: s_cbranch_vccnz BB0_6 ; GFX8-NEXT: ; %bb.5: ; %if @@ -102,8 +102,8 @@ ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s4 ; GFX9-NEXT: BB0_4: ; %Flow -; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_wqm_b64 s[4:5], -1 +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz BB0_6 ; GFX9-NEXT: ; %bb.5: ; %if @@ -136,8 +136,8 @@ ; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s4 ; GFX1064-NEXT: BB0_4: ; %Flow -; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1064-NEXT: s_wqm_b64 s[4:5], -1 +; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX1064-NEXT: s_cbranch_vccnz BB0_6 ; GFX1064-NEXT: ; %bb.5: ; %if @@ -170,8 +170,8 @@ ; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s4 ; GFX1032-NEXT: BB0_4: ; %Flow -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1032-NEXT: s_wqm_b32 s4, -1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_vccnz BB0_6 ; GFX1032-NEXT: ; %bb.5: ; %if @@ -253,8 +253,8 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v0 ; GFX8-NEXT: BB1_4: ; %Flow -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_wqm_b64 s[4:5], -1 +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX8-NEXT: s_cbranch_vccnz BB1_6 ; GFX8-NEXT: ; %bb.5: ; %if @@ -310,8 +310,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: v_add_u32_e32 v3, s4, v0 ; GFX9-NEXT: BB1_4: ; %Flow -; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_wqm_b64 s[4:5], -1 +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz BB1_6 ; GFX9-NEXT: ; %bb.5: ; %if @@ -371,8 +371,8 @@ ; GFX1064-NEXT: v_mov_b32_e32 v0, v1 ; GFX1064-NEXT: v_add_nc_u32_e32 v4, s4, v0 ; GFX1064-NEXT: BB1_4: ; %Flow -; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1064-NEXT: s_wqm_b64 s[4:5], -1 +; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX1064-NEXT: s_cbranch_vccnz BB1_6 ; GFX1064-NEXT: ; %bb.5: ; %if @@ -425,8 +425,8 @@ ; GFX1032-NEXT: v_mov_b32_e32 v0, v1 ; GFX1032-NEXT: v_add_nc_u32_e32 v4, s4, v0 ; GFX1032-NEXT: BB1_4: ; %Flow -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1032-NEXT: s_wqm_b32 s4, -1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_vccnz BB1_6 ; GFX1032-NEXT: ; %bb.5: ; %if Index: llvm/test/CodeGen/AMDGPU/collapse-endcf.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/collapse-endcf.mir +++ llvm/test/CodeGen/AMDGPU/collapse-endcf.mir @@ -16,16 +16,21 @@ ; GCN: $exec = S_MOV_B64_term killed [[S_AND_B64_]] ; GCN: S_CBRANCH_EXECZ %bb.4, implicit $exec ; GCN: bb.1: - ; GCN: successors: %bb.2(0x40000000), %bb.4(0x40000000) + ; GCN: successors: %bb.2(0x40000000), %bb.5(0x40000000) ; GCN: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %3:sreg_64, implicit-def dead $scc ; GCN: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN: S_CBRANCH_EXECZ %bb.4, implicit $exec + ; GCN: S_CBRANCH_EXECZ %bb.5, implicit $exec ; GCN: bb.2: + ; GCN: successors: %bb.5(0x80000000) + ; GCN: bb.5: ; GCN: successors: %bb.4(0x80000000) + ; GCN: DBG_VALUE ; GCN: bb.4: - ; GCN: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc + ; GCN: successors: %bb.6(0x80000000) ; GCN: DBG_VALUE + ; GCN: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc + ; GCN: bb.6: ; GCN: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.4 @@ -75,7 +80,9 @@ ; GCN: bb.4: ; GCN: successors: %bb.5(0x80000000) ; GCN: bb.5: - ; GCN: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc + ; GCN: successors: %bb.6(0x80000000) + ; GCN: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc + ; GCN: bb.6: ; GCN: S_ENDPGM 0 bb.0: %0:sreg_64 = SI_IF undef %1:sreg_64, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec @@ -124,7 +131,9 @@ ; GCN: successors: %bb.5(0x80000000) ; GCN: DBG_VALUE ; GCN: bb.5: - ; GCN: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc + ; GCN: successors: %bb.6(0x80000000) + ; GCN: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc + ; GCN: bb.6: ; GCN: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.4 @@ -178,7 +187,9 @@ ; GCN: [[S_BREV_B32_:%[0-9]+]]:sgpr_32 = S_BREV_B32 [[DEF]] ; GCN: KILL [[DEF]] ; GCN: bb.4: - ; GCN: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc + ; GCN: successors: %bb.5(0x80000000) + ; GCN: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc + ; GCN: bb.5: ; GCN: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.4 @@ -221,21 +232,23 @@ ; GCN: $exec = S_MOV_B64_term killed [[S_AND_B64_]] ; GCN: S_CBRANCH_EXECZ %bb.4, implicit $exec ; GCN: bb.1: - ; GCN: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GCN: successors: %bb.2(0x40000000), %bb.5(0x40000000) ; GCN: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %3:sreg_64, implicit-def dead $scc ; GCN: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN: S_CBRANCH_EXECZ %bb.3, implicit $exec + ; GCN: S_CBRANCH_EXECZ %bb.5, implicit $exec ; GCN: bb.2: - ; GCN: successors: %bb.3(0x80000000) - ; GCN: bb.3: + ; GCN: successors: %bb.5(0x80000000) + ; GCN: bb.5: ; GCN: successors: %bb.4(0x80000000) ; GCN: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; GCN: [[S_BREV_B32_:%[0-9]+]]:sgpr_32 = S_BREV_B32 [[DEF]] ; GCN: KILL [[DEF]] ; GCN: [[COPY2:%[0-9]+]]:sgpr_32 = COPY [[S_BREV_B32_]] ; GCN: bb.4: - ; GCN: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc + ; GCN: successors: %bb.6(0x80000000) + ; GCN: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc + ; GCN: bb.6: ; GCN: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.4 @@ -285,11 +298,15 @@ ; GCN: bb.2: ; GCN: successors: %bb.3(0x80000000) ; GCN: bb.3: + ; GCN: successors: %bb.5(0x80000000) + ; GCN: $exec = S_OR_B64_term $exec, [[COPY1]], implicit-def $scc + ; GCN: bb.5: ; GCN: successors: %bb.4(0x80000000) - ; GCN: $exec = S_OR_B64 $exec, [[COPY1]], implicit-def $scc ; GCN: [[S_BREV_B64_:%[0-9]+]]:sreg_64 = S_BREV_B64 $exec ; GCN: bb.4: - ; GCN: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc + ; GCN: successors: %bb.6(0x80000000) + ; GCN: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc + ; GCN: bb.6: ; GCN: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.4 @@ -337,11 +354,15 @@ ; GCN: bb.2: ; GCN: successors: %bb.3(0x80000000) ; GCN: bb.3: + ; GCN: successors: %bb.5(0x80000000) + ; GCN: $exec = S_OR_B64_term $exec, [[COPY1]], implicit-def $scc + ; GCN: bb.5: ; GCN: successors: %bb.4(0x80000000) - ; GCN: $exec = S_OR_B64 $exec, [[COPY1]], implicit-def $scc ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub2 ; GCN: bb.4: - ; GCN: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc + ; GCN: successors: %bb.6(0x80000000) + ; GCN: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc + ; GCN: bb.6: ; GCN: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.4 @@ -380,19 +401,24 @@ ; GCN: $exec = S_MOV_B64_term killed [[S_AND_B64_]] ; GCN: S_CBRANCH_EXECZ %bb.4, implicit $exec ; GCN: bb.1: - ; GCN: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; GCN: successors: %bb.2(0x40000000), %bb.6(0x40000000) ; GCN: [[COPY1:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY1]], undef %3:sreg_64, implicit-def dead $scc ; GCN: $exec = S_MOV_B64_term killed [[S_AND_B64_1]] - ; GCN: S_CBRANCH_EXECZ %bb.5, implicit $exec + ; GCN: S_CBRANCH_EXECZ %bb.6, implicit $exec ; GCN: bb.2: + ; GCN: successors: %bb.6(0x80000000) + ; GCN: bb.6: ; GCN: successors: %bb.5(0x80000000) + ; GCN: S_BRANCH %bb.5 + ; GCN: bb.4: + ; GCN: successors: %bb.7(0x80000000) + ; GCN: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc + ; GCN: bb.7: + ; GCN: S_ENDPGM 0 ; GCN: bb.5: ; GCN: successors: %bb.4(0x80000000) ; GCN: S_BRANCH %bb.4 - ; GCN: bb.4: - ; GCN: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc - ; GCN: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.4 @@ -435,8 +461,10 @@ ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GCN: [[V_CMP_LT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_U32_e64 1, [[COPY]], implicit $exec ; GCN: bb.1: + ; GCN: successors: %bb.2(0x80000000) + ; GCN: $exec = S_OR_B64_term $exec, [[V_CMP_LT_U32_e64_]], implicit-def $scc + ; GCN: bb.2: ; GCN: successors: %bb.1(0x80000000) - ; GCN: $exec = S_OR_B64 $exec, [[V_CMP_LT_U32_e64_]], implicit-def $scc ; GCN: S_BRANCH %bb.1 bb.0: successors: %bb.1 @@ -487,9 +515,11 @@ ; GCN: successors: %bb.5(0x80000000) ; GCN: bb.5: ; GCN: successors: %bb.6(0x80000000) - ; GCN: $exec = S_OR_B64 $exec, [[COPY1]], implicit-def $scc + ; GCN: $exec = S_OR_B64_term $exec, [[COPY1]], implicit-def $scc ; GCN: bb.6: - ; GCN: $exec = S_OR_B64 $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc + ; GCN: successors: %bb.7(0x80000000) + ; GCN: $exec = S_OR_B64_term $exec, [[S_OR_SAVEEXEC_B64_]], implicit-def $scc + ; GCN: bb.7: ; GCN: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.2 @@ -544,10 +574,10 @@ ; GCN: S_BRANCH %bb.6 ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) - ; GCN: $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc + ; GCN: $exec = S_OR_B64_term $exec, [[COPY]], implicit-def $scc ; GCN: bb.4: ; GCN: successors: %bb.5(0x80000000) - ; GCN: $exec = S_OR_B64 $exec, %2, implicit-def $scc + ; GCN: $exec = S_OR_B64_term $exec, %2, implicit-def $scc ; GCN: bb.5: ; GCN: successors: %bb.6(0x80000000) ; GCN: bb.6: Index: llvm/test/CodeGen/AMDGPU/multilevel-break.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/multilevel-break.ll +++ llvm/test/CodeGen/AMDGPU/multilevel-break.ll @@ -51,24 +51,24 @@ ; GCN-NEXT: s_branch BB0_2 ; GCN-NEXT: BB0_1: ; %loop.exit.guard ; GCN-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_and_b64 s[2:3], exec, s[2:3] -; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GCN-NEXT: s_and_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] +; GCN-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN-NEXT: s_cbranch_execz BB0_6 ; GCN-NEXT: BB0_2: ; %LOOP.outer ; GCN-NEXT: ; =>This Loop Header: Depth=1 ; GCN-NEXT: ; Child Loop BB0_4 Depth 2 ; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7 -; GCN-NEXT: ; implicit-def: $sgpr2_sgpr3 -; GCN-NEXT: s_mov_b64 s[4:5], 0 +; GCN-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GCN-NEXT: s_mov_b64 s[2:3], 0 ; GCN-NEXT: s_branch BB0_4 ; GCN-NEXT: BB0_3: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB0_4 Depth=2 ; GCN-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-NEXT: s_and_b64 s[8:9], exec, s[6:7] -; GCN-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] -; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3] +; GCN-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GCN-NEXT: s_cbranch_execz BB0_1 ; GCN-NEXT: BB0_4: ; %LOOP ; GCN-NEXT: ; Parent Loop BB0_2 Depth=1 @@ -76,14 +76,14 @@ ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v1 ; GCN-NEXT: v_cmp_lt_i32_e32 vcc, v1, v4 -; GCN-NEXT: s_or_b64 s[2:3], s[2:3], exec +; GCN-NEXT: s_or_b64 s[4:5], s[4:5], exec ; GCN-NEXT: s_or_b64 s[6:7], s[6:7], exec ; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GCN-NEXT: s_cbranch_execz BB0_3 ; GCN-NEXT: ; %bb.5: ; %ENDIF ; GCN-NEXT: ; in Loop: Header=BB0_4 Depth=2 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, v5, v0 -; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec ; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec ; GCN-NEXT: s_and_b64 s[10:11], vcc, exec ; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] @@ -179,45 +179,45 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dword s2, s[0:1], 0x9 ; GCN-NEXT: s_mov_b64 s[0:1], 0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: ; implicit-def: $sgpr2_sgpr3 ; GCN-NEXT: s_branch BB1_2 ; GCN-NEXT: BB1_1: ; %Flow4 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_and_b64 s[6:7], exec, s[6:7] -; GCN-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] -; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec -; GCN-NEXT: s_and_b64 s[6:7], s[8:9], exec -; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GCN-NEXT: s_and_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] +; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; GCN-NEXT: s_and_b64 s[4:5], s[8:9], exec +; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN-NEXT: s_cbranch_execz BB1_9 ; GCN-NEXT: BB1_2: ; %bb1 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GCN-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 -; GCN-NEXT: s_mov_b64 s[6:7], -1 +; GCN-NEXT: s_mov_b64 s[4:5], -1 ; GCN-NEXT: s_and_b64 vcc, exec, vcc ; GCN-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GCN-NEXT: s_mov_b64 s[10:11], -1 ; GCN-NEXT: s_cbranch_vccnz BB1_6 ; GCN-NEXT: ; %bb.3: ; %LeafBlock1 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: s_mov_b64 s[6:7], -1 +; GCN-NEXT: s_mov_b64 s[4:5], -1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GCN-NEXT: s_and_b64 vcc, exec, vcc ; GCN-NEXT: s_mov_b64 s[8:9], -1 ; GCN-NEXT: s_cbranch_vccz BB1_5 ; GCN-NEXT: ; %bb.4: ; %case1 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], 0 +; GCN-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ge_i32_e32 vcc, v0, v2 ; GCN-NEXT: s_mov_b64 s[8:9], 0 -; GCN-NEXT: s_orn2_b64 s[6:7], vcc, exec +; GCN-NEXT: s_orn2_b64 s[4:5], vcc, exec ; GCN-NEXT: BB1_5: ; %Flow3 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 ; GCN-NEXT: s_mov_b64 s[10:11], 0 @@ -233,17 +233,17 @@ ; GCN-NEXT: s_cbranch_vccz BB1_1 ; GCN-NEXT: ; %bb.8: ; %case0 ; GCN-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GCN-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; GCN-NEXT: s_mov_b64 s[8:9], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1 -; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec +; GCN-NEXT: s_andn2_b64 s[4:5], s[4:5], exec ; GCN-NEXT: s_and_b64 s[10:11], vcc, exec -; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[10:11] ; GCN-NEXT: s_branch BB1_1 ; GCN-NEXT: BB1_9: ; %loop.exit.guard ; GCN-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN-NEXT: s_and_saveexec_b64 s[0:1], s[4:5] +; GCN-NEXT: s_and_saveexec_b64 s[0:1], s[2:3] ; GCN-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GCN-NEXT: s_endpgm bb: Index: llvm/test/CodeGen/AMDGPU/sdiv64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -411,32 +411,32 @@ ; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v10 ; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 ; GCN-IR-NEXT: v_cndmask_b32_e32 v14, v7, v0, vcc -; GCN-IR-NEXT: v_sub_i32_e32 v7, vcc, v13, v14 -; GCN-IR-NEXT: v_subb_u32_e64 v8, s[4:5], 0, 0, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[7:8] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[4:5], 63, v[7:8] +; GCN-IR-NEXT: v_sub_i32_e32 v11, vcc, v13, v14 +; GCN-IR-NEXT: v_subb_u32_e64 v12, s[4:5], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[11:12] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[4:5], 63, v[11:12] ; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], vcc ; GCN-IR-NEXT: s_xor_b64 s[8:9], s[6:7], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v18, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v6, v4 ; GCN-IR-NEXT: v_mov_b32_e32 v1, v5 -; GCN-IR-NEXT: v_cndmask_b32_e64 v12, v10, 0, s[6:7] +; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v10, 0, s[6:7] ; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], s[4:5] ; GCN-IR-NEXT: v_mov_b32_e32 v15, v18 ; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v9, 0, s[6:7] ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, 1, v7 -; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, 0, v8, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v0, s[4:5], 63, v7 -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[16:17], v[7:8] +; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, 1, v11 +; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, 0, v12, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v0, s[4:5], 63, v11 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[16:17], v[11:12] ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[9:10], v0 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, -1, v2 @@ -471,23 +471,23 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v14, v19 ; GCN-IR-NEXT: v_mov_b32_e32 v19, v12 ; GCN-IR-NEXT: v_subb_u32_e64 v17, s[4:5], v17, v20, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v18, v11 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB1_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB1_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[7:8], 1 -; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v3 +; GCN-IR-NEXT: v_or_b32_e32 v7, v12, v3 ; GCN-IR-NEXT: v_or_b32_e32 v0, v11, v2 ; GCN-IR-NEXT: BB1_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_xor_b32_e32 v2, v5, v4 ; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v2 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v6 -; GCN-IR-NEXT: v_xor_b32_e32 v3, v12, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v3, v7, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] @@ -1505,10 +1505,10 @@ ; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], 24, v4 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB11_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[12:13], 24, v8 @@ -1541,14 +1541,14 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v11, v15 ; GCN-IR-NEXT: v_mov_b32_e32 v15, v7 ; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v13, v16, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v14, v6 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB11_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB11_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB11_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 ; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v1 ; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v0 @@ -1719,12 +1719,12 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[4:5] ; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 -; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[8:9], v4 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[8:9], v4 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB12_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_mov_b32 s5, 0 @@ -1759,14 +1759,14 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v11, v15 ; GCN-IR-NEXT: v_mov_b32_e32 v15, v7 ; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v13, v16, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v14, v6 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB12_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB12_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB12_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 ; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v1 ; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v0 @@ -1825,10 +1825,10 @@ ; GCN-IR-NEXT: v_sub_i32_e64 v3, s[4:5], 63, v3 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: v_lshl_b64 v[3:4], v[7:8], v3 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB13_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[9:10], v[7:8], v9 @@ -1860,14 +1860,14 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v9, s[4:5], v0, v9 ; GCN-IR-NEXT: v_subb_u32_e64 v10, s[4:5], v10, v13, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB13_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB13_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB13_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[3:4], v[3:4], 1 ; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 Index: llvm/test/CodeGen/AMDGPU/srem64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/srem64.ll +++ llvm/test/CodeGen/AMDGPU/srem64.ll @@ -402,61 +402,61 @@ ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v7 -; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v8, vcc +; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, 1, v7 +; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, 0, v8, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v3, s[4:5], 63, v7 -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[9:10], v[7:8] -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[16:17], v[7:8] +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[0:1], v3 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, -1, v5 -; GCN-IR-NEXT: v_lshr_b64 v[16:17], v[0:1], v9 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, -1, v6, vcc -; GCN-IR-NEXT: v_not_b32_e32 v10, v12 -; GCN-IR-NEXT: v_not_b32_e32 v11, v13 -; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, v10, v14 -; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, v11, v15, vcc +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, -1, v6, vcc +; GCN-IR-NEXT: v_not_b32_e32 v9, v12 +; GCN-IR-NEXT: v_not_b32_e32 v10, v13 +; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, v9, v14 +; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, v10, v15, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 +; GCN-IR-NEXT: v_lshr_b64 v[16:17], v[0:1], v16 ; GCN-IR-NEXT: v_mov_b32_e32 v15, 0 ; GCN-IR-NEXT: BB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v10, 31, v8 -; GCN-IR-NEXT: v_or_b32_e32 v16, v16, v10 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v9, 31, v8 +; GCN-IR-NEXT: v_or_b32_e32 v16, v16, v9 ; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v10, vcc, v3, v16 -; GCN-IR-NEXT: v_subb_u32_e32 v10, vcc, v9, v17, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v9, vcc, v3, v16 +; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, v11, v17, vcc ; GCN-IR-NEXT: v_or_b32_e32 v7, v14, v7 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v10 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v9 ; GCN-IR-NEXT: v_and_b32_e32 v19, v14, v5 -; GCN-IR-NEXT: v_and_b32_e32 v10, 1, v14 +; GCN-IR-NEXT: v_and_b32_e32 v9, 1, v14 ; GCN-IR-NEXT: v_and_b32_e32 v18, v14, v6 ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v12 ; GCN-IR-NEXT: v_or_b32_e32 v8, v15, v8 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v13, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[14:15], v[12:13] ; GCN-IR-NEXT: v_mov_b32_e32 v12, v14 -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v16, s[4:5], v16, v19 ; GCN-IR-NEXT: v_mov_b32_e32 v13, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v15, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v15, v10 ; GCN-IR-NEXT: v_subb_u32_e64 v17, s[4:5], v17, v18, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v14, v10 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v14, v9 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB1_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB1_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 -; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v8 -; GCN-IR-NEXT: v_or_b32_e32 v9, v10, v7 +; GCN-IR-NEXT: v_or_b32_e32 v3, v10, v8 +; GCN-IR-NEXT: v_or_b32_e32 v9, v9, v7 ; GCN-IR-NEXT: BB1_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_mul_lo_u32 v3, v5, v3 @@ -1684,10 +1684,10 @@ ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[3:4] ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], 24, v2 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB11_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[10:11], 24, v6 @@ -1720,14 +1720,14 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 ; GCN-IR-NEXT: v_mov_b32_e32 v13, v5 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v12, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB11_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB11_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB11_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 ; GCN-IR-NEXT: v_or_b32_e32 v2, v4, v2 @@ -1896,12 +1896,12 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[2:3] ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB12_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_mov_b32 s5, 0 @@ -1936,14 +1936,14 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 ; GCN-IR-NEXT: v_mov_b32_e32 v13, v5 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v12, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB12_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB12_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB12_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 @@ -2008,10 +2008,10 @@ ; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB13_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 @@ -2043,14 +2043,14 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v15, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v14 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v15, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB13_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB13_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB13_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v5 ; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v4 Index: llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll +++ llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll @@ -118,12 +118,14 @@ ; GCN: successors: %bb.7(0x40000000), %bb.8(0x40000000) ; GCN: liveins: $sgpr0_sgpr1 ; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr0_sgpr1, implicit-def $scc - ; GCN: S_CBRANCH_EXECZ %bb.7, implicit $exec - ; GCN: S_BRANCH %bb.8 - ; GCN: bb.7: + ; GCN: S_CBRANCH_EXECZ %bb.8, implicit $exec + ; GCN: bb.7.end: + ; GCN: successors: %bb.9(0x80000000) + ; GCN: S_BRANCH %bb.9 + ; GCN: bb.8: ; GCN: EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec ; GCN: S_ENDPGM 0 - ; GCN: bb.8: + ; GCN: bb.9: entry: %.i0 = fdiv reassoc nnan nsz arcp contract afn float 1.000000e+00, %val %cmp0 = fcmp olt float %.i0, 0.000000e+00 Index: llvm/test/CodeGen/AMDGPU/udiv64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/udiv64.ll +++ llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -374,10 +374,10 @@ ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[12:13], v[6:7] ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v12 @@ -412,14 +412,14 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v9, v11 ; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 ; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v13, v14, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB1_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB1_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 ; GCN-IR-NEXT: v_or_b32_e32 v4, v7, v1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v6, v0 @@ -1271,12 +1271,12 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[4:5] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB9_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_mov_b32 s5, 0 @@ -1311,14 +1311,14 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 ; GCN-IR-NEXT: v_mov_b32_e32 v13, v5 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v12, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB9_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB9_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB9_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v3, v5, v1 ; GCN-IR-NEXT: v_or_b32_e32 v2, v4, v0 @@ -1366,10 +1366,10 @@ ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[7:8], v[4:5] ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB10_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[7:8], v[0:1], v7 @@ -1401,14 +1401,14 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], v6, v7 ; GCN-IR-NEXT: v_subb_u32_e64 v8, s[4:5], v8, v11, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v9, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB10_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB10_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB10_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1 ; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0 @@ -1743,10 +1743,10 @@ ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[7:8], v[4:5] ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB12_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[7:8], v[0:1], v7 @@ -1776,14 +1776,14 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v1, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v5 ; GCN-IR-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v8, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v9, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB12_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB12_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB12_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1 ; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0 Index: llvm/test/CodeGen/AMDGPU/urem64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/urem64.ll +++ llvm/test/CodeGen/AMDGPU/urem64.ll @@ -377,61 +377,61 @@ ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v5 -; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v5 +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v6, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v5 -; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[7:8], v[5:6] -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[8:9], v[5:6] +; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, -1, v2 -; GCN-IR-NEXT: v_lshr_b64 v[14:15], v[0:1], v7 -; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, -1, v3, vcc -; GCN-IR-NEXT: v_not_b32_e32 v8, v10 -; GCN-IR-NEXT: v_not_b32_e32 v9, v11 -; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, v8, v12 -; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, v9, v13, vcc +; GCN-IR-NEXT: v_lshr_b64 v[14:15], v[0:1], v8 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, -1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, -1, v3, vcc +; GCN-IR-NEXT: v_not_b32_e32 v6, v10 +; GCN-IR-NEXT: v_not_b32_e32 v7, v11 +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, v6, v12 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, v7, v13, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: BB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v8, 31, v5 -; GCN-IR-NEXT: v_or_b32_e32 v14, v14, v8 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 +; GCN-IR-NEXT: v_or_b32_e32 v14, v14, v6 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v6, v14 -; GCN-IR-NEXT: v_subb_u32_e32 v8, vcc, v7, v15, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v8, v14 +; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v9, v15, vcc ; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v8 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 ; GCN-IR-NEXT: v_and_b32_e32 v17, v12, v2 -; GCN-IR-NEXT: v_and_b32_e32 v8, 1, v12 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 ; GCN-IR-NEXT: v_and_b32_e32 v16, v12, v3 ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v10 ; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v11, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v10, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v14, s[4:5], v14, v17 ; GCN-IR-NEXT: v_mov_b32_e32 v11, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 ; GCN-IR-NEXT: v_subb_u32_e64 v15, s[4:5], v15, v16, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v12, v8 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB1_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB1_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_or_b32_e32 v7, v9, v5 -; GCN-IR-NEXT: v_or_b32_e32 v4, v8, v4 +; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v5 +; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v4 ; GCN-IR-NEXT: BB1_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-IR-NEXT: v_mul_lo_u32 v5, v2, v7 @@ -1290,12 +1290,12 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc ; GCN-IR-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[2:3] ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 -; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB8_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_mov_b32 s5, 0 @@ -1330,14 +1330,14 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 ; GCN-IR-NEXT: v_mov_b32_e32 v13, v5 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v12, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB8_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB8_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB8_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 @@ -1391,10 +1391,10 @@ ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-IR-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz BB9_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 @@ -1426,14 +1426,14 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v12 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v13, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 -; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB9_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow -; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: BB9_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-IR-NEXT: BB9_5: ; %Flow3 +; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 ; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2