diff --git a/llvm/lib/CodeGen/UnreachableBlockElim.cpp b/llvm/lib/CodeGen/UnreachableBlockElim.cpp --- a/llvm/lib/CodeGen/UnreachableBlockElim.cpp +++ b/llvm/lib/CodeGen/UnreachableBlockElim.cpp @@ -95,8 +95,7 @@ } bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) { - df_iterator_default_set Reachable; - bool ModifiedPHI = false; + df_iterator_default_set Reachable; MachineDominatorTree *MDT = getAnalysisIfAvailable(); MachineLoopInfo *MLI = getAnalysisIfAvailable(); @@ -118,9 +117,9 @@ if (MDT && MDT->getNode(&BB)) MDT->eraseNode(&BB); while (BB.succ_begin() != BB.succ_end()) { - MachineBasicBlock* succ = *BB.succ_begin(); + MachineBasicBlock *Succ = *BB.succ_begin(); - for (MachineInstr &Phi : succ->phis()) { + for (MachineInstr &Phi : make_early_inc_range(Succ->phis())) { for (unsigned i = Phi.getNumOperands() - 1; i >= 2; i -= 2) { if (Phi.getOperand(i).isMBB() && Phi.getOperand(i).getMBB() == &BB) { @@ -128,6 +127,34 @@ Phi.removeOperand(i - 1); } } + + if (Phi.getNumOperands() == 3) { + const MachineOperand &Input = Phi.getOperand(1); + const MachineOperand &Output = Phi.getOperand(0); + Register InputReg = Input.getReg(); + Register OutputReg = Output.getReg(); + assert(Output.getSubReg() == 0 && "Cannot have output subregister"); + + if (InputReg != OutputReg) { + MachineRegisterInfo &MRI = F.getRegInfo(); + unsigned InputSub = Input.getSubReg(); + if (InputSub == 0 && + MRI.constrainRegClass(InputReg, MRI.getRegClass(OutputReg)) && + !Input.isUndef()) { + MRI.replaceRegWith(OutputReg, InputReg); + } else { + // The input register to the PHI has a subregister or it can't + // be constrained to the proper register class or it is undef: + // insert a COPY instead of simply replacing the output + // with the input. + const TargetInstrInfo *TII = F.getSubtarget().getInstrInfo(); + BuildMI(*Succ, Succ->getFirstNonPHI(), Phi.getDebugLoc(), + TII->get(TargetOpcode::COPY), OutputReg) + .addReg(InputReg, getRegState(Input), InputSub); + } + Phi.eraseFromParent(); + } + } } BB.removeSuccessor(BB.succ_begin()); @@ -145,52 +172,7 @@ BB->eraseFromParent(); } - // Cleanup PHI nodes. - for (MachineBasicBlock &BB : F) { - // Prune unneeded PHI entries. - SmallPtrSet preds(BB.pred_begin(), - BB.pred_end()); - for (MachineInstr &Phi : make_early_inc_range(BB.phis())) { - for (unsigned i = Phi.getNumOperands() - 1; i >= 2; i -= 2) { - if (!preds.count(Phi.getOperand(i).getMBB())) { - Phi.removeOperand(i); - Phi.removeOperand(i - 1); - ModifiedPHI = true; - } - } - - if (Phi.getNumOperands() == 3) { - const MachineOperand &Input = Phi.getOperand(1); - const MachineOperand &Output = Phi.getOperand(0); - Register InputReg = Input.getReg(); - Register OutputReg = Output.getReg(); - assert(Output.getSubReg() == 0 && "Cannot have output subregister"); - ModifiedPHI = true; - - if (InputReg != OutputReg) { - MachineRegisterInfo &MRI = F.getRegInfo(); - unsigned InputSub = Input.getSubReg(); - if (InputSub == 0 && - MRI.constrainRegClass(InputReg, MRI.getRegClass(OutputReg)) && - !Input.isUndef()) { - MRI.replaceRegWith(OutputReg, InputReg); - } else { - // The input register to the PHI has a subregister or it can't be - // constrained to the proper register class or it is undef: - // insert a COPY instead of simply replacing the output - // with the input. - const TargetInstrInfo *TII = F.getSubtarget().getInstrInfo(); - BuildMI(BB, BB.getFirstNonPHI(), Phi.getDebugLoc(), - TII->get(TargetOpcode::COPY), OutputReg) - .addReg(InputReg, getRegState(Input), InputSub); - } - Phi.eraseFromParent(); - } - } - } - } - F.RenumberBlocks(); - return (!DeadBlocks.empty() || ModifiedPHI); + return !DeadBlocks.empty(); } diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -174,7 +174,9 @@ ; SI-NEXT: S_BRANCH %bb.6 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.6.for.end: - ; SI-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[PHI6]], 0, killed [[PHI5]], 0, 0, implicit $mode, implicit $exec + ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI [[PHI5]], %bb.5 + ; SI-NEXT: [[PHI8:%[0-9]+]]:vgpr_32 = PHI [[PHI6]], %bb.5 + ; SI-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[PHI8]], 0, killed [[PHI7]], 0, 0, implicit $mode, implicit $exec ; SI-NEXT: $vgpr0 = COPY killed [[V_ADD_F32_e64_]] ; SI-NEXT: SI_RETURN_TO_EPILOG killed $vgpr0 entry: @@ -233,10 +235,10 @@ ; SI-NEXT: bb.1.Flow: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.10(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef [[COPY47:%[0-9]+]]:vgpr_32, %bb.0, %4, %bb.9 - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef [[COPY49:%[0-9]+]]:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef [[COPY51:%[0-9]+]]:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef [[COPY53:%[0-9]+]]:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %47:vgpr_32, %bb.0, %4, %bb.9 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %49:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %51:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %53:vgpr_32, %bb.9 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.2 ; SI-NEXT: {{ $}} @@ -249,8 +251,8 @@ ; SI-NEXT: bb.3: ; SI-NEXT: successors: %bb.4(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef [[COPY57:%[0-9]+]]:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 - ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef [[COPY59:%[0-9]+]]:vgpr_32, %bb.4, [[PHI1]], %bb.2 + ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %55:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 + ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %57:vgpr_32, %bb.4, [[PHI1]], %bb.2 ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -286,8 +288,8 @@ ; SI-NEXT: bb.7: ; SI-NEXT: successors: %bb.8(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef [[COPY59:%[0-9]+]]:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 - ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef [[COPY61:%[0-9]+]]:vgpr_32, %bb.8, [[COPY4]], %bb.6 + ; SI-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %59:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 + ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %61:vgpr_32, %bb.8, [[COPY4]], %bb.6 ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1 @@ -356,9 +358,9 @@ ; SI-NEXT: bb.1.Flow: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.10(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef [[COPY50:%[0-9]+]]:vgpr_32, %bb.0, %4, %bb.9 - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef [[COPY52:%[0-9]+]]:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef [[COPY54:%[0-9]+]]:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %48:vgpr_32, %bb.0, %4, %bb.9 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %50:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %52:vgpr_32, %bb.9 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.2 ; SI-NEXT: {{ $}} @@ -371,7 +373,7 @@ ; SI-NEXT: bb.3: ; SI-NEXT: successors: %bb.4(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef [[COPY56:%[0-9]+]]:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 + ; SI-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %54:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -407,7 +409,7 @@ ; SI-NEXT: bb.7: ; SI-NEXT: successors: %bb.8(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef [[COPY58:%[0-9]+]]:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 + ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %56:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1 @@ -509,7 +511,7 @@ ; SI-NEXT: bb.6.sw.bb18: ; SI-NEXT: successors: %bb.5(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef [[COPY38:%[0-9]+]]:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %36:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 ; SI-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec ; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, addrspace 1) ; SI-NEXT: S_BRANCH %bb.5 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1807,64 +1807,64 @@ ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_mov_b32 s0, exec_lo ; GFX1032-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX1032-NEXT: v_mov_b32_e32 v7, v3 +; GFX1032-NEXT: v_mov_b32_e32 v6, v2 +; GFX1032-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032-NEXT: v_mov_b32_e32 v4, v0 ; GFX1032-NEXT: v_mov_b32_e32 v8, 0 ; GFX1032-NEXT: s_branch .LBB33_2 ; GFX1032-NEXT: .LBB33_1: ; %body ; GFX1032-NEXT: ; in Loop: Header=BB33_2 Depth=1 -; GFX1032-NEXT: image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX1032-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX1032-NEXT: v_add_f32_e32 v8, 2.0, v8 ; GFX1032-NEXT: s_cbranch_execz .LBB33_4 ; GFX1032-NEXT: .LBB33_2: ; %loop ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0x40e00000, v8 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v7, v3 -; GFX1032-NEXT: v_mov_b32_e32 v6, v2 -; GFX1032-NEXT: v_mov_b32_e32 v5, v1 -; GFX1032-NEXT: v_mov_b32_e32 v4, v0 +; GFX1032-NEXT: v_mov_b32_e32 v0, v4 +; GFX1032-NEXT: v_mov_b32_e32 v1, v5 +; GFX1032-NEXT: v_mov_b32_e32 v2, v6 +; GFX1032-NEXT: v_mov_b32_e32 v3, v7 ; GFX1032-NEXT: s_cbranch_vccz .LBB33_1 ; GFX1032-NEXT: ; %bb.3: -; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX1032-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1032-NEXT: ; implicit-def: $vgpr8 ; GFX1032-NEXT: .LBB33_4: ; %break ; GFX1032-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_mov_b32_e32 v0, v4 -; GFX1032-NEXT: v_mov_b32_e32 v1, v5 -; GFX1032-NEXT: v_mov_b32_e32 v2, v6 -; GFX1032-NEXT: v_mov_b32_e32 v3, v7 ; GFX1032-NEXT: ; return to shader part epilog ; ; GFX1064-LABEL: test_loop_vcc: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: s_wqm_b64 exec, exec +; GFX1064-NEXT: v_mov_b32_e32 v7, v3 +; GFX1064-NEXT: v_mov_b32_e32 v6, v2 +; GFX1064-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064-NEXT: v_mov_b32_e32 v4, v0 ; GFX1064-NEXT: v_mov_b32_e32 v8, 0 ; GFX1064-NEXT: s_branch .LBB33_2 ; GFX1064-NEXT: .LBB33_1: ; %body ; GFX1064-NEXT: ; in Loop: Header=BB33_2 Depth=1 -; GFX1064-NEXT: image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX1064-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX1064-NEXT: v_add_f32_e32 v8, 2.0, v8 ; GFX1064-NEXT: s_cbranch_execz .LBB33_4 ; GFX1064-NEXT: .LBB33_2: ; %loop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: v_cmp_lt_f32_e32 vcc, 0x40e00000, v8 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v7, v3 -; GFX1064-NEXT: v_mov_b32_e32 v6, v2 -; GFX1064-NEXT: v_mov_b32_e32 v5, v1 -; GFX1064-NEXT: v_mov_b32_e32 v4, v0 +; GFX1064-NEXT: v_mov_b32_e32 v0, v4 +; GFX1064-NEXT: v_mov_b32_e32 v1, v5 +; GFX1064-NEXT: v_mov_b32_e32 v2, v6 +; GFX1064-NEXT: v_mov_b32_e32 v3, v7 ; GFX1064-NEXT: s_cbranch_vccz .LBB33_1 ; GFX1064-NEXT: ; %bb.3: -; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX1064-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX1064-NEXT: ; implicit-def: $vgpr8 ; GFX1064-NEXT: .LBB33_4: ; %break ; GFX1064-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_mov_b32_e32 v0, v4 -; GFX1064-NEXT: v_mov_b32_e32 v1, v5 -; GFX1064-NEXT: v_mov_b32_e32 v2, v6 -; GFX1064-NEXT: v_mov_b32_e32 v3, v7 ; GFX1064-NEXT: ; return to shader part epilog entry: br label %loop