Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -123,6 +123,9 @@ unsigned getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const override; + MachineBasicBlock *splitKillBlock(MachineInstr &MI, + MachineBasicBlock *BB) const; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -1070,6 +1070,62 @@ + StringRef(RegName) + "\".")); } +// If kill is not the last instruction, split the block so kill is always a +// proper terminator. +MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI, + MachineBasicBlock *BB) const { + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + + MachineBasicBlock::iterator SplitPoint(&MI); + ++SplitPoint; + + if (SplitPoint == BB->end()) { + // Don't bother with a new block. + MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); + return BB; + } + + MachineFunction *MF = BB->getParent(); + MachineBasicBlock *SplitBB + = MF->CreateMachineBasicBlock(BB->getBasicBlock()); + + SmallSet SplitDefRegs; + for (auto I = SplitPoint, E = BB->end(); I != E; ++I) { + for (MachineOperand &Def : I->defs()) + SplitDefRegs.insert(Def.getReg()); + } + + // Fix the block phi references to point to the new block for the defs in the + // second piece of the block. + for (MachineBasicBlock *Succ : BB->successors()) { + for (MachineInstr &MI : *Succ) { + if (!MI.isPHI()) + break; + + for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { + unsigned IncomingReg = MI.getOperand(I).getReg(); + MachineOperand &FromBB = MI.getOperand(I + 1); + if (BB == FromBB.getMBB()) { + if (SplitDefRegs.count(IncomingReg)) + FromBB.setMBB(SplitBB); + + break; + } + } + } + } + + MF->insert(++MachineFunction::iterator(BB), SplitBB); + SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end()); + + + SplitBB->transferSuccessors(BB); + BB->addSuccessor(SplitBB); + + MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); + return SplitBB; +} + MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { @@ -1082,8 +1138,6 @@ MI.eraseFromParent(); break; } - case AMDGPU::BRANCH: - return BB; case AMDGPU::GET_GROUPSTATICSIZE: { const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); @@ -1096,6 +1150,8 @@ MI.eraseFromParent(); return BB; } + case AMDGPU::SI_KILL: + return splitKillBlock(MI, BB); default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -1983,8 +1983,16 @@ let Uses = [EXEC], Defs = [EXEC,VCC] in { def SI_KILL : InstSI < (outs), (ins VSrc_32:$src), "", - [(int_AMDGPU_kill f32:$src)] ->; + [(int_AMDGPU_kill f32:$src)]> { + let isConvergent = 1; + let usesCustomInserter = 1; +} + +def SI_KILL_TERMINATOR : InstSI < + (outs), (ins VSrc_32:$src), "", []> { + let isTerminator = 1; +} + } // End Uses = [EXEC], Defs = [EXEC,VCC] } // End mayLoad = 1, mayStore = 1, hasSideEffects = 1 Index: lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- lib/Target/AMDGPU/SILowerControlFlow.cpp +++ lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -211,25 +211,29 @@ bool SILowerControlFlow::skipIfDead(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction *MF = MBB.getParent(); - if (MBB.getParent()->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS || + if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS || !shouldSkip(&MBB, &MBB.getParent()->back())) return false; - LivePhysRegs RemainderLiveRegs(TRI); - RemainderLiveRegs.addLiveOuts(MBB); + MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); - MachineBasicBlock *SkipBB; - MachineBasicBlock *RemainderBB; - std::tie(SkipBB, RemainderBB) = splitBlock(MBB, MI.getIterator()); + MachineFunction::iterator BBInsertPt(&MBB); + ++BBInsertPt; + + MF->insert(BBInsertPt, SkipBB); + + assert(MBB.succ_size() == 1); const DebugLoc &DL = MI.getDebugLoc(); // If the exec mask is non-zero, skip the next two instructions BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addMBB(RemainderBB); + .addMBB(&*BBInsertPt); - MBB.addSuccessor(RemainderBB); + SkipBB->addSuccessor(&*BBInsertPt); + MBB.addSuccessor(SkipBB); MachineBasicBlock::iterator Insert = SkipBB->begin(); @@ -247,16 +251,6 @@ // ... and terminate wavefront. BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); - - for (const MachineInstr &Inst : reverse(*RemainderBB)) - RemainderLiveRegs.stepBackward(Inst); - - const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - for (unsigned Reg : RemainderLiveRegs) { - if (MRI.isAllocatable(Reg)) - RemainderBB->addLiveIn(Reg); - } - return true; } @@ -743,7 +737,7 @@ EndCf(MI); break; - case AMDGPU::SI_KILL: + case AMDGPU::SI_KILL_TERMINATOR: if (Depth == 0) { if (skipIfDead(MI)) { NextBB = std::next(BI); Index: lib/Target/AMDGPU/SIWholeQuadMode.cpp =================================================================== --- lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -223,7 +223,7 @@ // Control flow-type instructions that are followed by WQM computations // must themselves be in WQM. if ((II.OutNeeds & StateWQM) && !(II.Needs & StateWQM) && - (MI.isBranch() || MI.isTerminator() || MI.getOpcode() == AMDGPU::SI_KILL)) { + (MI.isBranch() || MI.isTerminator())) { Instructions[&MI].Needs = StateWQM; II.Needs = StateWQM; } @@ -444,9 +444,6 @@ State = Needs; } - - if (MI.getOpcode() == AMDGPU::SI_KILL) - WQMFromExec = false; } if ((BI.OutNeeds & StateWQM) && State != StateWQM) { Index: test/CodeGen/AMDGPU/skip-if-dead.ll =================================================================== --- test/CodeGen/AMDGPU/skip-if-dead.ll +++ test/CodeGen/AMDGPU/skip-if-dead.ll @@ -2,6 +2,7 @@ ; CHECK-LABEL: {{^}}test_kill_depth_0_imm_pos: ; CHECK-NEXT: ; BB#0: +; CHECK-NEXT: ; BB#1: ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_0_imm_pos() #0 { call void @llvm.AMDGPU.kill(float 0.0) @@ -11,6 +12,7 @@ ; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg: ; CHECK-NEXT: ; BB#0: ; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: ; BB#1: ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_0_imm_neg() #0 { call void @llvm.AMDGPU.kill(float -0.0) @@ -20,6 +22,7 @@ ; CHECK-LABEL: {{^}}test_kill_depth_var: ; CHECK-NEXT: ; BB#0: ; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0 +; CHECK-NEXT: ; BB#1: ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_var(float %x) #0 { call void @llvm.AMDGPU.kill(float %x) @@ -32,7 +35,8 @@ ; CHECK: s_cmp_lg_i32 s{{[0-9]+}}, 0 ; CHECK: s_cbranch_scc1 [[RETURN_BB:BB[0-9]+_[0-9]+]] -; CHECK: ; BB#1: +; CHECK-NEXT: ; BB#1: +; CHECK: v_mov_b32_e64 v7, -1 ; CHECK: v_nop_e64 ; CHECK: v_nop_e64 ; CHECK: v_nop_e64 @@ -44,14 +48,13 @@ ; CHECK: v_nop_e64 ; CHECK: v_nop_e64 -; CHECK: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]] +; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7 +; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]] ; CHECK-NEXT: ; BB#3: ; CHECK-NEXT: exp 0, 9, 0, 1, 1, v0, v0, v0, v0 ; CHECK-NEXT: s_endpgm ; CHECK-NEXT: {{^}}[[SPLIT_BB]]: -; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v7 -; CHECK-NEXT: {{^}}BB{{[0-9]+_[0-9]+}}: ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_control_flow(i32 inreg %arg) #0 { entry: @@ -95,14 +98,14 @@ ; CHECK: ;;#ASMEND ; CHECK: v_mov_b32_e64 v8, -1 ; CHECK: ;;#ASMEND +; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7 ; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]] -; CHECK-NEXT: ; BB#3: +; CHECK-NEXT: ; BB#4: ; CHECK-NEXT: exp 0, 9, 0, 1, 1, v0, v0, v0, v0 ; CHECK-NEXT: s_endpgm ; CHECK-NEXT: {{^}}[[SPLIT_BB]]: -; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v7 ; CHECK: buffer_store_dword v8 ; CHECK: v_mov_b32_e64 v9, -2