Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -123,6 +123,9 @@ unsigned getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const override; + MachineBasicBlock *splitKillBlock(MachineInstr &MI, + MachineBasicBlock *BB) const; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -1070,6 +1070,64 @@ + StringRef(RegName) + "\".")); } +// If kill is not the last instruction, split the block so kill is always a +// proper terminator. +MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI, + MachineBasicBlock *BB) const { + + + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + + MachineBasicBlock::iterator SplitPoint(&MI); + ++SplitPoint; + + if (SplitPoint == BB->end()) { + // Don't bother with a new block. + MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); + return BB; + } + + MachineFunction *MF = BB->getParent(); + MachineBasicBlock *SplitBB + = MF->CreateMachineBasicBlock(BB->getBasicBlock()); + + SmallSet SplitDefRegs; + for (auto I = SplitPoint, E = BB->end(); I != E; ++I) { + for (MachineOperand &Def : I->defs()) + SplitDefRegs.insert(Def.getReg()); + } + + // Fix the block phi references to point to the new block for the defs in the + // second piece of the block. + for (MachineBasicBlock *Succ : BB->successors()) { + for (MachineInstr &MI : *Succ) { + if (!MI.isPHI()) + break; + + for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { + unsigned IncomingReg = MI.getOperand(I).getReg(); + MachineOperand &FromBB = MI.getOperand(I + 1); + if (BB == FromBB.getMBB()) { + if (SplitDefRegs.count(IncomingReg)) + FromBB.setMBB(SplitBB); + + break; + } + } + } + } + + MF->insert(++MachineFunction::iterator(BB), SplitBB); + SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end()); + + + SplitBB->transferSuccessors(BB); + BB->addSuccessor(SplitBB); + + MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); + return SplitBB; +} + MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { @@ -1082,8 +1140,6 @@ MI.eraseFromParent(); break; } - case AMDGPU::BRANCH: - return BB; case AMDGPU::GET_GROUPSTATICSIZE: { const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); @@ -1096,6 +1152,8 @@ MI.eraseFromParent(); return BB; } + case AMDGPU::SI_KILL: + return splitKillBlock(MI, BB); default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -1983,8 +1983,16 @@ let Uses = [EXEC], Defs = [EXEC,VCC] in { def SI_KILL : InstSI < (outs), (ins VSrc_32:$src), "", - [(int_AMDGPU_kill f32:$src)] ->; + [(int_AMDGPU_kill f32:$src)]> { + let isConvergent = 1; + let usesCustomInserter = 1; +} + +def SI_KILL_TERMINATOR : InstSI < + (outs), (ins VSrc_32:$src), "", []> { + let isTerminator = 1; +} + } // End Uses = [EXEC], Defs = [EXEC,VCC] } // End mayLoad = 1, mayStore = 1, hasSideEffects = 1 Index: lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- lib/Target/AMDGPU/SILowerControlFlow.cpp +++ lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -80,7 +80,7 @@ bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To); void Skip(MachineInstr &From, MachineOperand &To); - void SkipIfDead(MachineInstr &MI); + bool skipIfDead(MachineInstr &MI); void If(MachineInstr &MI); void Else(MachineInstr &MI, bool ExecModified); @@ -93,12 +93,16 @@ void Kill(MachineInstr &MI); void Branch(MachineInstr &MI); - void splitBlockLiveIns(const MachineBasicBlock &MBB, - const MachineInstr &MI, - MachineBasicBlock &LoopBB, - MachineBasicBlock &RemainderBB, - unsigned SaveReg, - const MachineOperand &IdxReg); + std::pair + splitBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); + + void splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs, + const MachineRegisterInfo &MRI, + const MachineInstr &MI, + MachineBasicBlock &LoopBB, + MachineBasicBlock &RemainderBB, + unsigned SaveReg, + const MachineOperand &IdxReg); void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL, MachineInstr *MovRel, @@ -115,7 +119,7 @@ static char ID; SILowerControlFlow() : - MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { } + MachineFunctionPass(ID), TRI(nullptr), TII(nullptr), SkipThreshold(0) { } bool runOnMachineFunction(MachineFunction &MF) override; @@ -175,7 +179,19 @@ I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) return true; - if (++NumInstr >= SkipThreshold) + if (I->isInlineAsm()) { + const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); + const char *AsmStr = I->getOperand(0).getSymbolName(); + + // inlineasm length estimate is number of bytes assuming the longest + // instruction. + uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI); + NumInstr += MaxAsmSize / MAI->getMaxInstLength(); + } else { + ++NumInstr; + } + + if (NumInstr >= SkipThreshold) return true; } } @@ -193,36 +209,49 @@ .addOperand(To); } -void SILowerControlFlow::SkipIfDead(MachineInstr &MI) { - +bool SILowerControlFlow::skipIfDead(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); + MachineFunction *MF = MBB.getParent(); - if (MBB.getParent()->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS || + if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS || !shouldSkip(&MBB, &MBB.getParent()->back())) - return; + return false; + + MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + + MachineFunction::iterator BBInsertPt(&MBB); + ++BBInsertPt; + + MF->insert(BBInsertPt, SkipBB); - MachineBasicBlock::iterator Insert = &MI; - ++Insert; + assert(MBB.succ_size() == 1); + + const DebugLoc &DL = MI.getDebugLoc(); // If the exec mask is non-zero, skip the next two instructions - BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addImm(3); + BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addMBB(&*BBInsertPt); + + SkipBB->addSuccessor(&*BBInsertPt); + MBB.addSuccessor(SkipBB); + + MachineBasicBlock::iterator Insert = SkipBB->begin(); // Exec mask is zero: Export to NULL target... - BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP)) - .addImm(0) - .addImm(0x09) // V_008DFC_SQ_EXP_NULL - .addImm(0) - .addImm(1) - .addImm(1) - .addReg(AMDGPU::VGPR0) - .addReg(AMDGPU::VGPR0) - .addReg(AMDGPU::VGPR0) - .addReg(AMDGPU::VGPR0); - - // ... and terminate wavefront - BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); + BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP)) + .addImm(0) + .addImm(0x09) // V_008DFC_SQ_EXP_NULL + .addImm(0) + .addImm(1) + .addImm(1) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef); + + // ... and terminate wavefront. + BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); + return true; } void SILowerControlFlow::If(MachineInstr &MI) { @@ -388,20 +417,13 @@ } // All currently live registers must remain so in the remainder block. -void SILowerControlFlow::splitBlockLiveIns(const MachineBasicBlock &MBB, - const MachineInstr &MI, - MachineBasicBlock &LoopBB, - MachineBasicBlock &RemainderBB, - unsigned SaveReg, - const MachineOperand &IdxReg) { - LivePhysRegs RemainderLiveRegs(TRI); - - RemainderLiveRegs.addLiveOuts(MBB); - for (MachineBasicBlock::const_reverse_iterator I = MBB.rbegin(), E(&MI); - I != E; ++I) { - RemainderLiveRegs.stepBackward(*I); - } - +void SILowerControlFlow::splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs, + const MachineRegisterInfo &MRI, + const MachineInstr &MI, + MachineBasicBlock &LoopBB, + MachineBasicBlock &RemainderBB, + unsigned SaveReg, + const MachineOperand &IdxReg) { // Add reg defined in loop body. RemainderLiveRegs.addReg(SaveReg); @@ -412,13 +434,11 @@ } } - const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); for (unsigned Reg : RemainderLiveRegs) { if (MRI.isAllocatable(Reg)) RemainderBB.addLiveIn(Reg); } - const MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src); if (!Src->isUndef()) LoopBB.addLiveIn(Src->getReg()); @@ -471,6 +491,30 @@ .addMBB(&LoopBB); } +std::pair +SILowerControlFlow::splitBlock(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) { + MachineFunction *MF = MBB.getParent(); + + // To insert the loop we need to split the block. Move everything after this + // point to a new block, and insert a new empty block between the two. + MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); + MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); + MachineFunction::iterator MBBI(MBB); + ++MBBI; + + MF->insert(MBBI, LoopBB); + MF->insert(MBBI, RemainderBB); + + // Move the rest of the block into a new block. + RemainderBB->transferSuccessors(&MBB); + RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); + + MBB.addSuccessor(LoopBB); + + return std::make_pair(LoopBB, RemainderBB); +} + // Returns true if a new block was inserted. bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) { MachineBasicBlock &MBB = *MI.getParent(); @@ -494,7 +538,6 @@ return false; } - MachineFunction &MF = *MBB.getParent(); MachineOperand *SaveOp = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); SaveOp->setIsDead(false); unsigned Save = SaveOp->getReg(); @@ -507,25 +550,24 @@ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), Save) .addReg(AMDGPU::EXEC); - // To insert the loop we need to split the block. Move everything after this - // point to a new block, and insert a new empty block between the two. - MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock(); - MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock(); - MachineFunction::iterator MBBI(MBB); - ++MBBI; + LivePhysRegs RemainderLiveRegs(TRI); - MF.insert(MBBI, LoopBB); - MF.insert(MBBI, RemainderBB); + RemainderLiveRegs.addLiveOuts(MBB); - LoopBB->addSuccessor(LoopBB); - LoopBB->addSuccessor(RemainderBB); + MachineBasicBlock *LoopBB; + MachineBasicBlock *RemainderBB; - splitBlockLiveIns(MBB, MI, *LoopBB, *RemainderBB, Save, *Idx); + std::tie(LoopBB, RemainderBB) = splitBlock(MBB, I); - // Move the rest of the block into a new block. - RemainderBB->transferSuccessors(&MBB); - RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); - MBB.addSuccessor(LoopBB); + for (const MachineInstr &Inst : reverse(*RemainderBB)) + RemainderLiveRegs.stepBackward(Inst); + + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + LoopBB->addSuccessor(RemainderBB); + LoopBB->addSuccessor(LoopBB); + + splitLoadM0BlockLiveIns(RemainderLiveRegs, MRI, MI, *LoopBB, + *RemainderBB, Save, *Idx); emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, *Idx, Offset); @@ -684,16 +726,25 @@ case AMDGPU::SI_END_CF: if (--Depth == 0 && HaveKill) { - SkipIfDead(MI); HaveKill = false; + + if (skipIfDead(MI)) { + NextBB = std::next(BI); + BE = MF.end(); + Next = MBB.end(); + } } EndCf(MI); break; - case AMDGPU::SI_KILL: - if (Depth == 0) - SkipIfDead(MI); - else + case AMDGPU::SI_KILL_TERMINATOR: + if (Depth == 0) { + if (skipIfDead(MI)) { + NextBB = std::next(BI); + BE = MF.end(); + Next = MBB.end(); + } + } else HaveKill = true; Kill(MI); break; Index: test/CodeGen/AMDGPU/indirect-addressing-undef.mir =================================================================== --- test/CodeGen/AMDGPU/indirect-addressing-undef.mir +++ test/CodeGen/AMDGPU/indirect-addressing-undef.mir @@ -3,7 +3,7 @@ # CHECK-LABEL: name: extract_undef_offset_vgpr{{$}} # CHECK: bb.1: -# CHECK: successors: %bb.1(0x40000000 / 0x80000000 = 50.00%), %bb.2(0x40000000 / 0x80000000 = 50.00%) +# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%) # CHECK: liveins: %vgpr0_vgpr1_vgpr2_vgpr3{{$}} # CHECK: V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec @@ -103,7 +103,7 @@ # CHECK-LABEL: name: extract_undef_neg_offset_vgpr{{$}} # CHECK: bb.1: -# CHECK: successors: %bb.1(0x40000000 / 0x80000000 = 50.00%), %bb.2(0x40000000 / 0x80000000 = 50.00%) +# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%) # CHECK: liveins: %vgpr0_vgpr1_vgpr2_vgpr3{{$}} # CHECK: %vcc_lo = V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec @@ -159,7 +159,7 @@ # CHECK-LABEL: name: insert_undef_offset_vgpr{{$}} # CHECK: bb.1: -# CHECK: successors: %bb.1(0x40000000 / 0x80000000 = 50.00%), %bb.2(0x40000000 / 0x80000000 = 50.00%) +# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%) # CHECK: liveins: %vgpr4, %vgpr0_vgpr1_vgpr2_vgpr3{{$}} # CHECK: %vcc_lo = V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec @@ -215,7 +215,7 @@ # CHECK-LABEL: name: insert_undef_neg_offset_vgpr{{$}} # CHECK: bb.1: -# CHECK: successors: %bb.1(0x40000000 / 0x80000000 = 50.00%), %bb.2(0x40000000 / 0x80000000 = 50.00%) +# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%) # CHECK: liveins: %vgpr4, %vgpr0_vgpr1_vgpr2_vgpr3{{$}} # CHECK: %vcc_lo = V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec @@ -272,7 +272,7 @@ # CHECK-LABEL: insert_undef_value_offset_vgpr{{$}} # CHECK: bb.1: -# CHECK: successors: %bb.1(0x40000000 / 0x80000000 = 50.00%), %bb.2(0x40000000 / 0x80000000 = 50.00%) +# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%) # CHECK: liveins: %vgpr4, %vgpr0_vgpr1_vgpr2_vgpr3{{$}} # CHECK: %vcc_lo = V_READFIRSTLANE_B32 %vgpr4, implicit %exec Index: test/CodeGen/AMDGPU/skip-if-dead.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/skip-if-dead.ll @@ -0,0 +1,148 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s + +; CHECK-LABEL: {{^}}test_kill_depth_0_imm_pos: +; CHECK-NEXT: ; BB#0: +; CHECK-NEXT: ; BB#1: +; CHECK-NEXT: s_endpgm +define amdgpu_ps void @test_kill_depth_0_imm_pos() #0 { + call void @llvm.AMDGPU.kill(float 0.0) + ret void +} + +; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg: +; CHECK-NEXT: ; BB#0: +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: ; BB#1: +; CHECK-NEXT: s_endpgm +define amdgpu_ps void @test_kill_depth_0_imm_neg() #0 { + call void @llvm.AMDGPU.kill(float -0.0) + ret void +} + +; CHECK-LABEL: {{^}}test_kill_depth_var: +; CHECK-NEXT: ; BB#0: +; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0 +; CHECK-NEXT: ; BB#1: +; CHECK-NEXT: s_endpgm +define amdgpu_ps void @test_kill_depth_var(float %x) #0 { + call void @llvm.AMDGPU.kill(float %x) + ret void +} + +; FIXME: why does the skip depend on the asm length in the same block? + +; CHECK-LABEL: {{^}}test_kill_control_flow: +; CHECK: s_cmp_lg_i32 s{{[0-9]+}}, 0 +; CHECK: s_cbranch_scc1 [[RETURN_BB:BB[0-9]+_[0-9]+]] + +; CHECK-NEXT: ; BB#1: +; CHECK: v_mov_b32_e64 v7, -1 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 + +; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7 +; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]] +; CHECK-NEXT: ; BB#3: +; CHECK-NEXT: exp 0, 9, 0, 1, 1, v0, v0, v0, v0 +; CHECK-NEXT: s_endpgm + +; CHECK-NEXT: {{^}}[[SPLIT_BB]]: +; CHECK-NEXT: s_endpgm +define amdgpu_ps void @test_kill_control_flow(i32 inreg %arg) #0 { +entry: + %cmp = icmp eq i32 %arg, 0 + br i1 %cmp, label %bb, label %exit + +bb: + %var = call float asm sideeffect " + v_mov_b32_e64 v7, -1 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", "={VGPR7}"() + call void @llvm.AMDGPU.kill(float %var) + br label %exit + +exit: + ret void +} + +; CHECK-LABEL: {{^}}test_kill_control_flow_remainder: +; CHECK: s_cmp_lg_i32 s{{[0-9]+}}, 0 +; CHECK-NEXT: s_cbranch_scc1 [[RETURN_BB:BB[0-9]+_[0-9]+]] + +; CHECK-NEXT: ; BB#1: ; %bb +; CHECK: v_mov_b32_e64 v7, -1 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: ;;#ASMEND +; CHECK: v_mov_b32_e64 v8, -1 +; CHECK: ;;#ASMEND +; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7 +; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]] + +; CHECK-NEXT: ; BB#4: +; CHECK-NEXT: exp 0, 9, 0, 1, 1, v0, v0, v0, v0 +; CHECK-NEXT: s_endpgm + +; CHECK-NEXT: {{^}}[[SPLIT_BB]]: +; CHECK: buffer_store_dword v8 +; CHECK: v_mov_b32_e64 v9, -2 + +; CHECK: {{^}}BB{{[0-9]+_[0-9]+}}: +; CHECK: buffer_store_dword v9 +; CHECK-NEXT: s_endpgm +define amdgpu_ps void @test_kill_control_flow_remainder(i32 inreg %arg) #0 { +entry: + %cmp = icmp eq i32 %arg, 0 + br i1 %cmp, label %bb, label %exit + +bb: + %var = call float asm sideeffect " + v_mov_b32_e64 v7, -1 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", "={VGPR7}"() + %live.across = call float asm sideeffect "v_mov_b32_e64 v8, -1", "={VGPR8}"() + call void @llvm.AMDGPU.kill(float %var) + store volatile float %live.across, float addrspace(1)* undef + %live.out = call float asm sideeffect "v_mov_b32_e64 v9, -2", "={VGPR9}"() + br label %exit + +exit: + %phi = phi float [ 0.0, %entry ], [ %live.out, %bb ] + store float %phi, float addrspace(1)* undef + ret void +} + +declare void @llvm.AMDGPU.kill(float) #0 + +attributes #0 = { nounwind }