diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -18,9 +18,11 @@ #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" @@ -28,6 +30,7 @@ #include "llvm/CodeGen/MachineOperand.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DebugLoc.h" +#include "llvm/InitializePasses.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" @@ -52,16 +55,16 @@ const SIRegisterInfo *TRI = nullptr; const SIInstrInfo *TII = nullptr; unsigned SkipThreshold = 0; + MachineDominatorTree *MDT = nullptr; bool shouldSkip(const MachineBasicBlock &From, const MachineBasicBlock &To) const; - bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB); + bool dominatesAllReachable(MachineBasicBlock &MBB); + void skipIfDead(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + DebugLoc DL); - void kill(MachineInstr &MI); - - MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) const; + bool kill(MachineInstr &MI); bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB); @@ -79,6 +82,7 @@ } void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -87,8 +91,11 @@ char SIInsertSkips::ID = 0; -INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE, - "SI insert s_cbranch_execz instructions", false, false) +INITIALIZE_PASS_BEGIN(SIInsertSkips, DEBUG_TYPE, + "SI insert s_cbranch_execz instructions", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(SIInsertSkips, DEBUG_TYPE, + "SI insert s_cbranch_execz instructions", false, false) char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID; @@ -146,42 +153,71 @@ return false; } -bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) { - MachineBasicBlock &MBB = *MI.getParent(); - MachineFunction *MF = MBB.getParent(); - - if (MF->getFunction().getCallingConv() != CallingConv::AMDGPU_PS || - !shouldSkip(MBB, MBB.getParent()->back())) - return false; - - MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator()); - - const DebugLoc &DL = MI.getDebugLoc(); - - // If the exec mask is non-zero, skip the next two instructions - BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addMBB(&NextBB); - - MachineBasicBlock::iterator Insert = SkipBB->begin(); +/// Check whether \p MBB dominates all blocks that are reachable from it. +bool SIInsertSkips::dominatesAllReachable(MachineBasicBlock &MBB) { + for (MachineBasicBlock *Other : depth_first(&MBB)) { + if (!MDT->dominates(&MBB, Other)) + return false; + } + return true; +} - // Exec mask is zero: Export to NULL target... - BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE)) - .addImm(0x09) // V_008DFC_SQ_EXP_NULL - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addImm(1) // vm - .addImm(0) // compr - .addImm(0); // en +/// Insert an "if exec=0 { null export; s_endpgm }" sequence before the given +/// iterator. Only applies to pixel shaders. +void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL) { + MachineFunction *MF = MBB.getParent(); + assert(MF->getFunction().getCallingConv() == CallingConv::AMDGPU_PS); + + // Currently, SI_KILL_*_TERMINATOR is expected to occur only as the last + // terminator of a basic block. If this ever changes, we need to optionally + // split MBB here. + assert(I == MBB.end()); + + // It is possible for an SI_KILL_*_TERMINATOR to sit at the bottom of a + // basic block that has no further successors (e.g., there was an + // `unreachable` there in IR). This can happen with original source of the + // form: + // + // if (uniform_condition) { + // write_to_memory(); + // discard; + // } + // + // In this case, we write the "null_export; s_endpgm" skip code in the + // already-existing basic block. + auto NextBBI = std::next(MBB.getIterator()); + bool NoSuccessor = llvm::find(MBB.successors(), &*NextBBI) == MBB.succ_end(); + MachineBasicBlock *SkipBB; + + if (NoSuccessor) { + SkipBB = &MBB; + } else { + // Create a new basic block that will contain the "null export; s_endpgm" + // and set up the branching to go around it. + SkipBB = MF->CreateMachineBasicBlock(); + MF->insert(NextBBI, SkipBB); - // ... and terminate wavefront. - BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0); + BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&*NextBBI); + MBB.addSuccessor(SkipBB); + } - return true; + // Generate "null export; s_endpgm". + BuildMI(SkipBB, DL, TII->get(AMDGPU::EXP_DONE)) + .addImm(0x09) // V_008DFC_SQ_EXP_NULL + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addImm(1) // vm + .addImm(0) // compr + .addImm(0); // en + BuildMI(SkipBB, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0); } -void SIInsertSkips::kill(MachineInstr &MI) { +/// Translate a SI_KILL_*_TERMINATOR into exec-manipulating instructions. +/// Return true unless the terminator is a no-op. +bool SIInsertSkips::kill(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); @@ -268,7 +304,7 @@ I.addImm(0); // omod } - break; + return true; } case AMDGPU::SI_KILL_I1_TERMINATOR: { const MachineFunction *MF = MI.getParent()->getParent(); @@ -283,11 +319,13 @@ int64_t Imm = Op.getImm(); assert(Imm == 0 || Imm == -1); - if (Imm == KillVal) + if (Imm == KillVal) { BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec) .addImm(0); - break; + return true; + } + return false; } unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64; @@ -296,27 +334,14 @@ BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec) .addReg(Exec) .add(Op); - break; + return true; } default: llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR"); + return false; } } -MachineBasicBlock *SIInsertSkips::insertSkipBlock( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - MachineFunction *MF = MBB.getParent(); - - MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock(); - MachineFunction::iterator MBBI(MBB); - ++MBBI; - - MF->insert(MBBI, SkipBB); - MBB.addSuccessor(SkipBB); - - return SkipBB; -} - // Returns true if a branch over the block was inserted. bool SIInsertSkips::skipMaskBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB) { @@ -430,47 +455,21 @@ const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); + MDT = &getAnalysis(); SkipThreshold = SkipThresholdFlag; - bool HaveKill = false; - bool MadeChange = false; - - // Track depth of exec mask, divergent branches. - SmallVector ExecBranchStack; - - MachineFunction::iterator NextBB; - MachineBasicBlock *EmptyMBBAtEnd = nullptr; + SmallVector KillInstrs; + bool MadeChange = false; - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; BI = NextBB) { - NextBB = std::next(BI); - MachineBasicBlock &MBB = *BI; - bool HaveSkipBlock = false; - - if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) { - // Reached convergence point for last divergent branch. - ExecBranchStack.pop_back(); - } - - if (HaveKill && ExecBranchStack.empty()) { - HaveKill = false; - - // TODO: Insert skip if exec is 0? - } - + for (MachineBasicBlock &MBB : MF) { MachineBasicBlock::iterator I, Next; for (I = MBB.begin(); I != MBB.end(); I = Next) { Next = std::next(I); - MachineInstr &MI = *I; switch (MI.getOpcode()) { - case AMDGPU::S_CBRANCH_EXECZ: - ExecBranchStack.push_back(MI.getOperand(0).getMBB()); - break; case AMDGPU::SI_MASK_BRANCH: - ExecBranchStack.push_back(MI.getOperand(0).getMBB()); MadeChange |= skipMaskBranch(MI, MBB); break; @@ -478,32 +477,37 @@ // Optimize out branches to the next block. // FIXME: Shouldn't this be handled by BranchFolding? if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) { + assert(&MI == &MBB.back()); MI.eraseFromParent(); - } else if (HaveSkipBlock) { - // Remove the given unconditional branch when a skip block has been - // inserted after the current one and let skip the two instructions - // performing the kill if the exec mask is non-zero. - MI.eraseFromParent(); + MadeChange = true; } break; case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: - case AMDGPU::SI_KILL_I1_TERMINATOR: + case AMDGPU::SI_KILL_I1_TERMINATOR: { MadeChange = true; - kill(MI); - - if (ExecBranchStack.empty()) { - if (NextBB != BE && skipIfDead(MI, *NextBB)) { - HaveSkipBlock = true; - NextBB = std::next(BI); - BE = MF.end(); - } + bool CanKill = kill(MI); + + // Check if we can add an early "if exec=0 { end shader }". + // + // Note that we _always_ do this if it is correct, even if the kill + // happens fairly late in the shader, because the null export should + // generally still be cheaper than normal export(s). + // + // TODO: The dominatesAllReachable check is conservative: if the + // dominance is only missing due to _uniform_ branches, we could + // in fact insert the early-exit as well. + if (CanKill && + MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS && + dominatesAllReachable(MBB)) { + // Mark the instruction for kill-if-dead insertion. We delay this + // change because it modifies the CFG. + KillInstrs.push_back(&MI); } else { - HaveKill = true; + MI.eraseFromParent(); } - - MI.eraseFromParent(); break; + } case AMDGPU::SI_RETURN_TO_EPILOG: // FIXME: Should move somewhere else @@ -511,7 +515,7 @@ // Graphics shaders returning non-void shouldn't contain S_ENDPGM, // because external bytecode will be appended at the end. - if (BI != --MF.end() || I != MBB.getFirstTerminator()) { + if (&MBB != &MF.back() || &MI != &MBB.back()) { // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at // the end and jump there. if (!EmptyMBBAtEnd) { @@ -520,9 +524,9 @@ } MBB.addSuccessor(EmptyMBBAtEnd); - BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) + BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) .addMBB(EmptyMBBAtEnd); - I->eraseFromParent(); + MI.eraseFromParent(); } break; @@ -537,5 +541,12 @@ } } + for (MachineInstr *Kill : KillInstrs) { + skipIfDead(*Kill->getParent(), std::next(Kill->getIterator()), + Kill->getDebugLoc()); + Kill->eraseFromParent(); + } + KillInstrs.clear(); + return MadeChange; } diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -12,7 +12,11 @@ ; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg: ; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: s_cbranch_execnz BB1_2 ; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: exp null off, off, off, off done vm +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: BB1_2: ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_0_imm_neg() #0 { call void @llvm.amdgcn.kill(i1 false) @@ -23,9 +27,15 @@ ; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg_x2: ; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: s_mov_b64 exec, 0 -; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_cbranch_execnz BB2_2 +; CHECK: exp null +; CHECK-NEXT: s_endpgm +; CHECK: BB2_2: ; CHECK-NEXT: s_mov_b64 exec, 0 -; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_cbranch_execnz BB2_4 +; CHECK: exp null +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: BB2_4: ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_0_imm_neg_x2() #0 { call void @llvm.amdgcn.kill(i1 false) @@ -36,7 +46,10 @@ ; CHECK-LABEL: {{^}}test_kill_depth_var: ; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0 -; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_cbranch_execnz BB3_2 +; CHECK: exp null +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: BB3_2: ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_var(float %x) #0 { %cmp = fcmp olt float %x, 0.0 @@ -48,9 +61,15 @@ ; CHECK-LABEL: {{^}}test_kill_depth_var_x2_same: ; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0 -; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_cbranch_execnz BB4_2 +; CHECK: exp null +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: BB4_2: ; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0 -; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_cbranch_execnz BB4_4 +; CHECK: exp null +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: BB4_4: ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_var_x2_same(float %x) #0 { %cmp = fcmp olt float %x, 0.0 @@ -59,12 +78,19 @@ ret void } +; FIXME: Ideally only one early-exit would be emitted ; CHECK-LABEL: {{^}}test_kill_depth_var_x2: ; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0 -; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_cbranch_execnz BB5_2 +; CHECK: exp null +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: BB5_2: ; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v1 -; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_cbranch_execnz BB5_4 +; CHECK: exp null +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: BB5_4: ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_var_x2(float %x, float %y) #0 { %cmp.x = fcmp olt float %x, 0.0 @@ -119,14 +145,12 @@ ; CHECK: v_nop_e64 ; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7 -; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]] -; CHECK-NEXT: ; %bb.2: -; CHECK-NEXT: exp null off, off, off, off done vm -; CHECK-NEXT: s_endpgm -; CHECK-NEXT: {{^}}[[SPLIT_BB]]: -; CHECK-NEXT: s_endpgm -define amdgpu_ps void @test_kill_control_flow(i32 inreg %arg) #0 { +; TODO: We could do an early-exit here (the branch above is uniform!) +; CHECK-NOT: exp null + +; CHECK: v_mov_b32_e32 v0, 1.0 +define amdgpu_ps float @test_kill_control_flow(i32 inreg %arg) #0 { entry: %cmp = icmp eq i32 %arg, 0 br i1 %cmp, label %bb, label %exit @@ -149,7 +173,7 @@ br label %exit exit: - ret void + ret float 1.0 } ; CHECK-LABEL: {{^}}test_kill_control_flow_remainder: @@ -171,13 +195,10 @@ ; CHECK: v_mov_b32_e64 v8, -1 ; CHECK: ;;#ASMEND ; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7 -; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]] -; CHECK-NEXT: ; %bb.2: -; CHECK-NEXT: exp null off, off, off, off done vm -; CHECK-NEXT: s_endpgm +; TODO: We could do an early-exit here (the branch above is uniform!) +; CHECK-NOT: exp null -; CHECK-NEXT: {{^}}[[SPLIT_BB]]: ; CHECK: buffer_store_dword v8 ; CHECK: v_mov_b32_e64 v9, -2 @@ -433,6 +454,38 @@ ret void } +; CHECK-LABEL: {{^}}complex_loop: +; CHECK: s_mov_b64 exec, 0 +; CHECK-NOT: exp null +define amdgpu_ps void @complex_loop(i32 inreg %cmpa, i32 %cmpb, i32 %cmpc) { +.entry: + %flaga = icmp sgt i32 %cmpa, 0 + br i1 %flaga, label %.lr.ph, label %._crit_edge + +.lr.ph: + br label %hdr + +hdr: + %ctr = phi i32 [ 0, %.lr.ph ], [ %ctr.next, %latch ] + %flagb = icmp ugt i32 %ctr, %cmpb + br i1 %flagb, label %kill, label %latch + +kill: + call void @llvm.amdgcn.kill(i1 false) + br label %latch + +latch: + %ctr.next = add nuw nsw i32 %ctr, 1 + %flagc = icmp slt i32 %ctr.next, %cmpc + br i1 %flagc, label %hdr, label %._crit_edge + +._crit_edge: + %tmp = phi i32 [ -1, %.entry ], [ %ctr.next, %latch ] + %out = bitcast i32 %tmp to <2 x half> + call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> %out, <2 x half> undef, i1 immarg true, i1 immarg true) + ret void +} + declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2 declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2 declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3