diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1052,8 +1052,8 @@ addPass(&SIInsertHardClausesID); addPass(&SIRemoveShortExecBranchesID); - addPass(&SIPreEmitPeepholeID); addPass(&SIInsertSkipsPassID); + addPass(&SIPreEmitPeepholeID); addPass(&BranchRelaxationPassID); } diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -57,10 +57,13 @@ unsigned SkipThreshold = 0; MachineDominatorTree *MDT = nullptr; + MachineBasicBlock *EarlyExitBlock = nullptr; + bool shouldSkip(const MachineBasicBlock &From, const MachineBasicBlock &To) const; bool dominatesAllReachable(MachineBasicBlock &MBB); + void createEarlyExitBlock(MachineBasicBlock &MBB); void skipIfDead(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL); @@ -161,6 +164,33 @@ return true; } +static void generatePsEndPgm(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + const SIInstrInfo *TII) { + // Generate "null export; s_endpgm". + BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE)) + .addImm(0x09) // V_008DFC_SQ_EXP_NULL + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addImm(1) // vm + .addImm(0) // compr + .addImm(0); // en + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0); +} + +void SIInsertSkips::createEarlyExitBlock(MachineBasicBlock &MBB) { + MachineFunction *MF = MBB.getParent(); + DebugLoc DL; + + assert(!EarlyExitBlock); + EarlyExitBlock = MF->CreateMachineBasicBlock(); + MF->insert(MF->end(), EarlyExitBlock); + + generatePsEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII); +} + /// Insert an "if exec=0 { null export; s_endpgm }" sequence before the given /// iterator. Only applies to pixel shaders. void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB, @@ -168,11 +198,6 @@ MachineFunction *MF = MBB.getParent(); assert(MF->getFunction().getCallingConv() == CallingConv::AMDGPU_PS); - // Currently, SI_KILL_*_TERMINATOR is expected to occur only as the last - // terminator of a basic block. If this ever changes, we need to optionally - // split MBB here. - assert(I == MBB.end()); - // It is possible for an SI_KILL_*_TERMINATOR to sit at the bottom of a // basic block that has no further successors (e.g., there was an // `unreachable` there in IR). This can happen with original source of the @@ -186,34 +211,40 @@ // In this case, we write the "null_export; s_endpgm" skip code in the // already-existing basic block. auto NextBBI = std::next(MBB.getIterator()); - bool NoSuccessor = llvm::find(MBB.successors(), &*NextBBI) == MBB.succ_end(); - MachineBasicBlock *SkipBB; + bool NoSuccessor = I == MBB.end() && + llvm::find(MBB.successors(), &*NextBBI) == MBB.succ_end(); if (NoSuccessor) { - SkipBB = &MBB; + generatePsEndPgm(MBB, I, DL, TII); } else { - // Create a new basic block that will contain the "null export; s_endpgm" - // and set up the branching to go around it. - SkipBB = MF->CreateMachineBasicBlock(); - MF->insert(NextBBI, SkipBB); + if (!EarlyExitBlock) { + createEarlyExitBlock(MBB); + // Update next block pointer to reflect any new blocks + NextBBI = std::next(MBB.getIterator()); + } - BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&*NextBBI); - MBB.addSuccessor(SkipBB); + auto BranchMI = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) + .addMBB(EarlyExitBlock); + + // Split the block if the branch will not come at the end. + auto Next = std::next(BranchMI->getIterator()); + if (Next != MBB.end() && !Next->isTerminator()) { + MachineBasicBlock *SplitBB = + MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + MF->insert(NextBBI, SplitBB); + SplitBB->splice(SplitBB->begin(), &MBB, I, MBB.end()); + SplitBB->transferSuccessorsAndUpdatePHIs(&MBB); + // FIXME: the expectation is that this will be used near the beginning + // of a block so just assume all registers are still live. + for (auto LiveIn : MBB.liveins()) + SplitBB->addLiveIn(LiveIn); + MBB.addSuccessor(SplitBB); + MDT->addNewBlock(SplitBB, &MBB); + } - MDT->addNewBlock(SkipBB, &MBB); + MBB.addSuccessor(EarlyExitBlock); + MDT->getBase().insertEdge(&MBB, EarlyExitBlock); } - - // Generate "null export; s_endpgm". - BuildMI(SkipBB, DL, TII->get(AMDGPU::EXP_DONE)) - .addImm(0x09) // V_008DFC_SQ_EXP_NULL - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addImm(1) // vm - .addImm(0) // compr - .addImm(0); // en - BuildMI(SkipBB, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0); } /// Translate a SI_KILL_*_TERMINATOR into exec-manipulating instructions. @@ -428,6 +459,7 @@ Kill->eraseFromParent(); } KillInstrs.clear(); + EarlyExitBlock = nullptr; return MadeChange; } diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir --- a/llvm/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir @@ -14,13 +14,13 @@ # CHECK: bb.1: # CHECK: V_CMPX_LE_F32_e32 -# CHECK-NEXT: S_CBRANCH_EXECNZ %bb.2, implicit $exec +# CHECK-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec -# CHECK: bb.3: -# CHECK-NEXT: EXP_DONE +# CHECK: bb.2: # CHECK: S_ENDPGM 0 -# CHECK: bb.2: +# CHECK: bb.3: +# CHECK-NEXT: EXP_DONE # CHECK: S_ENDPGM 0 name: kill_uncond_branch diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -12,11 +12,11 @@ ; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg: ; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: s_mov_b64 exec, 0 -; CHECK-NEXT: s_cbranch_execnz BB1_2 +; CHECK-NEXT: s_cbranch_execz BB1_2 ; CHECK-NEXT: ; %bb.1: -; CHECK-NEXT: exp null off, off, off, off done vm ; CHECK-NEXT: s_endpgm ; CHECK-NEXT: BB1_2: +; CHECK-NEXT: exp null off, off, off, off done vm ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_0_imm_neg() #0 { call void @llvm.amdgcn.kill(i1 false) @@ -27,15 +27,14 @@ ; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg_x2: ; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: s_mov_b64 exec, 0 -; CHECK-NEXT: s_cbranch_execnz BB2_2 -; CHECK: exp null -; CHECK-NEXT: s_endpgm -; CHECK: BB2_2: +; CHECK-NEXT: s_cbranch_execz BB2_3 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: s_mov_b64 exec, 0 -; CHECK-NEXT: s_cbranch_execnz BB2_4 -; CHECK: exp null +; CHECK-NEXT: s_cbranch_execz BB2_3 +; CHECK-NEXT: ; %bb.2: ; CHECK-NEXT: s_endpgm -; CHECK-NEXT: BB2_4: +; CHECK-NEXT: BB2_3: +; CHECK: exp null ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_0_imm_neg_x2() #0 { call void @llvm.amdgcn.kill(i1 false) @@ -46,10 +45,11 @@ ; CHECK-LABEL: {{^}}test_kill_depth_var: ; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0 -; CHECK-NEXT: s_cbranch_execnz BB3_2 -; CHECK: exp null +; CHECK-NEXT: s_cbranch_execz BB3_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: s_endpgm ; CHECK-NEXT: BB3_2: +; CHECK: exp null ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_var(float %x) #0 { %cmp = fcmp olt float %x, 0.0 @@ -61,15 +61,14 @@ ; CHECK-LABEL: {{^}}test_kill_depth_var_x2_same: ; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0 -; CHECK-NEXT: s_cbranch_execnz BB4_2 -; CHECK: exp null -; CHECK-NEXT: s_endpgm -; CHECK-NEXT: BB4_2: +; CHECK-NEXT: s_cbranch_execz BB4_3 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0 -; CHECK-NEXT: s_cbranch_execnz BB4_4 -; CHECK: exp null +; CHECK-NEXT: s_cbranch_execz BB4_3 +; CHECK-NEXT: ; %bb.2: ; CHECK-NEXT: s_endpgm -; CHECK-NEXT: BB4_4: +; CHECK-NEXT: BB4_3: +; CHECK: exp null ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_var_x2_same(float %x) #0 { %cmp = fcmp olt float %x, 0.0 @@ -82,15 +81,14 @@ ; CHECK-LABEL: {{^}}test_kill_depth_var_x2: ; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0 -; CHECK-NEXT: s_cbranch_execnz BB5_2 -; CHECK: exp null -; CHECK-NEXT: s_endpgm -; CHECK-NEXT: BB5_2: +; CHECK-NEXT: s_cbranch_execz BB5_3 +; CHECK-NEXT: ; %bb.1 ; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v1 -; CHECK-NEXT: s_cbranch_execnz BB5_4 -; CHECK: exp null +; CHECK-NEXT: s_cbranch_execz BB5_3 +; CHECK-NEXT: ; %bb.2 ; CHECK-NEXT: s_endpgm -; CHECK-NEXT: BB5_4: +; CHECK-NEXT: BB5_3: +; CHECK: exp null ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_var_x2(float %x, float %y) #0 { %cmp.x = fcmp olt float %x, 0.0 @@ -103,18 +101,15 @@ ; CHECK-LABEL: {{^}}test_kill_depth_var_x2_instructions: ; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: v_cmpx_gt_f32_e32 vcc, 0, v0 -; CHECK-NEXT: s_cbranch_execnz BB6_2 +; CHECK-NEXT: s_cbranch_execz BB6_3 ; CHECK-NEXT: ; %bb.1: -; CHECK-NEXT: exp -; CHECK-NEXT: s_endpgm -; CHECK-NEXT: BB6_2: ; CHECK: v_mov_b32_e64 v7, -1 ; CHECK: v_cmpx_gt_f32_e32 vcc, 0, v7 -; CHECK-NEXT: s_cbranch_execnz BB6_4 -; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: exp +; CHECK-NEXT: s_cbranch_execz BB6_3 +; CHECK-NEXT: ; %bb.2: ; CHECK-NEXT: s_endpgm -; CHECK-NEXT: BB6_4: +; CHECK-NEXT: BB6_3: +; CHECK-NEXT: exp null ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_var_x2_instructions(float %x) #0 { %cmp.x = fcmp olt float %x, 0.0 @@ -237,6 +232,62 @@ ret void } +; CHECK-LABEL: {{^}}test_kill_control_flow_return: + +; CHECK: v_cmp_eq_u32_e64 [[KILL_CC:s\[[0-9]+:[0-9]+\]]], s0, 1 +; CHECK: s_and_b64 exec, exec, s[2:3] +; CHECK-NEXT: s_cbranch_execz [[EXIT_BB:BB[0-9]+_[0-9]+]] + +; CHECK: s_cmp_lg_u32 s{{[0-9]+}}, 0 +; CHECK: s_cbranch_scc0 [[COND_BB:BB[0-9]+_[0-9]+]] +; CHECK: s_branch [[RETURN_BB:BB[0-9]+_[0-9]+]] + +; CHECK: [[COND_BB]]: +; CHECK: v_mov_b32_e64 v7, -1 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_mov_b32_e32 v0, v7 + +; CHECK: [[EXIT_BB]]: +; CHECK-NEXT: exp null +; CHECK-NEXT: s_endpgm + +; CHECK: [[RETURN_BB]]: +define amdgpu_ps float @test_kill_control_flow_return(i32 inreg %arg) #0 { +entry: + %kill = icmp eq i32 %arg, 1 + %cmp = icmp eq i32 %arg, 0 + call void @llvm.amdgcn.kill(i1 %kill) + br i1 %cmp, label %bb, label %exit + +bb: + %var = call float asm sideeffect " + v_mov_b32_e64 v7, -1 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", "={v7}"() + br label %exit + +exit: + %ret = phi float [ %var, %bb ], [ 0.0, %entry ] + ret float %ret +} + ; CHECK-LABEL: {{^}}test_kill_divergent_loop: ; CHECK: v_cmp_eq_u32_e32 vcc, 0, v0 ; CHECK-NEXT: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], vcc @@ -295,12 +346,9 @@ ; CHECK-LABEL: {{^}}phi_use_def_before_kill: ; CHECK: v_cndmask_b32_e64 [[PHIREG:v[0-9]+]], 0, -1.0, ; CHECK: v_cmpx_lt_f32_e32 vcc, 0, -; CHECK-NEXT: s_cbranch_execnz [[BB4:BB[0-9]+_[0-9]+]] +; CHECK-NEXT: s_cbranch_execz [[EXITBB:BB[0-9]+_[0-9]+]] -; CHECK: exp -; CHECK-NEXT: s_endpgm - -; CHECK: [[KILLBB:BB[0-9]+_[0-9]+]]: +; CHECK: ; %[[KILLBB:bb.[0-9]+]]: ; CHECK-NEXT: s_cbranch_scc0 [[PHIBB:BB[0-9]+_[0-9]+]] ; CHECK: [[PHIBB]]: @@ -313,6 +361,10 @@ ; CHECK: [[ENDBB]]: ; CHECK-NEXT: s_endpgm + +; CHECK: [[EXITBB]]: +; CHECK: exp null +; CHECK-NEXT: s_endpgm define amdgpu_ps void @phi_use_def_before_kill(float inreg %x) #0 { bb: %tmp = fadd float %x, 1.000000e+00