Index: lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- lib/Target/AMDGPU/SILowerControlFlow.cpp +++ lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -80,7 +80,7 @@ bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To); void Skip(MachineInstr &From, MachineOperand &To); - void SkipIfDead(MachineInstr &MI); + bool skipIfDead(MachineInstr &MI); void If(MachineInstr &MI); void Else(MachineInstr &MI, bool ExecModified); @@ -93,12 +93,16 @@ void Kill(MachineInstr &MI); void Branch(MachineInstr &MI); - void splitBlockLiveIns(const MachineBasicBlock &MBB, - const MachineInstr &MI, - MachineBasicBlock &LoopBB, - MachineBasicBlock &RemainderBB, - unsigned SaveReg, - const MachineOperand &IdxReg); + std::pair + splitBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); + + void splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs, + const MachineRegisterInfo &MRI, + const MachineInstr &MI, + MachineBasicBlock &LoopBB, + MachineBasicBlock &RemainderBB, + unsigned SaveReg, + const MachineOperand &IdxReg); void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL, MachineInstr *MovRel, @@ -115,7 +119,7 @@ static char ID; SILowerControlFlow() : - MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { } + MachineFunctionPass(ID), TRI(nullptr), TII(nullptr), SkipThreshold(0) { } bool runOnMachineFunction(MachineFunction &MF) override; @@ -175,7 +179,19 @@ I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ) return true; - if (++NumInstr >= SkipThreshold) + if (I->isInlineAsm()) { + const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); + const char *AsmStr = I->getOperand(0).getSymbolName(); + + // inlineasm length estimate is number of bytes assuming the longest + // instruction. + uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI); + NumInstr += MaxAsmSize / MAI->getMaxInstLength(); + } else { + ++NumInstr; + } + + if (NumInstr >= SkipThreshold) return true; } } @@ -193,36 +209,55 @@ .addOperand(To); } -void SILowerControlFlow::SkipIfDead(MachineInstr &MI) { - +bool SILowerControlFlow::skipIfDead(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); - DebugLoc DL = MI.getDebugLoc(); if (MBB.getParent()->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS || !shouldSkip(&MBB, &MBB.getParent()->back())) - return; + return false; - MachineBasicBlock::iterator Insert = &MI; - ++Insert; + LivePhysRegs RemainderLiveRegs(TRI); + RemainderLiveRegs.addLiveOuts(MBB); + + MachineBasicBlock *SkipBB; + MachineBasicBlock *RemainderBB; + std::tie(SkipBB, RemainderBB) = splitBlock(MBB, MI.getIterator()); + + const DebugLoc &DL = MI.getDebugLoc(); // If the exec mask is non-zero, skip the next two instructions - BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addImm(3); + BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addMBB(RemainderBB); + + MBB.addSuccessor(RemainderBB); + + MachineBasicBlock::iterator Insert = SkipBB->begin(); // Exec mask is zero: Export to NULL target... - BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP)) - .addImm(0) - .addImm(0x09) // V_008DFC_SQ_EXP_NULL - .addImm(0) - .addImm(1) - .addImm(1) - .addReg(AMDGPU::VGPR0) - .addReg(AMDGPU::VGPR0) - .addReg(AMDGPU::VGPR0) - .addReg(AMDGPU::VGPR0); - - // ... and terminate wavefront - BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); + BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP)) + .addImm(0) + .addImm(0x09) // V_008DFC_SQ_EXP_NULL + .addImm(0) + .addImm(1) + .addImm(1) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef) + .addReg(AMDGPU::VGPR0, RegState::Undef); + + // ... and terminate wavefront. + BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); + + for (const MachineInstr &Inst : reverse(*RemainderBB)) + RemainderLiveRegs.stepBackward(Inst); + + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + for (unsigned Reg : RemainderLiveRegs) { + if (MRI.isAllocatable(Reg)) + RemainderBB->addLiveIn(Reg); + } + + return true; } void SILowerControlFlow::If(MachineInstr &MI) { @@ -388,20 +423,13 @@ } // All currently live registers must remain so in the remainder block. -void SILowerControlFlow::splitBlockLiveIns(const MachineBasicBlock &MBB, - const MachineInstr &MI, - MachineBasicBlock &LoopBB, - MachineBasicBlock &RemainderBB, - unsigned SaveReg, - const MachineOperand &IdxReg) { - LivePhysRegs RemainderLiveRegs(TRI); - - RemainderLiveRegs.addLiveOuts(MBB); - for (MachineBasicBlock::const_reverse_iterator I = MBB.rbegin(), E(&MI); - I != E; ++I) { - RemainderLiveRegs.stepBackward(*I); - } - +void SILowerControlFlow::splitLoadM0BlockLiveIns(LivePhysRegs &RemainderLiveRegs, + const MachineRegisterInfo &MRI, + const MachineInstr &MI, + MachineBasicBlock &LoopBB, + MachineBasicBlock &RemainderBB, + unsigned SaveReg, + const MachineOperand &IdxReg) { // Add reg defined in loop body. RemainderLiveRegs.addReg(SaveReg); @@ -412,13 +440,11 @@ } } - const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); for (unsigned Reg : RemainderLiveRegs) { if (MRI.isAllocatable(Reg)) RemainderBB.addLiveIn(Reg); } - const MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src); if (!Src->isUndef()) LoopBB.addLiveIn(Src->getReg()); @@ -471,6 +497,30 @@ .addMBB(&LoopBB); } +std::pair +SILowerControlFlow::splitBlock(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) { + MachineFunction *MF = MBB.getParent(); + + // To insert the loop we need to split the block. Move everything after this + // point to a new block, and insert a new empty block between the two. + MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); + MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); + MachineFunction::iterator MBBI(MBB); + ++MBBI; + + MF->insert(MBBI, LoopBB); + MF->insert(MBBI, RemainderBB); + + // Move the rest of the block into a new block. + RemainderBB->transferSuccessors(&MBB); + RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); + + MBB.addSuccessor(LoopBB); + + return std::make_pair(LoopBB, RemainderBB); +} + // Returns true if a new block was inserted. bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) { MachineBasicBlock &MBB = *MI.getParent(); @@ -494,7 +544,6 @@ return false; } - MachineFunction &MF = *MBB.getParent(); MachineOperand *SaveOp = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); SaveOp->setIsDead(false); unsigned Save = SaveOp->getReg(); @@ -507,25 +556,24 @@ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), Save) .addReg(AMDGPU::EXEC); - // To insert the loop we need to split the block. Move everything after this - // point to a new block, and insert a new empty block between the two. - MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock(); - MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock(); - MachineFunction::iterator MBBI(MBB); - ++MBBI; + LivePhysRegs RemainderLiveRegs(TRI); - MF.insert(MBBI, LoopBB); - MF.insert(MBBI, RemainderBB); + RemainderLiveRegs.addLiveOuts(MBB); - LoopBB->addSuccessor(LoopBB); - LoopBB->addSuccessor(RemainderBB); + MachineBasicBlock *LoopBB; + MachineBasicBlock *RemainderBB; - splitBlockLiveIns(MBB, MI, *LoopBB, *RemainderBB, Save, *Idx); + std::tie(LoopBB, RemainderBB) = splitBlock(MBB, I); - // Move the rest of the block into a new block. - RemainderBB->transferSuccessors(&MBB); - RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); - MBB.addSuccessor(LoopBB); + for (const MachineInstr &Inst : reverse(*RemainderBB)) + RemainderLiveRegs.stepBackward(Inst); + + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + LoopBB->addSuccessor(RemainderBB); + LoopBB->addSuccessor(LoopBB); + + splitLoadM0BlockLiveIns(RemainderLiveRegs, MRI, MI, *LoopBB, + *RemainderBB, Save, *Idx); emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, *Idx, Offset); @@ -684,16 +732,25 @@ case AMDGPU::SI_END_CF: if (--Depth == 0 && HaveKill) { - SkipIfDead(MI); HaveKill = false; + + if (skipIfDead(MI)) { + NextBB = std::next(BI); + BE = MF.end(); + Next = MBB.end(); + } } EndCf(MI); break; case AMDGPU::SI_KILL: - if (Depth == 0) - SkipIfDead(MI); - else + if (Depth == 0) { + if (skipIfDead(MI)) { + NextBB = std::next(BI); + BE = MF.end(); + Next = MBB.end(); + } + } else HaveKill = true; Kill(MI); break; Index: test/CodeGen/AMDGPU/indirect-addressing-undef.mir =================================================================== --- test/CodeGen/AMDGPU/indirect-addressing-undef.mir +++ test/CodeGen/AMDGPU/indirect-addressing-undef.mir @@ -3,7 +3,7 @@ # CHECK-LABEL: name: extract_undef_offset_vgpr{{$}} # CHECK: bb.1: -# CHECK: successors: %bb.1(0x40000000 / 0x80000000 = 50.00%), %bb.2(0x40000000 / 0x80000000 = 50.00%) +# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%) # CHECK: liveins: %vgpr0_vgpr1_vgpr2_vgpr3{{$}} # CHECK: V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec @@ -103,7 +103,7 @@ # CHECK-LABEL: name: extract_undef_neg_offset_vgpr{{$}} # CHECK: bb.1: -# CHECK: successors: %bb.1(0x40000000 / 0x80000000 = 50.00%), %bb.2(0x40000000 / 0x80000000 = 50.00%) +# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%) # CHECK: liveins: %vgpr0_vgpr1_vgpr2_vgpr3{{$}} # CHECK: %vcc_lo = V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec @@ -159,7 +159,7 @@ # CHECK-LABEL: name: insert_undef_offset_vgpr{{$}} # CHECK: bb.1: -# CHECK: successors: %bb.1(0x40000000 / 0x80000000 = 50.00%), %bb.2(0x40000000 / 0x80000000 = 50.00%) +# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%) # CHECK: liveins: %vgpr4, %vgpr0_vgpr1_vgpr2_vgpr3{{$}} # CHECK: %vcc_lo = V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec @@ -215,7 +215,7 @@ # CHECK-LABEL: name: insert_undef_neg_offset_vgpr{{$}} # CHECK: bb.1: -# CHECK: successors: %bb.1(0x40000000 / 0x80000000 = 50.00%), %bb.2(0x40000000 / 0x80000000 = 50.00%) +# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%) # CHECK: liveins: %vgpr4, %vgpr0_vgpr1_vgpr2_vgpr3{{$}} # CHECK: %vcc_lo = V_READFIRSTLANE_B32 undef %vgpr10, implicit %exec @@ -272,7 +272,7 @@ # CHECK-LABEL: insert_undef_value_offset_vgpr{{$}} # CHECK: bb.1: -# CHECK: successors: %bb.1(0x40000000 / 0x80000000 = 50.00%), %bb.2(0x40000000 / 0x80000000 = 50.00%) +# CHECK: successors: %bb.2(0x40000000 / 0x80000000 = 50.00%), %bb.1(0x40000000 / 0x80000000 = 50.00%) # CHECK: liveins: %vgpr4, %vgpr0_vgpr1_vgpr2_vgpr3{{$}} # CHECK: %vcc_lo = V_READFIRSTLANE_B32 %vgpr4, implicit %exec Index: test/CodeGen/AMDGPU/skip-if-dead.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/skip-if-dead.ll @@ -0,0 +1,145 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s + +; CHECK-LABEL: {{^}}test_kill_depth_0_imm_pos: +; CHECK-NEXT: ; BB#0: +; CHECK-NEXT: s_endpgm +define amdgpu_ps void @test_kill_depth_0_imm_pos() #0 { + call void @llvm.AMDGPU.kill(float 0.0) + ret void +} + +; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg: +; CHECK-NEXT: ; BB#0: +; CHECK-NEXT: s_mov_b64 exec, 0 +; CHECK-NEXT: s_endpgm +define amdgpu_ps void @test_kill_depth_0_imm_neg() #0 { + call void @llvm.AMDGPU.kill(float -0.0) + ret void +} + +; CHECK-LABEL: {{^}}test_kill_depth_var: +; CHECK-NEXT: ; BB#0: +; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0 +; CHECK-NEXT: s_endpgm +define amdgpu_ps void @test_kill_depth_var(float %x) #0 { + call void @llvm.AMDGPU.kill(float %x) + ret void +} + +; FIXME: why does the skip depend on the asm length in the same block? + +; CHECK-LABEL: {{^}}test_kill_control_flow: +; CHECK: s_cmp_lg_i32 s{{[0-9]+}}, 0 +; CHECK: s_cbranch_scc1 [[RETURN_BB:BB[0-9]+_[0-9]+]] + +; CHECK: ; BB#1: +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 + +; CHECK: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]] +; CHECK-NEXT: ; BB#3: +; CHECK-NEXT: exp 0, 9, 0, 1, 1, v0, v0, v0, v0 +; CHECK-NEXT: s_endpgm + +; CHECK-NEXT: {{^}}[[SPLIT_BB]]: +; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v7 +; CHECK-NEXT: {{^}}BB{{[0-9]+_[0-9]+}}: +; CHECK-NEXT: s_endpgm +define amdgpu_ps void @test_kill_control_flow(i32 inreg %arg) #0 { +entry: + %cmp = icmp eq i32 %arg, 0 + br i1 %cmp, label %bb, label %exit + +bb: + %var = call float asm sideeffect " + v_mov_b32_e64 v7, -1 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", "={VGPR7}"() + call void @llvm.AMDGPU.kill(float %var) + br label %exit + +exit: + ret void +} + +; CHECK-LABEL: {{^}}test_kill_control_flow_remainder: +; CHECK: s_cmp_lg_i32 s{{[0-9]+}}, 0 +; CHECK-NEXT: s_cbranch_scc1 [[RETURN_BB:BB[0-9]+_[0-9]+]] + +; CHECK-NEXT: ; BB#1: ; %bb +; CHECK: v_mov_b32_e64 v7, -1 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: v_nop_e64 +; CHECK: ;;#ASMEND +; CHECK: v_mov_b32_e64 v8, -1 +; CHECK: ;;#ASMEND +; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]] + +; CHECK-NEXT: ; BB#3: +; CHECK-NEXT: exp 0, 9, 0, 1, 1, v0, v0, v0, v0 +; CHECK-NEXT: s_endpgm + +; CHECK-NEXT: {{^}}[[SPLIT_BB]]: +; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v7 +; CHECK: buffer_store_dword v8 +; CHECK: v_mov_b32_e64 v9, -2 + +; CHECK: {{^}}BB{{[0-9]+_[0-9]+}}: +; CHECK: buffer_store_dword v9 +; CHECK-NEXT: s_endpgm +define amdgpu_ps void @test_kill_control_flow_remainder(i32 inreg %arg) #0 { +entry: + %cmp = icmp eq i32 %arg, 0 + br i1 %cmp, label %bb, label %exit + +bb: + %var = call float asm sideeffect " + v_mov_b32_e64 v7, -1 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", "={VGPR7}"() + %live.across = call float asm sideeffect "v_mov_b32_e64 v8, -1", "={VGPR8}"() + call void @llvm.AMDGPU.kill(float %var) + store volatile float %live.across, float addrspace(1)* undef + %live.out = call float asm sideeffect "v_mov_b32_e64 v9, -2", "={VGPR9}"() + br label %exit + +exit: + %phi = phi float [ 0.0, %entry ], [ %live.out, %bb ] + store float %phi, float addrspace(1)* undef + ret void +} + +declare void @llvm.AMDGPU.kill(float) #0 + +attributes #0 = { nounwind }