Index: llvm/include/llvm/CodeGen/TargetInstrInfo.h =================================================================== --- llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -22,6 +22,7 @@ #include "llvm/CodeGen/MachineCombinerPattern.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineOutliner.h" @@ -1638,6 +1639,28 @@ return false; } + /// During PHI eleimination lets target to make necessary checks and + /// insert the copy to the PHI destination register in a target specific + /// manner. + virtual MachineInstr *createPHIDestinationCopy( + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, + const DebugLoc &DL, Register Src, Register Dst) const { + return BuildMI(MBB, InsPt, DL, get(TargetOpcode::COPY), Dst) + .addReg(Src); + } + + /// During PHI eleimination lets target to make necessary checks and + /// insert the copy to the PHI destination register in a target specific + /// manner. + virtual MachineInstr *createPHISourceCopy(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsPt, + const DebugLoc &DL, Register Src, + Register SrcSubReg, + Register Dst) const { + return BuildMI(MBB, InsPt, DL, get(TargetOpcode::COPY), Dst) + .addReg(Src, 0, SrcSubReg); + } + /// Returns a \p outliner::OutlinedFunction struct containing target-specific /// information for a set of outlining candidates. virtual outliner::OutlinedFunction getOutliningCandidateInfo( Index: llvm/lib/CodeGen/PHIElimination.cpp =================================================================== --- llvm/lib/CodeGen/PHIElimination.cpp +++ llvm/lib/CodeGen/PHIElimination.cpp @@ -31,7 +31,9 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetOpcodes.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/Pass.h" @@ -252,11 +254,12 @@ // Insert a register to register copy at the top of the current block (but // after any remaining phi nodes) which copies the new incoming register // into the phi node destination. + MachineInstr *PHICopy = nullptr; const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); if (allPhiOperandsUndefined(*MPhi, *MRI)) // If all sources of a PHI node are implicit_def or undef uses, just emit an // implicit_def instead of a copy. - BuildMI(MBB, AfterPHIsIt, MPhi->getDebugLoc(), + PHICopy = BuildMI(MBB, AfterPHIsIt, MPhi->getDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF), DestReg); else { // Can we reuse an earlier PHI node? This only happens for critical edges, @@ -273,15 +276,13 @@ const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(DestReg); entry = IncomingReg = MF.getRegInfo().createVirtualRegister(RC); } - BuildMI(MBB, AfterPHIsIt, MPhi->getDebugLoc(), - TII->get(TargetOpcode::COPY), DestReg) - .addReg(IncomingReg); + // Give the target possiblity to handle special cases fallthrough otherwise + PHICopy = TII->createPHIDestinationCopy(MBB, AfterPHIsIt, MPhi->getDebugLoc(), + IncomingReg, DestReg); } // Update live variable information if there is any. if (LV) { - MachineInstr &PHICopy = *std::prev(AfterPHIsIt); - if (IncomingReg) { LiveVariables::VarInfo &VI = LV->getVarInfo(IncomingReg); @@ -302,7 +303,7 @@ // killed. Note that because the value is defined in several places (once // each for each incoming block), the "def" block and instruction fields // for the VarInfo is not filled in. - LV->addVirtualRegisterKilled(IncomingReg, PHICopy); + LV->addVirtualRegisterKilled(IncomingReg, *PHICopy); } // Since we are going to be deleting the PHI node, if it is the last use of @@ -312,15 +313,14 @@ // If the result is dead, update LV. if (isDead) { - LV->addVirtualRegisterDead(DestReg, PHICopy); + LV->addVirtualRegisterDead(DestReg, *PHICopy); LV->removeVirtualRegisterDead(DestReg, *MPhi); } } // Update LiveIntervals for the new copy or implicit def. if (LIS) { - SlotIndex DestCopyIndex = - LIS->InsertMachineInstrInMaps(*std::prev(AfterPHIsIt)); + SlotIndex DestCopyIndex = LIS->InsertMachineInstrInMaps(*PHICopy); SlotIndex MBBStartIndex = LIS->getMBBStartIdx(&MBB); if (IncomingReg) { @@ -406,9 +406,9 @@ if (DefMI->isImplicitDef()) ImpDefs.insert(DefMI); } else { - NewSrcInstr = BuildMI(opBlock, InsertPos, MPhi->getDebugLoc(), - TII->get(TargetOpcode::COPY), IncomingReg) - .addReg(SrcReg, 0, SrcSubReg); + NewSrcInstr = + TII->createPHISourceCopy(opBlock, InsertPos, MPhi->getDebugLoc(), + SrcReg, SrcSubReg, IncomingReg); } } @@ -457,7 +457,7 @@ } } else { // We just inserted this copy. - KillInst = std::prev(InsertPos); + KillInst = NewSrcInstr; } } assert(KillInst->readsRegister(SrcReg) && "Cannot find kill instruction"); Index: llvm/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -954,6 +954,19 @@ bool isBasicBlockPrologue(const MachineInstr &MI) const override; + MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsPt, + const DebugLoc &DL, Register Src, + Register Dst) const override; + + MachineInstr *createPHISourceCopy(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsPt, + const DebugLoc &DL, Register Src, + Register SrcSubReg, + Register Dst) const override; + + bool isWave32() const; + /// Return a partially built integer add instruction without carry. /// Caller must add source operands. /// For pre-GFX9 it will generate unused carry destination operand. Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6393,3 +6393,40 @@ return true; } } + +MachineInstr *SIInstrInfo::createPHIDestinationCopy( + MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt, + const DebugLoc &DL, Register Src, Register Dst) const { + auto Cur = MBB.begin(); + if (Cur != MBB.end()) + do { + if (!Cur->isPHI() && Cur->readsRegister(Dst)) + return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src); + ++Cur; + } while (Cur != MBB.end() && Cur != LastPHIIt); + + return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src, + Dst); +} + +MachineInstr *SIInstrInfo::createPHISourceCopy( + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, + const DebugLoc &DL, Register Src, Register SrcSubReg, Register Dst) const { + if (InsPt != MBB.end() && + (InsPt->getOpcode() == AMDGPU::SI_IF || + InsPt->getOpcode() == AMDGPU::SI_ELSE || + InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) && + InsPt->definesRegister(Src)) { + InsPt++; + return BuildMI(MBB, InsPt, InsPt->getDebugLoc(), + get(ST.isWave32() ? AMDGPU::S_MOV_B32_term + : AMDGPU::S_MOV_B64_term), + Dst) + .addReg(Src, 0, SrcSubReg) + .addReg(AMDGPU::EXEC, RegState::Implicit); + } + return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg, + Dst); +} + +bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); } Index: llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -98,6 +98,8 @@ void emitLoop(MachineInstr &MI); void emitEndCf(MachineInstr &MI); + Register getSaveExec(MachineInstr* MI); + void findMaskOperands(MachineInstr &MI, unsigned OpNo, SmallVectorImpl &Src) const; @@ -175,17 +177,31 @@ return true; } +Register SILowerControlFlow::getSaveExec(MachineInstr *MI) { + MachineBasicBlock *MBB = MI->getParent(); + MachineOperand &SaveExec = MI->getOperand(0); + assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister); + + Register SaveExecReg = SaveExec.getReg(); + unsigned FalseTermOpc = + TII->isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; + MachineBasicBlock::iterator I = (MI); + MachineBasicBlock::iterator J = std::next(I); + if (J != MBB->end() && J->getOpcode() == FalseTermOpc && + J->getOperand(1).isReg() && J->getOperand(1).getReg() == SaveExecReg) { + SaveExecReg = J->getOperand(0).getReg(); + J->eraseFromParent(); + } + return SaveExecReg; +} + void SILowerControlFlow::emitIf(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); MachineBasicBlock::iterator I(&MI); - - MachineOperand &SaveExec = MI.getOperand(0); - MachineOperand &Cond = MI.getOperand(1); - assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister && - Cond.getSubReg() == AMDGPU::NoSubRegister); - - Register SaveExecReg = SaveExec.getReg(); + Register SaveExecReg = getSaveExec(&MI); + MachineOperand& Cond = MI.getOperand(1); + assert(Cond.getSubReg() == AMDGPU::NoSubRegister); MachineOperand &ImpDefSCC = MI.getOperand(4); assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef()); @@ -266,8 +282,7 @@ MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); - Register DstReg = MI.getOperand(0).getReg(); - assert(MI.getOperand(0).getSubReg() == AMDGPU::NoSubRegister); + Register DstReg = getSaveExec(&MI); bool ExecModified = MI.getOperand(3).getImm() != 0; MachineBasicBlock::iterator Start = MBB.begin(); @@ -339,7 +354,7 @@ void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); - auto Dst = MI.getOperand(0).getReg(); + auto Dst = getSaveExec(&MI); // Skip ANDing with exec if the break condition is already masked by exec // because it is a V_CMP in the same basic block. (We know the break @@ -400,13 +415,17 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + unsigned CFMask = MI.getOperand(0).getReg(); + MachineInstr *Def = MRI.getUniqueVRegDef(CFMask); const DebugLoc &DL = MI.getDebugLoc(); - MachineBasicBlock::iterator InsPt = MBB.begin(); - MachineInstr *NewMI = - BuildMI(MBB, InsPt, DL, TII->get(OrOpc), Exec) - .addReg(Exec) - .add(MI.getOperand(0)); + MachineBasicBlock::iterator InsPt = + Def && Def->getParent() == &MBB ? std::next(MachineBasicBlock::iterator(Def)) + : MBB.begin(); + MachineInstr *NewMI = BuildMI(MBB, InsPt, DL, TII->get(OrOpc), Exec) + .addReg(Exec) + .add(MI.getOperand(0)); if (LIS) LIS->ReplaceMachineInstrInMaps(MI, *NewMI); Index: llvm/test/CodeGen/AMDGPU/phi-elimination-assertion.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/phi-elimination-assertion.mir +++ llvm/test/CodeGen/AMDGPU/phi-elimination-assertion.mir @@ -26,8 +26,8 @@ # CHECK-LABEL: name: foo # CHECK: bb.3: -# CHECK-NEXT: %3:sreg_32_xm0 = COPY killed %4 # CHECK-NEXT: dead %2:sreg_32_xm0 = IMPLICIT_DEF +# CHECK-NEXT: %3:sreg_32_xm0 = COPY killed %4 # CHECK-NEXT: S_NOP 0, implicit killed %3 Index: llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir @@ -0,0 +1,54 @@ +# RUN: llc -mtriple amdgcn -run-pass livevars -run-pass phi-node-elimination -verify-machineinstrs -o - %s | FileCheck %s + +# CHECK-LABEL: phi-cf-test +# CHECK: bb.0: +# CHECK: [[COND:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 +# CHECK: [[IF_SOURCE0:%[0-9]+]]:sreg_64 = SI_IF [[COND]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec +# CHECK: [[IF_INPUT_REG:%[0-9]+]]:sreg_64 = S_MOV_B64_term killed [[IF_SOURCE0]], implicit $exec + +# CHECK: bb.1: +# CHECK: [[END_CF_ARG:%[0-9]+]]:sreg_64 = COPY killed [[IF_INPUT_REG]] +# CHECK: SI_END_CF killed [[END_CF_ARG]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + +# CHECK: bb.2: +# CHECK: [[IF_SOURCE1:%[0-9]+]]:sreg_64 = SI_IF [[COND]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec +# CHECK: [[IF_INPUT_REG]]:sreg_64 = S_MOV_B64_term killed [[IF_SOURCE1]], implicit $exec + + +... +--- +name: phi-cf-test +tracksRegLiveness: true +body: | + + bb.0: + successors: %bb.3(0x40000000), %bb.2(0x40000000) + liveins: $vgpr0 + + %5:vgpr_32(s32) = COPY $vgpr0 + %0:sreg_64 = V_CMP_EQ_U32_e64 0, %5(s32), implicit $exec + %18:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %22:sreg_64 = SI_IF %0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.3 + + bb.2: + successors: %bb.3(0x80000000) + + %24:sreg_64 = PHI %20, %bb.3, %22, %bb.0 + %23:vgpr_32 = PHI %19, %bb.3, %18, %bb.0 + SI_END_CF %24, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + %3:vgpr_32, dead %10:sreg_64 = nsw V_ADD_I32_e64 1, %23, 0, implicit $exec + + bb.3: + successors: %bb.3(0x40000000), %bb.2(0x40000000) + + %4:vgpr_32 = PHI %19, %bb.3, %3, %bb.2, %18, %bb.0 + %15:sreg_32_xm0 = S_MOV_B32 61440 + %16:sreg_32_xm0 = S_MOV_B32 -1 + %17:sreg_128 = REG_SEQUENCE undef %14:sreg_32_xm0, %subreg.sub0, undef %12:sreg_32_xm0, %subreg.sub1, %16, %subreg.sub2, %15, %subreg.sub3 + BUFFER_STORE_DWORD_OFFSET %4, %17, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + %19:vgpr_32 = COPY %4 + %20:sreg_64 = SI_IF %0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + S_BRANCH %bb.3 + +...