Index: lib/Target/AMDGPU/SIFixSGPRCopies.cpp =================================================================== --- lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -599,7 +599,7 @@ if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) { unsigned SrcReg = MI.getOperand(1).getReg(); if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) { - TII->moveToVALU(MI); + TII->moveToVALU(MI, *MDT); break; } @@ -614,7 +614,7 @@ MI.setDesc(TII->get(SMovOp)); break; } - TII->moveToVALU(MI); + TII->moveToVALU(MI, *MDT); } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) { tryChangeVGPRtoSGPRinCopy(MI, TRI, TII); } @@ -677,7 +677,7 @@ SmallSet Visited; if (HasVGPROperand || !phiHasBreakDef(MI, MRI, Visited)) { LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI); - TII->moveToVALU(MI); + TII->moveToVALU(MI, *MDT); } break; } @@ -690,7 +690,7 @@ LLVM_DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI); - TII->moveToVALU(MI); + TII->moveToVALU(MI, *MDT); break; case AMDGPU::INSERT_SUBREG: { const TargetRegisterClass *DstRC, *Src0RC, *Src1RC; @@ -700,7 +700,7 @@ if (TRI->isSGPRClass(DstRC) && (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) { LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI); - TII->moveToVALU(MI); + TII->moveToVALU(MI, *MDT); } break; } Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -37,6 +37,7 @@ namespace llvm { class APInt; +class MachineDominatorTree; class MachineRegisterInfo; class RegScavenger; class GCNSubtarget; @@ -79,8 +80,8 @@ private: void swapOperands(MachineInstr &Inst) const; - bool moveScalarAddSub(SetVectorType &Worklist, - MachineInstr &Inst) const; + bool moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, + MachineDominatorTree &MDT) const; void lowerScalarAbs(SetVectorType &Worklist, MachineInstr &Inst) const; @@ -91,11 +92,12 @@ void splitScalar64BitUnaryOp(SetVectorType &Worklist, MachineInstr &Inst, unsigned Opcode) const; - void splitScalar64BitAddSub(SetVectorType &Worklist, - MachineInstr &Inst) const; + void splitScalar64BitAddSub(SetVectorType &Worklist, MachineInstr &Inst, + MachineDominatorTree &MDT) const; - void splitScalar64BitBinaryOp(SetVectorType &Worklist, - MachineInstr &Inst, unsigned Opcode) const; + void splitScalar64BitBinaryOp(SetVectorType &Worklist, MachineInstr &Inst, + unsigned Opcode, + MachineDominatorTree &MDT) const; void splitScalar64BitBCNT(SetVectorType &Worklist, MachineInstr &Inst) const; @@ -787,12 +789,12 @@ /// Legalize all operands in this instruction. This function may /// create new instruction and insert them before \p MI. - void legalizeOperands(MachineInstr &MI) const; + void legalizeOperands(MachineInstr &MI, MachineDominatorTree &MDT) const; /// Replace this instruction's opcode with the equivalent VALU /// opcode. This function will also move the users of \p MI to the /// VALU if necessary. - void moveToVALU(MachineInstr &MI) const; + void moveToVALU(MachineInstr &MI, MachineDominatorTree &MDT) const; void insertWaitStates(MachineBasicBlock &MBB,MachineBasicBlock::iterator MI, int Count) const; Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -17,10 +17,10 @@ #include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" #include "GCNHazardRecognizer.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIDefines.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" @@ -31,6 +31,7 @@ #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" @@ -3657,9 +3658,8 @@ // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register // with SGPRs by iterating over all unique values across all lanes. -static void loadSRsrcFromVGPR(const SIInstrInfo &TII, - MachineInstr &MI, - MachineOperand &Rsrc) { +static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineDominatorTree &MDT, + MachineInstr &MI, MachineOperand &Rsrc) { MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -3699,6 +3699,16 @@ MBB.addSuccessor(LoopBB); + // Update dominators. We know that MBB immediately dominates LoopBB, that + // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately + // dominates all of the successors transferred to it from MBB that MBB used + // to dominate. + MDT.addNewBlock(LoopBB, &MBB); + MDT.addNewBlock(RemainderBB, LoopBB); + for (auto &Succ : RemainderBB->successors()) + if (MDT.dominates(&MBB, Succ)) + MDT.changeImmediateDominator(Succ, RemainderBB); + emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc); // Restore the EXEC mask @@ -3750,7 +3760,8 @@ return std::tie(RsrcPtr, NewSRsrc); } -void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { +void SIInstrInfo::legalizeOperands(MachineInstr &MI, + MachineDominatorTree &MDT) const { MachineFunction &MF = *MI.getParent()->getParent(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -4023,12 +4034,13 @@ } else { // This is another variant; legalize Rsrc with waterfall loop from VGPRs // to SGPRs. - loadSRsrcFromVGPR(*this, MI, *Rsrc); + loadSRsrcFromVGPR(*this, MDT, MI, *Rsrc); } } } -void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { +void SIInstrInfo::moveToVALU(MachineInstr &TopInst, + MachineDominatorTree &MDT) const { SetVectorType Worklist; Worklist.insert(&TopInst); @@ -4046,29 +4058,29 @@ break; case AMDGPU::S_ADD_U64_PSEUDO: case AMDGPU::S_SUB_U64_PSEUDO: - splitScalar64BitAddSub(Worklist, Inst); + splitScalar64BitAddSub(Worklist, Inst, MDT); Inst.eraseFromParent(); continue; case AMDGPU::S_ADD_I32: case AMDGPU::S_SUB_I32: // FIXME: The u32 versions currently selected use the carry. - if (moveScalarAddSub(Worklist, Inst)) + if (moveScalarAddSub(Worklist, Inst, MDT)) continue; // Default handling break; case AMDGPU::S_AND_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64, MDT); Inst.eraseFromParent(); continue; case AMDGPU::S_OR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64, MDT); Inst.eraseFromParent(); continue; case AMDGPU::S_XOR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64, MDT); Inst.eraseFromParent(); continue; @@ -4155,7 +4167,7 @@ continue; case AMDGPU::S_XNOR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); Inst.eraseFromParent(); continue; @@ -4255,7 +4267,7 @@ // Legalize all operands other than the offset. Notably, convert the srsrc // into SGPRs using v_readfirstlane if needed. - legalizeOperands(*NewInstr); + legalizeOperands(*NewInstr, MDT); continue; } } @@ -4263,7 +4275,7 @@ if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { // We cannot move this instruction to the VALU, so we should try to // legalize its operands instead. - legalizeOperands(Inst); + legalizeOperands(Inst, MDT); continue; } @@ -4352,7 +4364,7 @@ } // Legalize the operands - legalizeOperands(Inst); + legalizeOperands(Inst, MDT); if (HasDst) addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); @@ -4360,8 +4372,8 @@ } // Add/sub require special handling to deal with carry outs. -bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, - MachineInstr &Inst) const { +bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, + MachineDominatorTree &MDT) const { if (ST.hasAddNoCarry()) { // Assume there is no user of scc since we don't select this in that case. // Since scc isn't used, it doesn't really matter if the i32 or u32 variant @@ -4385,7 +4397,7 @@ Inst.setDesc(get(NewOpc)); Inst.addImplicitDefUseOperands(*MBB.getParent()); MRI.replaceRegWith(OldDstReg, ResultReg); - legalizeOperands(Inst); + legalizeOperands(Inst, MDT); addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); return true; @@ -4505,8 +4517,9 @@ addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitAddSub( - SetVectorType &Worklist, MachineInstr &Inst) const { +void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, + MachineInstr &Inst, + MachineDominatorTree &MDT) const { bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); MachineBasicBlock &MBB = *Inst.getParent(); @@ -4566,16 +4579,16 @@ // Try to legalize the operands in case we need to swap the order to keep it // valid. - legalizeOperands(*LoHalf); - legalizeOperands(*HiHalf); + legalizeOperands(*LoHalf, MDT); + legalizeOperands(*HiHalf, MDT); // Move all users of this moved vlaue. addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitBinaryOp( - SetVectorType &Worklist, MachineInstr &Inst, - unsigned Opcode) const { +void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, + MachineInstr &Inst, unsigned Opcode, + MachineDominatorTree &MDT) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -4633,8 +4646,8 @@ // Try to legalize the operands in case we need to swap the order to keep it // valid. - legalizeOperands(LoHalf); - legalizeOperands(HiHalf); + legalizeOperands(LoHalf, MDT); + legalizeOperands(HiHalf, MDT); // Move all users of this moved vlaue. addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);