Index: llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -73,6 +73,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" @@ -271,50 +272,7 @@ unsigned SubReg = CopyUse.getOperand(1).getSubReg(); if (SubReg != AMDGPU::NoSubRegister) return false; - - MRI.setRegClass(DstReg, DstRC); - - // SGPRx = ... - // SGPRy = REG_SEQUENCE SGPRx, sub0 ... - // VGPRz = COPY SGPRy - - // => - // VGPRx = COPY SGPRx - // VGPRz = REG_SEQUENCE VGPRx, sub0 - - MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg()); - bool IsAGPR = TRI->hasAGPRs(DstRC); - - for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { - Register SrcReg = MI.getOperand(I).getReg(); - unsigned SrcSubReg = MI.getOperand(I).getSubReg(); - - const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); - assert(TRI->isSGPRClass(SrcRC) && - "Expected SGPR REG_SEQUENCE to only have SGPR inputs"); - - SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg); - const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC); - - Register TmpReg = MRI.createVirtualRegister(NewSrcRC); - - BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), - TmpReg) - .add(MI.getOperand(I)); - - if (IsAGPR) { - const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SrcRC); - Register TmpAReg = MRI.createVirtualRegister(NewSrcRC); - unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ? - AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY; - BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc), - TmpAReg) - .addReg(TmpReg, RegState::Kill); - TmpReg = TmpAReg; - } - - MI.getOperand(I).setReg(TmpReg); - } + TII->convertRegSequenceToVGPR(MI, CopyUse.getOperand(0).getReg(), DstRC); CopyUse.eraseFromParent(); return true; @@ -594,11 +552,9 @@ MDT = &getAnalysis(); SmallVector Worklist; + for (auto MBB : post_order(&MF)) { - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - MachineBasicBlock &MBB = *BI; - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; ++I) { MachineInstr &MI = *I; @@ -623,7 +579,7 @@ Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - BuildMI(MBB, MI, MI.getDebugLoc(), + BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg) .add(MI.getOperand(1)); MI.getOperand(1).setReg(TmpReg); Index: llvm/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -878,6 +878,14 @@ const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const; + // SGPRx = ... + // SGPRy = REG_SEQUENCE SGPRx, sub0 ... + // VGPRz = COPY SGPRy + // => + // VGPRx = COPY SGPRx + // VGPRz = REG_SEQUENCE VGPRx, sub0 + void convertRegSequenceToVGPR(MachineInstr &MI, Register DstReg, + const TargetRegisterClass *DstRC) const; /// Legalize all operands in this instruction. This function may create new /// instructions and control-flow around \p MI. If present, \p MDT is Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4287,6 +4287,26 @@ if (DstRC == OpRC) return; + MachineInstr *Def = MRI.getVRegDef(OpReg); + if (Op.getParent()->isPHI() && !RI.isSGPRClass(DstRC) && + RI.isSGPRClass(OpRC)) { + SmallVector CopiesToDelete; + while (Def && Def->isCopy() && MRI.hasOneUse(Def->getOperand(0).getReg())) { + CopiesToDelete.push_back(Def); + Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg()); + } + if (Def && Def->isRegSequence()) { + Register Dst = Def->getOperand(0).getReg(); + if (MRI.hasOneUse(Dst)) { + convertRegSequenceToVGPR(*Def, + OpReg, DstRC); + for (auto& Copy : CopiesToDelete) + Copy->eraseFromParent(); + return; + } + } + } + Register DstReg = MRI.createVirtualRegister(DstRC); MachineInstr *Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); @@ -4294,7 +4314,6 @@ Op.setReg(DstReg); Op.setSubReg(0); - MachineInstr *Def = MRI.getVRegDef(OpReg); if (!Def) return; @@ -4314,6 +4333,46 @@ Copy->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); } +void SIInstrInfo::convertRegSequenceToVGPR(MachineInstr &MI, Register DstReg, + const TargetRegisterClass *DstRC) const { + assert(MI.isRegSequence()); + MachineRegisterInfo& MRI = MI.getParent()->getParent()->getRegInfo(); + + MRI.setRegClass(DstReg, DstRC); + + MI.getOperand(0).setReg(DstReg); + bool IsAGPR = RI.hasAGPRs(DstRC); + + for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { + Register SrcReg = MI.getOperand(I).getReg(); + unsigned SrcSubReg = MI.getOperand(I).getSubReg(); + + const TargetRegisterClass* SrcRC = MRI.getRegClass(SrcReg); + + SrcRC = RI.getSubRegClass(SrcRC, SrcSubReg); + const TargetRegisterClass* NewSrcRC = RI.getEquivalentVGPRClass(SrcRC); + + Register TmpReg = MRI.createVirtualRegister(NewSrcRC); + + BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), get(AMDGPU::COPY), + TmpReg) + .add(MI.getOperand(I)); + + if (IsAGPR) { + const TargetRegisterClass* NewSrcRC = RI.getEquivalentAGPRClass(SrcRC); + Register TmpAReg = MRI.createVirtualRegister(NewSrcRC); + unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ? + AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY; + BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), get(Opc), + TmpAReg) + .addReg(TmpReg, RegState::Kill); + TmpReg = TmpAReg; + } + + MI.getOperand(I).setReg(TmpReg); + } +} + // Emit the actual waterfall loop, executing the wrapped instruction for each // unique value of \p Rsrc across all lanes. In the best case we execute 1 // iteration, in the worst case we execute 64 (once per lane).