Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -533,6 +533,80 @@ .addReg(SrcReg, getKillRegState(KillSrc)); } +/// Handle copying from SGPR to AGPR, or from AGPR to AGPR. It is not possible +/// to directly copy, so an intermediate VGPR needs to be used. +static void indirectCopyToAGPR(const SIInstrInfo &TII, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const DebugLoc &DL, MCRegister DestReg, + MCRegister SrcReg, bool KillSrc, + RegScavenger &RS) { + const SIRegisterInfo &RI = TII.getRegisterInfo(); + + assert(AMDGPU::SReg_32RegClass.contains(SrcReg) || + AMDGPU::AGPR_32RegClass.contains(SrcReg)); + + // First try to find defining accvgpr_write to avoid temporary registers. + for (auto Def = MI, E = MBB.begin(); Def != E; ) { + --Def; + if (!Def->definesRegister(SrcReg, &RI)) + continue; + if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32) + break; + + MachineOperand &DefOp = Def->getOperand(1); + assert(DefOp.isReg() || DefOp.isImm()); + + if (DefOp.isReg()) { + // Check that register source operand if not clobbered before MI. + // Immediate operands are always safe to propagate. + bool SafeToPropagate = true; + for (auto I = Def; I != MI && SafeToPropagate; ++I) + if (I->modifiesRegister(DefOp.getReg(), &RI)) + SafeToPropagate = false; + + if (!SafeToPropagate) + break; + + DefOp.setIsKill(false); + } + + BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) + .add(DefOp); + return; + } + + RS.enterBasicBlock(MBB); + RS.forward(MI); + + // Ideally we want to have three registers for a long reg_sequence copy + // to hide 2 waitstates between v_mov_b32 and accvgpr_write. + unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass, + *MBB.getParent()); + + // Registers in the sequence are allocated contiguously so we can just + // use register number to pick one of three round-robin temps. + unsigned RegNo = DestReg % 3; + Register Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); + if (!Tmp) + report_fatal_error("Cannot scavenge VGPR to copy to AGPR"); + RS.setRegUsed(Tmp); + // Only loop through if there are any free registers left, otherwise + // scavenger may report a fatal error without emergency spill slot + // or spill with the slot. + while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) { + Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); + if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) + break; + Tmp = Tmp2; + RS.setRegUsed(Tmp); + } + + TII.copyPhysReg(MBB, MI, DL, Tmp, SrcReg, KillSrc); + BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) + .addReg(Tmp, RegState::Kill); +} + void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, @@ -652,75 +726,18 @@ return; } - if (RC == &AMDGPU::AGPR_32RegClass) { - assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || - AMDGPU::SReg_32RegClass.contains(SrcReg) || - AMDGPU::AGPR_32RegClass.contains(SrcReg)); - if (!AMDGPU::VGPR_32RegClass.contains(SrcReg)) { - // First try to find defining accvgpr_write to avoid temporary registers. - for (auto Def = MI, E = MBB.begin(); Def != E; ) { - --Def; - if (!Def->definesRegister(SrcReg, &RI)) - continue; - if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32) - break; - - MachineOperand &DefOp = Def->getOperand(1); - assert(DefOp.isReg() || DefOp.isImm()); - - if (DefOp.isReg()) { - // Check that register source operand if not clobbered before MI. - // Immediate operands are always safe to propagate. - bool SafeToPropagate = true; - for (auto I = Def; I != MI && SafeToPropagate; ++I) - if (I->modifiesRegister(DefOp.getReg(), &RI)) - SafeToPropagate = false; - - if (!SafeToPropagate) - break; - DefOp.setIsKill(false); - } - - BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) - .add(DefOp); - return; - } - - RegScavenger RS; - RS.enterBasicBlock(MBB); - RS.forward(MI); - - // Ideally we want to have three registers for a long reg_sequence copy - // to hide 2 waitstates between v_mov_b32 and accvgpr_write. - unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass, - *MBB.getParent()); - - // Registers in the sequence are allocated contiguously so we can just - // use register number to pick one of three round-robin temps. - unsigned RegNo = DestReg % 3; - Register Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); - if (!Tmp) - report_fatal_error("Cannot scavenge VGPR to copy to AGPR"); - RS.setRegUsed(Tmp); - // Only loop through if there are any free registers left, otherwise - // scavenger may report a fatal error without emergency spill slot - // or spill with the slot. - while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) { - unsigned Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); - if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) - break; - Tmp = Tmp2; - RS.setRegUsed(Tmp); - } - copyPhysReg(MBB, MI, DL, Tmp, SrcReg, KillSrc); + if (RC == &AMDGPU::AGPR_32RegClass) { + if (AMDGPU::VGPR_32RegClass.contains(SrcReg)) { BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) - .addReg(Tmp, RegState::Kill); + .addReg(SrcReg, getKillRegState(KillSrc)); return; } - BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); + // FIXME: Pass should maintain scavenger to avoid scan through the block on + // every AGPR spill. + RegScavenger RS; + indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS); return; }