diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -67,6 +67,7 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/SetOperations.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/InitializePasses.h" #include "llvm/Target/TargetMachine.h" @@ -81,7 +82,6 @@ cl::init(true)); namespace { - class SIFixSGPRCopies : public MachineFunctionPass { MachineDominatorTree *MDT; @@ -94,7 +94,9 @@ SIFixSGPRCopies() : MachineFunctionPass(ID) {} - bool runOnMachineFunction(MachineFunction &MF) override; + bool runOnMachineFunction(MachineFunction& MF) override; + void lowerVGPR2SGPRCopies(MachineFunction &MF); + bool LowerSpecialCase(MachineInstr& MI); MachineBasicBlock *processPHINode(MachineInstr &MI); @@ -569,6 +571,9 @@ TII = ST.getInstrInfo(); MDT = &getAnalysis(); + // Is kept aside to process V2S copies before the rest of the stuff + lowerVGPR2SGPRCopies(MF); + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock *MBB = &*BI; @@ -640,42 +645,7 @@ continue; } - if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) { - Register SrcReg = MI.getOperand(1).getReg(); - if (!SrcReg.isVirtual()) { - MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT); - if (NewBB && NewBB != MBB) { - MBB = NewBB; - E = MBB->end(); - BI = MachineFunction::iterator(MBB); - BE = MF.end(); - } - assert((!NewBB || NewBB == I->getParent()) && - "moveToVALU did not return the right basic block"); - break; - } - - MachineInstr *DefMI = MRI->getVRegDef(SrcReg); - unsigned SMovOp; - int64_t Imm; - // If we are just copying an immediate, we can replace the copy with - // s_mov_b32. - if (isSafeToFoldImmIntoCopy(&MI, DefMI, TII, SMovOp, Imm)) { - MI.getOperand(1).ChangeToImmediate(Imm); - MI.addImplicitDefUseOperands(MF); - MI.setDesc(TII->get(SMovOp)); - break; - } - MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT); - if (NewBB && NewBB != MBB) { - MBB = NewBB; - E = MBB->end(); - BI = MachineFunction::iterator(MBB); - BE = MF.end(); - } - assert((!NewBB || NewBB == I->getParent()) && - "moveToVALU did not return the right basic block"); - } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) { + if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) { tryChangeVGPRtoSGPRinCopy(MI, TRI, TII); } @@ -916,3 +886,265 @@ } return CreatedBB; } + +bool SIFixSGPRCopies::LowerSpecialCase(MachineInstr &MI) { + MachineBasicBlock *MBB = MI.getParent(); + const TargetRegisterClass *SrcRC, *DstRC; + std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI); + + // We return true to indicate that no further processing needed + if (!isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) + return true; + + Register SrcReg = MI.getOperand(1).getReg(); + if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) { + TII->moveToVALU(MI, MDT); + return true; + } + + unsigned SMovOp; + int64_t Imm; + // If we are just copying an immediate, we can replace the copy with + // s_mov_b32. + if (isSafeToFoldImmIntoCopy(&MI, MRI->getVRegDef(SrcReg), TII, SMovOp, Imm)) { + MI.getOperand(1).ChangeToImmediate(Imm); + MI.addImplicitDefUseOperands(*MBB->getParent()); + MI.setDesc(TII->get(SMovOp)); + return true; + } + return false; +} + +class V2SCopyInfo { + public: + // VGPR to SGPR copy being processed + MachineInstr *Copy; + // All SALU instructions reachable from this copy in SSA graph + DenseSet SChain; + // Number of SGPR to VGPR copies that are used to put the SALU computation results back to VALU. + unsigned NumSVCopies; + + unsigned Score; + // Actual count of v_readfirstlane_b32 + // which need to be inserted to keep SChain SALU + unsigned NumReadfirstlanes; + // Current score state. To speedup selection V2SCopyInfos for processing + bool IsVALU = false; + // Unique ID. Used as a key for mapping to keep permanent order. + unsigned ID; + // Next unique ID to use while new instance created. + static unsigned NextID; + + // Count of another VGPR to SGPR copies that contribute to the + // current copy SChain + unsigned SiblingPenaulty = 0; + SetVector Siblings; + V2SCopyInfo() : Copy(nullptr), ID(0) {}; + V2SCopyInfo(MachineInstr *C, unsigned Width) : Copy(C), NumSVCopies(0), + NumReadfirstlanes(Width/32), ID(++NextID) {}; + void dump() { + dbgs() << ID << " : " << *Copy + << "\n\tS:" << SChain.size() + << "\n\tSV:" << NumSVCopies + << "\n\tSP: " << SiblingPenaulty << "\nScore: " << Score << "\n"; + } +}; +unsigned V2SCopyInfo::NextID = 0; + +void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { + + DenseMap Copies; + DenseMap> SiblingPenaulty; + + // The main function that computes the VGPR to SGPR copy score + // and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU + auto isVALU = [&](V2SCopyInfo* I) -> bool { + if (I->SChain.empty()) + return true; + I->Siblings = SiblingPenaulty[ + *std::max_element(I->SChain.begin(), I->SChain.end(), + [&](MachineInstr *A, MachineInstr *B) -> bool { + return SiblingPenaulty[A].size() < SiblingPenaulty[B].size(); + })]; + I->Siblings.remove_if([&](unsigned ID) { return ID == I->ID;}); + SetVector SrcRegs; + for (auto J : I->Siblings) { + if (Copies.count(J)) { + MachineInstr * SiblingCopy = Copies[J].Copy; + if (SiblingCopy->isImplicitDef()) + // the COPY has already been MoveToVALUed + continue; + + SrcRegs.insert(SiblingCopy->getOperand(1).getReg()); + } + } + I->SiblingPenaulty = SrcRegs.size(); + + unsigned Penaulty = I->NumSVCopies + I->SiblingPenaulty + + I->NumReadfirstlanes; + unsigned Profit = I->SChain.size(); + I->Score = Penaulty > Profit ? 0 : Profit - Penaulty; + I->IsVALU = I->Score < 3; + return I->IsVALU; + }; + + auto needProcessing = [](MachineInstr &MI) -> bool { + switch (MI.getOpcode()) { + case AMDGPU::COPY: + case AMDGPU::WQM: + case AMDGPU::STRICT_WQM: + case AMDGPU::SOFT_WQM: + case AMDGPU::STRICT_WWM: + return true; + default: + return false; + } + }; + + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; + ++BI) { + MachineBasicBlock *MBB = &*BI; + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; + ++I) { + MachineInstr &MI = *I; + if (!needProcessing(MI)) + continue; + if (!LowerSpecialCase(MI)) { + + // Compute the COPY width to pass it to V2SCopyInfo Ctor + Register SrcReg = MI.getOperand(1).getReg(); + const TargetRegisterClass * RC = TRI->getRegClassForReg(*MRI, SrcReg); + + V2SCopyInfo In(&MI, TRI->getRegSizeInBits(*RC)); + + SmallVector worklist; + // Needed because the SSA is not a tree but a graph and may have + // forks and joins. We should not then go same way twice. + SetVector Visited; + worklist.push_back(&MI); + while (!worklist.empty()) { + + MachineInstr *Inst = worklist.pop_back_val(); + + // The analysis is per MBB for now. + if (Inst->isPHI() || Inst->getParent() != MI.getParent()) + continue; + + if (!Visited.insert(Inst)) + continue; + + // Copies and REG_SEQUENCE do not comtribute to the final assembly + // So, skip them but take care of the SGPR to VGPR copies bookkeeping. + if (Inst->isCopy() || Inst->isRegSequence()) { + if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) { + if (!Inst->isCopy() || !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) { + In.NumSVCopies++; + continue; + } + } + } + + SiblingPenaulty[Inst].insert(In.ID); + + SmallVector Users; + if ((TII->isSALU(*Inst) && Inst->isCompare()) || + (Inst->isCopy() && Inst->getOperand(0).getReg() == AMDGPU::SCC)) { + auto I = Inst->getIterator(); + while((++I) != Inst->getParent()->end() && + !I->findRegisterDefOperand(AMDGPU::SCC)) { + if (I->readsRegister(AMDGPU::SCC)) + Users.push_back(&*I); + } + } else if (Inst->getNumExplicitDefs() != 0) { + Register Reg = Inst->getOperand(0).getReg(); + for (auto &U : MRI->use_instructions(Reg)) { + if (TRI->isSGPRReg(*MRI, Reg)) { + Users.push_back(&U); + } + } + } + for (auto U : Users) { + if (TII->isSALU(*U)) + In.SChain.insert(U); + worklist.push_back(U); + } + } + Copies[In.ID] = In; + } + } + } + + SmallVector Worklist; + for (auto &C : Copies) { + if(isVALU(&C.second)) + Worklist.push_back(C.second.ID); + } + + while(!Worklist.empty()) { + unsigned CurID = Worklist.pop_back_val(); + if (Copies.count(CurID)) { + V2SCopyInfo C = Copies[CurID]; + // dbgs() << "Processing ...\n"; + // C.dump(); + for (auto S : C.Siblings) { + if (Copies.count(S)) { + V2SCopyInfo& SI = Copies[S]; + // dbgs() << "Sibling:\n"; + // SI.dump(); + if (!SI.IsVALU) { + set_subtract(SI.SChain, C.SChain); + if (isVALU(&SI)) + Worklist.push_back(SI.ID); + } + SI.Siblings.remove_if([&](unsigned ID) { return ID == C.ID;}); + } + } + LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy << " is being turned to VALU\n"); + Copies.erase(C.ID); + TII->moveToVALU(*C.Copy, MDT); + } + } + + // Now do actual lowering + for (auto C : Copies) { + MachineInstr *MI = C.second.Copy; + MachineBasicBlock *MBB = MI->getParent(); + // We decide to turn V2S copy to v_readfirstlanre_b32 + // remove it from the V2SCopies and remove it from all its siblings + LLVM_DEBUG(dbgs() << "V2S copy " << *MI << " is being turned to v_readfirstlane_b32" << + "Score: " << C.second.Score << "\n"); + uint16_t SubRegs[4] = {AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, + AMDGPU::sub3}; + Register DstReg = MI->getOperand(0).getReg(); + Register SrcReg = MI->getOperand(1).getReg(); + unsigned SubReg = MI->getOperand(1).getSubReg(); + bool IsSubReg = SubReg != AMDGPU::NoSubRegister; + const TargetRegisterClass *SrcRC = TRI->getRegClassForReg(*MRI, SrcReg); + if (IsSubReg) + SrcRC = TRI->getSubRegClass(SrcRC, SubReg); + if (TRI->getRegSizeInBits(*SrcRC) == 32) { + auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), + TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg); + if (IsSubReg) + MIB.addReg(SrcReg, 0, SubReg); + else + MIB.addReg(SrcReg); + } else { + auto Result = BuildMI(*MBB, MI, MI->getDebugLoc(), + TII->get(AMDGPU::REG_SEQUENCE), DstReg); + int N = TRI->getRegSizeInBits(*SrcRC) / 32; + for (int i = 0; i < N; i++) { + Register PartialSrc = + TII->buildExtractSubReg(Result, *MRI, MI->getOperand(1), SrcRC, + SubRegs[i], &AMDGPU::VGPR_32RegClass); + Register PartialDst = + MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*MBB, *Result, Result->getDebugLoc(), + TII->get(AMDGPU::V_READFIRSTLANE_B32), PartialDst) + .addReg(PartialSrc); + Result.addReg(PartialDst).addImm(SubRegs[i]); + } + } + MI->eraseFromParent(); + } +}