diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -118,8 +118,22 @@ #endif }; -class SIFixSGPRCopies : public MachineFunctionPass { +class SIFixSGPRCopies : public MachineFunctionPass, OnDeleteListener { MachineDominatorTree *MDT; + typedef struct tagMIT { + std::list::iterator I; + std::list* List; + } MITracking; + DenseMap PostProcessingLists; + void addToPostProcessing(std::list &L, MachineInstr &MI) { + MITracking &T = PostProcessingLists[&MI]; + T.List = &L; + T.I = L.insert(L.begin(), &MI); + } + std::list SCCCopies; + std::list RegSequences; + std::list PHINodes; + std::list S2VCopies; unsigned NextVGPRToSGPRCopyID; DenseMap V2SCopies; DenseMap> SiblingPenalty; @@ -134,8 +148,11 @@ SIFixSGPRCopies() : MachineFunctionPass(ID), NextVGPRToSGPRCopyID(0) {} bool runOnMachineFunction(MachineFunction &MF) override; + void fixSCCCopies(bool IsWave32); + void prepareRegSequenceAndPHIs(MachineFunction &MF); unsigned getNextVGPRToSGPRCopyId() { return ++NextVGPRToSGPRCopyID; } - void analyzeVGPRToSGPRCopy(V2SCopyInfo& Info); + bool needToBeConvertedToVALU(V2SCopyInfo *I); + void analyzeVGPRToSGPRCopy(MachineInstr *MI); void lowerVGPR2SGPRCopies(MachineFunction &MF); // Handles copies which source register is: // 1. Physical register @@ -145,6 +162,8 @@ void processPHINode(MachineInstr &MI); + void onInstrDelete(MachineInstr* MI) override; + StringRef getPassName() const override { return "SI Fix SGPR copies"; } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -171,19 +190,6 @@ return new SIFixSGPRCopies(); } -static bool hasVectorOperands(const MachineInstr &MI, - const SIRegisterInfo *TRI) { - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); - for (const MachineOperand &MO : MI.operands()) { - if (!MO.isReg() || !MO.getReg().isVirtual()) - continue; - - if (TRI->hasVectorRegisters(MRI.getRegClass(MO.getReg()))) - return true; - } - return false; -} - static std::pair getCopyRegClasses(const MachineInstr &Copy, const SIRegisterInfo &TRI, @@ -616,13 +622,6 @@ TII = ST.getInstrInfo(); MDT = &getAnalysis(); - // We have to lower VGPR to SGPR copies before the main loop - // because the REG_SEQUENCE and PHI lowering in main loop - // convert the def-use chains to VALU and close the opportunities - // for keeping them scalar. - // TODO: REG_SEQENCE and PHIs are semantically copies. The next patch - // addresses their lowering and unify the processing in one main loop. - lowerVGPR2SGPRCopies(MF); for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { @@ -639,100 +638,68 @@ case AMDGPU::STRICT_WQM: case AMDGPU::SOFT_WQM: case AMDGPU::STRICT_WWM: { + Register SrcReg = MI.getOperand(1).getReg(); Register DstReg = MI.getOperand(0).getReg(); const TargetRegisterClass *SrcRC, *DstRC; std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI); - if (MI.isCopy()) { - Register SrcReg = MI.getOperand(1).getReg(); - if (SrcReg == AMDGPU::SCC) { - Register SCCCopy = MRI->createVirtualRegister( - TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID)); - I = BuildMI(*MI.getParent(), - std::next(MachineBasicBlock::iterator(MI)), - MI.getDebugLoc(), - TII->get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 - : AMDGPU::S_CSELECT_B64), - SCCCopy) - .addImm(-1) - .addImm(0); - I = BuildMI(*MI.getParent(), std::next(I), I->getDebugLoc(), - TII->get(AMDGPU::COPY), DstReg) - .addReg(SCCCopy); - MI.eraseFromParent(); - continue; - } else if (DstReg == AMDGPU::SCC) { - unsigned Opcode = - ST.isWave64() ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; - Register Exec = ST.isWave64() ? AMDGPU::EXEC : AMDGPU::EXEC_LO; - Register Tmp = MRI->createVirtualRegister(TRI->getBoolRC()); - I = BuildMI(*MI.getParent(), - std::next(MachineBasicBlock::iterator(MI)), - MI.getDebugLoc(), TII->get(Opcode)) - .addReg(Tmp, getDefRegState(true)) - .addReg(SrcReg) - .addReg(Exec); - MI.eraseFromParent(); + if (MI.isCopy() && (SrcReg == AMDGPU::SCC || DstReg == AMDGPU::SCC)) + addToPostProcessing(SCCCopies, MI); + + if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) { + // Since VGPR to SGPR copies affect VGPR to SGPR copy + // score and, hence the lowering decision, let's try to get rid of + // them as early as possible + if (tryChangeVGPRtoSGPRinCopy(MI, TRI, TII)) continue; - } - } - if (!DstReg.isVirtual()) { - // If the destination register is a physical register there isn't - // really much we can do to fix this. - // Some special instructions use M0 as an input. Some even only use - // the first lane. Insert a readfirstlane and hope for the best. - if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) { - Register TmpReg - = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - - BuildMI(*MBB, MI, MI.getDebugLoc(), - TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg) - .add(MI.getOperand(1)); - MI.getOperand(1).setReg(TmpReg); - } + // Collect those not changed to try them after VGPR to SGPR copies + // lowering as there will be more opportunities. + addToPostProcessing(S2VCopies, MI); + } + if (!isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) + continue; + if (lowerSpecialCase(MI)) continue; - } - if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) { - tryChangeVGPRtoSGPRinCopy(MI, TRI, TII); - } + analyzeVGPRToSGPRCopy(&MI); break; } - case AMDGPU::PHI: { - processPHINode(MI); - break; - } + case AMDGPU::INSERT_SUBREG: + case AMDGPU::PHI: case AMDGPU::REG_SEQUENCE: { - if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) || - !hasVectorOperands(MI, TRI)) { - foldVGPRCopyIntoRegSequence(MI, TRI, TII, *MRI); - continue; - } - - break; - } - case AMDGPU::INSERT_SUBREG: { - const TargetRegisterClass *DstRC, *Src0RC, *Src1RC; - DstRC = MRI->getRegClass(MI.getOperand(0).getReg()); - Src0RC = MRI->getRegClass(MI.getOperand(1).getReg()); - Src1RC = MRI->getRegClass(MI.getOperand(2).getReg()); - if (TRI->isSGPRClass(DstRC) && - (TRI->hasVectorRegisters(Src0RC) || - TRI->hasVectorRegisters(Src1RC))) { - LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI); - MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT); - if (NewBB && NewBB != MBB) { - MBB = NewBB; - E = MBB->end(); - BI = MachineFunction::iterator(MBB); - BE = MF.end(); + if (TRI->isSGPRClass(TII->getOpRegClass(MI, 0))) { + for (MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || !MO.getReg().isVirtual()) + continue; + const TargetRegisterClass *SrcRC = MRI->getRegClass(MO.getReg()); + if (TRI->hasVectorRegisters(SrcRC)) { + const TargetRegisterClass *DestRC = + TRI->getEquivalentSGPRClass(SrcRC); + Register NewDst = MRI->createVirtualRegister(DestRC); + MachineBasicBlock *BlockToInsertCopy = + MI.isPHI() ? MI.getOperand(MI.getOperandNo(&MO) + 1).getMBB() + : MBB; + MachineBasicBlock::iterator PointToInsertCopy = + MI.isPHI() ? BlockToInsertCopy->getFirstInstrTerminator() : I; + MachineInstr *NewCopy = + BuildMI(*BlockToInsertCopy, PointToInsertCopy, + PointToInsertCopy->getDebugLoc(), + TII->get(AMDGPU::COPY), NewDst) + .addReg(MO.getReg()); + MO.setReg(NewDst); + analyzeVGPRToSGPRCopy(NewCopy); + } } - assert((!NewBB || NewBB == I->getParent()) && - "moveToVALU did not return the right basic block"); } + + if (MI.isPHI()) + addToPostProcessing(PHINodes, MI); + else if (MI.isRegSequence()) + addToPostProcessing(RegSequences, MI); + break; } case AMDGPU::V_WRITELANE_B32: { @@ -800,11 +767,35 @@ } } + lowerVGPR2SGPRCopies(MF); + // Postprocessing + fixSCCCopies(ST.isWave32()); + for (auto MI : S2VCopies) { + // Check if it is still valid + if (MI->isCopy()) { + const TargetRegisterClass *SrcRC, *DstRC; + std::tie(SrcRC, DstRC) = getCopyRegClasses(*MI, *TRI, *MRI); + if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) + tryChangeVGPRtoSGPRinCopy(*MI, TRI, TII); + } + } + for (auto MI : RegSequences) { + // Check if it is still valid + if (MI->isRegSequence()) + foldVGPRCopyIntoRegSequence(*MI, TRI, TII, *MRI); + } + for (auto MI : PHINodes) { + processPHINode(*MI); + } if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge) hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII); SiblingPenalty.clear(); V2SCopies.clear(); + SCCCopies.clear(); + RegSequences.clear(); + PHINodes.clear(); + S2VCopies.clear(); return true; } @@ -861,9 +852,26 @@ } bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI) { + Register DstReg = MI.getOperand(0).getReg(); Register SrcReg = MI.getOperand(1).getReg(); + if (!DstReg.isVirtual()) { + // If the destination register is a physical register there isn't + // really much we can do to fix this. + // Some special instructions use M0 as an input. Some even only use + // the first lane. Insert a readfirstlane and hope for the best. + if (DstReg == AMDGPU::M0 && + TRI->hasVectorRegisters(MRI->getRegClass(SrcReg))) { + Register TmpReg = + MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), + TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg) + .add(MI.getOperand(1)); + MI.getOperand(1).setReg(TmpReg); + } + return true; + } if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) { - TII->moveToVALU(MI, MDT); + TII->moveToVALU(MI, MDT, this); return true; } @@ -880,9 +888,12 @@ return false; } +void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) { + Register DstReg = MI->getOperand(0).getReg(); + const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg); - -void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(V2SCopyInfo& Info) { + V2SCopyInfo Info(getNextVGPRToSGPRCopyId(), MI, + TRI->getRegSizeInBits(*DstRC)); SmallVector AnalysisWorklist; // Needed because the SSA is not a tree but a graph and may have // forks and joins. We should not then go same way twice. @@ -924,146 +935,57 @@ for (auto &U : MRI->use_instructions(Reg)) Users.push_back(&U); } - for (auto *U : Users) { + for (auto U : Users) { if (TII->isSALU(*U)) Info.SChain.insert(U); AnalysisWorklist.push_back(U); } } + V2SCopies[Info.ID] = Info; } -void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { - - // The main function that computes the VGPR to SGPR copy score - // and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU - auto needToBeConvertedToVALU = [&](V2SCopyInfo *I) -> bool { - if (I->SChain.empty()) { - I->Score = 0; - return true; - } - I->Siblings = SiblingPenalty[*std::max_element( - I->SChain.begin(), I->SChain.end(), - [&](MachineInstr *A, MachineInstr *B) -> bool { - return SiblingPenalty[A].size() < SiblingPenalty[B].size(); - })]; - I->Siblings.remove_if([&](unsigned ID) { return ID == I->ID; }); - // The loop below computes the number of another VGPR to SGPR V2SCopies - // which contribute to the current copy SALU chain. We assume that all the - // V2SCopies with the same source virtual register will be squashed to one - // by regalloc. Also we take care of the V2SCopies of the differnt subregs - // of the same register. - SmallSet, 4> SrcRegs; - for (auto J : I->Siblings) { - auto InfoIt = V2SCopies.find(J); - if (InfoIt != V2SCopies.end()) { - MachineInstr *SiblingCopy = InfoIt->getSecond().Copy; - if (SiblingCopy->isImplicitDef()) - // the COPY has already been MoveToVALUed - continue; - - SrcRegs.insert(std::make_pair(SiblingCopy->getOperand(1).getReg(), - SiblingCopy->getOperand(1).getSubReg())); - } - } - I->SiblingPenalty = SrcRegs.size(); - - unsigned Penalty = - I->NumSVCopies + I->SiblingPenalty + I->NumReadfirstlanes; - unsigned Profit = I->SChain.size(); - I->Score = Penalty > Profit ? 0 : Profit - Penalty; - I->NeedToBeConvertedToVALU = I->Score < 3; - return I->NeedToBeConvertedToVALU; - }; - - auto needProcessing = [](MachineInstr &MI) -> bool { - switch (MI.getOpcode()) { - case AMDGPU::COPY: - case AMDGPU::WQM: - case AMDGPU::STRICT_WQM: - case AMDGPU::SOFT_WQM: - case AMDGPU::STRICT_WWM: - case AMDGPU::REG_SEQUENCE: - case AMDGPU::PHI: - return true; - default: - return false; - } - }; - - SmallSet OutOfOrderProcessedCopies; - - for (MachineBasicBlock &MBB : MF) { - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; - ++I) { - MachineInstr *MI = &*I; - if (!needProcessing(*MI)) - continue; - - if (MI->isRegSequence() || MI->isPHI()) { - MachineBasicBlock::iterator J = I; - if (TRI->isSGPRClass(TII->getOpRegClass(*MI, 0))) { - for (MachineOperand &MO : MI->operands()) { - if (!MO.isReg() || !MO.getReg().isVirtual()) - continue; - const TargetRegisterClass *SrcRC = MRI->getRegClass(MO.getReg()); - if (TRI->hasVectorRegisters(SrcRC)) { - const TargetRegisterClass *DestRC = - TRI->getEquivalentSGPRClass(SrcRC); - Register NewDst = MRI->createVirtualRegister(DestRC); - MachineBasicBlock *BlockToInsertCopy = &MBB; - MachineBasicBlock::iterator PointToInsertCopy = I; - if (MI->isPHI()) { - BlockToInsertCopy = - MI->getOperand(MI->getOperandNo(&MO) + 1).getMBB(); - PointToInsertCopy = - BlockToInsertCopy->getFirstInstrTerminator(); - } - MachineBasicBlock::iterator NewI = - BuildMI(*BlockToInsertCopy, PointToInsertCopy, - PointToInsertCopy->getDebugLoc(), - TII->get(AMDGPU::COPY), NewDst) - .addReg(MO.getReg()); - MO.setReg(NewDst); - if (!MI->isPHI()) { - I = NewI; - MI = &*I; - } else { - // We insert the copy into the basic block that may have been - // already processed. Pass it to the analysis explicitly. - V2SCopyInfo In(getNextVGPRToSGPRCopyId(), MI, - TRI->getRegSizeInBits(*DestRC)); - analyzeVGPRToSGPRCopy(In); - V2SCopies[In.ID] = In; - OutOfOrderProcessedCopies.insert(MI); - } - } - } - } - - if (J == I) - continue; - } - - const TargetRegisterClass *SrcRC, *DstRC; - std::tie(SrcRC, DstRC) = getCopyRegClasses(*MI, *TRI, *MRI); - - if (!isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) - continue; - - if (lowerSpecialCase(*MI)) - continue; - - if (OutOfOrderProcessedCopies.contains(MI)) +// The main function that computes the VGPR to SGPR copy score +// and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU +bool SIFixSGPRCopies::needToBeConvertedToVALU(V2SCopyInfo *Info) { + if (Info->SChain.empty()) { + Info->Score = 0; + return true; + } + Info->Siblings = SiblingPenalty[*std::max_element( + Info->SChain.begin(), Info->SChain.end(), + [&](MachineInstr *A, MachineInstr *B) -> bool { + return SiblingPenalty[A].size() < SiblingPenalty[B].size(); + })]; + Info->Siblings.remove_if([&](unsigned ID) { return ID == Info->ID; }); + // The loop below computes the number of another VGPR to SGPR V2SCopies + // which contribute to the current copy SALU chain. We assume that all the + // V2SCopies with the same source virtual register will be squashed to one + // by regalloc. Also we take care of the V2SCopies of the differnt subregs + // of the same register. + SmallSet, 4> SrcRegs; + for (auto J : Info->Siblings) { + auto InfoIt = V2SCopies.find(J); + if (InfoIt != V2SCopies.end()) { + MachineInstr *SiblingCopy = InfoIt->getSecond().Copy; + if (SiblingCopy->isImplicitDef()) + // the COPY has already been MoveToVALUed continue; - V2SCopyInfo In(getNextVGPRToSGPRCopyId(), MI, - TRI->getRegSizeInBits(*DstRC)); - - analyzeVGPRToSGPRCopy(In); - - V2SCopies[In.ID] = In; + SrcRegs.insert(std::make_pair(SiblingCopy->getOperand(1).getReg(), + SiblingCopy->getOperand(1).getSubReg())); } } + Info->SiblingPenalty = SrcRegs.size(); + + unsigned Penalty = + Info->NumSVCopies + Info->SiblingPenalty + Info->NumReadfirstlanes; + unsigned Profit = Info->SChain.size(); + Info->Score = Penalty > Profit ? 0 : Profit - Penalty; + Info->NeedToBeConvertedToVALU = Info->Score < 3; + return Info->NeedToBeConvertedToVALU; +} + +void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { SmallVector LoweringWorklist; for (auto &C : V2SCopies) { @@ -1093,7 +1015,7 @@ LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy << " is being turned to VALU\n"); V2SCopies.erase(C.ID); - TII->moveToVALU(*C.Copy, MDT); + TII->moveToVALU(*C.Copy, MDT, this); } } @@ -1140,3 +1062,50 @@ MI->eraseFromParent(); } } + +void SIFixSGPRCopies::fixSCCCopies(bool IsWave32) { + for (auto MI : SCCCopies) { + // May already have been lowered. + if (!MI->isCopy()) + continue; + Register SrcReg = MI->getOperand(1).getReg(); + Register DstReg = MI->getOperand(0).getReg(); + if (SrcReg == AMDGPU::SCC) { + Register SCCCopy = MRI->createVirtualRegister( + TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID)); + MachineBasicBlock::iterator I = + BuildMI(*MI->getParent(), + std::next(MachineBasicBlock::iterator(MI)), + MI->getDebugLoc(), + TII->get(IsWave32 ? AMDGPU::S_CSELECT_B32 + : AMDGPU::S_CSELECT_B64), + SCCCopy) + .addImm(-1) + .addImm(0); + BuildMI(*MI->getParent(), std::next(I), I->getDebugLoc(), + TII->get(AMDGPU::COPY), DstReg) + .addReg(SCCCopy); + MI->eraseFromParent(); + continue; + } + if (DstReg == AMDGPU::SCC) { + unsigned Opcode = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; + Register Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + Register Tmp = MRI->createVirtualRegister(TRI->getBoolRC()); + BuildMI(*MI->getParent(), std::next(MachineBasicBlock::iterator(MI)), + MI->getDebugLoc(), TII->get(Opcode)) + .addReg(Tmp, getDefRegState(true)) + .addReg(SrcReg) + .addReg(Exec); + MI->eraseFromParent(); + } + } +} + +void SIFixSGPRCopies::onInstrDelete(MachineInstr* MI) { + auto T = PostProcessingLists.find(MI); + if (T != PostProcessingLists.end()) { + T->second.List->erase(T->second.I); + PostProcessingLists.erase(MI); + } +} \ No newline at end of file diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -36,6 +36,11 @@ class TargetRegisterClass; class ScheduleHazardRecognizer; +class OnDeleteListener { + public: + virtual void onInstrDelete(MachineInstr *) = 0; +}; + /// Mark the MMO of a uniform load if there are no potentially clobbering stores /// on any path from the start of an entry function to this load. static const MachineMemOperand::Flags MONoClobber = @@ -128,8 +133,8 @@ void addSCCDefUsersToVALUWorklist(MachineOperand &Op, MachineInstr &SCCDefInst, - SetVectorType &Worklist, - Register NewCond = Register()) const; + SetVectorType &Worklist, Register NewCond, + OnDeleteListener *OnDelete = nullptr) const; void addSCCDefsToVALUWorklist(MachineOperand &Op, SetVectorType &Worklist) const; @@ -1004,8 +1009,8 @@ /// Replace this instruction's opcode with the equivalent VALU /// opcode. This function will also move the users of \p MI to the /// VALU if necessary. If present, \p MDT is updated. - MachineBasicBlock *moveToVALU(MachineInstr &MI, - MachineDominatorTree *MDT = nullptr) const; + MachineBasicBlock *moveToVALU(MachineInstr &MI, MachineDominatorTree *MDT, + OnDeleteListener *OnDelete = nullptr) const; void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; @@ -1181,6 +1186,15 @@ // This is used if an operand is a 32 bit register but needs to be aligned // regardless. void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const; + + /// @brief removes instruction from the parent block. + /// @param MI - pointer to instruction to remove. + void removeInstruction(MachineInstr *MI, + OnDeleteListener *OnDelete = nullptr) const { + if (OnDelete) + OnDelete->onInstrDelete(MI); + MI->eraseFromParent(); + } }; /// \brief Returns true if a reg:subreg pair P has a TRC class @@ -1225,7 +1239,6 @@ bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI); - namespace AMDGPU { LLVM_READONLY diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1902,7 +1902,7 @@ .addReg(Dst, RegState::Implicit | RegState::Define); } } - MI.eraseFromParent(); + removeInstruction(&MI); break; } case AMDGPU::V_MOV_B64_DPP_PSEUDO: { @@ -1930,7 +1930,7 @@ BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi) .addImm(Hi.getSExtValue()) .addReg(Dst, RegState::Implicit | RegState::Define); - MI.eraseFromParent(); + removeInstruction(&MI); break; } case AMDGPU::V_SET_INACTIVE_B32: { @@ -1946,7 +1946,7 @@ .add(MI.getOperand(2)); BuildMI(MBB, MI, DL, get(NotOpc), Exec) .addReg(Exec); - MI.eraseFromParent(); + removeInstruction(&MI); break; } case AMDGPU::V_SET_INACTIVE_B64: { @@ -1964,7 +1964,7 @@ expandPostRAPseudo(*Copy); BuildMI(MBB, MI, DL, get(NotOpc), Exec) .addReg(Exec); - MI.eraseFromParent(); + removeInstruction(&MI); break; } case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1: @@ -2015,7 +2015,7 @@ OpDesc.getNumOperands() + OpDesc.getNumImplicitUses(); const int ImpUseIdx = ImpDefIdx + 1; MIB->tieOperands(ImpDefIdx, ImpUseIdx); - MI.eraseFromParent(); + removeInstruction(&MI); break; } case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1: @@ -2054,7 +2054,7 @@ finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator())); - MI.eraseFromParent(); + removeInstruction(&MI); break; } case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1: @@ -2086,7 +2086,7 @@ finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator())); - MI.eraseFromParent(); + removeInstruction(&MI); break; } case AMDGPU::SI_PC_ADD_REL_OFFSET: { @@ -2113,7 +2113,7 @@ Bundler.append(MIB); finalizeBundle(MBB, Bundler.begin()); - MI.eraseFromParent(); + removeInstruction(&MI); break; } case AMDGPU::ENTER_STRICT_WWM: { @@ -2132,7 +2132,7 @@ BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec); BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec); - MI.eraseFromParent(); + removeInstruction(&MI); break; } case AMDGPU::EXIT_STRICT_WWM: @@ -2156,7 +2156,7 @@ .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef); MIB.copyImplicitOps(MI); - MI.eraseFromParent(); + removeInstruction(&MI); break; } } @@ -2223,7 +2223,7 @@ .addReg(Split[1]->getOperand(0).getReg()) .addImm(AMDGPU::sub1); - MI.eraseFromParent(); + removeInstruction(&MI); return std::make_pair(Split[0], Split[1]); } @@ -2624,7 +2624,7 @@ // Skip over artificial terminators when removing instructions. if (MI.isBranch() || MI.isReturn()) { RemovedSize += getInstSizeInBytes(MI); - MI.eraseFromParent(); + removeInstruction(&MI); ++Count; } } @@ -3025,7 +3025,7 @@ bool DeleteDef = MRI->use_nodbg_empty(Reg); if (DeleteDef) - DefMI.eraseFromParent(); + removeInstruction(&DefMI); return true; } @@ -3102,7 +3102,7 @@ bool DeleteDef = MRI->use_nodbg_empty(Reg); if (DeleteDef) - DefMI.eraseFromParent(); + removeInstruction(&DefMI); return true; } @@ -5396,7 +5396,7 @@ } if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg())) - VAddrDef->eraseFromParent(); + removeInstruction(VAddrDef); return true; } @@ -6030,8 +6030,9 @@ return CreatedBB; } -MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst, - MachineDominatorTree *MDT) const { +MachineBasicBlock *SIInstrInfo::moveToVALU( + MachineInstr &TopInst, + MachineDominatorTree *MDT, OnDeleteListener *OnDelete) const { SetVectorType Worklist; Worklist.insert(&TopInst); MachineBasicBlock *CreatedBB = nullptr; @@ -6052,7 +6053,7 @@ case AMDGPU::S_ADD_U64_PSEUDO: case AMDGPU::S_SUB_U64_PSEUDO: splitScalar64BitAddSub(Worklist, Inst, MDT); - Inst.eraseFromParent(); + removeInstruction(&Inst, OnDelete); continue; case AMDGPU::S_ADD_I32: case AMDGPU::S_SUB_I32: { @@ -6069,27 +6070,27 @@ } case AMDGPU::S_AND_B64: splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); - Inst.eraseFromParent(); + removeInstruction(&Inst, OnDelete); continue; case AMDGPU::S_OR_B64: splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT); - Inst.eraseFromParent(); + removeInstruction(&Inst, OnDelete); continue; case AMDGPU::S_XOR_B64: splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT); - Inst.eraseFromParent(); + removeInstruction(&Inst, OnDelete); continue; case AMDGPU::S_NAND_B64: splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT); - Inst.eraseFromParent(); + removeInstruction(&Inst, OnDelete); continue; case AMDGPU::S_NOR_B64: splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT); - Inst.eraseFromParent(); + removeInstruction(&Inst, OnDelete); continue; case AMDGPU::S_XNOR_B64: @@ -6097,37 +6098,37 @@ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); else splitScalar64BitXnor(Worklist, Inst, MDT); - Inst.eraseFromParent(); + removeInstruction(&Inst, OnDelete); continue; case AMDGPU::S_ANDN2_B64: splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT); - Inst.eraseFromParent(); + removeInstruction(&Inst, OnDelete); continue; case AMDGPU::S_ORN2_B64: splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT); - Inst.eraseFromParent(); + removeInstruction(&Inst, OnDelete); continue; case AMDGPU::S_BREV_B64: splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true); - Inst.eraseFromParent(); + removeInstruction(&Inst, OnDelete); continue; case AMDGPU::S_NOT_B64: splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); - Inst.eraseFromParent(); + removeInstruction(&Inst, OnDelete); continue; case AMDGPU::S_BCNT1_I32_B64: splitScalar64BitBCNT(Worklist, Inst); - Inst.eraseFromParent(); + removeInstruction(&Inst, OnDelete); continue; case AMDGPU::S_BFE_I64: splitScalar64BitBFE(Worklist, Inst); - Inst.eraseFromParent(); + removeInstruction(&Inst, OnDelete); continue; case AMDGPU::S_LSHL_B32: @@ -6169,7 +6170,7 @@ case AMDGPU::S_ABS_I32: lowerScalarAbs(Worklist, Inst); - Inst.eraseFromParent(); + removeInstruction(&Inst, OnDelete); continue; case AMDGPU::S_CBRANCH_SCC0: @@ -6196,32 +6197,32 @@ case AMDGPU::S_PACK_HL_B32_B16: case AMDGPU::S_PACK_HH_B32_B16: movePackToVALU(Worklist, MRI, Inst); - Inst.eraseFromParent(); + removeInstruction(&Inst, OnDelete); continue; case AMDGPU::S_XNOR_B32: lowerScalarXnor(Worklist, Inst); - Inst.eraseFromParent(); + removeInstruction(&Inst, OnDelete); continue; case AMDGPU::S_NAND_B32: splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32); - Inst.eraseFromParent(); + removeInstruction(&Inst, OnDelete); continue; case AMDGPU::S_NOR_B32: splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32); - Inst.eraseFromParent(); + removeInstruction(&Inst, OnDelete); continue; case AMDGPU::S_ANDN2_B32: splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32); - Inst.eraseFromParent(); + removeInstruction(&Inst, OnDelete); continue; case AMDGPU::S_ORN2_B32: splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); - Inst.eraseFromParent(); + removeInstruction(&Inst, OnDelete); continue; // TODO: remove as soon as everything is ready @@ -6258,7 +6259,7 @@ CreatedBB = CreatedBBTmp; MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg); addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); - Inst.eraseFromParent(); + removeInstruction(&Inst, OnDelete); } continue; case AMDGPU::S_UADDO_PSEUDO: @@ -6288,14 +6289,14 @@ MRI.replaceRegWith(Dest0.getReg(), DestReg); addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI, Worklist); - Inst.eraseFromParent(); + removeInstruction(&Inst, OnDelete); } continue; case AMDGPU::S_CSELECT_B32: case AMDGPU::S_CSELECT_B64: lowerSelect(Worklist, Inst, MDT); - Inst.eraseFromParent(); + removeInstruction(&Inst, OnDelete); continue; case AMDGPU::S_CMP_EQ_I32: case AMDGPU::S_CMP_LG_I32: @@ -6320,8 +6321,8 @@ legalizeOperands(*NewInstr, MDT); int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC); MachineOperand SCCOp = Inst.getOperand(SCCIdx); - addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); - Inst.eraseFromParent(); + addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg, OnDelete); + removeInstruction(&Inst, OnDelete); } continue; } @@ -6348,7 +6349,8 @@ if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { // Only propagate through live-def of SCC. if (Op.isDef() && !Op.isDead()) - addSCCDefUsersToVALUWorklist(Op, Inst, Worklist); + addSCCDefUsersToVALUWorklist(Op, Inst, Worklist, Register(), + OnDelete); if (Op.isUse()) addSCCDefsToVALUWorklist(Op, Worklist); Inst.removeOperand(i); @@ -7141,10 +7143,9 @@ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, - MachineInstr &SCCDefInst, - SetVectorType &Worklist, - Register NewCond) const { +void SIInstrInfo::addSCCDefUsersToVALUWorklist( + MachineOperand &Op, MachineInstr &SCCDefInst, SetVectorType &Worklist, + Register NewCond, OnDeleteListener *OnDelete) const { // Ensure that def inst defines SCC, which is still live. assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && @@ -7177,7 +7178,7 @@ break; } for (auto &Copy : CopyToDelete) - Copy->eraseFromParent(); + removeInstruction(Copy, OnDelete); } // Instructions that use SCC may be converted to VALU instructions. When that @@ -8396,7 +8397,7 @@ MachineOperand *SccDef = Def->findRegisterDefOperand(AMDGPU::SCC); SccDef->setIsDead(false); - CmpInstr.eraseFromParent(); + removeInstruction(&CmpInstr); if (!MRI->use_nodbg_empty(DefReg)) { assert(!IsReversedCC); @@ -8414,7 +8415,7 @@ BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc)) .add(*SrcOp) .addImm(BitNo); - Def->eraseFromParent(); + removeInstruction(Def); return true; }; diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir @@ -41,9 +41,9 @@ ; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; W64-NEXT: {{ $}} ; W64-NEXT: .1: @@ -88,9 +88,9 @@ ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; W32-NEXT: {{ $}} ; W32-NEXT: .1: @@ -160,10 +160,10 @@ ; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec - ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec + ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec + ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; W64-NEXT: {{ $}} ; W64-NEXT: .1: @@ -207,10 +207,10 @@ ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec - ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec + ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; W32-NEXT: {{ $}} ; W32-NEXT: .1: @@ -280,10 +280,10 @@ ; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec - ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec + ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec + ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; W64-NEXT: {{ $}} ; W64-NEXT: .1: @@ -327,10 +327,10 @@ ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec - ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec + ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec + ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; W32-NEXT: {{ $}} ; W32-NEXT: .1: @@ -400,9 +400,9 @@ ; ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec ; ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; ADDR64-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; ADDR64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 @@ -429,9 +429,9 @@ ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; W32-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; W32-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 @@ -485,9 +485,9 @@ ; ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec ; ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; ADDR64-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; ADDR64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 @@ -513,9 +513,9 @@ ; W64-NO-ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NO-ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NO-ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W64-NO-ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; W64-NO-ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec ; W64-NO-ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; W64-NO-ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; W64-NO-ADDR64-NEXT: {{ $}} ; W64-NO-ADDR64-NEXT: .1: @@ -560,9 +560,9 @@ ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; W32-NEXT: {{ $}} ; W32-NEXT: .1: diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -140,8 +140,8 @@ ; SI-NEXT: bb.2.Flow: ; SI-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI undef %32:vgpr_32, %bb.1, %10, %bb.4 - ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI undef %33:vgpr_32, %bb.1, %9, %bb.4 + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI undef %33:vgpr_32, %bb.1, %10, %bb.4 + ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI undef %32:vgpr_32, %bb.1, %9, %bb.4 ; SI-NEXT: [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, undef %35:vgpr_32, %bb.4 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.3 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -434,7 +434,7 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 @@ -586,7 +586,7 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 -; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 ; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[40:41], v2, v4 @@ -723,7 +723,7 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 ; GFX9-O0-NEXT: s_mov_b32 s35, 0x7fffffff @@ -742,7 +742,7 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 @@ -771,7 +771,7 @@ ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11