diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -120,6 +120,10 @@ class SIFixSGPRCopies : public MachineFunctionPass { MachineDominatorTree *MDT; + SmallVector SCCCopies; + SmallVector RegSequences; + SmallVector PHINodes; + SmallVector S2VCopies; unsigned NextVGPRToSGPRCopyID; DenseMap V2SCopies; DenseMap> SiblingPenalty; @@ -134,8 +138,11 @@ SIFixSGPRCopies() : MachineFunctionPass(ID), NextVGPRToSGPRCopyID(0) {} bool runOnMachineFunction(MachineFunction &MF) override; + void fixSCCCopies(bool IsWave32); + void prepareRegSequenceAndPHIs(MachineFunction &MF); unsigned getNextVGPRToSGPRCopyId() { return ++NextVGPRToSGPRCopyID; } - void analyzeVGPRToSGPRCopy(V2SCopyInfo& Info); + bool needToBeConvertedToVALU(V2SCopyInfo *I); + void analyzeVGPRToSGPRCopy(MachineInstr *MI); void lowerVGPR2SGPRCopies(MachineFunction &MF); // Handles copies which source register is: // 1. Physical register @@ -171,19 +178,6 @@ return new SIFixSGPRCopies(); } -static bool hasVectorOperands(const MachineInstr &MI, - const SIRegisterInfo *TRI) { - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); - for (const MachineOperand &MO : MI.operands()) { - if (!MO.isReg() || !MO.getReg().isVirtual()) - continue; - - if (TRI->hasVectorRegisters(MRI.getRegClass(MO.getReg()))) - return true; - } - return false; -} - static std::pair getCopyRegClasses(const MachineInstr &Copy, const SIRegisterInfo &TRI, @@ -616,14 +610,6 @@ TII = ST.getInstrInfo(); MDT = &getAnalysis(); - // We have to lower VGPR to SGPR copies before the main loop - // because the REG_SEQUENCE and PHI lowering in main loop - // convert the def-use chains to VALU and close the opportunities - // for keeping them scalar. - // TODO: REG_SEQENCE and PHIs are semantically copies. The next patch - // addresses their lowering and unify the processing in one main loop. - lowerVGPR2SGPRCopies(MF); - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock *MBB = &*BI; @@ -639,100 +625,64 @@ case AMDGPU::STRICT_WQM: case AMDGPU::SOFT_WQM: case AMDGPU::STRICT_WWM: { + Register SrcReg = MI.getOperand(1).getReg(); Register DstReg = MI.getOperand(0).getReg(); const TargetRegisterClass *SrcRC, *DstRC; std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI); - if (MI.isCopy()) { - Register SrcReg = MI.getOperand(1).getReg(); - if (SrcReg == AMDGPU::SCC) { - Register SCCCopy = MRI->createVirtualRegister( - TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID)); - I = BuildMI(*MI.getParent(), - std::next(MachineBasicBlock::iterator(MI)), - MI.getDebugLoc(), - TII->get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 - : AMDGPU::S_CSELECT_B64), - SCCCopy) - .addImm(-1) - .addImm(0); - I = BuildMI(*MI.getParent(), std::next(I), I->getDebugLoc(), - TII->get(AMDGPU::COPY), DstReg) - .addReg(SCCCopy); - MI.eraseFromParent(); - continue; - } else if (DstReg == AMDGPU::SCC) { - unsigned Opcode = - ST.isWave64() ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; - Register Exec = ST.isWave64() ? AMDGPU::EXEC : AMDGPU::EXEC_LO; - Register Tmp = MRI->createVirtualRegister(TRI->getBoolRC()); - I = BuildMI(*MI.getParent(), - std::next(MachineBasicBlock::iterator(MI)), - MI.getDebugLoc(), TII->get(Opcode)) - .addReg(Tmp, getDefRegState(true)) - .addReg(SrcReg) - .addReg(Exec); - MI.eraseFromParent(); - continue; - } + if (MI.isCopy() && (SrcReg == AMDGPU::SCC || DstReg == AMDGPU::SCC)) + SCCCopies.push_back(&MI); + + if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI) && + // Since VGPR to SGPR copies affect VGPR to SGPR copiy + // scoreand,hence the lowering decision, let's try to get rid of + // them as early as possible + !tryChangeVGPRtoSGPRinCopy(MI, TRI, TII)) { + // Collect those not changed to try them after VGPR to SGPR copies + // lowering as there will be more opportunities. + S2VCopies.push_back(&MI); } - - if (!DstReg.isVirtual()) { - // If the destination register is a physical register there isn't - // really much we can do to fix this. - // Some special instructions use M0 as an input. Some even only use - // the first lane. Insert a readfirstlane and hope for the best. - if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) { - Register TmpReg - = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - - BuildMI(*MBB, MI, MI.getDebugLoc(), - TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg) - .add(MI.getOperand(1)); - MI.getOperand(1).setReg(TmpReg); - } - + if (!isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) + continue; + if (lowerSpecialCase(MI)) continue; - } - if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) { - tryChangeVGPRtoSGPRinCopy(MI, TRI, TII); - } + analyzeVGPRToSGPRCopy(&MI); break; } - case AMDGPU::PHI: { - processPHINode(MI); - break; - } + case AMDGPU::INSERT_SUBREG: + case AMDGPU::PHI: case AMDGPU::REG_SEQUENCE: { - if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) || - !hasVectorOperands(MI, TRI)) { - foldVGPRCopyIntoRegSequence(MI, TRI, TII, *MRI); - continue; - } - - break; - } - case AMDGPU::INSERT_SUBREG: { - const TargetRegisterClass *DstRC, *Src0RC, *Src1RC; - DstRC = MRI->getRegClass(MI.getOperand(0).getReg()); - Src0RC = MRI->getRegClass(MI.getOperand(1).getReg()); - Src1RC = MRI->getRegClass(MI.getOperand(2).getReg()); - if (TRI->isSGPRClass(DstRC) && - (TRI->hasVectorRegisters(Src0RC) || - TRI->hasVectorRegisters(Src1RC))) { - LLVM_DEBUG(dbgs() << " Fixing INSERT_SUBREG: " << MI); - MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT); - if (NewBB && NewBB != MBB) { - MBB = NewBB; - E = MBB->end(); - BI = MachineFunction::iterator(MBB); - BE = MF.end(); + if (TRI->isSGPRClass(TII->getOpRegClass(MI, 0))) { + for (MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || !MO.getReg().isVirtual()) + continue; + const TargetRegisterClass *SrcRC = MRI->getRegClass(MO.getReg()); + if (TRI->hasVectorRegisters(SrcRC)) { + const TargetRegisterClass *DestRC = + TRI->getEquivalentSGPRClass(SrcRC); + Register NewDst = MRI->createVirtualRegister(DestRC); + MachineBasicBlock *BlockToInsertCopy = + MI.isPHI() ? MI.getOperand(MI.getOperandNo(&MO) + 1).getMBB() + : MBB; + MachineBasicBlock::iterator PointToInsertCopy = + MI.isPHI() ? BlockToInsertCopy->getFirstInstrTerminator() : I; + MachineInstr* NewCopy = BuildMI(*BlockToInsertCopy, PointToInsertCopy, + PointToInsertCopy->getDebugLoc(), TII->get(AMDGPU::COPY), + NewDst) + .addReg(MO.getReg()); + MO.setReg(NewDst); + analyzeVGPRToSGPRCopy(NewCopy); + } } - assert((!NewBB || NewBB == I->getParent()) && - "moveToVALU did not return the right basic block"); } + + if (MI.isPHI()) + PHINodes.push_back(&MI); + else if (MI.isRegSequence()) + RegSequences.push_back(&MI); + break; } case AMDGPU::V_WRITELANE_B32: { @@ -800,11 +750,41 @@ } } + lowerVGPR2SGPRCopies(MF); + + // Postprocessing + fixSCCCopies(ST.isWave32()); + + for (auto MI : S2VCopies) { + // Check if it is still valid + if (nullptr != MI->getParent() && MI->isCopy()) { + const TargetRegisterClass *SrcRC, *DstRC; + std::tie(SrcRC, DstRC) = getCopyRegClasses(*MI, *TRI, *MRI); + if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) + tryChangeVGPRtoSGPRinCopy(*MI, TRI, TII); + } + } + + + for (auto MI : RegSequences) { + // Check if it is still valid + if (nullptr != MI->getParent() && MI->isRegSequence()) + foldVGPRCopyIntoRegSequence(*MI, TRI, TII, *MRI); + } + + for (auto MI : PHINodes) { + processPHINode(*MI); + } + if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge) hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII); SiblingPenalty.clear(); V2SCopies.clear(); + SCCCopies.clear(); + RegSequences.clear(); + PHINodes.clear(); + S2VCopies.clear(); return true; } @@ -861,7 +841,29 @@ } bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI) { + + Register DstReg = MI.getOperand(0).getReg(); Register SrcReg = MI.getOperand(1).getReg(); + + if (!DstReg.isVirtual()) { + // If the destination register is a physical register there isn't + // really much we can do to fix this. + // Some special instructions use M0 as an input. Some even only use + // the first lane. Insert a readfirstlane and hope for the best. + if (DstReg == AMDGPU::M0 && + TRI->hasVectorRegisters(MRI->getRegClass(SrcReg))) { + Register TmpReg = + MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), + TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg) + .add(MI.getOperand(1)); + MI.getOperand(1).setReg(TmpReg); + } + + return true; + } + if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) { TII->moveToVALU(MI, MDT); return true; @@ -880,9 +882,13 @@ return false; } +void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) { + Register DstReg = MI->getOperand(0).getReg(); + const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg); + V2SCopyInfo Info(getNextVGPRToSGPRCopyId(), MI, + TRI->getRegSizeInBits(*DstRC)); -void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(V2SCopyInfo& Info) { SmallVector AnalysisWorklist; // Needed because the SSA is not a tree but a graph and may have // forks and joins. We should not then go same way twice. @@ -930,142 +936,51 @@ AnalysisWorklist.push_back(U); } } + V2SCopies[Info.ID] = Info; } -void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { - - // The main function that computes the VGPR to SGPR copy score - // and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU - auto needToBeConvertedToVALU = [&](V2SCopyInfo *I) -> bool { - if (I->SChain.empty()) { - I->Score = 0; - return true; - } - I->Siblings = SiblingPenalty[*std::max_element( - I->SChain.begin(), I->SChain.end(), - [&](MachineInstr *A, MachineInstr *B) -> bool { - return SiblingPenalty[A].size() < SiblingPenalty[B].size(); - })]; - I->Siblings.remove_if([&](unsigned ID) { return ID == I->ID; }); - // The loop below computes the number of another VGPR to SGPR V2SCopies - // which contribute to the current copy SALU chain. We assume that all the - // V2SCopies with the same source virtual register will be squashed to one - // by regalloc. Also we take care of the V2SCopies of the differnt subregs - // of the same register. - SmallSet, 4> SrcRegs; - for (auto J : I->Siblings) { - auto InfoIt = V2SCopies.find(J); - if (InfoIt != V2SCopies.end()) { - MachineInstr *SiblingCopy = InfoIt->getSecond().Copy; - if (SiblingCopy->isImplicitDef()) - // the COPY has already been MoveToVALUed - continue; - - SrcRegs.insert(std::make_pair(SiblingCopy->getOperand(1).getReg(), - SiblingCopy->getOperand(1).getSubReg())); - } - } - I->SiblingPenalty = SrcRegs.size(); - - unsigned Penalty = - I->NumSVCopies + I->SiblingPenalty + I->NumReadfirstlanes; - unsigned Profit = I->SChain.size(); - I->Score = Penalty > Profit ? 0 : Profit - Penalty; - I->NeedToBeConvertedToVALU = I->Score < 3; - return I->NeedToBeConvertedToVALU; - }; - - auto needProcessing = [](MachineInstr &MI) -> bool { - switch (MI.getOpcode()) { - case AMDGPU::COPY: - case AMDGPU::WQM: - case AMDGPU::STRICT_WQM: - case AMDGPU::SOFT_WQM: - case AMDGPU::STRICT_WWM: - case AMDGPU::REG_SEQUENCE: - case AMDGPU::PHI: - return true; - default: - return false; - } - }; - - SmallSet OutOfOrderProcessedCopies; - - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; - ++BI) { - MachineBasicBlock *MBB = &*BI; - for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; - ++I) { - MachineInstr *MI = &*I; - if (!needProcessing(*MI)) - continue; - - if (MI->isRegSequence() || MI->isPHI()) { - MachineBasicBlock::iterator J = I; - if (TRI->isSGPRClass(TII->getOpRegClass(*MI, 0))) { - for (MachineOperand &MO : MI->operands()) { - if (!MO.isReg() || !MO.getReg().isVirtual()) - continue; - const TargetRegisterClass *SrcRC = MRI->getRegClass(MO.getReg()); - if (TRI->hasVectorRegisters(SrcRC)) { - const TargetRegisterClass *DestRC = - TRI->getEquivalentSGPRClass(SrcRC); - Register NewDst = MRI->createVirtualRegister(DestRC); - MachineBasicBlock *BlockToInsertCopy = MBB; - MachineBasicBlock::iterator PointToInsertCopy = I; - if (MI->isPHI()) { - BlockToInsertCopy = - MI->getOperand(MI->getOperandNo(&MO) + 1).getMBB(); - PointToInsertCopy = - BlockToInsertCopy->getFirstInstrTerminator(); - } - MachineBasicBlock::iterator NewI = - BuildMI(*BlockToInsertCopy, PointToInsertCopy, - PointToInsertCopy->getDebugLoc(), - TII->get(AMDGPU::COPY), NewDst) - .addReg(MO.getReg()); - MO.setReg(NewDst); - if (!MI->isPHI()) { - I = NewI; - MI = &*I; - } else { - // We insert the copy into the basic block that may have been - // already processed. Pass it to the analysis explicitly. - V2SCopyInfo In(getNextVGPRToSGPRCopyId(), MI, - TRI->getRegSizeInBits(*DestRC)); - analyzeVGPRToSGPRCopy(In); - V2SCopies[In.ID] = In; - OutOfOrderProcessedCopies.insert(MI); - } - } - } - } - - if (J == I) - continue; - } - - const TargetRegisterClass *SrcRC, *DstRC; - std::tie(SrcRC, DstRC) = getCopyRegClasses(*MI, *TRI, *MRI); - - if (!isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) - continue; - - if (lowerSpecialCase(*MI)) - continue; - - if (OutOfOrderProcessedCopies.contains(MI)) +// The main function that computes the VGPR to SGPR copy score +// and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU +bool SIFixSGPRCopies::needToBeConvertedToVALU(V2SCopyInfo *Info) { + if (Info->SChain.empty()) { + Info->Score = 0; + return true; + } + Info->Siblings = SiblingPenalty[*std::max_element( + Info->SChain.begin(), Info->SChain.end(), + [&](MachineInstr *A, MachineInstr *B) -> bool { + return SiblingPenalty[A].size() < SiblingPenalty[B].size(); + })]; + Info->Siblings.remove_if([&](unsigned ID) { return ID == Info->ID; }); + // The loop below computes the number of another VGPR to SGPR V2SCopies + // which contribute to the current copy SALU chain. We assume that all the + // V2SCopies with the same source virtual register will be squashed to one + // by regalloc. Also we take care of the V2SCopies of the differnt subregs + // of the same register. + SmallSet, 4> SrcRegs; + for (auto J : Info->Siblings) { + auto InfoIt = V2SCopies.find(J); + if (InfoIt != V2SCopies.end()) { + MachineInstr *SiblingCopy = InfoIt->getSecond().Copy; + if (SiblingCopy->isImplicitDef()) + // the COPY has already been MoveToVALUed continue; - V2SCopyInfo In(getNextVGPRToSGPRCopyId(), MI, - TRI->getRegSizeInBits(*DstRC)); - - analyzeVGPRToSGPRCopy(In); - - V2SCopies[In.ID] = In; + SrcRegs.insert(std::make_pair(SiblingCopy->getOperand(1).getReg(), + SiblingCopy->getOperand(1).getSubReg())); } } + Info->SiblingPenalty = SrcRegs.size(); + + unsigned Penalty = + Info->NumSVCopies + Info->SiblingPenalty + Info->NumReadfirstlanes; + unsigned Profit = Info->SChain.size(); + Info->Score = Penalty > Profit ? 0 : Profit - Penalty; + Info->NeedToBeConvertedToVALU = Info->Score < 3; + return Info->NeedToBeConvertedToVALU; +} + +void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) { SmallVector LoweringWorklist; for (auto &C : V2SCopies) { @@ -1142,3 +1057,43 @@ MI->eraseFromParent(); } } + +void SIFixSGPRCopies::fixSCCCopies(bool IsWave32) { + for (auto MI : SCCCopies) { + // May be lowered out + if (nullptr == MI->getParent()) + continue; + // May already have been lowered. + if (!MI->isCopy()) + continue; + Register SrcReg = MI->getOperand(1).getReg(); + Register DstReg = MI->getOperand(0).getReg(); + if (SrcReg == AMDGPU::SCC) { + Register SCCCopy = MRI->createVirtualRegister( + TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID)); + MachineBasicBlock::iterator I = + BuildMI(*MI->getParent(), + std::next(MachineBasicBlock::iterator(MI)), + MI->getDebugLoc(), + TII->get(IsWave32 ? AMDGPU::S_CSELECT_B32 + : AMDGPU::S_CSELECT_B64), + SCCCopy) + .addImm(-1) + .addImm(0); + BuildMI(*MI->getParent(), std::next(I), I->getDebugLoc(), + TII->get(AMDGPU::COPY), DstReg) + .addReg(SCCCopy); + MI->eraseFromParent(); + } else if (DstReg == AMDGPU::SCC) { + unsigned Opcode = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; + Register Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + Register Tmp = MRI->createVirtualRegister(TRI->getBoolRC()); + BuildMI(*MI->getParent(), std::next(MachineBasicBlock::iterator(MI)), + MI->getDebugLoc(), TII->get(Opcode)) + .addReg(Tmp, getDefRegState(true)) + .addReg(SrcReg) + .addReg(Exec); + MI->eraseFromParent(); + } + } +} \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir @@ -41,9 +41,9 @@ ; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; W64-NEXT: {{ $}} ; W64-NEXT: .1: @@ -88,9 +88,9 @@ ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; W32-NEXT: {{ $}} ; W32-NEXT: .1: @@ -160,10 +160,10 @@ ; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec - ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec + ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec + ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; W64-NEXT: {{ $}} ; W64-NEXT: .1: @@ -207,10 +207,10 @@ ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec - ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, [[COPY2]], %subreg.sub3 + ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec + ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; W32-NEXT: {{ $}} ; W32-NEXT: .1: @@ -280,10 +280,10 @@ ; W64-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec - ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec + ; W64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec + ; W64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; W64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; W64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; W64-NEXT: {{ $}} ; W64-NEXT: .1: @@ -327,10 +327,10 @@ ; W32-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF - ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec - ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec - ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY5]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec + ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec + ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, [[COPY2]], %subreg.sub3 ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; W32-NEXT: {{ $}} ; W32-NEXT: .1: @@ -400,9 +400,9 @@ ; ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec ; ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; ADDR64-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; ADDR64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 @@ -429,9 +429,9 @@ ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; W32-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; W32-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 @@ -485,9 +485,9 @@ ; ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec ; ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; ADDR64-NEXT: [[COPY9:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]].sub0_sub1 ; ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; ADDR64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 @@ -513,9 +513,9 @@ ; W64-NO-ADDR64-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NO-ADDR64-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W64-NO-ADDR64-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W64-NO-ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; W64-NO-ADDR64-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec ; W64-NO-ADDR64-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W64-NO-ADDR64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; W64-NO-ADDR64-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec ; W64-NO-ADDR64-NEXT: {{ $}} ; W64-NO-ADDR64-NEXT: .1: @@ -560,9 +560,9 @@ ; W32-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF ; W32-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]], implicit $exec - ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]], implicit $exec + ; W32-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]], implicit $exec ; W32-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY2]], implicit $exec - ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, [[COPY4]], %subreg.sub1, killed [[COPY7]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 + ; W32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[COPY6]], %subreg.sub0, killed [[COPY7]], %subreg.sub1, [[COPY3]], %subreg.sub2, killed [[COPY8]], %subreg.sub3 ; W32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo ; W32-NEXT: {{ $}} ; W32-NEXT: .1: diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -433,7 +433,7 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 -; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35 @@ -585,7 +585,7 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 -; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 ; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[40:41], v2, v4 @@ -722,7 +722,7 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 ; GFX9-O0-NEXT: s_mov_b32 s35, 0x7fffffff @@ -741,7 +741,7 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v12 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 @@ -770,7 +770,7 @@ ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; implicit-def: $sgpr35 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $exec +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 killed $exec ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11