Index: llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -869,7 +869,9 @@ return true; } if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) { - TII->moveToVALU(MI, MDT); + std::set Worklist; + Worklist.insert(&MI); + TII->moveToVALU(Worklist, MDT); return true; } @@ -991,6 +993,10 @@ LoweringWorklist.push_back(C.second.ID); } + // Store all the V2S copy instructions that need to be moved to VALU + // in the Worklist. + std::set Worklist; + while (!LoweringWorklist.empty()) { unsigned CurID = LoweringWorklist.pop_back_val(); auto CurInfoIt = V2SCopies.find(CurID); @@ -1013,10 +1019,12 @@ LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy << " is being turned to VALU\n"); V2SCopies.erase(C.ID); - TII->moveToVALU(*C.Copy, MDT); + Worklist.insert(C.Copy); } } + TII->moveToVALU(Worklist, MDT); + // Now do actual lowering for (auto C : V2SCopies) { MachineInstr *MI = C.second.Copy; Index: llvm/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -81,57 +81,57 @@ void swapOperands(MachineInstr &Inst) const; std::pair - moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, + moveScalarAddSub(std::set &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT = nullptr) const; - void lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, + void lowerSelect(std::set &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT = nullptr) const; - void lowerScalarAbs(SetVectorType &Worklist, + void lowerScalarAbs(std::set &Worklist, MachineInstr &Inst) const; - void lowerScalarXnor(SetVectorType &Worklist, + void lowerScalarXnor(std::set &Worklist, MachineInstr &Inst) const; - void splitScalarNotBinop(SetVectorType &Worklist, + void splitScalarNotBinop(std::set &Worklist, MachineInstr &Inst, unsigned Opcode) const; - void splitScalarBinOpN2(SetVectorType &Worklist, + void splitScalarBinOpN2(std::set &Worklist, MachineInstr &Inst, unsigned Opcode) const; - void splitScalar64BitUnaryOp(SetVectorType &Worklist, + void splitScalar64BitUnaryOp(std::set &Worklist, MachineInstr &Inst, unsigned Opcode, bool Swap = false) const; - void splitScalar64BitAddSub(SetVectorType &Worklist, MachineInstr &Inst, + void splitScalar64BitAddSub(std::set &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT = nullptr) const; - void splitScalar64BitBinaryOp(SetVectorType &Worklist, MachineInstr &Inst, + void splitScalar64BitBinaryOp(std::set &Worklist, MachineInstr &Inst, unsigned Opcode, MachineDominatorTree *MDT = nullptr) const; - void splitScalar64BitXnor(SetVectorType &Worklist, MachineInstr &Inst, + void splitScalar64BitXnor(std::set &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT = nullptr) const; - void splitScalar64BitBCNT(SetVectorType &Worklist, + void splitScalar64BitBCNT(std::set &Worklist, MachineInstr &Inst) const; - void splitScalar64BitBFE(SetVectorType &Worklist, + void splitScalar64BitBFE(std::set &Worklist, MachineInstr &Inst) const; - void movePackToVALU(SetVectorType &Worklist, + void movePackToVALU(std::set &Worklist, MachineRegisterInfo &MRI, MachineInstr &Inst) const; void addUsersToMoveToVALUWorklist(Register Reg, MachineRegisterInfo &MRI, - SetVectorType &Worklist) const; + std::set &Worklist) const; void addSCCDefUsersToVALUWorklist(MachineOperand &Op, MachineInstr &SCCDefInst, - SetVectorType &Worklist, + std::set &Worklist, Register NewCond = Register()) const; void addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst, - SetVectorType &Worklist) const; + std::set &Worklist) const; const TargetRegisterClass * getDestEquivalentVGPRClass(const MachineInstr &Inst) const; @@ -1011,7 +1011,10 @@ /// Replace this instruction's opcode with the equivalent VALU /// opcode. This function will also move the users of \p MI to the /// VALU if necessary. If present, \p MDT is updated. - MachineBasicBlock *moveToVALU(MachineInstr &MI, + //MachineBasicBlock *moveToVALU(MachineInstr &MI, + // MachineDominatorTree *MDT = nullptr) const; + + void moveToVALU(std::set& worklist, MachineDominatorTree *MDT = nullptr) const; void insertNoop(MachineBasicBlock &MBB, Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -30,6 +30,7 @@ #include "llvm/MC/MCContext.h" #include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/ADT/PostOrderIterator.h" using namespace llvm; @@ -6149,424 +6150,425 @@ return CreatedBB; } -MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst, +void SIInstrInfo::moveToVALU(std::set& Worklist, MachineDominatorTree *MDT) const { - SetVectorType Worklist; - Worklist.insert(&TopInst); - MachineBasicBlock *CreatedBB = nullptr; - MachineBasicBlock *CreatedBBTmp = nullptr; - - while (!Worklist.empty()) { - MachineInstr &Inst = *Worklist.pop_back_val(); - MachineBasicBlock *MBB = Inst.getParent(); - MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + if (Worklist.empty()) return; - unsigned Opcode = Inst.getOpcode(); - unsigned NewOpcode = getVALUOp(Inst); + MachineFunction* MF = (*Worklist.begin())->getMF(); + if (MF->empty()) return; + ReversePostOrderTraversal RPOT(MF); - // Handle some special cases - switch (Opcode) { - default: - break; - case AMDGPU::S_ADD_U64_PSEUDO: - case AMDGPU::S_SUB_U64_PSEUDO: - splitScalar64BitAddSub(Worklist, Inst, MDT); - Inst.eraseFromParent(); + for (MachineBasicBlock *MBB : RPOT) { + if (MBB->empty()) continue; - case AMDGPU::S_ADD_I32: - case AMDGPU::S_SUB_I32: { - // FIXME: The u32 versions currently selected use the carry. - bool Changed; - std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT); - if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) - CreatedBB = CreatedBBTmp; - if (Changed) + for (auto I = MBB->instr_begin(); I != MBB->instr_end(); ++I) { + if (Worklist.empty()) break; + MachineInstr& Inst = *I; + + auto itr = std::find(Worklist.begin(), Worklist.end(), &Inst); + if (itr == Worklist.end()) continue; + + Worklist.erase(itr); + + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + + unsigned Opcode = Inst.getOpcode(); + unsigned NewOpcode = getVALUOp(Inst); + + // Handle some special cases + switch (Opcode) { + default: + break; + case AMDGPU::S_ADD_U64_PSEUDO: + case AMDGPU::S_SUB_U64_PSEUDO: + splitScalar64BitAddSub(Worklist, Inst, MDT); + Inst.eraseFromParent(); continue; + case AMDGPU::S_ADD_I32: + case AMDGPU::S_SUB_I32: { + // FIXME: The u32 versions currently selected use the carry. + bool Changed; + MachineBasicBlock *CreatedBBTmp = nullptr; + std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT); + if (Changed) + continue; - // Default handling - break; - } - case AMDGPU::S_AND_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); - Inst.eraseFromParent(); - continue; + // Default handling + break; + } + case AMDGPU::S_AND_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); + Inst.eraseFromParent(); + continue; - case AMDGPU::S_OR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_OR_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT); + Inst.eraseFromParent(); + continue; - case AMDGPU::S_XOR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_XOR_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT); + Inst.eraseFromParent(); + continue; - case AMDGPU::S_NAND_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_NAND_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT); + Inst.eraseFromParent(); + continue; - case AMDGPU::S_NOR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_NOR_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT); + Inst.eraseFromParent(); + continue; - case AMDGPU::S_XNOR_B64: - if (ST.hasDLInsts()) - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); - else - splitScalar64BitXnor(Worklist, Inst, MDT); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_XNOR_B64: + if (ST.hasDLInsts()) + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT); + else + splitScalar64BitXnor(Worklist, Inst, MDT); + Inst.eraseFromParent(); + continue; - case AMDGPU::S_ANDN2_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_ANDN2_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT); + Inst.eraseFromParent(); + continue; - case AMDGPU::S_ORN2_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_ORN2_B64: + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT); + Inst.eraseFromParent(); + continue; - case AMDGPU::S_BREV_B64: - splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_BREV_B64: + splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true); + Inst.eraseFromParent(); + continue; - case AMDGPU::S_NOT_B64: - splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_NOT_B64: + splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); + Inst.eraseFromParent(); + continue; - case AMDGPU::S_BCNT1_I32_B64: - splitScalar64BitBCNT(Worklist, Inst); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_BCNT1_I32_B64: + splitScalar64BitBCNT(Worklist, Inst); + Inst.eraseFromParent(); + continue; - case AMDGPU::S_BFE_I64: - splitScalar64BitBFE(Worklist, Inst); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_BFE_I64: + splitScalar64BitBFE(Worklist, Inst); + Inst.eraseFromParent(); + continue; - case AMDGPU::S_LSHL_B32: - if (ST.hasOnlyRevVALUShifts()) { - NewOpcode = AMDGPU::V_LSHLREV_B32_e64; - swapOperands(Inst); - } - break; - case AMDGPU::S_ASHR_I32: - if (ST.hasOnlyRevVALUShifts()) { - NewOpcode = AMDGPU::V_ASHRREV_I32_e64; - swapOperands(Inst); - } - break; - case AMDGPU::S_LSHR_B32: - if (ST.hasOnlyRevVALUShifts()) { - NewOpcode = AMDGPU::V_LSHRREV_B32_e64; - swapOperands(Inst); - } - break; - case AMDGPU::S_LSHL_B64: - if (ST.hasOnlyRevVALUShifts()) { - NewOpcode = AMDGPU::V_LSHLREV_B64_e64; - swapOperands(Inst); - } - break; - case AMDGPU::S_ASHR_I64: - if (ST.hasOnlyRevVALUShifts()) { - NewOpcode = AMDGPU::V_ASHRREV_I64_e64; - swapOperands(Inst); - } - break; - case AMDGPU::S_LSHR_B64: - if (ST.hasOnlyRevVALUShifts()) { - NewOpcode = AMDGPU::V_LSHRREV_B64_e64; - swapOperands(Inst); - } - break; + case AMDGPU::S_LSHL_B32: + if (ST.hasOnlyRevVALUShifts()) { + NewOpcode = AMDGPU::V_LSHLREV_B32_e64; + swapOperands(Inst); + } + break; + case AMDGPU::S_ASHR_I32: + if (ST.hasOnlyRevVALUShifts()) { + NewOpcode = AMDGPU::V_ASHRREV_I32_e64; + swapOperands(Inst); + } + break; + case AMDGPU::S_LSHR_B32: + if (ST.hasOnlyRevVALUShifts()) { + NewOpcode = AMDGPU::V_LSHRREV_B32_e64; + swapOperands(Inst); + } + break; + case AMDGPU::S_LSHL_B64: + if (ST.hasOnlyRevVALUShifts()) { + NewOpcode = AMDGPU::V_LSHLREV_B64_e64; + swapOperands(Inst); + } + break; + case AMDGPU::S_ASHR_I64: + if (ST.hasOnlyRevVALUShifts()) { + NewOpcode = AMDGPU::V_ASHRREV_I64_e64; + swapOperands(Inst); + } + break; + case AMDGPU::S_LSHR_B64: + if (ST.hasOnlyRevVALUShifts()) { + NewOpcode = AMDGPU::V_LSHRREV_B64_e64; + swapOperands(Inst); + } + break; - case AMDGPU::S_ABS_I32: - lowerScalarAbs(Worklist, Inst); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_ABS_I32: + lowerScalarAbs(Worklist, Inst); + Inst.eraseFromParent(); + continue; - case AMDGPU::S_CBRANCH_SCC0: - case AMDGPU::S_CBRANCH_SCC1: { - // Clear unused bits of vcc - Register CondReg = Inst.getOperand(1).getReg(); - bool IsSCC = CondReg == AMDGPU::SCC; - Register VCC = RI.getVCC(); - Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; - BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC) - .addReg(EXEC) - .addReg(IsSCC ? VCC : CondReg); - Inst.removeOperand(1); - } - break; + case AMDGPU::S_CBRANCH_SCC0: + case AMDGPU::S_CBRANCH_SCC1: { + // Clear unused bits of vcc + Register CondReg = Inst.getOperand(1).getReg(); + bool IsSCC = CondReg == AMDGPU::SCC; + Register VCC = RI.getVCC(); + Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; + BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC) + .addReg(EXEC) + .addReg(IsSCC ? VCC : CondReg); + Inst.removeOperand(1); + } + break; - case AMDGPU::S_BFE_U64: - case AMDGPU::S_BFM_B64: - llvm_unreachable("Moving this op to VALU not implemented"); + case AMDGPU::S_BFE_U64: + case AMDGPU::S_BFM_B64: + llvm_unreachable("Moving this op to VALU not implemented"); - case AMDGPU::S_PACK_LL_B32_B16: - case AMDGPU::S_PACK_LH_B32_B16: - case AMDGPU::S_PACK_HL_B32_B16: - case AMDGPU::S_PACK_HH_B32_B16: - movePackToVALU(Worklist, MRI, Inst); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_PACK_LL_B32_B16: + case AMDGPU::S_PACK_LH_B32_B16: + case AMDGPU::S_PACK_HL_B32_B16: + case AMDGPU::S_PACK_HH_B32_B16: + movePackToVALU(Worklist, MRI, Inst); + Inst.eraseFromParent(); + continue; - case AMDGPU::S_XNOR_B32: - lowerScalarXnor(Worklist, Inst); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_XNOR_B32: + lowerScalarXnor(Worklist, Inst); + Inst.eraseFromParent(); + continue; - case AMDGPU::S_NAND_B32: - splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_NAND_B32: + splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32); + Inst.eraseFromParent(); + continue; - case AMDGPU::S_NOR_B32: - splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_NOR_B32: + splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32); + Inst.eraseFromParent(); + continue; - case AMDGPU::S_ANDN2_B32: - splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_ANDN2_B32: + splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32); + Inst.eraseFromParent(); + continue; - case AMDGPU::S_ORN2_B32: - splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); - Inst.eraseFromParent(); - continue; + case AMDGPU::S_ORN2_B32: + splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); + Inst.eraseFromParent(); + continue; - // TODO: remove as soon as everything is ready - // to replace VGPR to SGPR copy with V_READFIRSTLANEs. - // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO - // can only be selected from the uniform SDNode. - case AMDGPU::S_ADD_CO_PSEUDO: - case AMDGPU::S_SUB_CO_PSEUDO: { - unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) - ? AMDGPU::V_ADDC_U32_e64 - : AMDGPU::V_SUBB_U32_e64; - const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); - - Register CarryInReg = Inst.getOperand(4).getReg(); - if (!MRI.constrainRegClass(CarryInReg, CarryRC)) { - Register NewCarryReg = MRI.createVirtualRegister(CarryRC); - BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg) - .addReg(CarryInReg); - } + // TODO: remove as soon as everything is ready + // to replace VGPR to SGPR copy with V_READFIRSTLANEs. + // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO + // can only be selected from the uniform SDNode. + case AMDGPU::S_ADD_CO_PSEUDO: + case AMDGPU::S_SUB_CO_PSEUDO: { + unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) + ? AMDGPU::V_ADDC_U32_e64 + : AMDGPU::V_SUBB_U32_e64; + const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + + Register CarryInReg = Inst.getOperand(4).getReg(); + if (!MRI.constrainRegClass(CarryInReg, CarryRC)) { + Register NewCarryReg = MRI.createVirtualRegister(CarryRC); + BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg) + .addReg(CarryInReg); + } - Register CarryOutReg = Inst.getOperand(1).getReg(); - - Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass( - MRI.getRegClass(Inst.getOperand(0).getReg()))); - MachineInstr *CarryOp = - BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg) - .addReg(CarryOutReg, RegState::Define) - .add(Inst.getOperand(2)) - .add(Inst.getOperand(3)) - .addReg(CarryInReg) - .addImm(0); - CreatedBBTmp = legalizeOperands(*CarryOp); - if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) - CreatedBB = CreatedBBTmp; - MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg); - addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); - Inst.eraseFromParent(); - } - continue; - case AMDGPU::S_UADDO_PSEUDO: - case AMDGPU::S_USUBO_PSEUDO: { - const DebugLoc &DL = Inst.getDebugLoc(); - MachineOperand &Dest0 = Inst.getOperand(0); - MachineOperand &Dest1 = Inst.getOperand(1); - MachineOperand &Src0 = Inst.getOperand(2); - MachineOperand &Src1 = Inst.getOperand(3); - - unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO) - ? AMDGPU::V_ADD_CO_U32_e64 - : AMDGPU::V_SUB_CO_U32_e64; - const TargetRegisterClass *NewRC = - RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg())); - Register DestReg = MRI.createVirtualRegister(NewRC); - MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg) - .addReg(Dest1.getReg(), RegState::Define) - .add(Src0) - .add(Src1) - .addImm(0); // clamp bit - - CreatedBBTmp = legalizeOperands(*NewInstr, MDT); - if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) - CreatedBB = CreatedBBTmp; - - MRI.replaceRegWith(Dest0.getReg(), DestReg); - addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI, - Worklist); - Inst.eraseFromParent(); - } - continue; + Register CarryOutReg = Inst.getOperand(1).getReg(); + + Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass( + MRI.getRegClass(Inst.getOperand(0).getReg()))); + MachineInstr *CarryOp = + BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg) + .addReg(CarryOutReg, RegState::Define) + .add(Inst.getOperand(2)) + .add(Inst.getOperand(3)) + .addReg(CarryInReg) + .addImm(0); + legalizeOperands(*CarryOp); + MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg); + addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); + Inst.eraseFromParent(); + } + continue; + case AMDGPU::S_UADDO_PSEUDO: + case AMDGPU::S_USUBO_PSEUDO: { + const DebugLoc &DL = Inst.getDebugLoc(); + MachineOperand &Dest0 = Inst.getOperand(0); + MachineOperand &Dest1 = Inst.getOperand(1); + MachineOperand &Src0 = Inst.getOperand(2); + MachineOperand &Src1 = Inst.getOperand(3); + + unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO) + ? AMDGPU::V_ADD_CO_U32_e64 + : AMDGPU::V_SUB_CO_U32_e64; + const TargetRegisterClass *NewRC = + RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg())); + Register DestReg = MRI.createVirtualRegister(NewRC); + MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg) + .addReg(Dest1.getReg(), RegState::Define) + .add(Src0) + .add(Src1) + .addImm(0); // clamp bit - case AMDGPU::S_CSELECT_B32: - case AMDGPU::S_CSELECT_B64: - lowerSelect(Worklist, Inst, MDT); - Inst.eraseFromParent(); - continue; - case AMDGPU::S_CMP_EQ_I32: - case AMDGPU::S_CMP_LG_I32: - case AMDGPU::S_CMP_GT_I32: - case AMDGPU::S_CMP_GE_I32: - case AMDGPU::S_CMP_LT_I32: - case AMDGPU::S_CMP_LE_I32: - case AMDGPU::S_CMP_EQ_U32: - case AMDGPU::S_CMP_LG_U32: - case AMDGPU::S_CMP_GT_U32: - case AMDGPU::S_CMP_GE_U32: - case AMDGPU::S_CMP_LT_U32: - case AMDGPU::S_CMP_LE_U32: - case AMDGPU::S_CMP_EQ_U64: - case AMDGPU::S_CMP_LG_U64: { - const MCInstrDesc &NewDesc = get(NewOpcode); - Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass()); - MachineInstr *NewInstr = - BuildMI(*MBB, Inst, Inst.getDebugLoc(), NewDesc, CondReg) - .add(Inst.getOperand(0)) - .add(Inst.getOperand(1)); legalizeOperands(*NewInstr, MDT); - int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC); - MachineOperand SCCOp = Inst.getOperand(SCCIdx); - addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); + + MRI.replaceRegWith(Dest0.getReg(), DestReg); + addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI, + Worklist); Inst.eraseFromParent(); } - continue; - } + continue; - if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { - // We cannot move this instruction to the VALU, so we should try to - // legalize its operands instead. - CreatedBBTmp = legalizeOperands(Inst, MDT); - if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) - CreatedBB = CreatedBBTmp; - continue; - } + case AMDGPU::S_CSELECT_B32: + case AMDGPU::S_CSELECT_B64: + lowerSelect(Worklist, Inst, MDT); + Inst.eraseFromParent(); + continue; + case AMDGPU::S_CMP_EQ_I32: + case AMDGPU::S_CMP_LG_I32: + case AMDGPU::S_CMP_GT_I32: + case AMDGPU::S_CMP_GE_I32: + case AMDGPU::S_CMP_LT_I32: + case AMDGPU::S_CMP_LE_I32: + case AMDGPU::S_CMP_EQ_U32: + case AMDGPU::S_CMP_LG_U32: + case AMDGPU::S_CMP_GT_U32: + case AMDGPU::S_CMP_GE_U32: + case AMDGPU::S_CMP_LT_U32: + case AMDGPU::S_CMP_LE_U32: + case AMDGPU::S_CMP_EQ_U64: + case AMDGPU::S_CMP_LG_U64: { + const MCInstrDesc &NewDesc = get(NewOpcode); + Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass()); + MachineInstr *NewInstr = + BuildMI(*MBB, Inst, Inst.getDebugLoc(), NewDesc, CondReg) + .add(Inst.getOperand(0)) + .add(Inst.getOperand(1)); + legalizeOperands(*NewInstr, MDT); + int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC); + MachineOperand SCCOp = Inst.getOperand(SCCIdx); + addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); + Inst.eraseFromParent(); + } + continue; + } - // Handle converting generic instructions like COPY-to-SGPR into - // COPY-to-VGPR. - if (NewOpcode == Opcode) { - Register DstReg = Inst.getOperand(0).getReg(); - const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); - - if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() && - NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { - // Instead of creating a copy where src and dst are the same register - // class, we just replace all uses of dst with src. These kinds of - // copies interfere with the heuristics MachineSink uses to decide - // whether or not to split a critical edge. Since the pass assumes - // that copies will end up as machine instructions and not be - // eliminated. - addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); - MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); - MRI.clearKillFlags(Inst.getOperand(1).getReg()); - Inst.getOperand(0).setReg(DstReg); - - // Make sure we don't leave around a dead VGPR->SGPR copy. Normally - // these are deleted later, but at -O0 it would leave a suspicious - // looking illegal copy of an undef register. - for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I) - Inst.removeOperand(I); - Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); + if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { + // We cannot move this instruction to the VALU, so we should try to + // legalize its operands instead. + legalizeOperands(Inst, MDT); continue; } - Register NewDstReg = MRI.createVirtualRegister(NewDstRC); - MRI.replaceRegWith(DstReg, NewDstReg); - legalizeOperands(Inst, MDT); - addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); - continue; - } + // Handle converting generic instructions like COPY-to-SGPR into + // COPY-to-VGPR. + if (NewOpcode == Opcode) { + Register DstReg = Inst.getOperand(0).getReg(); + const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); + + if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() && + NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { + // Instead of creating a copy where src and dst are the same register + // class, we just replace all uses of dst with src. These kinds of + // copies interfere with the heuristics MachineSink uses to decide + // whether or not to split a critical edge. Since the pass assumes + // that copies will end up as machine instructions and not be + // eliminated. + addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist); + MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg()); + MRI.clearKillFlags(Inst.getOperand(1).getReg()); + Inst.getOperand(0).setReg(DstReg); + + // Make sure we don't leave around a dead VGPR->SGPR copy. Normally + // these are deleted later, but at -O0 it would leave a suspicious + // looking illegal copy of an undef register. + for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I) + Inst.removeOperand(I); + Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); + continue; + } - // Use the new VALU Opcode. - auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode)) - .setMIFlags(Inst.getFlags()); - for (const MachineOperand &Op : Inst.explicit_operands()) - NewInstr->addOperand(Op); - - // Remove any references to SCC. Vector instructions can't read from it, and - // We're just about to add the implicit use / defs of VCC, and we don't want - // both. - for (MachineOperand &Op : Inst.implicit_operands()) { - if (Op.getReg() == AMDGPU::SCC) { - // Only propagate through live-def of SCC. - if (Op.isDef() && !Op.isDead()) - addSCCDefUsersToVALUWorklist(Op, Inst, Worklist); - if (Op.isUse()) - addSCCDefsToVALUWorklist(NewInstr, Worklist); + Register NewDstReg = MRI.createVirtualRegister(NewDstRC); + MRI.replaceRegWith(DstReg, NewDstReg); + legalizeOperands(Inst, MDT); + addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); + continue; } - } - Inst.eraseFromParent(); + // Use the new VALU Opcode. + auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode)) + .setMIFlags(Inst.getFlags()); + for (const MachineOperand &Op : Inst.explicit_operands()) + NewInstr->addOperand(Op); + + // Remove any references to SCC. Vector instructions can't read from it, and + // We're just about to add the implicit use / defs of VCC, and we don't want + // both. + for (MachineOperand &Op : Inst.implicit_operands()) { + if (Op.getReg() == AMDGPU::SCC) { + // Only propagate through live-def of SCC. + if (Op.isDef() && !Op.isDead()) + addSCCDefUsersToVALUWorklist(Op, Inst, Worklist); + if (Op.isUse()) + addSCCDefsToVALUWorklist(NewInstr, Worklist); + } + } - Register NewDstReg; - if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) { - Register DstReg = NewInstr->getOperand(0).getReg(); - assert(DstReg.isVirtual()); + Inst.eraseFromParent(); - // Update the destination register class. - const TargetRegisterClass *NewDstRC = - getDestEquivalentVGPRClass(*NewInstr); - assert(NewDstRC); + Register NewDstReg; + if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) { + Register DstReg = NewInstr->getOperand(0).getReg(); + assert(DstReg.isVirtual()); - NewDstReg = MRI.createVirtualRegister(NewDstRC); - MRI.replaceRegWith(DstReg, NewDstReg); - } + // Update the destination register class. + const TargetRegisterClass *NewDstRC = + getDestEquivalentVGPRClass(*NewInstr); + assert(NewDstRC); - if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { - // We are converting these to a BFE, so we need to add the missing - // operands for the size and offset. - unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; - NewInstr.addImm(0); - NewInstr.addImm(Size); - } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { - // The VALU version adds the second operand to the result, so insert an - // extra 0 operand. - NewInstr.addImm(0); - } + NewDstReg = MRI.createVirtualRegister(NewDstRC); + MRI.replaceRegWith(DstReg, NewDstReg); + } - if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { - const MachineOperand &OffsetWidthOp = NewInstr->getOperand(2); - // If we need to move this to VGPRs, we need to unpack the second operand - // back into the 2 separate ones for bit offset and width. - assert(OffsetWidthOp.isImm() && - "Scalar BFE is only implemented for constant width and offset"); - uint32_t Imm = OffsetWidthOp.getImm(); + if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { + // We are converting these to a BFE, so we need to add the missing + // operands for the size and offset. + unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; + NewInstr.addImm(0); + NewInstr.addImm(Size); + } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { + // The VALU version adds the second operand to the result, so insert an + // extra 0 operand. + NewInstr.addImm(0); + } - uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. - uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. - NewInstr->removeOperand(2); - NewInstr.addImm(Offset); - NewInstr.addImm(BitWidth); - } + if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { + const MachineOperand &OffsetWidthOp = NewInstr->getOperand(2); + // If we need to move this to VGPRs, we need to unpack the second operand + // back into the 2 separate ones for bit offset and width. + assert(OffsetWidthOp.isImm() && + "Scalar BFE is only implemented for constant width and offset"); + uint32_t Imm = OffsetWidthOp.getImm(); + + uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. + uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. + NewInstr->removeOperand(2); + NewInstr.addImm(Offset); + NewInstr.addImm(BitWidth); + } - fixImplicitOperands(*NewInstr); + fixImplicitOperands(*NewInstr); - // Legalize the operands - CreatedBBTmp = legalizeOperands(*NewInstr, MDT); - if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp) - CreatedBB = CreatedBBTmp; + // Legalize the operands + legalizeOperands(*NewInstr, MDT); - if (NewDstReg) - addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); + if (NewDstReg) + addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); + } } - return CreatedBB; } // Add/sub require special handling to deal with carry outs. std::pair -SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, +SIInstrInfo::moveScalarAddSub(std::set &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT) const { if (ST.hasAddNoCarry()) { // Assume there is no user of scc since we don't select this in that case. @@ -6601,7 +6603,7 @@ return std::pair(false, nullptr); } -void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, +void SIInstrInfo::lowerSelect(std::set &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -6677,7 +6679,7 @@ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, +void SIInstrInfo::lowerScalarAbs(std::set &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -6704,7 +6706,7 @@ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist, +void SIInstrInfo::lowerScalarXnor(std::set &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -6769,7 +6771,7 @@ } } -void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist, +void SIInstrInfo::splitScalarNotBinop(std::set &Worklist, MachineInstr &Inst, unsigned Opcode) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -6798,7 +6800,7 @@ addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist); } -void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist, +void SIInstrInfo::splitScalarBinOpN2(std::set& Worklist, MachineInstr &Inst, unsigned Opcode) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -6828,7 +6830,7 @@ } void SIInstrInfo::splitScalar64BitUnaryOp( - SetVectorType &Worklist, MachineInstr &Inst, + std::set &Worklist, MachineInstr &Inst, unsigned Opcode, bool Swap) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -6886,7 +6888,7 @@ addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist, +void SIInstrInfo::splitScalar64BitAddSub(std::set &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT) const { bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); @@ -6960,7 +6962,7 @@ addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist, +void SIInstrInfo::splitScalar64BitBinaryOp(std::set &Worklist, MachineInstr &Inst, unsigned Opcode, MachineDominatorTree *MDT) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -7027,7 +7029,7 @@ addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist, +void SIInstrInfo::splitScalar64BitXnor(std::set &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -7070,7 +7072,7 @@ } void SIInstrInfo::splitScalar64BitBCNT( - SetVectorType &Worklist, MachineInstr &Inst) const { + std::set &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -7107,7 +7109,7 @@ addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, +void SIInstrInfo::splitScalar64BitBFE(std::set &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -7171,7 +7173,7 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist( Register DstReg, MachineRegisterInfo &MRI, - SetVectorType &Worklist) const { + std::set &Worklist) const { for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), E = MRI.use_end(); I != E;) { MachineInstr &UseMI = *I->getParent(); @@ -7205,7 +7207,7 @@ } } -void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, +void SIInstrInfo::movePackToVALU(std::set &Worklist, MachineRegisterInfo &MRI, MachineInstr &Inst) const { Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -7280,7 +7282,7 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, MachineInstr &SCCDefInst, - SetVectorType &Worklist, + std::set &Worklist, Register NewCond) const { // Ensure that def inst defines SCC, which is still live. @@ -7323,7 +7325,7 @@ // sure that the instruction that defines SCC is added to the moveToVALU // worklist. void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst, - SetVectorType &Worklist) const { + std::set &Worklist) const { // Look for a preceding instruction that either defines VCC or SCC. If VCC // then there is nothing to do because the defining instruction has been // converted to a VALU already. If SCC then that instruction needs to be