diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -918,12 +918,12 @@ void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, Register Reg, - bool IsSCCLive) const; + const DebugLoc &DL, Register Reg, bool IsSCCLive, + SlotIndexes *Indexes = nullptr) const; void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, - Register Reg) const; + Register Reg, SlotIndexes *Indexes = nullptr) const; /// Return the correct register class for \p OpNo. For target-specific /// instructions, this will return the register class that has been defined diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4928,7 +4928,8 @@ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, - bool IsSCCLive) const { + bool IsSCCLive, + SlotIndexes *Indexes) const { const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); bool IsWave32 = ST.isWave32(); @@ -4938,25 +4939,36 @@ // the single instruction S_OR_SAVEEXEC that clobbers SCC. unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg).addReg(Exec, RegState::Kill); - BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); + auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg) + .addReg(Exec, RegState::Kill); + auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); + if (Indexes) { + Indexes->insertMachineInstrInMaps(*StoreExecMI); + Indexes->insertMachineInstrInMaps(*FlipExecMI); + } } else { const unsigned OrSaveExec = IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; auto SaveExec = BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1); SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead. + if (Indexes) + Indexes->insertMachineInstrInMaps(*SaveExec); } } void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, Register Reg) const { + const DebugLoc &DL, Register Reg, + SlotIndexes *Indexes) const { const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec).addReg(Reg, RegState::Kill); + auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) + .addReg(Reg, RegState::Kill); + if (Indexes) + Indexes->insertMachineInstrInMaps(*ExecRestoreMI); } static const TargetRegisterClass * diff --git a/llvm/lib/Target/AMDGPU/SISimplifyPredicatedCopies.cpp b/llvm/lib/Target/AMDGPU/SISimplifyPredicatedCopies.cpp --- a/llvm/lib/Target/AMDGPU/SISimplifyPredicatedCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SISimplifyPredicatedCopies.cpp @@ -14,7 +14,10 @@ /// scalar registers don't carry any such dependency and hence the regular COPY /// opcode can be used. AMDGPU by default uses PRED_COPY opcode right from the /// instruction selection and this pass would simplify the COPY opcode and the -/// implicit operand field as mentioned above. +/// implicit operand field as mentioned above. This pass also implements the +/// EXEC MASK manipulation around the whole wave vector register copies by +/// turning all bits of exec to one before the copy and then restore it +/// immediately afterwards. // //===----------------------------------------------------------------------===// @@ -52,6 +55,11 @@ } private: + bool isWWMCopy(const MachineInstr &MI); + bool isSCCLiveAtMI(const MachineInstr &MI); + + LiveIntervals *LIS; + SlotIndexes *Indexes; const SIRegisterInfo *TRI; const MachineRegisterInfo *MRI; SIMachineFunctionInfo *MFI; @@ -61,6 +69,7 @@ INITIALIZE_PASS_BEGIN(SISimplifyPredicatedCopies, DEBUG_TYPE, "SI Simplify Predicated Copies", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) INITIALIZE_PASS_END(SISimplifyPredicatedCopies, DEBUG_TYPE, "SI Simplify Predicated Copies", false, false) @@ -68,11 +77,45 @@ char &llvm::SISimplifyPredicatedCopiesID = SISimplifyPredicatedCopies::ID; +// Returns true if \p MI is a whole-wave copy instruction. Iterate +// recursively skipping the intermediate copies if it maps to any +// whole-wave operation. +bool SISimplifyPredicatedCopies::isWWMCopy(const MachineInstr &MI) { + Register SrcReg = MI.getOperand(1).getReg(); + + if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG)) + return true; + + if (SrcReg.isPhysical()) + return false; + + // Look recursively skipping intermediate copies. + const MachineInstr *DefMI = MRI->getUniqueVRegDef(SrcReg); + if (!DefMI || !DefMI->isCopy()) + return false; + + return isWWMCopy(*DefMI); +} + +bool SISimplifyPredicatedCopies::isSCCLiveAtMI(const MachineInstr &MI) { + // We can't determine the liveness info if LIS isn't available. Early return + // in that case and always assume SCC is live. + if (!LIS) + return true; + + LiveRange &LR = + LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI)); + SlotIndex Idx = LIS->getInstructionIndex(MI); + return LR.liveAt(Idx); +} + bool SISimplifyPredicatedCopies::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); MFI = MF.getInfo(); + LIS = getAnalysisIfAvailable(); + Indexes = getAnalysisIfAvailable(); TRI = ST.getRegisterInfo(); MRI = &MF.getRegInfo(); bool Changed = false; @@ -93,6 +136,20 @@ Changed = true; } } else { + if (TII->isVGPRCopy(MI) && + !TRI->isSGPRReg(*MRI, MI.getOperand(1).getReg()) && + MI.getOperand(0).getReg().isVirtual() && isWWMCopy(MI)) { + // For WWM vector copies, manipulate the exec mask around the copy + // instruction. + DebugLoc DL = MI.getDebugLoc(); + MachineBasicBlock::iterator InsertPt = MI.getIterator(); + Register RegForExecCopy = MFI->getSGPRForEXECCopy(); + TII->insertScratchExecCopy(MF, MBB, InsertPt, DL, RegForExecCopy, + isSCCLiveAtMI(MI), Indexes); + TII->restoreExec(MF, MBB, ++InsertPt, DL, RegForExecCopy, Indexes); + LLVM_DEBUG(dbgs() << "WWM copy manipulation for " << MI); + } + // For vector registers, add implicit exec use. if (!MI.readsRegister(AMDGPU::EXEC, TRI)) { MI.addOperand(MF,