diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -952,12 +952,12 @@ void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, Register Reg, - bool IsSCCLive) const; + const DebugLoc &DL, Register Reg, bool IsSCCLive, + SlotIndexes *Indexes = nullptr) const; void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, - Register Reg) const; + Register Reg, SlotIndexes *Indexes = nullptr) const; /// Return the correct register class for \p OpNo. For target-specific /// instructions, this will return the register class that has been defined diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4952,7 +4952,8 @@ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, - bool IsSCCLive) const { + bool IsSCCLive, + SlotIndexes *Indexes) const { const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); bool IsWave32 = ST.isWave32(); @@ -4962,25 +4963,36 @@ // the single instruction S_OR_SAVEEXEC that clobbers SCC. unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg).addReg(Exec, RegState::Kill); - BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); + auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg) + .addReg(Exec, RegState::Kill); + auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); + if (Indexes) { + Indexes->insertMachineInstrInMaps(*StoreExecMI); + Indexes->insertMachineInstrInMaps(*FlipExecMI); + } } else { const unsigned OrSaveExec = IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; auto SaveExec = BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1); SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead. + if (Indexes) + Indexes->insertMachineInstrInMaps(*SaveExec); } } void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, Register Reg) const { + const DebugLoc &DL, Register Reg, + SlotIndexes *Indexes) const { const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec).addReg(Reg, RegState::Kill); + auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) + .addReg(Reg, RegState::Kill); + if (Indexes) + Indexes->insertMachineInstrInMaps(*ExecRestoreMI); } static const TargetRegisterClass * diff --git a/llvm/lib/Target/AMDGPU/SISimplifyPredicatedCopies.cpp b/llvm/lib/Target/AMDGPU/SISimplifyPredicatedCopies.cpp --- a/llvm/lib/Target/AMDGPU/SISimplifyPredicatedCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SISimplifyPredicatedCopies.cpp @@ -10,8 +10,7 @@ /// Lowering the predicated PRED_COPY instructions for various register /// classes. AMDGPU target generates PRED_COPY instruction to differentiate WWM /// copy from COPY. This pass generates the necessary exec mask manipulation -/// instructions to replicate 'Whole Wave Mode' and lowers PRED_COPY back to -/// COPY. +/// instructions back to COPY. // //===----------------------------------------------------------------------===// @@ -21,6 +20,7 @@ #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/VirtRegMap.h" #include "llvm/InitializePasses.h" using namespace llvm; @@ -49,6 +49,13 @@ } private: + bool isWWMCopy(const MachineInstr &MI); + bool isSCCLiveAtMI(const MachineInstr &MI); + void addToWWMSpills(MachineFunction &MF, Register Reg); + + LiveIntervals *LIS; + SlotIndexes *Indexes; + VirtRegMap *VRM; const SIRegisterInfo *TRI; const MachineRegisterInfo *MRI; SIMachineFunctionInfo *MFI; @@ -56,18 +63,75 @@ } // End anonymous namespace. -INITIALIZE_PASS(SISimplifyPredicatedCopies, DEBUG_TYPE, - "SI Simplify Predicated Copies", false, false) +INITIALIZE_PASS_BEGIN(SISimplifyPredicatedCopies, DEBUG_TYPE, + "SI Simplify Predicated Copies", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_END(SISimplifyPredicatedCopies, DEBUG_TYPE, + "SI Simplify Predicated Copies", false, false) char SISimplifyPredicatedCopies::ID = 0; char &llvm::SISimplifyPredicatedCopiesID = SISimplifyPredicatedCopies::ID; +// Returns true if \p MI is a whole-wave copy instruction. Iterate +// recursively skipping the intermediate copies if it maps to any +// whole-wave operation. +bool SISimplifyPredicatedCopies::isWWMCopy(const MachineInstr &MI) { + // Skip if it is a subreg copy. + if (!MI.isFullCopy()) + return false; + + Register SrcReg = MI.getOperand(1).getReg(); + + if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG)) + return true; + + if (SrcReg.isPhysical()) + return false; + + // Look recursively skipping intermediate copies. + const MachineInstr *DefMI = MRI->getUniqueVRegDef(SrcReg); + if (!DefMI || !DefMI->isCopy()) + return false; + + return isWWMCopy(*DefMI); +} + +bool SISimplifyPredicatedCopies::isSCCLiveAtMI(const MachineInstr &MI) { + // We can't determine the liveness info if LIS isn't available. Early return + // in that case and always assume SCC is live. + if (!LIS) + return true; + + LiveRange &LR = + LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI)); + SlotIndex Idx = LIS->getInstructionIndex(MI); + return LR.liveAt(Idx); +} + +// If \p Reg is assigned with a physical VGPR, add the latter into wwm-spills +// for preserving its entire lanes at function prolog/epilog. +void SISimplifyPredicatedCopies::addToWWMSpills(MachineFunction &MF, + Register Reg) { + if (!VRM || Reg.isPhysical()) + return; + + Register PhysReg = VRM->getPhys(Reg); + assert(PhysReg != VirtRegMap::NO_PHYS_REG && + "should have allocated a physical register"); + + MFI->allocateWWMSpill(MF, PhysReg); +} + bool SISimplifyPredicatedCopies::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); MFI = MF.getInfo(); + LIS = getAnalysisIfAvailable(); + Indexes = getAnalysisIfAvailable(); + VRM = getAnalysisIfAvailable(); TRI = ST.getRegisterInfo(); MRI = &MF.getRegInfo(); bool Changed = false; @@ -75,13 +139,19 @@ for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { if (MI.getOpcode() == AMDGPU::PRED_COPY) { - - // Below asserts still fails cause isVgprCopy call isCopy() which is - // false for PRED_COPY. Figure out a way to use isCopyInstr() - // everywhere. assert(TII->isVGPRCopy(MI)); - - // Whole wave register copy logic goes here // - + assert(TII->isVGPRCopy(MI)); + if (MI.getOperand(0).getReg().isVirtual() && isWWMCopy(MI)) { + // For WWM vector copies, manipulate the exec mask around the copy + // instruction. + DebugLoc DL = MI.getDebugLoc(); + MachineBasicBlock::iterator InsertPt = MI.getIterator(); + Register RegForExecCopy = MFI->getSGPRForEXECCopy(); + TII->insertScratchExecCopy(MF, MBB, InsertPt, DL, RegForExecCopy, + isSCCLiveAtMI(MI), Indexes); + TII->restoreExec(MF, MBB, ++InsertPt, DL, RegForExecCopy, Indexes); + addToWWMSpills(MF, MI.getOperand(0).getReg()); + LLVM_DEBUG(dbgs() << "WWM copy manipulation for " << MI); + } // Lower PRED_COPY to COPY LLVM_DEBUG(dbgs() << MI << " to use COPY opcode"); MI.setDesc(TII->get(AMDGPU::COPY)); diff --git a/llvm/test/CodeGen/AMDGPU/skip-subreg-copy-from-iswwmcopy-check.mir b/llvm/test/CodeGen/AMDGPU/skip-subreg-copy-from-iswwmcopy-check.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/skip-subreg-copy-from-iswwmcopy-check.mir @@ -0,0 +1,20 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=si-simplify-predicated-copies -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +# The test goes into infinite loop while checking for isWWMCopy(). +# getUniqueVRegDef of the SrcReg returns the instruction itself if it is a partial copy. +# wwm-copies will always be a full copy and hence skip subreg copies while checking for one. + +--- +name: subreg_copy +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: false +body: | + bb.0: + ; GCN-LABEL: name: subreg_copy + ; GCN: dead undef %0.sub3:vreg_128_align2 = COPY undef %0.sub1 + ; GCN-NEXT: SI_RETURN + dead undef %0.sub3:vreg_128_align2 = PRED_COPY undef %0.sub1 + SI_RETURN +...