diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -41,6 +41,7 @@ FunctionPass *createSIOptimizeExecMaskingPreRAPass(); FunctionPass *createSIOptimizeVGPRLiveRangePass(); FunctionPass *createSIFixSGPRCopiesPass(); +FunctionPass *createLowerWWMCopiesPass(); FunctionPass *createSIMemoryLegalizerPass(); FunctionPass *createSIInsertWaitcntsPass(); FunctionPass *createSIPreAllocateWWMRegsPass(); @@ -144,6 +145,9 @@ void initializeSIFixVGPRCopiesPass(PassRegistry &); extern char &SIFixVGPRCopiesID; +void initializeSILowerWWMCopiesPass(PassRegistry &); +extern char &SILowerWWMCopiesID; + void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -364,6 +364,7 @@ initializeAMDGPUDAGToDAGISelPass(*PR); initializeGCNDPPCombinePass(*PR); initializeSILowerI1CopiesPass(*PR); + initializeSILowerWWMCopiesPass(*PR); initializeSILowerSGPRSpillsPass(*PR); initializeSIFixSGPRCopiesPass(*PR); initializeSIFixVGPRCopiesPass(*PR); @@ -1296,6 +1297,7 @@ } bool GCNPassConfig::addPreRewrite() { + addPass(&SILowerWWMCopiesID); if (EnableRegReassign) addPass(&GCNNSAReassignID); return true; @@ -1350,6 +1352,8 @@ addPass(&SILowerSGPRSpillsID); addPass(createVGPRAllocPass(false)); + + addPass(&SILowerWWMCopiesID); return true; } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -145,6 +145,7 @@ SILoadStoreOptimizer.cpp SILowerControlFlow.cpp SILowerI1Copies.cpp + SILowerWWMCopies.cpp SILowerSGPRSpills.cpp SIMachineFunctionInfo.cpp SIMachineScheduler.cpp diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -170,6 +170,12 @@ Register findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const; protected: + /// If the specific machine instruction is a instruction that moves/copies + /// value from one register to another register return destination and source + /// registers as machine operands. + std::optional + isCopyInstrImpl(const MachineInstr &MI) const override; + bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, MachineOperand &Src1, unsigned Src1OpName) const; @@ -827,7 +833,7 @@ } bool isVGPRCopy(const MachineInstr &MI) const { - assert(MI.isCopy()); + assert(isCopyInstr(MI)); Register Dest = MI.getOperand(0).getReg(); const MachineFunction &MF = *MI.getParent()->getParent(); const MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -897,7 +903,7 @@ if (OpIdx >= MI.getDesc().NumOperands) return false; - if (MI.isCopy()) { + if (isCopyInstr(MI)) { unsigned Size = getOpSize(MI, OpIdx); assert(Size == 8 || Size == 4); @@ -946,12 +952,12 @@ void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, Register Reg, - bool IsSCCLive) const; + const DebugLoc &DL, Register Reg, bool IsSCCLive, + SlotIndexes *Indexes = nullptr) const; void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, - Register Reg) const; + Register Reg, SlotIndexes *Indexes = nullptr) const; /// Return the correct register class for \p OpNo. For target-specific /// instructions, this will return the register class that has been defined @@ -1143,6 +1149,9 @@ CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override; + unsigned getLiveRangeSplitOpcode(Register Reg, + const MachineFunction &MF) const override; + bool isBasicBlockPrologue(const MachineInstr &MI) const override; MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2414,6 +2414,14 @@ return std::pair(Split[0], Split[1]); } +std::optional +SIInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const { + if (MI.getOpcode() == AMDGPU::WWM_COPY) + return DestSourcePair{MI.getOperand(0), MI.getOperand(1)}; + + return std::nullopt; +} + bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, @@ -3080,6 +3088,7 @@ case AMDGPU::S_MOV_B32: case AMDGPU::S_MOV_B64: case AMDGPU::COPY: + case AMDGPU::WWM_COPY: case AMDGPU::V_ACCVGPR_WRITE_B32_e64: case AMDGPU::V_ACCVGPR_READ_B32_e64: case AMDGPU::V_ACCVGPR_MOV_B32: @@ -4969,7 +4978,8 @@ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, - bool IsSCCLive) const { + bool IsSCCLive, + SlotIndexes *Indexes) const { const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); bool IsWave32 = ST.isWave32(); @@ -4979,23 +4989,34 @@ // the single instruction S_OR_SAVEEXEC that clobbers SCC. unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg).addReg(Exec, RegState::Kill); - BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); + auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg) + .addReg(Exec, RegState::Kill); + auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); + if (Indexes) { + Indexes->insertMachineInstrInMaps(*StoreExecMI); + Indexes->insertMachineInstrInMaps(*FlipExecMI); + } } else { const unsigned OrSaveExec = IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; auto SaveExec = BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1); SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead. + if (Indexes) + Indexes->insertMachineInstrInMaps(*SaveExec); } } void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, Register Reg) const { + const DebugLoc &DL, Register Reg, + SlotIndexes *Indexes) const { unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill); + auto ExecRestoreMI = + BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill); + if (Indexes) + Indexes->insertMachineInstrInMaps(*ExecRestoreMI); } static const TargetRegisterClass * @@ -7980,6 +8001,16 @@ return ArrayRef(TargetFlags); } +unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg, + const MachineFunction &MF) const { + const SIMachineFunctionInfo *MFI = MF.getInfo(); + assert(SrcReg.isVirtual()); + if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG)) + return AMDGPU::WWM_COPY; + + return AMDGPU::COPY; +} + bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && MI.modifiesRegister(AMDGPU::EXEC, &RI); @@ -8547,7 +8578,7 @@ // A similar issue also exists with spilling and reloading $exec registers. // // To prevent that, constrain the %0 register class here. - if (MI.isFullCopy()) { + if (isFullCopyInstr(MI)) { Register DstReg = MI.getOperand(0).getReg(); Register SrcReg = MI.getOperand(1).getReg(); if ((DstReg.isVirtual() || SrcReg.isVirtual()) && @@ -8644,7 +8675,7 @@ if (opcode == AMDGPU::V_READLANE_B32 || opcode == AMDGPU::V_READFIRSTLANE_B32) return InstructionUniformity::AlwaysUniform; - if (MI.isCopy()) { + if (isCopyInstr(MI)) { const MachineOperand &srcOp = MI.getOperand(1); if (srcOp.isReg() && srcOp.getReg().isPhysical()) { const TargetRegisterClass *regClass = diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -172,6 +172,13 @@ } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] +def WWM_COPY : SPseudoInstSI < + (outs unknown:$dst), (ins unknown:$src)> { + let hasSideEffects = 0; + let isAsCheapAsAMove = 1; + let isConvergent = 1; +} + def ENTER_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> { let Uses = [EXEC]; let Defs = [EXEC, SCC]; diff --git a/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp b/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp @@ -0,0 +1,141 @@ +//===-- SILowerWWMCopies.cpp - Lower Copies after regalloc ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Lowering the WWM_COPY instructions for various register classes. +/// AMDGPU target generates WWM_COPY instruction to differentiate WWM +/// copy from COPY. This pass generates the necessary exec mask manipulation +/// instructions to replicate 'Whole Wave Mode' and lowers WWM_COPY back to +/// COPY. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-lower-wwm-copies" + +namespace { + +class SILowerWWMCopies : public MachineFunctionPass { +public: + static char ID; + + SILowerWWMCopies() : MachineFunctionPass(ID) { + initializeSILowerWWMCopiesPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "SI Lower WWM Copies"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + bool isSCCLiveAtMI(const MachineInstr &MI); + void addToWWMSpills(MachineFunction &MF, Register Reg); + + LiveIntervals *LIS; + SlotIndexes *Indexes; + VirtRegMap *VRM; + const SIRegisterInfo *TRI; + const MachineRegisterInfo *MRI; + SIMachineFunctionInfo *MFI; +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SILowerWWMCopies, DEBUG_TYPE, "SI Lower WWM Copies", + false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_END(SILowerWWMCopies, DEBUG_TYPE, "SI Lower WWM Copies", false, + false) + +char SILowerWWMCopies::ID = 0; + +char &llvm::SILowerWWMCopiesID = SILowerWWMCopies::ID; + +bool SILowerWWMCopies::isSCCLiveAtMI(const MachineInstr &MI) { + // We can't determine the liveness info if LIS isn't available. Early return + // in that case and always assume SCC is live. + if (!LIS) + return true; + + LiveRange &LR = + LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI)); + SlotIndex Idx = LIS->getInstructionIndex(MI); + return LR.liveAt(Idx); +} + +// If \p Reg is assigned with a physical VGPR, add the latter into wwm-spills +// for preserving its entire lanes at function prolog/epilog. +void SILowerWWMCopies::addToWWMSpills(MachineFunction &MF, Register Reg) { + if (Reg.isPhysical()) + return; + + Register PhysReg = VRM->getPhys(Reg); + assert(PhysReg != VirtRegMap::NO_PHYS_REG && + "should have allocated a physical register"); + + MFI->allocateWWMSpill(MF, PhysReg); +} + +bool SILowerWWMCopies::runOnMachineFunction(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + + MFI = MF.getInfo(); + LIS = getAnalysisIfAvailable(); + Indexes = getAnalysisIfAvailable(); + VRM = getAnalysisIfAvailable(); + TRI = ST.getRegisterInfo(); + MRI = &MF.getRegInfo(); + + if (!MFI->hasVRegFlags()) + return false; + + bool Changed = false; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() != AMDGPU::WWM_COPY) + continue; + + // TODO: Club adjacent WWM ops between same exec save/restore + assert(TII->isVGPRCopy(MI)); + + // For WWM vector copies, manipulate the exec mask around the copy + // instruction. + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock::iterator InsertPt = MI.getIterator(); + Register RegForExecCopy = MFI->getSGPRForEXECCopy(); + TII->insertScratchExecCopy(MF, MBB, InsertPt, DL, RegForExecCopy, + isSCCLiveAtMI(MI), Indexes); + TII->restoreExec(MF, MBB, ++InsertPt, DL, RegForExecCopy, Indexes); + addToWWMSpills(MF, MI.getOperand(0).getReg()); + LLVM_DEBUG(dbgs() << "WWM copy manipulation for " << MI); + + // Lower WWM_COPY back to COPY + MI.setDesc(TII->get(AMDGPU::COPY)); + Changed |= true; + } + } + + return Changed; +} diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -667,6 +667,8 @@ return VRegFlags.inBounds(Reg) && VRegFlags[Reg] & Flag; } + bool hasVRegFlags() { return VRegFlags.size(); } + void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size = 4, Align Alignment = Align(4)); diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -118,6 +118,7 @@ ; GCN-O0-NEXT: Fast Register Allocator ; GCN-O0-NEXT: SI lower SGPR spill instructions ; GCN-O0-NEXT: Fast Register Allocator +; GCN-O0-NEXT: SI Lower WWM Copies ; GCN-O0-NEXT: SI Fix VGPR copies ; GCN-O0-NEXT: Remove Redundant DEBUG_VALUE analysis ; GCN-O0-NEXT: Fixup Statepoint Caller Saved @@ -367,6 +368,7 @@ ; GCN-O1-NEXT: Virtual Register Map ; GCN-O1-NEXT: Live Register Matrix ; GCN-O1-NEXT: Greedy Register Allocator +; GCN-O1-NEXT: SI Lower WWM Copies ; GCN-O1-NEXT: GCN NSA Reassign ; GCN-O1-NEXT: Virtual Register Rewriter ; GCN-O1-NEXT: Stack Slot Coloring @@ -666,6 +668,7 @@ ; GCN-O1-OPTS-NEXT: Virtual Register Map ; GCN-O1-OPTS-NEXT: Live Register Matrix ; GCN-O1-OPTS-NEXT: Greedy Register Allocator +; GCN-O1-OPTS-NEXT: SI Lower WWM Copies ; GCN-O1-OPTS-NEXT: GCN NSA Reassign ; GCN-O1-OPTS-NEXT: Virtual Register Rewriter ; GCN-O1-OPTS-NEXT: Stack Slot Coloring @@ -975,6 +978,7 @@ ; GCN-O2-NEXT: Virtual Register Map ; GCN-O2-NEXT: Live Register Matrix ; GCN-O2-NEXT: Greedy Register Allocator +; GCN-O2-NEXT: SI Lower WWM Copies ; GCN-O2-NEXT: GCN NSA Reassign ; GCN-O2-NEXT: Virtual Register Rewriter ; GCN-O2-NEXT: Stack Slot Coloring @@ -1296,6 +1300,7 @@ ; GCN-O3-NEXT: Virtual Register Map ; GCN-O3-NEXT: Live Register Matrix ; GCN-O3-NEXT: Greedy Register Allocator +; GCN-O3-NEXT: SI Lower WWM Copies ; GCN-O3-NEXT: GCN NSA Reassign ; GCN-O3-NEXT: Virtual Register Rewriter ; GCN-O3-NEXT: Stack Slot Coloring diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll @@ -21,6 +21,7 @@ ; DEFAULT-NEXT: Virtual Register Map ; DEFAULT-NEXT: Live Register Matrix ; DEFAULT-NEXT: Greedy Register Allocator +; DEFAULT-NEXT: SI Lower WWM Copies ; DEFAULT-NEXT: GCN NSA Reassign ; DEFAULT-NEXT: Virtual Register Rewriter ; DEFAULT-NEXT: Stack Slot Coloring @@ -28,6 +29,7 @@ ; O0: Fast Register Allocator ; O0-NEXT: SI lower SGPR spill instructions ; O0-NEXT: Fast Register Allocator +; O0-NEXT: SI Lower WWM Copies ; O0-NEXT: SI Fix VGPR copies @@ -49,6 +51,7 @@ ; BASIC-DEFAULT-NEXT: Lazy Machine Block Frequency Analysis ; BASIC-DEFAULT-NEXT: Machine Optimization Remark Emitter ; BASIC-DEFAULT-NEXT: Greedy Register Allocator +; BASIC-DEFAULT-NEXT: SI Lower WWM Copies ; BASIC-DEFAULT-NEXT: GCN NSA Reassign ; BASIC-DEFAULT-NEXT: Virtual Register Rewriter ; BASIC-DEFAULT-NEXT: Stack Slot Coloring @@ -61,6 +64,7 @@ ; DEFAULT-BASIC-NEXT: Virtual Register Map ; DEFAULT-BASIC-NEXT: Live Register Matrix ; DEFAULT-BASIC-NEXT: Basic Register Allocator +; DEFAULT-BASIC-NEXT: SI Lower WWM Copies ; DEFAULT-BASIC-NEXT: GCN NSA Reassign ; DEFAULT-BASIC-NEXT: Virtual Register Rewriter ; DEFAULT-BASIC-NEXT: Stack Slot Coloring @@ -79,6 +83,7 @@ ; BASIC-BASIC-NEXT: Virtual Register Map ; BASIC-BASIC-NEXT: Live Register Matrix ; BASIC-BASIC-NEXT: Basic Register Allocator +; BASIC-BASIC-NEXT: SI Lower WWM Copies ; BASIC-BASIC-NEXT: GCN NSA Reassign ; BASIC-BASIC-NEXT: Virtual Register Rewriter ; BASIC-BASIC-NEXT: Stack Slot Coloring