diff --git a/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp b/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp --- a/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp +++ b/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp @@ -47,7 +47,6 @@ private: bool LowerSubregToReg(MachineInstr *MI); - }; } // end anonymous namespace diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -41,6 +41,7 @@ FunctionPass *createSIOptimizeExecMaskingPreRAPass(); FunctionPass *createSIOptimizeVGPRLiveRangePass(); FunctionPass *createSIFixSGPRCopiesPass(); +FunctionPass *createLowerPredicatedCopiesPass(); FunctionPass *createSIMemoryLegalizerPass(); FunctionPass *createSIInsertWaitcntsPass(); FunctionPass *createSIPreAllocateWWMRegsPass(); @@ -171,6 +172,9 @@ void initializeSIFixVGPRCopiesPass(PassRegistry &); extern char &SIFixVGPRCopiesID; +void initializeSILowerPredicatedCopiesPass(PassRegistry &); +extern char &SILowerPredicatedCopiesID; + void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -361,6 +361,7 @@ initializeAMDGPUDAGToDAGISelPass(*PR); initializeGCNDPPCombinePass(*PR); initializeSILowerI1CopiesPass(*PR); + initializeSILowerPredicatedCopiesPass(*PR); initializeSILowerSGPRSpillsPass(*PR); initializeSIFixSGPRCopiesPass(*PR); initializeSIFixVGPRCopiesPass(*PR); @@ -1303,6 +1304,7 @@ } bool GCNPassConfig::addPreRewrite() { + addPass(&SILowerPredicatedCopiesID); if (EnableRegReassign) addPass(&GCNNSAReassignID); return true; @@ -1355,6 +1357,9 @@ addPass(&SILowerSGPRSpillsID); addPass(createVGPRAllocPass(false)); + + addPass(&SILowerPredicatedCopiesID); + return true; } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -147,6 +147,7 @@ SILoadStoreOptimizer.cpp SILowerControlFlow.cpp SILowerI1Copies.cpp + SILowerPredicatedCopies.cpp SILowerSGPRSpills.cpp SIMachineFunctionInfo.cpp SIMachineScheduler.cpp diff --git a/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp --- a/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp @@ -54,6 +54,7 @@ for (MachineInstr &MI : MBB) { switch (MI.getOpcode()) { case AMDGPU::COPY: + case AMDGPU::PRED_COPY: if (TII->isVGPRCopy(MI) && !MI.readsRegister(AMDGPU::EXEC, TRI)) { MI.addOperand(MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -170,6 +170,12 @@ Register findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const; protected: + /// If the specific machine instruction is a instruction that moves/copies + /// value from one register to another register return destination and source + /// registers as machine operands. + std::optional + isCopyInstrImpl(const MachineInstr &MI) const override; + bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, MachineOperand &Src1, unsigned Src1OpName) const; @@ -827,7 +833,7 @@ } bool isVGPRCopy(const MachineInstr &MI) const { - assert(MI.isCopy()); + assert(isCopyInstr(MI)); Register Dest = MI.getOperand(0).getReg(); const MachineFunction &MF = *MI.getParent()->getParent(); const MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -897,7 +903,7 @@ if (OpIdx >= MI.getDesc().NumOperands) return false; - if (MI.isCopy()) { + if (isCopyInstr(MI)) { unsigned Size = getOpSize(MI, OpIdx); assert(Size == 8 || Size == 4); @@ -946,12 +952,12 @@ void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, Register Reg, - bool IsSCCLive) const; + const DebugLoc &DL, Register Reg, bool IsSCCLive, + SlotIndexes *Indexes = nullptr) const; void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, - Register Reg) const; + Register Reg, SlotIndexes *Indexes = nullptr) const; /// Return the correct register class for \p OpNo. For target-specific /// instructions, this will return the register class that has been defined @@ -1143,6 +1149,9 @@ CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override; + unsigned getLiveRangeSplitOpcode(Register reg, + MachineRegisterInfo &MRI) const override; + bool isBasicBlockPrologue(const MachineInstr &MI) const override; MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1932,10 +1932,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MBB.findDebugLoc(MI); switch (MI.getOpcode()) { default: return TargetInstrInfo::expandPostRAPseudo(MI); + case AMDGPU::PRED_COPY: + TII->lowerCopy(&MI); + break; + case AMDGPU::S_MOV_B64_term: // This is only a terminator to get the correct spill code placement during // register allocation. @@ -2414,6 +2419,14 @@ return std::pair(Split[0], Split[1]); } +std::optional +SIInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const { + if (MI.getOpcode() == AMDGPU::COPY || MI.getOpcode() == AMDGPU::PRED_COPY) + return DestSourcePair{MI.getOperand(0), MI.getOperand(1)}; + + return std::nullopt; +} + bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, @@ -3069,6 +3082,7 @@ case AMDGPU::S_MOV_B32: case AMDGPU::S_MOV_B64: case AMDGPU::COPY: + case AMDGPU::PRED_COPY: case AMDGPU::V_ACCVGPR_WRITE_B32_e64: case AMDGPU::V_ACCVGPR_READ_B32_e64: case AMDGPU::V_ACCVGPR_MOV_B32: @@ -4958,7 +4972,8 @@ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, - bool IsSCCLive) const { + bool IsSCCLive, + SlotIndexes *Indexes) const { const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); bool IsWave32 = ST.isWave32(); @@ -4968,25 +4983,36 @@ // the single instruction S_OR_SAVEEXEC that clobbers SCC. unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg).addReg(Exec, RegState::Kill); - BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); + auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg) + .addReg(Exec, RegState::Kill); + auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); + if (Indexes) { + Indexes->insertMachineInstrInMaps(*StoreExecMI); + Indexes->insertMachineInstrInMaps(*FlipExecMI); + } } else { const unsigned OrSaveExec = IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; auto SaveExec = BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1); SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead. + if (Indexes) + Indexes->insertMachineInstrInMaps(*SaveExec); } } void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, Register Reg) const { + const DebugLoc &DL, Register Reg, + SlotIndexes *Indexes) const { const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec).addReg(Reg, RegState::Kill); + auto ExecRestoreMI = BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec) + .addReg(Reg, RegState::Kill); + if (Indexes) + Indexes->insertMachineInstrInMaps(*ExecRestoreMI); } static const TargetRegisterClass * @@ -7977,6 +8003,14 @@ return ArrayRef(TargetFlags); } +unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register Reg, + MachineRegisterInfo &MRI) const { + auto *TRI = MRI.getTargetRegisterInfo(); + const TargetRegisterClass *RC = + Reg.isVirtual() ? MRI.getRegClass(Reg) : TRI->getPhysRegBaseClass(Reg); + return SIRegisterInfo::isSGPRClass(RC) ? AMDGPU::COPY : AMDGPU::PRED_COPY; +} + bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && MI.modifiesRegister(AMDGPU::EXEC, &RI); @@ -8544,7 +8578,7 @@ // A similar issue also exists with spilling and reloading $exec registers. // // To prevent that, constrain the %0 register class here. - if (MI.isFullCopy()) { + if (isFullCopyInstr(MI)) { Register DstReg = MI.getOperand(0).getReg(); Register SrcReg = MI.getOperand(1).getReg(); if ((DstReg.isVirtual() || SrcReg.isVirtual()) && @@ -8641,7 +8675,7 @@ if (opcode == AMDGPU::V_READLANE_B32 || opcode == AMDGPU::V_READFIRSTLANE_B32) return InstructionUniformity::AlwaysUniform; - if (MI.isCopy()) { + if (isCopyInstr(MI)) { const MachineOperand &srcOp = MI.getOperand(1); if (srcOp.isReg() && srcOp.getReg().isPhysical()) { const TargetRegisterClass *regClass = diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3349,6 +3349,15 @@ let Namespace = "AMDGPU"; } +def PRED_COPY : AMDGPUGenericInstruction { + let OutOperandList = (outs unknown:$dst); + let InOperandList = (ins unknown:$src); + let AsmString = "PRED_COPY"; + let hasSideEffects = false; + let isAsCheapAsAMove = true; + let isPredicable = true; +} + // Convert a wave address to a swizzled vector address (i.e. this is // for copying the stack pointer to a vector address appropriate to // use in the offset field of mubuf instructions). diff --git a/llvm/lib/Target/AMDGPU/SILowerPredicatedCopies.cpp b/llvm/lib/Target/AMDGPU/SILowerPredicatedCopies.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SILowerPredicatedCopies.cpp @@ -0,0 +1,163 @@ +//===-- SILowerPredicatedCopies.cpp - Lower Copies after regalloc ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Lowering the predicated PRED_COPY instructions for various register +/// classes. AMDGPU target generates PRED_COPY instruction to differentiate WWM +/// copy from COPY. This pass generates the necessary exec mask manipulation +/// instructions to replicate 'Whole Wave Mode' and lowers PRED_COPY back to +/// COPY. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "si-lower-predicated-copies" + +namespace { + +class SILowerPredicatedCopies : public MachineFunctionPass { +public: + static char ID; + + SILowerPredicatedCopies() : MachineFunctionPass(ID) { + initializeSILowerPredicatedCopiesPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { + return "SI Lower Predicated Copies"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + bool isWWMCopy(const MachineInstr &MI, const TargetInstrInfo &TII); + bool isSCCLiveAtMI(const MachineInstr &MI); + void addToWWMSpills(MachineFunction &MF, Register Reg); + + LiveIntervals *LIS; + SlotIndexes *Indexes; + VirtRegMap *VRM; + const SIRegisterInfo *TRI; + const MachineRegisterInfo *MRI; + SIMachineFunctionInfo *MFI; +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SILowerPredicatedCopies, DEBUG_TYPE, + "SI Lower Predicated Copies", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_END(SILowerPredicatedCopies, DEBUG_TYPE, + "SI Lower Predicated Copies", false, false) + +char SILowerPredicatedCopies::ID = 0; + +char &llvm::SILowerPredicatedCopiesID = SILowerPredicatedCopies::ID; + +// Returns true if \p MI is a whole-wave copy instruction. Iterate +// recursively skipping the intermediate copies if it maps to any +// whole-wave operation. +bool SILowerPredicatedCopies::isWWMCopy(const MachineInstr &MI, + const TargetInstrInfo &TII) { + // Skip if it is a subreg copy. + if (!TII.isFullCopyInstr(MI)) + return false; + + Register SrcReg = MI.getOperand(1).getReg(); + + if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG)) + return true; + + if (SrcReg.isPhysical()) + return false; + + // Look recursively skipping intermediate copies. + const MachineInstr *DefMI = MRI->getUniqueVRegDef(SrcReg); + if (!DefMI || !TII.isCopyInstr(*DefMI)) + return false; + + return isWWMCopy(*DefMI, TII); +} + +bool SILowerPredicatedCopies::isSCCLiveAtMI(const MachineInstr &MI) { + // We can't determine the liveness info if LIS isn't available. Early return + // in that case and always assume SCC is live. + if (!LIS) + return true; + + LiveRange &LR = + LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI)); + SlotIndex Idx = LIS->getInstructionIndex(MI); + return LR.liveAt(Idx); +} + +// If \p Reg is assigned with a physical VGPR, add the latter into wwm-spills +// for preserving its entire lanes at function prolog/epilog. +void SILowerPredicatedCopies::addToWWMSpills(MachineFunction &MF, + Register Reg) { + if (!VRM || Reg.isPhysical()) + return; + + Register PhysReg = VRM->getPhys(Reg); + assert(PhysReg != VirtRegMap::NO_PHYS_REG && + "should have allocated a physical register"); + + MFI->allocateWWMSpill(MF, PhysReg); +} + +bool SILowerPredicatedCopies::runOnMachineFunction(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + + MFI = MF.getInfo(); + LIS = getAnalysisIfAvailable(); + Indexes = getAnalysisIfAvailable(); + VRM = getAnalysisIfAvailable(); + TRI = ST.getRegisterInfo(); + MRI = &MF.getRegInfo(); + bool Changed = false; + + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() == AMDGPU::PRED_COPY) { + assert(TII->isVGPRCopy(MI)); + if (MI.getOperand(0).getReg().isVirtual() && isWWMCopy(MI, *TII)) { + // For WWM vector copies, manipulate the exec mask around the copy + // instruction. + DebugLoc DL = MI.getDebugLoc(); + MachineBasicBlock::iterator InsertPt = MI.getIterator(); + Register RegForExecCopy = MFI->getSGPRForEXECCopy(); + TII->insertScratchExecCopy(MF, MBB, InsertPt, DL, RegForExecCopy, + isSCCLiveAtMI(MI), Indexes); + TII->restoreExec(MF, MBB, ++InsertPt, DL, RegForExecCopy, Indexes); + addToWWMSpills(MF, MI.getOperand(0).getReg()); + LLVM_DEBUG(dbgs() << "WWM copy manipulation for " << MI); + Changed |= true; + } + } + } + } + + return Changed; +} diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -836,6 +836,7 @@ ; GFX90A-NEXT: renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr17, implicit $exec ; GFX90A-NEXT: renamable $agpr0_agpr1 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.435, addrspace 3) + ; GFX90A-NEXT: renamable $agpr0_agpr1 = PRED_COPY killed renamable $agpr0_agpr1 ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr22, implicit $exec ; GFX90A-NEXT: renamable $vgpr26_vgpr27 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1 @@ -872,7 +873,7 @@ ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr52_sgpr53, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr12 = COPY renamable $vgpr16, implicit $exec - ; GFX90A-NEXT: renamable $agpr0_agpr1 = COPY killed renamable $vgpr12_vgpr13, implicit $exec + ; GFX90A-NEXT: renamable $agpr0_agpr1 = PRED_COPY killed renamable $vgpr12_vgpr13 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.61.Flow30: ; GFX90A-NEXT: successors: %bb.55(0x80000000) @@ -953,7 +954,7 @@ ; GFX90A-NEXT: renamable $vgpr36 = V_OR_B32_e32 $vgpr38, $vgpr22, implicit $exec ; GFX90A-NEXT: renamable $vgpr32 = V_CNDMASK_B32_e64 0, $vgpr36, 0, 0, $sgpr12_sgpr13, implicit $exec ; GFX90A-NEXT: renamable $vgpr50 = V_OR_B32_e32 $vgpr32, $vgpr20, implicit $exec - ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = COPY renamable $agpr0_agpr1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = PRED_COPY renamable $agpr0_agpr1 ; GFX90A-NEXT: renamable $vgpr48 = V_OR_B32_e32 $vgpr50, killed $vgpr12, implicit $exec ; GFX90A-NEXT: renamable $vgpr34 = V_OR_B32_e32 $vgpr48, $vgpr14, implicit $exec ; GFX90A-NEXT: renamable $vgpr52 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr34, killed $sgpr12_sgpr13, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/greedy-global-heuristic.mir b/llvm/test/CodeGen/AMDGPU/greedy-global-heuristic.mir --- a/llvm/test/CodeGen/AMDGPU/greedy-global-heuristic.mir +++ b/llvm/test/CodeGen/AMDGPU/greedy-global-heuristic.mir @@ -133,14 +133,14 @@ ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: S_NOP 0 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %31 + ; CHECK-NEXT: [[PRED_COPY:%[0-9]+]]:vreg_128 = PRED_COPY %31 ; CHECK-NEXT: S_NOP 0, implicit %31 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY %29 + ; CHECK-NEXT: [[PRED_COPY1:%[0-9]+]]:vreg_128 = PRED_COPY %29 ; CHECK-NEXT: S_NOP 0, implicit %29 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_128 = COPY %27 + ; CHECK-NEXT: [[PRED_COPY2:%[0-9]+]]:vreg_128 = PRED_COPY %27 ; CHECK-NEXT: S_NOP 0, implicit %27 ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE1:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_128 = COPY [[SI_SPILL_V128_RESTORE1]] + ; CHECK-NEXT: [[PRED_COPY3:%[0-9]+]]:vreg_128 = PRED_COPY [[SI_SPILL_V128_RESTORE1]] ; CHECK-NEXT: S_NOP 0, implicit [[SI_SPILL_V128_RESTORE1]] ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE2:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) ; CHECK-NEXT: S_NOP 0, implicit [[SI_SPILL_V128_RESTORE2]] @@ -156,10 +156,10 @@ ; CHECK-NEXT: S_NOP 0, implicit %0 ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE6:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) ; CHECK-NEXT: S_NOP 0, implicit [[SI_SPILL_V128_RESTORE6]] - ; CHECK-NEXT: S_NOP 0, implicit [[COPY3]] - ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]] - ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]] - ; CHECK-NEXT: S_NOP 0, implicit [[COPY]] + ; CHECK-NEXT: S_NOP 0, implicit [[PRED_COPY3]] + ; CHECK-NEXT: S_NOP 0, implicit [[PRED_COPY2]] + ; CHECK-NEXT: S_NOP 0, implicit [[PRED_COPY1]] + ; CHECK-NEXT: S_NOP 0, implicit [[PRED_COPY]] bb.0: S_NOP 0, implicit-def %0:vreg_128 S_NOP 0, implicit-def %1:vreg_128 diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -123,6 +123,7 @@ ; GCN-O0-NEXT: Fast Register Allocator ; GCN-O0-NEXT: SI lower SGPR spill instructions ; GCN-O0-NEXT: Fast Register Allocator +; GCN-O0-NEXT: SI Lower Predicated Copies ; GCN-O0-NEXT: SI Fix VGPR copies ; GCN-O0-NEXT: Remove Redundant DEBUG_VALUE analysis ; GCN-O0-NEXT: Fixup Statepoint Caller Saved @@ -372,6 +373,7 @@ ; GCN-O1-NEXT: Virtual Register Map ; GCN-O1-NEXT: Live Register Matrix ; GCN-O1-NEXT: Greedy Register Allocator +; GCN-O1-NEXT: SI Lower Predicated Copies ; GCN-O1-NEXT: GCN NSA Reassign ; GCN-O1-NEXT: Virtual Register Rewriter ; GCN-O1-NEXT: Stack Slot Coloring @@ -681,6 +683,7 @@ ; GCN-O1-OPTS-NEXT: Virtual Register Map ; GCN-O1-OPTS-NEXT: Live Register Matrix ; GCN-O1-OPTS-NEXT: Greedy Register Allocator +; GCN-O1-OPTS-NEXT: SI Lower Predicated Copies ; GCN-O1-OPTS-NEXT: GCN NSA Reassign ; GCN-O1-OPTS-NEXT: Virtual Register Rewriter ; GCN-O1-OPTS-NEXT: Stack Slot Coloring @@ -992,6 +995,7 @@ ; GCN-O2-NEXT: Virtual Register Map ; GCN-O2-NEXT: Live Register Matrix ; GCN-O2-NEXT: Greedy Register Allocator +; GCN-O2-NEXT: SI Lower Predicated Copies ; GCN-O2-NEXT: GCN NSA Reassign ; GCN-O2-NEXT: Virtual Register Rewriter ; GCN-O2-NEXT: Stack Slot Coloring @@ -1314,6 +1318,7 @@ ; GCN-O3-NEXT: Virtual Register Map ; GCN-O3-NEXT: Live Register Matrix ; GCN-O3-NEXT: Greedy Register Allocator +; GCN-O3-NEXT: SI Lower Predicated Copies ; GCN-O3-NEXT: GCN NSA Reassign ; GCN-O3-NEXT: Virtual Register Rewriter ; GCN-O3-NEXT: Stack Slot Coloring diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -7321,6 +7321,7 @@ ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, v14 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NOHSA-SI-NEXT: ; kill: def $vgpr4_vgpr5_vgpr6_vgpr7 killed $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v41, 0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v33, 0 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v49, 0 diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll --- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll @@ -12,20 +12,20 @@ ; REGALLOC-GFX908-NEXT: {{ $}} ; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1703945 /* reguse:AGPR_32 */, undef %5:agpr_32 ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5767178 /* regdef:VReg_128 */, def %26 - ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:av_128 = COPY %26 + ; REGALLOC-GFX908-NEXT: [[PRED_COPY:%[0-9]+]]:av_128 = PRED_COPY %26 ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3080202 /* regdef:VReg_64 */, def %23 ; REGALLOC-GFX908-NEXT: SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) - ; REGALLOC-GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[COPY]] - ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) + ; REGALLOC-GFX908-NEXT: [[PRED_COPY1:%[0-9]+]]:vreg_128 = PRED_COPY [[PRED_COPY]] + ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64, [[PRED_COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; REGALLOC-GFX908-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) - ; REGALLOC-GFX908-NEXT: [[COPY2:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 + ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 ; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec - ; REGALLOC-GFX908-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec + ; REGALLOC-GFX908-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec ; REGALLOC-GFX908-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64, [[SI_SPILL_V64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1) - ; REGALLOC-GFX908-NEXT: [[COPY3:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] - ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) + ; REGALLOC-GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] + ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; REGALLOC-GFX908-NEXT: S_ENDPGM 0 ; PEI-GFX908-LABEL: name: partial_copy ; PEI-GFX908: bb.0 (%ir-block.0): @@ -36,11 +36,11 @@ ; PEI-GFX908-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 ; PEI-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1703945 /* reguse:AGPR_32 */, undef renamable $agpr0 ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 5767178 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 - ; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = PRED_COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3080202 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1 ; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) ; PEI-GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1 - ; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = PRED_COPY killed renamable $agpr0_agpr1_agpr2_agpr3 ; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; PEI-GFX908-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec @@ -59,15 +59,15 @@ ; REGALLOC-GFX90A-NEXT: {{ $}} ; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1703945 /* reguse:AGPR_32 */, undef %5:agpr_32 ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:VReg_128_Align2 */, def %25 - ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:av_128_align2 = COPY %25 + ; REGALLOC-GFX90A-NEXT: [[PRED_COPY:%[0-9]+]]:av_128_align2 = PRED_COPY %25 ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3407882 /* regdef:VReg_64_Align2 */, def %23 ; REGALLOC-GFX90A-NEXT: SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) - ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) + ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64_align2, [[PRED_COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) ; REGALLOC-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) - ; REGALLOC-GFX90A-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 + ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 ; REGALLOC-GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; REGALLOC-GFX90A-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec - ; REGALLOC-GFX90A-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec + ; REGALLOC-GFX90A-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec ; REGALLOC-GFX90A-NEXT: [[SI_SPILL_AV64_RESTORE:%[0-9]+]]:av_64_align2 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64_align2, [[SI_SPILL_AV64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1) ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1) @@ -81,7 +81,7 @@ ; PEI-GFX90A-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 ; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 1703945 /* reguse:AGPR_32 */, undef renamable $agpr0 ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6094858 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 - ; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = PRED_COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3407882 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1 ; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 4, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) ; PEI-GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-fail-unsatisfiable-overlapping-tuple-hints.mir b/llvm/test/CodeGen/AMDGPU/regalloc-fail-unsatisfiable-overlapping-tuple-hints.mir --- a/llvm/test/CodeGen/AMDGPU/regalloc-fail-unsatisfiable-overlapping-tuple-hints.mir +++ b/llvm/test/CodeGen/AMDGPU/regalloc-fail-unsatisfiable-overlapping-tuple-hints.mir @@ -52,13 +52,13 @@ ; CHECK-NEXT: [[SI_SPILL_V256_RESTORE:%[0-9]+]]:vreg_256 = SI_SPILL_V256_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.1, align 4, addrspace 5) ; CHECK-NEXT: [[SI_SPILL_V256_RESTORE1:%[0-9]+]]:vreg_256 = SI_SPILL_V256_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.3, align 4, addrspace 5) ; CHECK-NEXT: S_NOP 0, implicit [[SI_SPILL_V256_RESTORE]], implicit [[SI_SPILL_V256_RESTORE1]], implicit %4 - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_256 = COPY [[SI_SPILL_V256_RESTORE1]] + ; CHECK-NEXT: [[PRED_COPY:%[0-9]+]]:vreg_256 = PRED_COPY [[SI_SPILL_V256_RESTORE1]] ; CHECK-NEXT: S_CBRANCH_EXECNZ %bb.2, implicit $exec ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_NOP 0, implicit [[COPY]] + ; CHECK-NEXT: S_NOP 0, implicit [[PRED_COPY]] ; CHECK-NEXT: [[SI_SPILL_V256_RESTORE2:%[0-9]+]]:vreg_256 = SI_SPILL_V256_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.0, align 4, addrspace 5) ; CHECK-NEXT: S_NOP 0, implicit [[SI_SPILL_V256_RESTORE2]] ; CHECK-NEXT: [[SI_SPILL_V256_RESTORE3:%[0-9]+]]:vreg_256 = SI_SPILL_V256_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.2, align 4, addrspace 5) diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-introduces-copy-sgpr-to-agpr.mir b/llvm/test/CodeGen/AMDGPU/regalloc-introduces-copy-sgpr-to-agpr.mir --- a/llvm/test/CodeGen/AMDGPU/regalloc-introduces-copy-sgpr-to-agpr.mir +++ b/llvm/test/CodeGen/AMDGPU/regalloc-introduces-copy-sgpr-to-agpr.mir @@ -287,6 +287,7 @@ ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec ; GFX908-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr35, implicit $exec, implicit $exec ; GFX908-NEXT: GLOBAL_STORE_DWORD undef $vgpr0_vgpr1, killed renamable $vgpr0, 0, 0, implicit $exec + ; GFX908-NEXT: renamable $agpr0 = KILL killed renamable $agpr0, implicit $exec ; GFX908-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GFX908-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 12, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) ; GFX908-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll @@ -21,6 +21,7 @@ ; DEFAULT-NEXT: Virtual Register Map ; DEFAULT-NEXT: Live Register Matrix ; DEFAULT-NEXT: Greedy Register Allocator +; DEFAULT-NEXT: SI Lower Predicated Copies ; DEFAULT-NEXT: GCN NSA Reassign ; DEFAULT-NEXT: Virtual Register Rewriter ; DEFAULT-NEXT: Stack Slot Coloring @@ -28,6 +29,7 @@ ; O0: Fast Register Allocator ; O0-NEXT: SI lower SGPR spill instructions ; O0-NEXT: Fast Register Allocator +; O0-NEXT: SI Lower Predicated Copies ; O0-NEXT: SI Fix VGPR copies @@ -49,6 +51,7 @@ ; BASIC-DEFAULT-NEXT: Lazy Machine Block Frequency Analysis ; BASIC-DEFAULT-NEXT: Machine Optimization Remark Emitter ; BASIC-DEFAULT-NEXT: Greedy Register Allocator +; BASIC-DEFAULT-NEXT: SI Lower Predicated Copies ; BASIC-DEFAULT-NEXT: GCN NSA Reassign ; BASIC-DEFAULT-NEXT: Virtual Register Rewriter ; BASIC-DEFAULT-NEXT: Stack Slot Coloring @@ -61,6 +64,7 @@ ; DEFAULT-BASIC-NEXT: Virtual Register Map ; DEFAULT-BASIC-NEXT: Live Register Matrix ; DEFAULT-BASIC-NEXT: Basic Register Allocator +; DEFAULT-BASIC-NEXT: SI Lower Predicated Copies ; DEFAULT-BASIC-NEXT: GCN NSA Reassign ; DEFAULT-BASIC-NEXT: Virtual Register Rewriter ; DEFAULT-BASIC-NEXT: Stack Slot Coloring @@ -79,6 +83,7 @@ ; BASIC-BASIC-NEXT: Virtual Register Map ; BASIC-BASIC-NEXT: Live Register Matrix ; BASIC-BASIC-NEXT: Basic Register Allocator +; BASIC-BASIC-NEXT: SI Lower Predicated Copies ; BASIC-BASIC-NEXT: GCN NSA Reassign ; BASIC-BASIC-NEXT: Virtual Register Rewriter ; BASIC-BASIC-NEXT: Stack Slot Coloring diff --git a/llvm/test/CodeGen/AMDGPU/skip-subreg-copy-from-iswwmcopy-check.mir b/llvm/test/CodeGen/AMDGPU/skip-subreg-copy-from-iswwmcopy-check.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/skip-subreg-copy-from-iswwmcopy-check.mir @@ -0,0 +1,20 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=si-lower-predicated-copies -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +# The test goes into infinite loop while checking for isWWMCopy(). +# getUniqueVRegDef of the SrcReg returns the instruction itself if it is a partial copy. +# wwm-copies will always be a full copy and hence skip subreg copies while checking for one. + +--- +name: subreg_copy +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: false +body: | + bb.0: + ; GCN-LABEL: name: subreg_copy + ; GCN: dead undef %0.sub3:vreg_128_align2 = PRED_COPY undef %0.sub1 + ; GCN-NEXT: SI_RETURN + dead undef %0.sub3:vreg_128_align2 = PRED_COPY undef %0.sub1 + SI_RETURN +...