Index: llvm/trunk/lib/CodeGen/SplitKit.h =================================================================== --- llvm/trunk/lib/CodeGen/SplitKit.h +++ llvm/trunk/lib/CodeGen/SplitKit.h @@ -405,6 +405,17 @@ /// deleteRematVictims - Delete defs that are dead after rematerializing. void deleteRematVictims(); + /// Add a copy instruction copying \p FromReg to \p ToReg before + /// \p InsertBefore. This can be invoked with a \p LaneMask which may make it + /// necessary to construct a sequence of copies to cover it exactly. + SlotIndex buildCopy(unsigned FromReg, unsigned ToReg, LaneBitmask LaneMask, + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, + bool Late, unsigned RegIdx); + + SlotIndex buildSingleSubRegCopy(unsigned FromReg, unsigned ToReg, + MachineBasicBlock &MB, MachineBasicBlock::iterator InsertBefore, + unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex PrevCopy); + public: /// Create a new SplitEditor for editing the LiveInterval analyzed by SA. /// Newly created intervals will be appended to newIntervals. Index: llvm/trunk/lib/CodeGen/SplitKit.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SplitKit.cpp +++ llvm/trunk/lib/CodeGen/SplitKit.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/VirtRegMap.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" @@ -487,12 +488,127 @@ VFP = ValueForcePair(nullptr, true); } +SlotIndex SplitEditor::buildSingleSubRegCopy(unsigned FromReg, unsigned ToReg, + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, + unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex Def) { + const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY); + bool FirstCopy = !Def.isValid(); + MachineInstr *CopyMI = BuildMI(MBB, InsertBefore, DebugLoc(), Desc) + .addReg(ToReg, RegState::Define | getUndefRegState(FirstCopy) + | getInternalReadRegState(!FirstCopy), SubIdx) + .addReg(FromReg, 0, SubIdx); + + BumpPtrAllocator &Allocator = LIS.getVNInfoAllocator(); + if (FirstCopy) { + SlotIndexes &Indexes = *LIS.getSlotIndexes(); + Def = Indexes.insertMachineInstrInMaps(*CopyMI, Late).getRegSlot(); + DestLI.createDeadDef(Def, Allocator); + } else { + CopyMI->bundleWithPred(); + } + LaneBitmask LaneMask = TRI.getSubRegIndexLaneMask(SubIdx); + DestLI.refineSubRanges(Allocator, LaneMask, + [Def, &Allocator](LiveInterval::SubRange& SR) { + SR.createDeadDef(Def, Allocator); + }); + return Def; +} + +SlotIndex SplitEditor::buildCopy(unsigned FromReg, unsigned ToReg, + LaneBitmask LaneMask, MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, bool Late, unsigned RegIdx) { + const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY); + if (LaneMask.all() || LaneMask == MRI.getMaxLaneMaskForVReg(FromReg)) { + // The full vreg is copied. + MachineInstr *CopyMI = + BuildMI(MBB, InsertBefore, DebugLoc(), Desc, ToReg).addReg(FromReg); + SlotIndexes &Indexes = *LIS.getSlotIndexes(); + return Indexes.insertMachineInstrInMaps(*CopyMI, Late).getRegSlot(); + } + + // Only a subset of lanes needs to be copied. The following is a simple + // heuristic to construct a sequence of COPYs. We could add a target + // specific callback if this turns out to be suboptimal. + LiveInterval &DestLI = LIS.getInterval(Edit->get(RegIdx)); + + // First pass: Try to find a perfectly matching subregister index. If none + // exists find the one covering the most lanemask bits. + SmallVector PossibleIndexes; + unsigned BestIdx = 0; + unsigned BestCover = 0; + const TargetRegisterClass *RC = MRI.getRegClass(FromReg); + assert(RC == MRI.getRegClass(ToReg) && "Should have same reg class"); + for (unsigned Idx = 1, E = TRI.getNumSubRegIndices(); Idx < E; ++Idx) { + // Is this index even compatible with the given class? + if (TRI.getSubClassWithSubReg(RC, Idx) != RC) + continue; + LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(Idx); + // Early exit if we found a perfect match. + if (SubRegMask == LaneMask) { + BestIdx = Idx; + break; + } + + // The index must not cover any lanes outside \p LaneMask. + if ((SubRegMask & ~LaneMask).any()) + continue; + + unsigned PopCount = countPopulation(SubRegMask.getAsInteger()); + PossibleIndexes.push_back(Idx); + if (PopCount > BestCover) { + BestCover = PopCount; + BestIdx = Idx; + } + } + + // Abort if we cannot possibly implement the COPY with the given indexes. + if (BestIdx == 0) + report_fatal_error("Impossible to implement partial COPY"); + + SlotIndex Def = buildSingleSubRegCopy(FromReg, ToReg, MBB, InsertBefore, + BestIdx, DestLI, Late, SlotIndex()); + + // Greedy heuristic: Keep iterating keeping the best covering subreg index + // each time. + LaneBitmask LanesLeft = + LaneMask & ~(TRI.getSubRegIndexLaneMask(BestCover)); + while (LanesLeft.any()) { + unsigned BestIdx = 0; + int BestCover = INT_MIN; + for (unsigned Idx : PossibleIndexes) { + LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(Idx); + // Early exit if we found a perfect match. + if (SubRegMask == LanesLeft) { + BestIdx = Idx; + break; + } + + // Try to cover as much of the remaining lanes as possible but + // as few of the already covered lanes as possible. + int Cover = countPopulation((SubRegMask & LanesLeft).getAsInteger()) + - countPopulation((SubRegMask & ~LanesLeft).getAsInteger()); + if (Cover > BestCover) { + BestCover = Cover; + BestIdx = Idx; + } + } + + if (BestIdx == 0) + report_fatal_error("Impossible to implement partial COPY"); + + buildSingleSubRegCopy(FromReg, ToReg, MBB, InsertBefore, BestIdx, + DestLI, Late, Def); + LanesLeft &= ~TRI.getSubRegIndexLaneMask(BestIdx); + } + + return Def; +} + VNInfo *SplitEditor::defFromParent(unsigned RegIdx, VNInfo *ParentVNI, SlotIndex UseIdx, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) { - MachineInstr *CopyMI = nullptr; SlotIndex Def; LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx)); @@ -505,45 +621,29 @@ LiveInterval &OrigLI = LIS.getInterval(Original); VNInfo *OrigVNI = OrigLI.getVNInfoAt(UseIdx); + unsigned Reg = LI->reg; bool DidRemat = false; if (OrigVNI) { LiveRangeEdit::Remat RM(ParentVNI); RM.OrigMI = LIS.getInstructionFromIndex(OrigVNI->def); if (Edit->canRematerializeAt(RM, OrigVNI, UseIdx, true)) { - Def = Edit->rematerializeAt(MBB, I, LI->reg, RM, TRI, Late); + Def = Edit->rematerializeAt(MBB, I, Reg, RM, TRI, Late); ++NumRemats; DidRemat = true; } } if (!DidRemat) { - // Can't remat, just insert a copy from parent. - CopyMI = BuildMI(MBB, I, DebugLoc(), TII.get(TargetOpcode::COPY), LI->reg) - .addReg(Edit->getReg()); - Def = LIS.getSlotIndexes() - ->insertMachineInstrInMaps(*CopyMI, Late) - .getRegSlot(); + LaneBitmask LaneMask; if (LI->hasSubRanges()) { - LaneBitmask LM = LaneBitmask::getNone(); + LaneMask = LaneBitmask::getNone(); for (LiveInterval::SubRange &S : LI->subranges()) - LM |= S.LaneMask; - - if (MRI.getMaxLaneMaskForVReg(LI->reg) != LM) { - // Find subreg for the lane mask. - unsigned SubIdx = 0; - for (unsigned I = 1, E = TRI.getNumSubRegIndices(); I < E; ++I) { - if (TRI.getSubRegIndexLaneMask(I) == LM) { - SubIdx = I; - break; - } - } - if (SubIdx == 0) - report_fatal_error("Cannot find subreg index to cover all alive lanes"); - CopyMI->getOperand(0).setSubReg(SubIdx); - CopyMI->getOperand(1).setSubReg(SubIdx); - CopyMI->getOperand(0).setIsUndef(true); - } + LaneMask |= S.LaneMask; + } else { + LaneMask = LaneBitmask::getAll(); } + ++NumCopies; + Def = buildCopy(Edit->getReg(), Reg, LaneMask, MBB, I, Late, RegIdx); } // Define the value in Reg. Index: llvm/trunk/lib/CodeGen/VirtRegMap.cpp =================================================================== --- llvm/trunk/lib/CodeGen/VirtRegMap.cpp +++ llvm/trunk/lib/CodeGen/VirtRegMap.cpp @@ -167,6 +167,7 @@ bool readsUndefSubreg(const MachineOperand &MO) const; void addLiveInsForSubRanges(const LiveInterval &LI, unsigned PhysReg) const; void handleIdentityCopy(MachineInstr &MI) const; + void expandCopyBundle(MachineInstr &MI) const; public: static char ID; @@ -372,6 +373,34 @@ DEBUG(dbgs() << " deleted.\n"); } +/// The liverange splitting logic sometimes produces bundles of copies when +/// subregisters are involved. Expand these into a sequence of copy instructions +/// after processing the last in the bundle. Does not update LiveIntervals +/// which we shouldn't need for this instruction anymore. +void VirtRegRewriter::expandCopyBundle(MachineInstr &MI) const { + if (!MI.isCopy()) + return; + + if (MI.isBundledWithPred() && !MI.isBundledWithSucc()) { + // Only do this when the complete bundle is made out of COPYs. + for (MachineBasicBlock::reverse_instr_iterator I = + std::next(MI.getReverseIterator()); I->isBundledWithSucc(); ++I) { + if (!I->isCopy()) + return; + } + + for (MachineBasicBlock::reverse_instr_iterator I = MI.getReverseIterator(); + I->isBundledWithPred(); ) { + MachineInstr &MI = *I; + ++I; + + MI.unbundleFromPred(); + if (Indexes) + Indexes->insertMachineInstrInMaps(MI); + } + } +} + void VirtRegRewriter::rewrite() { bool NoSubRegLiveness = !MRI->subRegLivenessEnabled(); SmallVector SuperDeads; @@ -463,6 +492,8 @@ DEBUG(dbgs() << "> " << *MI); + expandCopyBundle(*MI); + // We can remove identity copies right now. handleIdentityCopy(*MI); } Index: llvm/trunk/test/CodeGen/AMDGPU/splitkit.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/splitkit.mir +++ llvm/trunk/test/CodeGen/AMDGPU/splitkit.mir @@ -0,0 +1,64 @@ +# RUN: llc -o - %s -mtriple=amdgcn-- -mcpu=fiji -verify-machineinstrs -run-pass=greedy,virtregrewriter | FileCheck %s +--- | + define void @func0() #0 { ret void } + define void @func1() #0 { ret void } + + attributes #0 = { "amdgpu-num-sgpr"="12" } +... +--- +# Make sure we only get a single spill+reload even if liverange splitting +# created a sequence of multiple copy instructions. +# CHECK-LABEL: name: func0 +# CHECK: SI_SPILL_S128_SAVE +# CHECK-NOT: SI_SPILL_S128_SAVE +# CHECK: S_NOP 0 +# CHECK: SI_SPILL_S128_RESTORE +# CHECK-NOT: SI_SPILL_S128_RESTORE +name: func0 +body: | + bb.0: + S_NOP 0, implicit-def undef %0.sub0 : sreg_128 + S_NOP 0, implicit-def %0.sub3 : sreg_128 + + ; Clobber registers + S_NOP 0, implicit-def dead %sgpr0, implicit-def dead %sgpr1, implicit-def dead %sgpr2, implicit-def dead %sgpr3, implicit-def dead %sgpr4, implicit-def dead %sgpr5, implicit-def dead %sgpr6, implicit-def dead %sgpr7, implicit-def dead %sgpr8, implicit-def dead %sgpr9, implicit-def dead %sgpr10, implicit-def dead %sgpr11 + + S_NOP 0, implicit %0.sub0 + S_NOP 0, implicit %0.sub3 + S_NOP 0, implicit %0.sub0 + S_NOP 0, implicit %0.sub3 +... +--- +# LiveRange splitting should split this into 2 intervals with the second getting +# allocated to sgpr0_sgpr1 and the first to something else so we see two copies +# in between for the two subregisters that are alive. +# CHECK-LABEL: name: func1 +# CHECK: [[REG0:%sgpr[0-9]+]] = COPY %sgpr0 +# CHECK: [[REG1:%sgpr[0-9]+]] = COPY %sgpr2 +# CHECK: S_NOP 0 +# CHECK: S_NOP 0, implicit [[REG0]] +# CHECK: S_NOP 0, implicit [[REG1]] +# CHECK: %sgpr0 = COPY [[REG0]] +# CHECK: %sgpr2 = COPY [[REG1]] +# CHECK: S_NOP +# CHECK: S_NOP 0, implicit %sgpr0 +# CHECK: S_NOP 0, implicit %sgpr2 +name: func1 +tracksRegLiveness: true +body: | + bb.0: + liveins: %sgpr0, %sgpr1, %sgpr2 + undef %0.sub0 : sreg_128 = COPY %sgpr0 + %0.sub2 = COPY %sgpr2 + + S_NOP 0, implicit-def dead %sgpr0, implicit-def dead %sgpr1 + + S_NOP 0, implicit %0.sub0 + S_NOP 0, implicit %0.sub2 + + ; Clobber everything but sgpr0-sgpr3 + S_NOP 0, implicit-def dead %sgpr4, implicit-def dead %sgpr5, implicit-def dead %sgpr6, implicit-def dead %sgpr7, implicit-def dead %sgpr8, implicit-def dead %sgpr9, implicit-def dead %sgpr10, implicit-def dead %sgpr11, implicit-def dead %sgpr12, implicit-def dead %sgpr13, implicit-def dead %sgpr14, implicit-def dead %sgpr15, implicit-def dead %vcc_lo, implicit-def dead %vcc_hi + + S_NOP 0, implicit %0.sub0 + S_NOP 0, implicit %0.sub2 +...