Index: include/llvm/CodeGen/MachineScheduler.h =================================================================== --- include/llvm/CodeGen/MachineScheduler.h +++ include/llvm/CodeGen/MachineScheduler.h @@ -221,6 +221,7 @@ /// PreRA and PostRA MachineScheduler. class ScheduleDAGMI : public ScheduleDAGInstrs { protected: + LiveIntervals *LIS; AliasAnalysis *AA; std::unique_ptr SchedImpl; @@ -250,9 +251,10 @@ ScheduleDAGMI(MachineSchedContext *C, std::unique_ptr S, bool IsPostRA) : ScheduleDAGInstrs(*C->MF, C->MLI, IsPostRA, - /*RemoveKillFlags=*/IsPostRA, C->LIS), - AA(C->AA), SchedImpl(std::move(S)), Topo(SUnits, &ExitSU), CurrentTop(), - CurrentBottom(), NextClusterPred(nullptr), NextClusterSucc(nullptr) { + /*RemoveKillFlags=*/IsPostRA), + LIS(C->LIS), AA(C->AA), SchedImpl(std::move(S)), Topo(SUnits, &ExitSU), + CurrentTop(), CurrentBottom(), NextClusterPred(nullptr), + NextClusterSucc(nullptr) { #ifndef NDEBUG NumInstrsScheduled = 0; #endif @@ -310,6 +312,8 @@ void viewGraph(const Twine &Name, const Twine &Title) override; void viewGraph() override; + LiveIntervals *getLIS() const { return LIS; } + protected: // Top-Level entry points for the schedule() driver... Index: include/llvm/CodeGen/ScheduleDAGInstrs.h =================================================================== --- include/llvm/CodeGen/ScheduleDAGInstrs.h +++ include/llvm/CodeGen/ScheduleDAGInstrs.h @@ -26,16 +26,17 @@ class MachineFrameInfo; class MachineLoopInfo; class MachineDominatorTree; - class LiveIntervals; class RegPressureTracker; class PressureDiffs; /// An individual mapping from virtual register number to SUnit. struct VReg2SUnit { unsigned VirtReg; + unsigned LaneMask; SUnit *SU; - VReg2SUnit(unsigned reg, SUnit *su): VirtReg(reg), SU(su) {} + VReg2SUnit(unsigned VReg, unsigned LaneMask, SUnit *SU) + : VirtReg(VReg), LaneMask(LaneMask), SU(SU) {} unsigned getSparseSetIndex() const { return TargetRegisterInfo::virtReg2Index(VirtReg); @@ -69,7 +70,7 @@ /// Track local uses of virtual registers. These uses are gathered by the DAG /// builder and may be consulted by the scheduler to avoid iterating an entire /// vreg use list. - typedef SparseMultiSet VReg2UseMap; + typedef SparseMultiSet VReg2SUnitMultiMap; /// ScheduleDAGInstrs - A ScheduleDAG subclass for scheduling lists of /// MachineInstrs. @@ -78,9 +79,6 @@ const MachineLoopInfo *MLI; const MachineFrameInfo *MFI; - /// Live Intervals provides reaching defs in preRA scheduling. - LiveIntervals *LIS; - /// TargetSchedModel provides an interface to the machine model. TargetSchedModel SchedModel; @@ -98,6 +96,9 @@ /// it has taken responsibility for scheduling the terminator correctly. bool CanHandleTerminators; + /// Whether lane masks should get tracked. + bool TrackLaneMasks; + /// State specific to the current scheduling region. /// ------------------------------------------------ @@ -120,7 +121,7 @@ /// After calling BuildSchedGraph, each vreg used in the scheduling region /// is mapped to a set of SUnits. These include all local vreg uses, not /// just the uses for a singly defined vreg. - VReg2UseMap VRegUses; + VReg2SUnitMultiMap VRegUses; /// State internal to DAG building. /// ------------------------------- @@ -132,8 +133,12 @@ Reg2SUnitsMap Defs; Reg2SUnitsMap Uses; - /// Track the last instruction in this region defining each virtual register. - VReg2SUnitMap VRegDefs; + /// Track the last instruction(s) in this region defining each virtual + /// register. There may be multiple current definitions for a register with + /// disjunct lanemasks. + VReg2SUnitMultiMap CurrentVRegDefs; + /// Track the last instructions in this region using each virtual register. + VReg2SUnitMultiMap CurrentVRegUses; /// PendingLoads - Remember where unknown loads are after the most recent /// unknown store, as we iterate. As with Defs and Uses, this is here @@ -155,16 +160,12 @@ explicit ScheduleDAGInstrs(MachineFunction &mf, const MachineLoopInfo *mli, bool IsPostRAFlag, - bool RemoveKillFlags = false, - LiveIntervals *LIS = nullptr); + bool RemoveKillFlags = false); ~ScheduleDAGInstrs() override {} bool isPostRA() const { return IsPostRA; } - /// \brief Expose LiveIntervals for use in DAG mutators and such. - LiveIntervals *getLIS() const { return LIS; } - /// \brief Get the machine model for instruction scheduling. const TargetSchedModel *getSchedModel() const { return &SchedModel; } @@ -253,6 +254,10 @@ /// Other adjustments may be made to the instruction if necessary. Return /// true if the operand has been deleted, false if not. bool toggleKillFlag(MachineInstr *MI, MachineOperand &MO); + + /// Return a mask for which lanes get read/written by the given (register) + /// machine operand. + unsigned getLaneMaskForMO(const MachineOperand &MO) const; }; /// newSUnit - Creates a new SUnit and return a ptr to it. Index: lib/CodeGen/MachineScheduler.cpp =================================================================== --- lib/CodeGen/MachineScheduler.cpp +++ lib/CodeGen/MachineScheduler.cpp @@ -975,7 +975,7 @@ } // RegisterPressureTracker guarantees that readsReg is true for LiveUses. assert(VNI && "No live value at use."); - for (VReg2UseMap::iterator + for (VReg2SUnitMultiMap::iterator UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) { SUnit *SU = UI->SU; DEBUG(dbgs() << " UpdateRegP: SU(" << SU->NodeNum << ") " @@ -1148,7 +1148,7 @@ unsigned LiveOutHeight = DefSU->getHeight(); unsigned LiveOutDepth = DefSU->getDepth() + DefSU->Latency; // Visit all local users of the vreg def. - for (VReg2UseMap::iterator + for (VReg2SUnitMultiMap::iterator UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) { if (UI->SU == &ExitSU) continue; Index: lib/CodeGen/ScheduleDAGInstrs.cpp =================================================================== --- lib/CodeGen/ScheduleDAGInstrs.cpp +++ lib/CodeGen/ScheduleDAGInstrs.cpp @@ -13,12 +13,12 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/ScheduleDAGInstrs.h" +#include "llvm/ADT/IntEqClasses.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" @@ -51,12 +51,11 @@ ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf, const MachineLoopInfo *mli, - bool IsPostRAFlag, bool RemoveKillFlags, - LiveIntervals *lis) - : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()), LIS(lis), + bool IsPostRAFlag, bool RemoveKillFlags) + : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()), IsPostRA(IsPostRAFlag), RemoveKillFlags(RemoveKillFlags), - CanHandleTerminators(false), FirstDbgValue(nullptr) { - assert((IsPostRA || LIS) && "PreRA scheduling requires LiveIntervals"); + CanHandleTerminators(false), TrackLaneMasks(false), + FirstDbgValue(nullptr) { DbgValues.clear(); assert(!(IsPostRA && MRI.getNumVirtRegs()) && "Virtual registers must be removed prior to PostRA scheduling"); @@ -367,6 +366,21 @@ } } +unsigned ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const { + if (!TrackLaneMasks) + return 1; + unsigned Reg = MO.getReg(); + // No point in tracking lanemasks if we don't have interesting subregisters. + const TargetRegisterClass &RC = *MRI.getRegClass(Reg); + if (!RC.HasDisjunctSubRegs) + return 1; + + unsigned SubReg = MO.getSubReg(); + if (SubReg == 0) + return MRI.getMaxLaneMaskForVReg(Reg); + return TRI->getSubRegIndexLaneMask(SubReg); +} + /// addVRegDefDeps - Add register output and data dependencies from this SUnit /// to instructions that occur later in the same scheduling region if they read /// from or write to the virtual register defined at OperIdx. @@ -375,7 +389,47 @@ /// reevaluated. Generally, IV scheduling should be done before coalescing. void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) { const MachineInstr *MI = SU->getInstr(); - unsigned Reg = MI->getOperand(OperIdx).getReg(); + const MachineOperand &MO = MI->getOperand(OperIdx); + unsigned Reg = MO.getReg(); + unsigned DefLaneMask = getLaneMaskForMO(MO); + + if (MO.isDead()) { + assert(CurrentVRegUses.find(Reg) == CurrentVRegUses.end() && + "Dead defs should have no uses"); + } else { + // If we have a flag, none of the lane values comes from an + // earlier instruction. + unsigned KillLaneMask = MO.isUndef() ? ~0u : DefLaneMask; + // Add data dependence to all uses we found so far. + const TargetSubtargetInfo &ST = MF.getSubtarget(); + for (VReg2SUnitMultiMap::iterator I = CurrentVRegUses.find(Reg), + E = CurrentVRegUses.end(); I != E; /*empty*/) { + // Ignore uses of other lanes. + if ((I->LaneMask & KillLaneMask) == 0) { + ++I; + continue; + } + + if ((I->LaneMask & DefLaneMask) != 0) { + SUnit *UseSU = I->SU; + MachineInstr *Use = UseSU->getInstr(); + SDep Dep(SU, SDep::Data, Reg); + unsigned UseOperIdx = 0; // TODO! FIXME! Get a proper value! + Dep.setLatency(SchedModel.computeOperandLatency(MI, OperIdx, Use, + UseOperIdx)); + ST.adjustSchedDependency(SU, UseSU, Dep); + UseSU->addPred(Dep); + } + + // If none of the lanes can have come from an earlier instruction remove + // the Use from the list. + I->LaneMask &= ~KillLaneMask; + if (I->LaneMask == 0) + I = CurrentVRegUses.erase(I); + else + ++I; + } + } // Singly defined vregs do not have output/anti dependencies. // The current operand is a def, so we have at least one. @@ -383,26 +437,46 @@ if (MRI.hasOneDef(Reg)) return; - // Add output dependence to the next nearest def of this vreg. + // Add output dependence to the next nearest def(s) of this vreg. // // Unless this definition is dead, the output dependence should be // transitively redundant with antidependencies from this definition's // uses. We're conservative for now until we have a way to guarantee the uses // are not eliminated sometime during scheduling. The output dependence edge // is also useful if output latency exceeds def-use latency. - VReg2SUnitMap::iterator DefI = VRegDefs.find(Reg); - if (DefI == VRegDefs.end()) - VRegDefs.insert(VReg2SUnit(Reg, SU)); - else { - SUnit *DefSU = DefI->SU; - if (DefSU != SU && DefSU != &ExitSU) { - SDep Dep(SU, SDep::Output, Reg); - Dep.setLatency( - SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr())); - DefSU->addPred(Dep); - } - DefI->SU = SU; + unsigned LaneMask = DefLaneMask; + for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg), + CurrentVRegDefs.end())) { + // Ignore defs for other lanes. + if ((V2SU.LaneMask & LaneMask) == 0) + continue; + // Add an output dependence. + SUnit *DefSU = V2SU.SU; + // We may have multiple defs of the same lanes in the instruction. This + // can happen because lanemasks are shared for targets with too many + // subregisters. We also use some representration tricks/hacks where we + // add super-register defs/uses, to imply that although we only access + // parts of the reg we care about the full one. + if (DefSU == SU) + continue; + SDep Dep(SU, SDep::Output, Reg); + Dep.setLatency( + SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr())); + DefSU->addPred(Dep); + + // Update current definition. This can get tricky if the def was about a + // bigger lanemask before. We then have to shrink it and create a new + // VReg2SUnit for the non-overlapping part. + unsigned OverlapMask = V2SU.LaneMask & LaneMask; + unsigned NonOverlapMask = V2SU.LaneMask & ~LaneMask; + if (NonOverlapMask != 0) + CurrentVRegDefs.insert(VReg2SUnit(Reg, NonOverlapMask, V2SU.SU)); + V2SU.SU = SU; + V2SU.LaneMask = OverlapMask; } + // If there are any lanes left we didn't cover yet add a new VReg2SUnit. + if (LaneMask != 0) + CurrentVRegDefs.insert(VReg2SUnit(Reg, LaneMask, SU)); } /// addVRegUseDeps - Add a register data dependency if the instruction that @@ -413,48 +487,35 @@ /// TODO: Handle ExitSU "uses" properly. void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) { MachineInstr *MI = SU->getInstr(); - unsigned Reg = MI->getOperand(OperIdx).getReg(); + const MachineOperand &MO = MI->getOperand(OperIdx); + unsigned Reg = MO.getReg(); // Record this local VReg use. - VReg2UseMap::iterator UI = VRegUses.find(Reg); + VReg2SUnitMultiMap::iterator UI = VRegUses.find(Reg); for (; UI != VRegUses.end(); ++UI) { if (UI->SU == SU) break; } if (UI == VRegUses.end()) - VRegUses.insert(VReg2SUnit(Reg, SU)); - - // Lookup this operand's reaching definition. - assert(LIS && "vreg dependencies requires LiveIntervals"); - LiveQueryResult LRQ - = LIS->getInterval(Reg).Query(LIS->getInstructionIndex(MI)); - VNInfo *VNI = LRQ.valueIn(); - - // VNI will be valid because MachineOperand::readsReg() is checked by caller. - assert(VNI && "No value to read by operand"); - MachineInstr *Def = LIS->getInstructionFromIndex(VNI->def); - // Phis and other noninstructions (after coalescing) have a NULL Def. - if (Def) { - SUnit *DefSU = getSUnit(Def); - if (DefSU) { - // The reaching Def lives within this scheduling region. - // Create a data dependence. - SDep dep(DefSU, SDep::Data, Reg); - // Adjust the dependence latency using operand def/use information, then - // allow the target to perform its own adjustments. - int DefOp = Def->findRegisterDefOperandIdx(Reg); - dep.setLatency(SchedModel.computeOperandLatency(Def, DefOp, MI, OperIdx)); - - const TargetSubtargetInfo &ST = MF.getSubtarget(); - ST.adjustSchedDependency(DefSU, SU, const_cast(dep)); - SU->addPred(dep); - } - } + VRegUses.insert(VReg2SUnit(Reg, 0, SU)); + + unsigned LaneMask = getLaneMaskForMO(MO); - // Add antidependence to the following def of the vreg it uses. - VReg2SUnitMap::iterator DefI = VRegDefs.find(Reg); - if (DefI != VRegDefs.end() && DefI->SU != SU) - DefI->SU->addPred(SDep(SU, SDep::Anti, Reg)); + // Remember the use, SDeps will be added once we find the Def. + CurrentVRegUses.insert(VReg2SUnit(Reg, LaneMask, SU)); + + // Add antidependences to the following defs of the vreg. + for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg), + CurrentVRegDefs.end())) { + // Ignore defs for other lanes. + unsigned PrevDefLaneMask = V2SU.LaneMask; + if ((PrevDefLaneMask & LaneMask) == 0) + continue; + if (V2SU.SU == SU) + continue; + + V2SU.SU->addPred(SDep(SU, SDep::Anti, Reg)); + } } /// Return true if MI is an instruction we are unable to reason about @@ -784,10 +845,14 @@ Defs.setUniverse(TRI->getNumRegs()); Uses.setUniverse(TRI->getNumRegs()); - assert(VRegDefs.empty() && "Only BuildSchedGraph may access VRegDefs"); + assert(CurrentVRegDefs.empty() && "nobody else should use CurrentVRegDefs"); + assert(CurrentVRegUses.empty() && "nobody else should use CurrentVRegUses"); + unsigned NumVirtRegs = MRI.getNumVirtRegs(); + CurrentVRegDefs.setUniverse(NumVirtRegs); + CurrentVRegUses.setUniverse(NumVirtRegs); + VRegUses.clear(); - VRegDefs.setUniverse(MRI.getNumVirtRegs()); - VRegUses.setUniverse(MRI.getNumVirtRegs()); + VRegUses.setUniverse(NumVirtRegs); // Model data dependencies between instructions being scheduled and the // ExitSU. @@ -1065,7 +1130,8 @@ Defs.clear(); Uses.clear(); - VRegDefs.clear(); + CurrentVRegDefs.clear(); + CurrentVRegUses.clear(); PendingLoads.clear(); }