Index: include/llvm/CodeGen/ExecutionDepsFix.h =================================================================== --- include/llvm/CodeGen/ExecutionDepsFix.h +++ include/llvm/CodeGen/ExecutionDepsFix.h @@ -23,20 +23,11 @@ #ifndef LLVM_CODEGEN_EXECUTIONDEPSFIX_H #define LLVM_CODEGEN_EXECUTIONDEPSFIX_H -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/iterator_range.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/CodeGen/LivePhysRegs.h" -#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/RegisterClassInfo.h" -#include "llvm/Pass.h" -#include "llvm/Support/Allocator.h" -#include "llvm/Support/MathExtras.h" -#include -#include -#include -#include namespace llvm { @@ -74,7 +65,7 @@ DomainValue *Next; // Twiddleable instructions using or defining these registers. - SmallVector Instrs; + SmallVector Instrs; DomainValue() { clear(); } @@ -118,37 +109,9 @@ } }; -/// Information about a live register. -struct LiveReg { - /// Value currently in this register, or NULL when no value is being tracked. - /// This counts as a DomainValue reference. - DomainValue *Value; - - /// Instruction that defined this register, relative to the beginning of the - /// current basic block. When a LiveReg is used to represent a live-out - /// register, this value is relative to the end of the basic block, so it - /// will be a negative number. - int Def; -}; - -class ExecutionDepsFix : public MachineFunctionPass { - SpecificBumpPtrAllocator Allocator; - SmallVector Avail; - - const TargetRegisterClass *const RC; - MachineFunction *MF; - const TargetInstrInfo *TII; - const TargetRegisterInfo *TRI; - RegisterClassInfo RegClassInfo; - std::vector> AliasMap; - const unsigned NumRegs; - LiveReg *LiveRegs; +class LoopTraversal { +protected: struct MBBInfo { - // Keeps clearance and domain information for all registers. Note that this - // is different from the usual definition notion of liveness. The CPU - // doesn't care whether or not we consider a register killed. - LiveReg *OutRegs = nullptr; - // Whether we have gotten to this block in primary processing yet. bool PrimaryCompleted = false; @@ -166,19 +129,44 @@ using MBBInfoMap = DenseMap; MBBInfoMap MBBInfos; - /// List of undefined register reads in this block in forward order. - std::vector> UndefReads; +public: + LoopTraversal() {} + +protected: + // Overridden by the derived classes. + virtual void enterBasicBlock(MachineBasicBlock *) = 0; + virtual void leaveBasicBlock(MachineBasicBlock *) = 0; + virtual void processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass) = 0; + + // Traversal operations + void runOnBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass); + bool isBlockDone(MachineBasicBlock *MBB); + bool traverse(MachineFunction &mf); +}; - /// Storage for register unit liveness. - LivePhysRegs LiveRegSet; - /// Current instruction number. - /// The first instruction in each basic block is 0. - int CurInstr; +class ExecutionDomainFix : public MachineFunctionPass, LoopTraversal { + SpecificBumpPtrAllocator Allocator; + SmallVector Avail; + + const TargetRegisterClass *const RC; + MachineFunction *MF; + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + RegisterClassInfo RegClassInfo; + std::vector> AliasMap; + const unsigned NumRegs; + + typedef std::vector RegsDVInfo; + /// The domain value of each reg in the register class in the current basic + /// block. + RegsDVInfo LiveRegs; + /// The domain values of the live out regs of each basic block. + std::vector OutRegs; public: - ExecutionDepsFix(char &PassID, const TargetRegisterClass &RC) - : MachineFunctionPass(PassID), RC(&RC), NumRegs(RC.getNumRegs()) {} + ExecutionDomainFix(char &PassID, const TargetRegisterClass &RC) + : MachineFunctionPass(PassID), RC(&RC), NumRegs(RC.getNumRegs()) {} void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); @@ -211,18 +199,77 @@ void collapse(DomainValue *dv, unsigned domain); bool merge(DomainValue *A, DomainValue *B); - void enterBasicBlock(MachineBasicBlock*); - void leaveBasicBlock(MachineBasicBlock*); - bool isBlockDone(MachineBasicBlock *); - void processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass); + // LoopTraversal overrides + void enterBasicBlock(MachineBasicBlock *) override; + void leaveBasicBlock(MachineBasicBlock *) override; + void processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass) override; + + // Instruction processing bool visitInstr(MachineInstr *); - void processDefs(MachineInstr *, bool breakDependency, bool Kill); + void processDefs(MachineInstr *, bool Kill); void visitSoftInstr(MachineInstr*, unsigned mask); void visitHardInstr(MachineInstr*, unsigned domain); +}; + +class BreakFalseDeps : public MachineFunctionPass, LoopTraversal { +private: + MachineFunction *MF; + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + RegisterClassInfo RegClassInfo; + unsigned NumRegUnits; + + typedef std::vector RegDefInfo; + /// Maps each register unit to the instruction that defined it, relative to + /// the beginning of the current basic block. Live-out register unit values + /// may be negative, as they are relative to the end of the basic block. + RegDefInfo Defs; + /// The def instruction of the live out reg units of each basic block. + std::vector OutDefs; + + /// List of undefined register reads in this block in forward order. + std::vector> UndefReads; + + /// Storage for register unit liveness. + LivePhysRegs LiveRegSet; + + /// Current instruction number. + /// The first instruction in each basic block is 0. + int CurInstr; + +public: + static char ID; // Pass identification, replacement for typeid + + BreakFalseDeps() : MachineFunctionPass(ID) { + initializeBreakFalseDepsPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + +private: + // Dependency breaking. bool pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, unsigned Pref); bool shouldBreakDependence(MachineInstr*, unsigned OpIdx, unsigned Pref); void processUndefReads(MachineBasicBlock*); + + // LoopTraversal overrides. + void enterBasicBlock(MachineBasicBlock *) override; + void leaveBasicBlock(MachineBasicBlock *) override; + void processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass) override; + + // Instruction processing. + void processDefs(MachineInstr *, bool breakDependency); }; } // end namepsace llvm Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -80,6 +80,7 @@ void initializeBranchProbabilityInfoWrapperPassPass(PassRegistry&); void initializeBranchRelaxationPass(PassRegistry&); void initializeBreakCriticalEdgesPass(PassRegistry&); +void initializeBreakFalseDepsPass(PassRegistry&); void initializeCallSiteSplittingLegacyPassPass(PassRegistry&); void initializeCFGOnlyPrinterLegacyPassPass(PassRegistry&); void initializeCFGOnlyViewerLegacyPassPass(PassRegistry&); Index: lib/CodeGen/ExecutionDepsFix.cpp =================================================================== --- lib/CodeGen/ExecutionDepsFix.cpp +++ lib/CodeGen/ExecutionDepsFix.cpp @@ -8,18 +8,9 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/ExecutionDepsFix.h" - #include "llvm/ADT/PostOrderIterator.h" -#include "llvm/ADT/iterator_range.h" -#include "llvm/CodeGen/LivePhysRegs.h" -#include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RegisterClassInfo.h" -#include "llvm/Support/Allocator.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; @@ -28,13 +19,13 @@ /// Translate TRI register number to a list of indices into our smaller tables /// of interesting registers. iterator_range::const_iterator> -ExecutionDepsFix::regIndices(unsigned Reg) const { +ExecutionDomainFix::regIndices(unsigned Reg) const { assert(Reg < AliasMap.size() && "Invalid register"); const auto &Entry = AliasMap[Reg]; return make_range(Entry.begin(), Entry.end()); } -DomainValue *ExecutionDepsFix::alloc(int domain) { +DomainValue *ExecutionDomainFix::alloc(int domain) { DomainValue *dv = Avail.empty() ? new(Allocator.Allocate()) DomainValue : Avail.pop_back_val(); @@ -47,7 +38,7 @@ /// Release a reference to DV. When the last reference is released, /// collapse if needed. -void ExecutionDepsFix::release(DomainValue *DV) { +void ExecutionDomainFix::release(DomainValue *DV) { while (DV) { assert(DV->Refs && "Bad DomainValue"); if (--DV->Refs) @@ -67,7 +58,7 @@ /// Follow the chain of dead DomainValues until a live DomainValue is reached. /// Update the referenced pointer when necessary. -DomainValue *ExecutionDepsFix::resolve(DomainValue *&DVRef) { +DomainValue *ExecutionDomainFix::resolve(DomainValue *&DVRef) { DomainValue *DV = DVRef; if (!DV || !DV->Next) return DV; @@ -84,33 +75,33 @@ } /// Set LiveRegs[rx] = dv, updating reference counts. -void ExecutionDepsFix::setLiveReg(int rx, DomainValue *dv) { +void ExecutionDomainFix::setLiveReg(int rx, DomainValue *dv) { assert(unsigned(rx) < NumRegs && "Invalid index"); - assert(LiveRegs && "Must enter basic block first."); + assert(!LiveRegs.empty() && "Must enter basic block first."); - if (LiveRegs[rx].Value == dv) + if (LiveRegs[rx] == dv) return; - if (LiveRegs[rx].Value) - release(LiveRegs[rx].Value); - LiveRegs[rx].Value = retain(dv); + if (LiveRegs[rx]) + release(LiveRegs[rx]); + LiveRegs[rx] = retain(dv); } // Kill register rx, recycle or collapse any DomainValue. -void ExecutionDepsFix::kill(int rx) { +void ExecutionDomainFix::kill(int rx) { assert(unsigned(rx) < NumRegs && "Invalid index"); - assert(LiveRegs && "Must enter basic block first."); - if (!LiveRegs[rx].Value) + assert(!LiveRegs.empty() && "Must enter basic block first."); + if (!LiveRegs[rx]) return; - release(LiveRegs[rx].Value); - LiveRegs[rx].Value = nullptr; + release(LiveRegs[rx]); + LiveRegs[rx] = nullptr; } /// Force register rx into domain. -void ExecutionDepsFix::force(int rx, unsigned domain) { +void ExecutionDomainFix::force(int rx, unsigned domain) { assert(unsigned(rx) < NumRegs && "Invalid index"); - assert(LiveRegs && "Must enter basic block first."); - if (DomainValue *dv = LiveRegs[rx].Value) { + assert(!LiveRegs.empty() && "Must enter basic block first."); + if (DomainValue *dv = LiveRegs[rx]) { if (dv->isCollapsed()) dv->addDomain(domain); else if (dv->hasDomain(domain)) @@ -119,8 +110,8 @@ // This is an incompatible open DomainValue. Collapse it to whatever and // force the new value into domain. This costs a domain crossing. collapse(dv, dv->getFirstDomain()); - assert(LiveRegs[rx].Value && "Not live after collapse?"); - LiveRegs[rx].Value->addDomain(domain); + assert(LiveRegs[rx] && "Not live after collapse?"); + LiveRegs[rx]->addDomain(domain); } } else { // Set up basic collapsed DomainValue. @@ -130,7 +121,7 @@ /// Collapse open DomainValue into given domain. If there are multiple /// registers using dv, they each get a unique collapsed DomainValue. -void ExecutionDepsFix::collapse(DomainValue *dv, unsigned domain) { +void ExecutionDomainFix::collapse(DomainValue *dv, unsigned domain) { assert(dv->hasDomain(domain) && "Cannot collapse"); // Collapse all the instructions. @@ -139,14 +130,14 @@ dv->setSingleDomain(domain); // If there are multiple users, give them new, unique DomainValues. - if (LiveRegs && dv->Refs > 1) + if (!LiveRegs.empty() && dv->Refs > 1) for (unsigned rx = 0; rx != NumRegs; ++rx) - if (LiveRegs[rx].Value == dv) + if (LiveRegs[rx] == dv) setLiveReg(rx, alloc(domain)); } /// All instructions and registers in B are moved to A, and B is released. -bool ExecutionDepsFix::merge(DomainValue *A, DomainValue *B) { +bool ExecutionDomainFix::merge(DomainValue *A, DomainValue *B) { assert(!A->isCollapsed() && "Cannot merge into collapsed"); assert(!B->isCollapsed() && "Cannot merge from collapsed"); if (A == B) @@ -164,15 +155,24 @@ B->Next = retain(A); for (unsigned rx = 0; rx != NumRegs; ++rx) { - assert(LiveRegs && "no space allocated for live registers"); - if (LiveRegs[rx].Value == B) + assert(!LiveRegs.empty() && "no space allocated for live registers"); + if (LiveRegs[rx] == B) setLiveReg(rx, A); } return true; } +// BreakFalseDeps related +char BreakFalseDeps::ID = 0; + +INITIALIZE_PASS(BreakFalseDeps, DEBUG_TYPE, "BreakFalseDeps", false, false) + +// Default values are 'nothing happened a long time ago'. +const int DefaultDefVal = -(1 << 20); + /// Set up LiveRegs by merging predecessor live-out values. -void ExecutionDepsFix::enterBasicBlock(MachineBasicBlock *MBB) { +void BreakFalseDeps::enterBasicBlock(MachineBasicBlock *MBB) { + // Reset instruction counter in each basic block. CurInstr = 0; @@ -180,59 +180,80 @@ UndefReads.clear(); LiveRegSet.clear(); - // Set up LiveRegs to represent registers entering MBB. - if (!LiveRegs) - LiveRegs = new LiveReg[NumRegs]; - + // Set up Defs to represent registers entering MBB. // Default values are 'nothing happened a long time ago'. - for (unsigned rx = 0; rx != NumRegs; ++rx) { - LiveRegs[rx].Value = nullptr; - LiveRegs[rx].Def = -(1 << 20); - } + Defs.assign(NumRegUnits, DefaultDefVal); // This is the entry block. if (MBB->pred_empty()) { for (const auto &LI : MBB->liveins()) { - for (int rx : regIndices(LI.PhysReg)) { + for (MCRegUnitIterator Units(LI.PhysReg, TRI); Units.isValid(); ++Units) { // Treat function live-ins as if they were defined just before the first // instruction. Usually, function arguments are set up immediately // before the call. - LiveRegs[rx].Def = -1; + Defs[*Units] = -1; } } DEBUG(dbgs() << "BB#" << MBB->getNumber() << ": entry\n"); return; } + // Try to coalesce live-out register units from predecessors. + for (MachineBasicBlock *pred : MBB->predecessors()) { + const RegDefInfo &IncomingDefs = OutDefs[pred->getNumber()]; + // Incoming is empty if this is a backedge from a BB + // we haven't processed yet + if (!IncomingDefs.size()) + continue; + + for (unsigned Unit = 0; Unit < NumRegUnits; ++Unit) { + // Use the most recent predecessor def for each register unit. + Defs[Unit] = std::max(Defs[Unit], IncomingDefs[Unit]); + } + } + DEBUG( + dbgs() << "BB#" << MBB->getNumber() + << (!isBlockDone(MBB) ? ": incomplete\n" : ": all preds known\n")); +} + +/// Set up LiveRegs by merging predecessor live-out values. +void ExecutionDomainFix::enterBasicBlock(MachineBasicBlock *MBB) { + // Set up LiveRegs to represent registers entering MBB. + if (LiveRegs.empty()) + LiveRegs.resize(NumRegs); + + // Set up LiveRegs to represent registers entering MBB. + // Set default domain values to 'no domain' (nullptr) + LiveRegs.assign(NumRegs, nullptr); + + // This is the entry block. + if (MBB->pred_empty()) { + DEBUG(dbgs() << "BB#" << MBB->getNumber() << ": entry\n"); + return; + } + // Try to coalesce live-out registers from predecessors. - for (MachineBasicBlock::const_pred_iterator pi = MBB->pred_begin(), - pe = MBB->pred_end(); pi != pe; ++pi) { - auto fi = MBBInfos.find(*pi); - assert(fi != MBBInfos.end() && - "Should have pre-allocated MBBInfos for all MBBs"); - LiveReg *Incoming = fi->second.OutRegs; - // Incoming is null if this is a backedge from a BB + for (MachineBasicBlock *pred : MBB->predecessors()) { + RegsDVInfo &Incoming = OutRegs[pred->getNumber()]; + // Incoming is empty if this is a backedge from a BB // we haven't processed yet - if (Incoming == nullptr) { + if (Incoming.empty()) { continue; } for (unsigned rx = 0; rx != NumRegs; ++rx) { - // Use the most recent predecessor def for each register. - LiveRegs[rx].Def = std::max(LiveRegs[rx].Def, Incoming[rx].Def); - - DomainValue *pdv = resolve(Incoming[rx].Value); + DomainValue *pdv = resolve(Incoming[rx]); if (!pdv) continue; - if (!LiveRegs[rx].Value) { + if (!LiveRegs[rx]) { setLiveReg(rx, pdv); continue; } // We have a live DomainValue from more than one predecessor. - if (LiveRegs[rx].Value->isCollapsed()) { + if (LiveRegs[rx]->isCollapsed()) { // We are already collapsed, but predecessor is not. Force it. - unsigned Domain = LiveRegs[rx].Value->getFirstDomain(); + unsigned Domain = LiveRegs[rx]->getFirstDomain(); if (!pdv->isCollapsed() && pdv->hasDomain(Domain)) collapse(pdv, Domain); continue; @@ -240,7 +261,7 @@ // Currently open, merge in predecessor. if (!pdv->isCollapsed()) - merge(LiveRegs[rx].Value, pdv); + merge(LiveRegs[rx], pdv); else force(rx, pdv->getFirstDomain()); } @@ -250,29 +271,33 @@ << (!isBlockDone(MBB) ? ": incomplete\n" : ": all preds known\n")); } -void ExecutionDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) { - assert(LiveRegs && "Must enter basic block first."); - LiveReg *OldOutRegs = MBBInfos[MBB].OutRegs; - // Save register clearances at end of MBB - used by enterBasicBlock(). - MBBInfos[MBB].OutRegs = LiveRegs; +void BreakFalseDeps::leaveBasicBlock(MachineBasicBlock *MBB) { + + assert(Defs.size() && "Must enter basic block first."); // While processing the basic block, we kept `Def` relative to the start // of the basic block for convenience. However, future use of this information // only cares about the clearance from the end of the block, so adjust // everything to be relative to the end of the basic block. - for (unsigned i = 0, e = NumRegs; i != e; ++i) - LiveRegs[i].Def -= CurInstr; - if (OldOutRegs) { - // This must be the second pass. - // Release all the DomainValues instead of keeping them. - for (unsigned i = 0, e = NumRegs; i != e; ++i) - release(OldOutRegs[i].Value); - delete[] OldOutRegs; - } - LiveRegs = nullptr; + for (unsigned Unit = 0; Unit < NumRegUnits; ++Unit) + Defs[Unit] -= CurInstr; + + // Save register clearances at end of MBB - used by enterBasicBlock(). + OutDefs[MBB->getNumber()] = Defs; + + Defs.clear(); +} + +void ExecutionDomainFix::leaveBasicBlock(MachineBasicBlock *MBB) { + assert(!LiveRegs.empty() && "Must enter basic block first."); + for (DomainValue *DV : OutRegs[MBB->getNumber()]) { + release(DV); + } + OutRegs[MBB->getNumber()] = LiveRegs; + LiveRegs.clear(); } -bool ExecutionDepsFix::visitInstr(MachineInstr *MI) { +bool ExecutionDomainFix::visitInstr(MachineInstr *MI) { // Update instructions with explicit execution domains. std::pair DomP = TII->getExecutionDomain(*MI); if (DomP.first) { @@ -290,16 +315,22 @@ /// is truly dependent on, or use a register with clearance higher than Pref. /// Returns true if it was able to find a true dependency, thus not requiring /// a dependency breaking instruction regardless of clearance. -bool ExecutionDepsFix::pickBestRegisterForUndef(MachineInstr *MI, +bool BreakFalseDeps::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, unsigned Pref) { MachineOperand &MO = MI->getOperand(OpIdx); assert(MO.isUndef() && "Expected undef machine operand"); unsigned OriginalReg = MO.getReg(); - // Update only undef operands that are mapped to one register. - if (AliasMap[OriginalReg].size() != 1) - return false; + // Update only undef operands that have reg units that are mapped to one root. + for (MCRegUnitIterator Unit(OriginalReg, TRI); Unit.isValid(); ++Unit) { + unsigned NumRoots = 0; + for (MCRegUnitRootIterator Root(*Unit, TRI); Root.isValid(); ++Root) { + NumRoots++; + if (NumRoots > 1) + return false; + } + } // Get the undef operand's register class const TargetRegisterClass *OpRC = @@ -323,10 +354,11 @@ unsigned MaxClearanceReg = OriginalReg; ArrayRef Order = RegClassInfo.getOrder(OpRC); for (auto Reg : Order) { - assert(AliasMap[Reg].size() == 1 && - "Reg is expected to be mapped to a single index"); - int RCrx = *regIndices(Reg).begin(); - unsigned Clearance = CurInstr - LiveRegs[RCrx].Def; + int TmpDef = DefaultDefVal; + for (MCRegUnitIterator Unit(Reg, TRI); Unit.isValid(); ++Unit) { + TmpDef = std::max(TmpDef, Defs[*Unit]); + } + unsigned Clearance = CurInstr - TmpDef; if (Clearance <= MaxClearance) continue; MaxClearance = Clearance; @@ -345,11 +377,11 @@ /// \brief Return true to if it makes sense to break dependence on a partial def /// or undef use. -bool ExecutionDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx, - unsigned Pref) { +bool BreakFalseDeps::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx, + unsigned Pref) { unsigned reg = MI->getOperand(OpIdx).getReg(); - for (int rx : regIndices(reg)) { - unsigned Clearance = CurInstr - LiveRegs[rx].Def; + for (MCRegUnitIterator Unit(reg, TRI); Unit.isValid(); ++Unit) { + unsigned Clearance = CurInstr - Defs[*Unit]; DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref); if (Pref > Clearance) { @@ -366,8 +398,31 @@ // If Kill is set, also kill off DomainValues clobbered by the defs. // // Also break dependencies on partial defs and undef uses. -void ExecutionDepsFix::processDefs(MachineInstr *MI, bool breakDependency, - bool Kill) { +void ExecutionDomainFix::processDefs(MachineInstr *MI, bool Kill) { + assert(!MI->isDebugValue() && "Won't process debug values"); + const MCInstrDesc &MCID = MI->getDesc(); + for (unsigned i = 0, + e = MI->isVariadic() ? MI->getNumOperands() : MCID.getNumDefs(); + i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg()) + continue; + if (MO.isUse()) + continue; + for (int rx : regIndices(MO.getReg())) { + // This instruction explicitly defines rx. + DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << *MI); + + // Kill off domains redefined by generic instructions. + if (Kill) + kill(rx); + } + } +} + +// Update def-ages for registers defined by MI. +// Also break dependencies on partial defs and undef uses. +void BreakFalseDeps::processDefs(MachineInstr *MI, bool breakDependency) { assert(!MI->isDebugValue() && "Won't process debug values"); // Break dependence on undef uses. Do this before updating LiveRegs below. @@ -388,14 +443,14 @@ e = MI->isVariadic() ? MI->getNumOperands() : MCID.getNumDefs(); i != e; ++i) { MachineOperand &MO = MI->getOperand(i); - if (!MO.isReg()) + if (!MO.isReg() || !MO.getReg()) continue; if (MO.isUse()) continue; - for (int rx : regIndices(MO.getReg())) { + for (MCRegUnitIterator Unit(MO.getReg(), TRI); Unit.isValid(); ++Unit) { // This instruction explicitly defines rx. - DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << CurInstr - << '\t' << *MI); + DEBUG(dbgs() << TRI->getName(MO.getReg()) << ":\t" << CurInstr << '\t' + << *MI); if (breakDependency) { // Check clearance before partial register updates. @@ -405,12 +460,8 @@ TII->breakPartialRegDependency(*MI, i, TRI); } - // How many instructions since rx was last written? - LiveRegs[rx].Def = CurInstr; - - // Kill off domains redefined by generic instructions. - if (Kill) - kill(rx); + // How many instructions since this reg unit was last written? + Defs[*Unit] = CurInstr; } } ++CurInstr; @@ -422,7 +473,7 @@ /// only do it on demand. Note that the occurrence of undefined register reads /// that should be broken is very rare, but when they occur we may have many in /// a single block. -void ExecutionDepsFix::processUndefReads(MachineBasicBlock *MBB) { +void BreakFalseDeps::processUndefReads(MachineBasicBlock *MBB) { if (UndefReads.empty()) return; @@ -455,7 +506,7 @@ // A hard instruction only works in one domain. All input registers will be // forced into that domain. -void ExecutionDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) { +void ExecutionDomainFix::visitHardInstr(MachineInstr *mi, unsigned domain) { // Collapse all uses. for (unsigned i = mi->getDesc().getNumDefs(), e = mi->getDesc().getNumOperands(); i != e; ++i) { @@ -478,20 +529,20 @@ } // A soft instruction can be changed to work in other domains given by mask. -void ExecutionDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) { +void ExecutionDomainFix::visitSoftInstr(MachineInstr *mi, unsigned mask) { // Bitmask of available domains for this instruction after taking collapsed // operands into account. unsigned available = mask; // Scan the explicit use operands for incoming domains. SmallVector used; - if (LiveRegs) + if (!LiveRegs.empty()) for (unsigned i = mi->getDesc().getNumDefs(), e = mi->getDesc().getNumOperands(); i != e; ++i) { MachineOperand &mo = mi->getOperand(i); if (!mo.isReg()) continue; for (int rx : regIndices(mo.getReg())) { - DomainValue *dv = LiveRegs[rx].Value; + DomainValue *dv = LiveRegs[rx]; if (dv == nullptr) continue; // Bitmask of domains that dv and available have in common. @@ -522,21 +573,16 @@ // Kill off any remaining uses that don't match available, and build a list of // incoming DomainValues that we want to merge. - SmallVector Regs; + SmallVector Regs; for (int rx : used) { - assert(LiveRegs && "no space allocated for live registers"); - const LiveReg &LR = LiveRegs[rx]; + assert(!LiveRegs.empty() && "no space allocated for live registers"); + DomainValue *LR = LiveRegs[rx]; // This useless DomainValue could have been missed above. - if (!LR.Value->getCommonDomains(available)) { + if (!LR->getCommonDomains(available)) { kill(rx); continue; } - // Sorted insertion. - auto I = std::upper_bound(Regs.begin(), Regs.end(), &LR, - [](const LiveReg *LHS, const LiveReg *RHS) { - return LHS->Def < RHS->Def; - }); - Regs.insert(I, &LR); + Regs.push_back(rx); } // doms are now sorted in order of appearance. Try to merge them all, giving @@ -544,14 +590,14 @@ DomainValue *dv = nullptr; while (!Regs.empty()) { if (!dv) { - dv = Regs.pop_back_val()->Value; + dv = LiveRegs[Regs.pop_back_val()]; // Force the first dv to match the current instruction. dv->AvailableDomains = dv->getCommonDomains(available); assert(dv->AvailableDomains && "Domain should have been filtered"); continue; } - DomainValue *Latest = Regs.pop_back_val()->Value; + DomainValue *Latest = LiveRegs[Regs.pop_back_val()]; // Skip already merged values. if (Latest == dv || Latest->Next) continue; @@ -560,8 +606,8 @@ // If latest didn't merge, it is useless now. Kill all registers using it. for (int i : used) { - assert(LiveRegs && "no space allocated for live registers"); - if (LiveRegs[i].Value == Latest) + assert(!LiveRegs.empty() && "no space allocated for live registers"); + if (LiveRegs[i] == Latest) kill(i); } } @@ -575,13 +621,10 @@ // Finally set all defs and non-collapsed uses to dv. We must iterate through // all the operators, including imp-def ones. - for (MachineInstr::mop_iterator ii = mi->operands_begin(), - ee = mi->operands_end(); - ii != ee; ++ii) { - MachineOperand &mo = *ii; + for (MachineOperand &mo : mi->operands()) { if (!mo.isReg()) continue; for (int rx : regIndices(mo.getReg())) { - if (!LiveRegs[rx].Value || (mo.isDef() && LiveRegs[rx].Value != dv)) { + if (!LiveRegs[rx] || (mo.isDef() && LiveRegs[rx] != dv)) { kill(rx); setLiveReg(rx, dv); } @@ -589,9 +632,22 @@ } } -void ExecutionDepsFix::processBasicBlock(MachineBasicBlock *MBB, +void ExecutionDomainFix::processBasicBlock(MachineBasicBlock *MBB, + bool PrimaryPass) { + // Go over the instructions. + // Kill domain values only in the primary traversal pass. + for (MachineInstr &MI : *MBB) { + if (!MI.isDebugValue()) { + bool Kill = false; + if (PrimaryPass) + Kill = visitInstr(&MI); + processDefs(&MI, Kill); + } + } +} + +void BreakFalseDeps::processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass) { - enterBasicBlock(MBB); // If this block is not done, it makes little sense to make any decisions // based on clearance information. We need to make a second pass anyway, // and by then we'll have better information, so we can avoid doing the work @@ -599,58 +655,29 @@ bool breakDependency = isBlockDone(MBB); for (MachineInstr &MI : *MBB) { if (!MI.isDebugValue()) { - bool Kill = false; - if (PrimaryPass) - Kill = visitInstr(&MI); - processDefs(&MI, breakDependency, Kill); + processDefs(&MI, breakDependency); } } if (breakDependency) processUndefReads(MBB); +} + +/// Includes the general logic of handeling a basic block, activating sub phases +/// that are implemented by the derived classes. +void LoopTraversal::runOnBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass) { + enterBasicBlock(MBB); + processBasicBlock(MBB, PrimaryPass); leaveBasicBlock(MBB); } -bool ExecutionDepsFix::isBlockDone(MachineBasicBlock *MBB) { +/// Have we finished processing this block and all its predecessors. +bool LoopTraversal::isBlockDone(MachineBasicBlock *MBB) { return MBBInfos[MBB].PrimaryCompleted && MBBInfos[MBB].IncomingCompleted == MBBInfos[MBB].PrimaryIncoming && MBBInfos[MBB].IncomingProcessed == MBB->pred_size(); } -bool ExecutionDepsFix::runOnMachineFunction(MachineFunction &mf) { - if (skipFunction(*mf.getFunction())) - return false; - MF = &mf; - TII = MF->getSubtarget().getInstrInfo(); - TRI = MF->getSubtarget().getRegisterInfo(); - RegClassInfo.runOnMachineFunction(mf); - LiveRegs = nullptr; - assert(NumRegs == RC->getNumRegs() && "Bad regclass"); - - DEBUG(dbgs() << "********** FIX EXECUTION DEPENDENCIES: " - << TRI->getRegClassName(RC) << " **********\n"); - - // If no relevant registers are used in the function, we can skip it - // completely. - bool anyregs = false; - const MachineRegisterInfo &MRI = mf.getRegInfo(); - for (unsigned Reg : *RC) { - if (MRI.isPhysRegUsed(Reg)) { - anyregs = true; - break; - } - } - if (!anyregs) return false; - - // Initialize the AliasMap on the first use. - if (AliasMap.empty()) { - // Given a PhysReg, AliasMap[PhysReg] returns a list of indices into RC and - // therefore the LiveRegs array. - AliasMap.resize(TRI->getNumRegs()); - for (unsigned i = 0, e = RC->getNumRegs(); i != e; ++i) - for (MCRegAliasIterator AI(RC->getRegister(i), TRI, true); - AI.isValid(); ++AI) - AliasMap[*AI].push_back(i); - } +bool LoopTraversal::traverse(MachineFunction &mf) { // Initialize the MMBInfos for (auto &MBB : mf) { @@ -686,7 +713,7 @@ * any successors that are now done. */ - MachineBasicBlock *Entry = &*MF->begin(); + MachineBasicBlock *Entry = &*mf.begin(); ReversePostOrderTraversal RPOT(Entry); SmallVector Workqueue; for (ReversePostOrderTraversal::rpo_iterator @@ -701,7 +728,7 @@ while (!Workqueue.empty()) { MachineBasicBlock *ActiveMBB = &*Workqueue.back(); Workqueue.pop_back(); - processBasicBlock(ActiveMBB, Primary); + runOnBasicBlock(ActiveMBB, Primary); bool Done = isBlockDone(ActiveMBB); for (auto *Succ : ActiveMBB->successors()) { if (!isBlockDone(Succ)) { @@ -729,27 +756,92 @@ MBBI != MBBE; ++MBBI) { MachineBasicBlock *MBB = *MBBI; if (!isBlockDone(MBB)) { - processBasicBlock(MBB, false); + runOnBasicBlock(MBB, false); // Don't update successors here. We'll get to them anyway through this // loop. } } - // Clear the LiveOuts vectors and collapse any remaining DomainValues. - for (ReversePostOrderTraversal::rpo_iterator - MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) { - auto FI = MBBInfos.find(*MBBI); - if (FI == MBBInfos.end() || !FI->second.OutRegs) - continue; - for (unsigned i = 0, e = NumRegs; i != e; ++i) - if (FI->second.OutRegs[i].Value) - release(FI->second.OutRegs[i].Value); - delete[] FI->second.OutRegs; - } MBBInfos.clear(); - UndefReads.clear(); + + return false; +} + +bool ExecutionDomainFix::runOnMachineFunction(MachineFunction &mf) { + if (skipFunction(*mf.getFunction())) + return false; + MF = &mf; + TII = MF->getSubtarget().getInstrInfo(); + TRI = MF->getSubtarget().getRegisterInfo(); + RegClassInfo.runOnMachineFunction(mf); + LiveRegs.clear(); + OutRegs.clear(); + OutRegs.resize(MF->getNumBlockIDs()); + assert(NumRegs == RC->getNumRegs() && "Bad regclass"); + + DEBUG(dbgs() << "********** FIX EXECUTION DOMAIN: " + << TRI->getRegClassName(RC) << " **********\n"); + + // If no relevant registers are used in the function, we can skip it + // completely. + bool anyregs = false; + const MachineRegisterInfo &MRI = mf.getRegInfo(); + for (unsigned Reg : *RC) { + if (MRI.isPhysRegUsed(Reg)) { + anyregs = true; + break; + } + } + if (!anyregs) + return false; + + // Initialize the AliasMap on the first use. + if (AliasMap.empty()) { + // Given a PhysReg, AliasMap[PhysReg] returns a list of indices into RC and + // therefore the LiveRegs array. + AliasMap.resize(TRI->getNumRegs()); + for (unsigned i = 0, e = RC->getNumRegs(); i != e; ++i) + for (MCRegAliasIterator AI(RC->getRegister(i), TRI, true); AI.isValid(); + ++AI) + AliasMap[*AI].push_back(i); + } + + // Traverse the basic blocks. + traverse(mf); + + // Collapse any remaining DomainValues in OutRegs. + for (RegsDVInfo LiveOutRegs : OutRegs) { + for (DomainValue *DV : LiveOutRegs) { + if (DV) + release(DV); + } + } + + // Clear structures Avail.clear(); Allocator.DestroyAll(); return false; } + +bool BreakFalseDeps::runOnMachineFunction(MachineFunction &mf) { + if (skipFunction(*mf.getFunction())) + return false; + MF = &mf; + TII = MF->getSubtarget().getInstrInfo(); + TRI = MF->getSubtarget().getRegisterInfo(); + NumRegUnits = TRI->getNumRegUnits(); + RegClassInfo.runOnMachineFunction(mf); + Defs.clear(); + OutDefs.clear(); + OutDefs.resize(MF->getNumBlockIDs()); + + DEBUG(dbgs() << "********** BREAK FALSE DEPENDENCIES **********\n"); + + // Traverse the basic blocks. + traverse(mf); + + UndefReads.clear(); + + return false; +} Index: lib/Target/ARM/ARMTargetMachine.cpp =================================================================== --- lib/Target/ARM/ARMTargetMachine.cpp +++ lib/Target/ARM/ARMTargetMachine.cpp @@ -75,7 +75,7 @@ cl::desc("Enable the global merge pass")); namespace llvm { - void initializeARMExecutionDepsFixPass(PassRegistry&); + void initializeARMExecutionDomainFixPass(PassRegistry&); } extern "C" void LLVMInitializeARMTarget() { @@ -90,7 +90,7 @@ initializeARMLoadStoreOptPass(Registry); initializeARMPreAllocLoadStoreOptPass(Registry); initializeARMConstantIslandsPass(Registry); - initializeARMExecutionDepsFixPass(Registry); + initializeARMExecutionDomainFixPass(Registry); initializeARMExpandPseudoPass(Registry); } @@ -355,20 +355,20 @@ void addPreEmitPass() override; }; -class ARMExecutionDepsFix : public ExecutionDepsFix { -public: - static char ID; - ARMExecutionDepsFix() : ExecutionDepsFix(ID, ARM::DPRRegClass) {} - StringRef getPassName() const override { - return "ARM Execution Dependency Fix"; - } -}; -char ARMExecutionDepsFix::ID; - -} // end anonymous namespace - -INITIALIZE_PASS(ARMExecutionDepsFix, "arm-execution-deps-fix", - "ARM Execution Dependency Fix", false, false) +class ARMExecutionDomainFix : public ExecutionDomainFix { +public: + static char ID; + ARMExecutionDomainFix() : ExecutionDomainFix(ID, ARM::DPRRegClass) {} + StringRef getPassName() const override { + return "ARM Execution Domain Fix"; + } +}; +char ARMExecutionDomainFix::ID; + +} // end anonymous namespace + +INITIALIZE_PASS(ARMExecutionDomainFix, "arm-execution-domain-fix", + "ARM Execution Domain Fix", false, false) TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) { return new ARMPassConfig(*this, PM); @@ -462,7 +462,8 @@ if (EnableARMLoadStoreOpt) addPass(createARMLoadStoreOptimizationPass()); - addPass(new ARMExecutionDepsFix()); + addPass(new ARMExecutionDomainFix()); + addPass(new BreakFalseDeps()); } // Expand some pseudo instructions into multiple instructions to allow Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -20586,7 +20586,7 @@ SDValue Segment = DAG.getRegister(0, MVT::i32); // If source is undef or we know it won't be used, use a zero vector // to break register dependency. - // TODO: use undef instead and let ExecutionDepsFix deal with it? + // TODO: use undef instead and let BreakFalseDeps deal with it? if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode())) Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain}; @@ -20614,7 +20614,7 @@ SDValue Segment = DAG.getRegister(0, MVT::i32); // If source is undef or we know it won't be used, use a zero vector // to break register dependency. - // TODO: use undef instead and let ExecutionDepsFix deal with it? + // TODO: use undef instead and let BreakFalseDeps deal with it? if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode())) Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain}; Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -432,7 +432,7 @@ // Alias instruction that maps zero vector to pxor / xorp* for AVX-512. // This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then -// swizzled by ExecutionDepsFix to pxor. +// swizzled by ExecutionDomainFix to pxor. // We set canFoldAsLoad because this can be converted to a constant-pool // load of an all-zeros value if folding it would be beneficial. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -8013,7 +8013,7 @@ return false; } -/// Inform the ExecutionDepsFix pass how many idle +/// Inform the BreakFalseDeps pass how many idle /// instructions we would like before a partial register update. unsigned X86InstrInfo::getPartialRegUpdateClearance( const MachineInstr &MI, unsigned OpNum, @@ -8166,7 +8166,7 @@ return false; } -/// Inform the ExecutionDepsFix pass how many idle instructions we would like +/// Inform the BreakFalseDeps pass how many idle instructions we would like /// before certain undef register reads. /// /// This catches the VCVTSI2SD family of instructions: Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -336,7 +336,7 @@ // Alias instruction that maps zero vector to pxor / xorp* for sse. // This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then -// swizzled by ExecutionDepsFix to pxor. +// swizzled by ExecutionDomainFix to pxor. // We set canFoldAsLoad because this can be converted to a constant-pool // load of an all-zeros value if folding it would be beneficial. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, @@ -3122,7 +3122,7 @@ // which has a clobber before the rcp, vs. // vrcpss mem, %xmm0, %xmm0 // TODO: In theory, we could fold the load, and avoid the stall caused by - // the partial register store, either in ExecutionDepsFix or with smarter RA. + // the partial register store, either in BreakFalseDeps or with smarter RA. let Predicates = [target] in { def : Pat<(OpNode RC:$src), (!cast("V"#NAME#Suffix##r) (ScalarVT (IMPLICIT_DEF)), RC:$src)>; Index: lib/Target/X86/X86RegisterInfo.cpp =================================================================== --- lib/Target/X86/X86RegisterInfo.cpp +++ lib/Target/X86/X86RegisterInfo.cpp @@ -80,7 +80,7 @@ bool X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const { - // ExecutionDepsFixer and PostRAScheduler require liveness. + // ExecutionDomainFix, BreakFalseDeps and PostRAScheduler require liveness. return true; } Index: lib/Target/X86/X86TargetMachine.cpp =================================================================== --- lib/Target/X86/X86TargetMachine.cpp +++ lib/Target/X86/X86TargetMachine.cpp @@ -60,7 +60,7 @@ void initializeFixupLEAPassPass(PassRegistry &); void initializeX86CallFrameOptimizationPass(PassRegistry &); void initializeX86CmovConverterPassPass(PassRegistry &); -void initializeX86ExecutionDepsFixPass(PassRegistry &); +void initializeX86ExecutionDomainFixPass(PassRegistry &); void initializeX86DomainReassignmentPass(PassRegistry &); } // end namespace llvm @@ -78,7 +78,7 @@ initializeFixupLEAPassPass(PR); initializeX86CallFrameOptimizationPass(PR); initializeX86CmovConverterPassPass(PR); - initializeX86ExecutionDepsFixPass(PR); + initializeX86ExecutionDomainFixPass(PR); initializeX86DomainReassignmentPass(PR); } @@ -325,20 +325,20 @@ void addPreSched2() override; }; -class X86ExecutionDepsFix : public ExecutionDepsFix { -public: - static char ID; - X86ExecutionDepsFix() : ExecutionDepsFix(ID, X86::VR128XRegClass) {} - StringRef getPassName() const override { - return "X86 Execution Dependency Fix"; - } -}; -char X86ExecutionDepsFix::ID; - -} // end anonymous namespace - -INITIALIZE_PASS(X86ExecutionDepsFix, "x86-execution-deps-fix", - "X86 Execution Dependency Fix", false, false) +class X86ExecutionDomainFix : public ExecutionDomainFix { +public: + static char ID; + X86ExecutionDomainFix() : ExecutionDomainFix(ID, X86::VR128XRegClass) {} + StringRef getPassName() const override { + return "X86 Execution Dependency Fix"; + } +}; +char X86ExecutionDomainFix::ID; + +} // end anonymous namespace + +INITIALIZE_PASS(X86ExecutionDomainFix, "x86-execution-domain-fix", + "X86 Execution Domain Fix", false, false) TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) { return new X86PassConfig(*this, PM); @@ -424,8 +424,10 @@ void X86PassConfig::addPreSched2() { addPass(createX86ExpandPseudoPass()); } void X86PassConfig::addPreEmitPass() { - if (getOptLevel() != CodeGenOpt::None) - addPass(new X86ExecutionDepsFix()); + if (getOptLevel() != CodeGenOpt::None) { + addPass(new X86ExecutionDomainFix()); + addPass(new BreakFalseDeps()); + } if (UseVZeroUpper) addPass(createX86IssueVZeroUpperPass()); Index: test/CodeGen/ARM/deps-fix.ll =================================================================== --- test/CodeGen/ARM/deps-fix.ll +++ test/CodeGen/ARM/deps-fix.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mcpu=cortex-a9 -mattr=+neon,+neonfp -float-abi=hard -mtriple armv7-linux-gnueabi | FileCheck %s -;; This test checks that the ExecutionDepsFix pass performs the domain changes +;; This test checks that the ExecutionDomainFix pass performs the domain changes ;; even when some dependencies are propagated through implicit definitions. ; CHECK: fun_a Index: test/CodeGen/X86/break-false-dep.ll =================================================================== --- test/CodeGen/X86/break-false-dep.ll +++ test/CodeGen/X86/break-false-dep.ll @@ -67,7 +67,7 @@ ; SSE: for.body{{$}} ; ; This loop contains two cvtsi2ss instructions that update the same xmm -; register. Verify that the execution dependency fix pass breaks those +; register. Verify that the break false dependency fix pass breaks those ; dependencies by inserting xorps instructions. ; ; If the register allocator chooses different registers for the two cvtsi2ss @@ -141,7 +141,7 @@ ; This loop contains a cvtsi2sd instruction that has a loop-carried ; false dependency on an xmm that is modified by other scalar instructions ; that follow it in the loop. Additionally, the source of convert is a -; memory operand. Verify the execution dependency fix pass breaks this +; memory operand. Verify the break false dependency fix pass breaks this ; dependency by inserting a xor before the convert. @x = common global [1024 x double] zeroinitializer, align 16 @y = common global [1024 x double] zeroinitializer, align 16