Index: include/llvm/CodeGen/BreakFalseDeps.h =================================================================== --- /dev/null +++ include/llvm/CodeGen/BreakFalseDeps.h @@ -0,0 +1,99 @@ +//==- llvm/CodeGen/BreakFalseDeps.h - Break False Dependency Fix -*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file Break False Dependency pass. +/// +/// Some instructions have false dependencies which cause unnecessary stalls. +/// For exmaple, instructions that only write part of a register, and implicitly +/// need to read the other parts of the register. This may cause unwanted +/// stalls preventing otherwise unrelated instructions from executing in +/// parallel in an out-of-order CPU. +/// This pass is aimed at identifying and avoiding these depepndencies when +/// possible. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_BREAKFALSEDEPS_H +#define LLVM_CODEGEN_BREAKFALSEDEPS_H + +#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/LoopTraversal.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/RegisterClassInfo.h" + +namespace llvm { + +class MachineBasicBlock; +class MachineInstr; +class TargetInstrInfo; + +class BreakFalseDeps : public MachineFunctionPass, LoopTraversal { +private: + MachineFunction *MF; + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + RegisterClassInfo RegClassInfo; + unsigned NumRegUnits; + + typedef std::vector RegDefInfo; + /// Maps each register unit to the instruction that defined it, relative to + /// the beginning of the current basic block. Live-out register unit values + /// may be negative, as they are relative to the end of the basic block. + RegDefInfo Defs; + /// The def instruction of the live out reg units of each basic block. + std::vector OutDefs; + + /// List of undefined register reads in this block in forward order. + std::vector> UndefReads; + + /// Storage for register unit liveness. + LivePhysRegs LiveRegSet; + + /// Current instruction number. + /// The first instruction in each basic block is 0. + int CurInstr; + +public: + static char ID; // Pass identification, replacement for typeid + + BreakFalseDeps() : MachineFunctionPass(ID) { + initializeBreakFalseDepsPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + +private: + // Dependency breaking. + bool pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, + unsigned Pref); + bool shouldBreakDependence(MachineInstr *, unsigned OpIdx, unsigned Pref); + void processUndefReads(MachineBasicBlock *); + + // LoopTraversal overrides. + void enterBasicBlock(MachineBasicBlock *) override; + void leaveBasicBlock(MachineBasicBlock *) override; + void processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass) override; + + // Instruction processing. + void processDefs(MachineInstr *, bool breakDependency); +}; + +} // namespace llvm + +#endif // LLVM_CODEGEN_BREAKFALSEDEPS_H Index: include/llvm/CodeGen/ExecutionDomainFix.h =================================================================== --- include/llvm/CodeGen/ExecutionDomainFix.h +++ include/llvm/CodeGen/ExecutionDomainFix.h @@ -26,6 +26,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/iterator_range.h" #include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/LoopTraversal.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/RegisterClassInfo.h" @@ -105,41 +106,6 @@ } }; -class LoopTraversal { -protected: - struct MBBInfo { - // Whether we have gotten to this block in primary processing yet. - bool PrimaryCompleted = false; - - // The number of predecessors for which primary processing has completed - unsigned IncomingProcessed = 0; - - // The value of `IncomingProcessed` at the start of primary processing - unsigned PrimaryIncoming = 0; - - // The number of predecessors for which all processing steps are done. - unsigned IncomingCompleted = 0; - - MBBInfo() = default; - }; - using MBBInfoMap = DenseMap; - MBBInfoMap MBBInfos; - -public: - LoopTraversal() {} - -protected: - // Overridden by the derived classes. - virtual void enterBasicBlock(MachineBasicBlock *) = 0; - virtual void leaveBasicBlock(MachineBasicBlock *) = 0; - virtual void processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass) = 0; - - // Traversal operations - void runOnBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass); - bool isBlockDone(MachineBasicBlock *MBB); - bool traverse(MachineFunction &mf); -}; - class ExecutionDomainFix : public MachineFunctionPass, LoopTraversal { SpecificBumpPtrAllocator Allocator; SmallVector Avail; @@ -205,67 +171,7 @@ void processDefs(MachineInstr *, bool Kill); void visitSoftInstr(MachineInstr *, unsigned mask); void visitHardInstr(MachineInstr *, unsigned domain); -}; - -class BreakFalseDeps : public MachineFunctionPass, LoopTraversal { -private: - MachineFunction *MF; - const TargetInstrInfo *TII; - const TargetRegisterInfo *TRI; - RegisterClassInfo RegClassInfo; - unsigned NumRegUnits; - - typedef std::vector RegDefInfo; - /// Maps each register unit to the instruction that defined it, relative to - /// the beginning of the current basic block. Live-out register unit values - /// may be negative, as they are relative to the end of the basic block. - RegDefInfo Defs; - /// The def instruction of the live out reg units of each basic block. - std::vector OutDefs; - - /// List of undefined register reads in this block in forward order. - std::vector> UndefReads; - - /// Storage for register unit liveness. - LivePhysRegs LiveRegSet; - - /// Current instruction number. - /// The first instruction in each basic block is 0. - int CurInstr; - -public: - static char ID; // Pass identification, replacement for typeid - - BreakFalseDeps() : MachineFunctionPass(ID) { - initializeBreakFalseDepsPass(*PassRegistry::getPassRegistry()); - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesAll(); - MachineFunctionPass::getAnalysisUsage(AU); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - MachineFunctionProperties getRequiredProperties() const override { - return MachineFunctionProperties().set( - MachineFunctionProperties::Property::NoVRegs); - } - -private: - // Dependency breaking. - bool pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, - unsigned Pref); - bool shouldBreakDependence(MachineInstr *, unsigned OpIdx, unsigned Pref); - void processUndefReads(MachineBasicBlock *); - - // LoopTraversal overrides. - void enterBasicBlock(MachineBasicBlock *) override; - void leaveBasicBlock(MachineBasicBlock *) override; - void processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass) override; - // Instruction processing. - void processDefs(MachineInstr *, bool breakDependency); }; } // namespace llvm Index: include/llvm/CodeGen/LoopTraversal.h =================================================================== --- /dev/null +++ include/llvm/CodeGen/LoopTraversal.h @@ -0,0 +1,86 @@ +//==- llvm/CodeGen/LoopTraversal.h - Execution Dependency Fix -*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file Loop Traversal logic. +/// +/// This class implements the basic blocks traversal order used by passes like +/// BreakFalseDeps and ExecutionDomainFix. +/// +/// We want to visit every instruction in every basic block in order to update +/// it's execution domain or break any false dependencies. However, for the +/// dependency breaking, we need to know clearances from all predecessors +/// (including any backedges). One way to do so would be to do two complete +/// passes over all basic blocks/instructions, the first for recording +/// clearances, the second to break the dependencies. However, for functions +/// without backedges, or functions with a lot of straight-line code, and +/// a small loop, that would be a lot of unnecessary work (since only the +/// BBs that are part of the loop require two passes). As an example, +/// consider the following loop. +/// +/// +/// PH -> A -> B (xmm -> xmm) -> C -> D -> EXIT +/// ^ | +/// +----------------------------------+ +/// +/// The iteration order is as follows: +/// Naive: PH A B C D A' B' C' D' +/// Optimized: PH A B C A' B' C' D +/// +/// Note that we avoid processing D twice, because we can entirely process +/// the predecessors before getting to D. We call a block that is ready +/// for its second round of processing `done` (isBlockDone). Once we finish +/// processing some block, we update the counters in MBBInfos and re-process +/// any successors that are now done. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_LOOPTRAVERSAL_H +#define LLVM_CODEGEN_LOOPTRAVERSAL_H + +#include "llvm/CodeGen/MachineFunction.h" + +namespace llvm { +class LoopTraversal { +protected: + struct MBBInfo { + // Whether we have gotten to this block in primary processing yet. + bool PrimaryCompleted = false; + + // The number of predecessors for which primary processing has completed + unsigned IncomingProcessed = 0; + + // The value of `IncomingProcessed` at the start of primary processing + unsigned PrimaryIncoming = 0; + + // The number of predecessors for which all processing steps are done. + unsigned IncomingCompleted = 0; + + MBBInfo() = default; + }; + using MBBInfoMap = DenseMap; + MBBInfoMap MBBInfos; + +public: + LoopTraversal() {} + +protected: + // Overridden by the derived classes. + virtual void enterBasicBlock(MachineBasicBlock *) = 0; + virtual void leaveBasicBlock(MachineBasicBlock *) = 0; + virtual void processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass) = 0; + + // Traversal operations + void runOnBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass); + bool isBlockDone(MachineBasicBlock *MBB); + bool traverse(MachineFunction &mf); +}; + +} // namespace llvm + +#endif // LLVM_CODEGEN_LOOPTRAVERSAL_H Index: lib/CodeGen/BreakFalseDeps.cpp =================================================================== --- /dev/null +++ lib/CodeGen/BreakFalseDeps.cpp @@ -0,0 +1,300 @@ +//===- BreakFalseDeps.cpp - Break false dependencies issues -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/BreakFalseDeps.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "break-false-deps" + +char BreakFalseDeps::ID = 0; + +INITIALIZE_PASS(BreakFalseDeps, DEBUG_TYPE, "BreakFalseDeps", false, false) + +// Default values are 'nothing happened a long time ago'. +const int DefaultDefVal = -(1 << 20); + +/// Set up LiveRegs by merging predecessor live-out values. +void BreakFalseDeps::enterBasicBlock(MachineBasicBlock *MBB) { + + // Reset instruction counter in each basic block. + CurInstr = 0; + + // Set up UndefReads to track undefined register reads. + UndefReads.clear(); + LiveRegSet.clear(); + + // Set up Defs to represent registers entering MBB. + if (!Defs.size()) + Defs.resize(NumRegUnits); + + // Default values are 'nothing happened a long time ago'. + for (unsigned Unit = 0; Unit < NumRegUnits; ++Unit) { + Defs[Unit] = DefaultDefVal; + } + + // This is the entry block. + if (MBB->pred_empty()) { + for (const auto &LI : MBB->liveins()) { + for (MCRegUnitIterator Units(LI.PhysReg, TRI); Units.isValid(); ++Units) { + // Treat function live-ins as if they were defined just before the first + // instruction. Usually, function arguments are set up immediately + // before the call. + Defs[*Units] = -1; + } + } + DEBUG(dbgs() << "BB#" << MBB->getNumber() << ": entry\n"); + return; + } + + // Try to coalesce live-out register units from predecessors. + for (MachineBasicBlock::const_pred_iterator pi = MBB->pred_begin(), + pe = MBB->pred_end(); + pi != pe; ++pi) { + const RegDefInfo &IncomingDefs = OutDefs[(*pi)->getNumber()]; + // Incoming is empty if this is a backedge from a BB + // we haven't processed yet + if (!IncomingDefs.size()) + continue; + + for (unsigned Unit = 0; Unit < NumRegUnits; ++Unit) { + // Use the most recent predecessor def for each register unit. + Defs[Unit] = std::max(Defs[Unit], IncomingDefs[Unit]); + } + } + DEBUG( + dbgs() << "BB#" << MBB->getNumber() + << (!isBlockDone(MBB) ? ": incomplete\n" : ": all preds known\n")); +} + +void BreakFalseDeps::leaveBasicBlock(MachineBasicBlock *MBB) { + + assert(Defs.size() && "Must enter basic block first."); + + // While processing the basic block, we kept `Def` relative to the start + // of the basic block for convenience. However, future use of this information + // only cares about the clearance from the end of the block, so adjust + // everything to be relative to the end of the basic block. + for (unsigned Unit = 0; Unit < NumRegUnits; ++Unit) + Defs[Unit] -= CurInstr; + + // Save register clearances at end of MBB - used by enterBasicBlock(). + OutDefs[MBB->getNumber()] = Defs; + + Defs.clear(); +} + +/// \brief Helps avoid false dependencies on undef registers by updating the +/// machine instructions' undef operand to use a register that the instruction +/// is truly dependent on, or use a register with clearance higher than Pref. +/// Returns true if it was able to find a true dependency, thus not requiring +/// a dependency breaking instruction regardless of clearance. +bool BreakFalseDeps::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, + unsigned Pref) { + MachineOperand &MO = MI->getOperand(OpIdx); + assert(MO.isUndef() && "Expected undef machine operand"); + + unsigned OriginalReg = MO.getReg(); + + // Update only undef operands that have reg units that are mapped to one root. + for (MCRegUnitIterator Unit(OriginalReg, TRI); Unit.isValid(); ++Unit) { + unsigned NumRoots = 0; + for (MCRegUnitRootIterator Root(*Unit, TRI); Root.isValid(); ++Root) { + NumRoots++; + if (NumRoots > 1) + return false; + } + } + + // Get the undef operand's register class + const TargetRegisterClass *OpRC = + TII->getRegClass(MI->getDesc(), OpIdx, TRI, *MF); + + // If the instruction has a true dependency, we can hide the false depdency + // behind it. + for (MachineOperand &CurrMO : MI->operands()) { + if (!CurrMO.isReg() || CurrMO.isDef() || CurrMO.isUndef() || + !OpRC->contains(CurrMO.getReg())) + continue; + // We found a true dependency - replace the undef register with the true + // dependency. + MO.setReg(CurrMO.getReg()); + return true; + } + + // Go over all registers in the register class and find the register with + // max clearance or clearance higher than Pref. + unsigned MaxClearance = 0; + unsigned MaxClearanceReg = OriginalReg; + ArrayRef Order = RegClassInfo.getOrder(OpRC); + for (auto Reg : Order) { + int TmpDef = DefaultDefVal; + for (MCRegUnitIterator Unit(Reg, TRI); Unit.isValid(); ++Unit) { + TmpDef = std::max(TmpDef, Defs[*Unit]); + } + unsigned Clearance = CurInstr - TmpDef; + if (Clearance <= MaxClearance) + continue; + MaxClearance = Clearance; + MaxClearanceReg = Reg; + + if (MaxClearance > Pref) + break; + } + + // Update the operand if we found a register with better clearance. + if (MaxClearanceReg != OriginalReg) + MO.setReg(MaxClearanceReg); + + return false; +} + +/// \brief Return true to if it makes sense to break dependence on a partial def +/// or undef use. +bool BreakFalseDeps::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx, + unsigned Pref) { + unsigned reg = MI->getOperand(OpIdx).getReg(); + for (MCRegUnitIterator Unit(reg, TRI); Unit.isValid(); ++Unit) { + unsigned Clearance = CurInstr - Defs[*Unit]; + DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref); + + if (Pref > Clearance) { + DEBUG(dbgs() << ": Break dependency.\n"); + continue; + } + DEBUG(dbgs() << ": OK .\n"); + return false; + } + return true; +} + +// Update def-ages for registers defined by MI. +// Also break dependencies on partial defs and undef uses. +void BreakFalseDeps::processDefs(MachineInstr *MI, bool breakDependency) { + assert(!MI->isDebugValue() && "Won't process debug values"); + + // Break dependence on undef uses. Do this before updating LiveRegs below. + unsigned OpNum; + if (breakDependency) { + unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI); + if (Pref) { + bool HadTrueDependency = pickBestRegisterForUndef(MI, OpNum, Pref); + // We don't need to bother trying to break a dependency if this + // instruction has a true dependency on that register through another + // operand - we'll have to wait for it to be available regardless. + if (!HadTrueDependency && shouldBreakDependence(MI, OpNum, Pref)) + UndefReads.push_back(std::make_pair(MI, OpNum)); + } + } + const MCInstrDesc &MCID = MI->getDesc(); + for (unsigned i = 0, + e = MI->isVariadic() ? MI->getNumOperands() : MCID.getNumDefs(); + i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg() || !MO.getReg()) + continue; + if (MO.isUse()) + continue; + for (MCRegUnitIterator Unit(MO.getReg(), TRI); Unit.isValid(); ++Unit) { + // This instruction explicitly defines rx. + DEBUG(dbgs() << TRI->getName(MO.getReg()) << ":\t" << CurInstr << '\t' + << *MI); + + if (breakDependency) { + // Check clearance before partial register updates. + // Call breakDependence before setting LiveRegs[rx].Def. + unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI); + if (Pref && shouldBreakDependence(MI, i, Pref)) + TII->breakPartialRegDependency(*MI, i, TRI); + } + + // How many instructions since this reg unit was last written? + Defs[*Unit] = CurInstr; + } + } + ++CurInstr; +} + +/// \break Break false dependencies on undefined register reads. +/// +/// Walk the block backward computing precise liveness. This is expensive, so we +/// only do it on demand. Note that the occurrence of undefined register reads +/// that should be broken is very rare, but when they occur we may have many in +/// a single block. +void BreakFalseDeps::processUndefReads(MachineBasicBlock *MBB) { + if (UndefReads.empty()) + return; + + // Collect this block's live out register units. + LiveRegSet.init(*TRI); + // We do not need to care about pristine register units as they are just + // preserved but not actually used in the function. + LiveRegSet.addLiveOutsNoPristines(*MBB); + + MachineInstr *UndefMI = UndefReads.back().first; + unsigned OpIdx = UndefReads.back().second; + + for (MachineInstr &I : make_range(MBB->rbegin(), MBB->rend())) { + // Update liveness, including the current instruction's defs. + LiveRegSet.stepBackward(I); + + if (UndefMI == &I) { + if (!LiveRegSet.contains(UndefMI->getOperand(OpIdx).getReg())) + TII->breakPartialRegDependency(*UndefMI, OpIdx, TRI); + + UndefReads.pop_back(); + if (UndefReads.empty()) + return; + + UndefMI = UndefReads.back().first; + OpIdx = UndefReads.back().second; + } + } +} + +void BreakFalseDeps::processBasicBlock(MachineBasicBlock *MBB, + bool PrimaryPass) { + bool breakDependency = isBlockDone(MBB); + for (MachineInstr &MI : *MBB) { + if (MI.isDebugValue()) + continue; + processDefs(&MI, breakDependency); + } + // If this block is not done, it makes little sense to make any decisions + // based on clearance information. We need to make a second pass anyway, + // and by then we'll have better information, so we can avoid doing the work + // to try and break dependencies now. + if (breakDependency) + processUndefReads(MBB); +} + +bool BreakFalseDeps::runOnMachineFunction(MachineFunction &mf) { + if (skipFunction(*mf.getFunction())) + return false; + MF = &mf; + TII = MF->getSubtarget().getInstrInfo(); + TRI = MF->getSubtarget().getRegisterInfo(); + NumRegUnits = TRI->getNumRegUnits(); + RegClassInfo.runOnMachineFunction(mf); + Defs.clear(); + OutDefs.clear(); + OutDefs.resize(MF->getNumBlockIDs()); + + DEBUG(dbgs() << "********** BREAK FALSE DEPENDENCIES **********\n"); + + // Traverse the basic blocks. + traverse(mf); + + UndefReads.clear(); + + return false; +} Index: lib/CodeGen/CMakeLists.txt =================================================================== --- lib/CodeGen/CMakeLists.txt +++ lib/CodeGen/CMakeLists.txt @@ -6,6 +6,7 @@ BasicTargetTransformInfo.cpp BranchFolding.cpp BranchRelaxation.cpp + BreakFalseDeps.cpp BuiltinGCs.cpp CalcSpillWeights.cpp CallingConvLower.cpp @@ -57,6 +58,7 @@ LiveVariables.cpp LLVMTargetMachine.cpp LocalStackSlotAllocation.cpp + LoopTraversal.cpp LowLevelType.cpp LowerEmuTLS.cpp MachineBasicBlock.cpp Index: lib/CodeGen/ExecutionDomainFix.cpp =================================================================== --- lib/CodeGen/ExecutionDomainFix.cpp +++ lib/CodeGen/ExecutionDomainFix.cpp @@ -162,67 +162,6 @@ return true; } -// BreakFalseDeps related -char BreakFalseDeps::ID = 0; - -INITIALIZE_PASS(BreakFalseDeps, DEBUG_TYPE, "BreakFalseDeps", false, false) - -// Default values are 'nothing happened a long time ago'. -const int DefaultDefVal = -(1 << 20); - -/// Set up LiveRegs by merging predecessor live-out values. -void BreakFalseDeps::enterBasicBlock(MachineBasicBlock *MBB) { - - // Reset instruction counter in each basic block. - CurInstr = 0; - - // Set up UndefReads to track undefined register reads. - UndefReads.clear(); - LiveRegSet.clear(); - - // Set up Defs to represent registers entering MBB. - if (!Defs.size()) - Defs.resize(NumRegUnits); - - // Default values are 'nothing happened a long time ago'. - for (unsigned Unit = 0; Unit < NumRegUnits; ++Unit) { - Defs[Unit] = DefaultDefVal; - } - - // This is the entry block. - if (MBB->pred_empty()) { - for (const auto &LI : MBB->liveins()) { - for (MCRegUnitIterator Units(LI.PhysReg, TRI); Units.isValid(); ++Units) { - // Treat function live-ins as if they were defined just before the first - // instruction. Usually, function arguments are set up immediately - // before the call. - Defs[*Units] = -1; - } - } - DEBUG(dbgs() << "BB#" << MBB->getNumber() << ": entry\n"); - return; - } - - // Try to coalesce live-out register units from predecessors. - for (MachineBasicBlock::const_pred_iterator pi = MBB->pred_begin(), - pe = MBB->pred_end(); - pi != pe; ++pi) { - const RegDefInfo &IncomingDefs = OutDefs[(*pi)->getNumber()]; - // Incoming is empty if this is a backedge from a BB - // we haven't processed yet - if (!IncomingDefs.size()) - continue; - - for (unsigned Unit = 0; Unit < NumRegUnits; ++Unit) { - // Use the most recent predecessor def for each register unit. - Defs[Unit] = std::max(Defs[Unit], IncomingDefs[Unit]); - } - } - DEBUG( - dbgs() << "BB#" << MBB->getNumber() - << (!isBlockDone(MBB) ? ": incomplete\n" : ": all preds known\n")); -} - /// Set up LiveRegs by merging predecessor live-out values. void ExecutionDomainFix::enterBasicBlock(MachineBasicBlock *MBB) { // Set up LiveRegs to represent registers entering MBB. @@ -281,23 +220,6 @@ << (!isBlockDone(MBB) ? ": incomplete\n" : ": all preds known\n")); } -void BreakFalseDeps::leaveBasicBlock(MachineBasicBlock *MBB) { - - assert(Defs.size() && "Must enter basic block first."); - - // While processing the basic block, we kept `Def` relative to the start - // of the basic block for convenience. However, future use of this information - // only cares about the clearance from the end of the block, so adjust - // everything to be relative to the end of the basic block. - for (unsigned Unit = 0; Unit < NumRegUnits; ++Unit) - Defs[Unit] -= CurInstr; - - // Save register clearances at end of MBB - used by enterBasicBlock(). - OutDefs[MBB->getNumber()] = Defs; - - Defs.clear(); -} - void ExecutionDomainFix::leaveBasicBlock(MachineBasicBlock *MBB) { assert(!LiveRegs.empty() && "Must enter basic block first."); for (auto DV : OutRegs[MBB->getNumber()]) { @@ -320,94 +242,7 @@ return !DomP.first; } -/// \brief Helps avoid false dependencies on undef registers by updating the -/// machine instructions' undef operand to use a register that the instruction -/// is truly dependent on, or use a register with clearance higher than Pref. -/// Returns true if it was able to find a true dependency, thus not requiring -/// a dependency breaking instruction regardless of clearance. -bool BreakFalseDeps::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, - unsigned Pref) { - MachineOperand &MO = MI->getOperand(OpIdx); - assert(MO.isUndef() && "Expected undef machine operand"); - - unsigned OriginalReg = MO.getReg(); - - // Update only undef operands that have reg units that are mapped to one root. - for (MCRegUnitIterator Unit(OriginalReg, TRI); Unit.isValid(); ++Unit) { - unsigned NumRoots = 0; - for (MCRegUnitRootIterator Root(*Unit, TRI); Root.isValid(); ++Root) { - NumRoots++; - if (NumRoots > 1) - return false; - } - } - - // Get the undef operand's register class - const TargetRegisterClass *OpRC = - TII->getRegClass(MI->getDesc(), OpIdx, TRI, *MF); - - // If the instruction has a true dependency, we can hide the false depdency - // behind it. - for (MachineOperand &CurrMO : MI->operands()) { - if (!CurrMO.isReg() || CurrMO.isDef() || CurrMO.isUndef() || - !OpRC->contains(CurrMO.getReg())) - continue; - // We found a true dependency - replace the undef register with the true - // dependency. - MO.setReg(CurrMO.getReg()); - return true; - } - - // Go over all registers in the register class and find the register with - // max clearance or clearance higher than Pref. - unsigned MaxClearance = 0; - unsigned MaxClearanceReg = OriginalReg; - ArrayRef Order = RegClassInfo.getOrder(OpRC); - for (auto Reg : Order) { - int TmpDef = DefaultDefVal; - for (MCRegUnitIterator Unit(Reg, TRI); Unit.isValid(); ++Unit) { - TmpDef = std::max(TmpDef, Defs[*Unit]); - } - unsigned Clearance = CurInstr - TmpDef; - if (Clearance <= MaxClearance) - continue; - MaxClearance = Clearance; - MaxClearanceReg = Reg; - - if (MaxClearance > Pref) - break; - } - - // Update the operand if we found a register with better clearance. - if (MaxClearanceReg != OriginalReg) - MO.setReg(MaxClearanceReg); - - return false; -} - -/// \brief Return true to if it makes sense to break dependence on a partial def -/// or undef use. -bool BreakFalseDeps::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx, - unsigned Pref) { - unsigned reg = MI->getOperand(OpIdx).getReg(); - for (MCRegUnitIterator Unit(reg, TRI); Unit.isValid(); ++Unit) { - unsigned Clearance = CurInstr - Defs[*Unit]; - DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref); - - if (Pref > Clearance) { - DEBUG(dbgs() << ": Break dependency.\n"); - continue; - } - DEBUG(dbgs() << ": OK .\n"); - return false; - } - return true; -} - -// Update def-ages for registers defined by MI. -// If Kill is set, also kill off DomainValues clobbered by the defs. -// -// Also break dependencies on partial defs and undef uses. +// If Kill is set, kill off DomainValues clobbered by the defs. void ExecutionDomainFix::processDefs(MachineInstr *MI, bool Kill) { assert(!MI->isDebugValue() && "Won't process debug values"); const MCInstrDesc &MCID = MI->getDesc(); @@ -430,90 +265,6 @@ } } -// Update def-ages for registers defined by MI. -// Also break dependencies on partial defs and undef uses. -void BreakFalseDeps::processDefs(MachineInstr *MI, bool breakDependency) { - assert(!MI->isDebugValue() && "Won't process debug values"); - - // Break dependence on undef uses. Do this before updating LiveRegs below. - unsigned OpNum; - if (breakDependency) { - unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI); - if (Pref) { - bool HadTrueDependency = pickBestRegisterForUndef(MI, OpNum, Pref); - // We don't need to bother trying to break a dependency if this - // instruction has a true dependency on that register through another - // operand - we'll have to wait for it to be available regardless. - if (!HadTrueDependency && shouldBreakDependence(MI, OpNum, Pref)) - UndefReads.push_back(std::make_pair(MI, OpNum)); - } - } - const MCInstrDesc &MCID = MI->getDesc(); - for (unsigned i = 0, - e = MI->isVariadic() ? MI->getNumOperands() : MCID.getNumDefs(); - i != e; ++i) { - MachineOperand &MO = MI->getOperand(i); - if (!MO.isReg() || !MO.getReg()) - continue; - if (MO.isUse()) - continue; - for (MCRegUnitIterator Unit(MO.getReg(), TRI); Unit.isValid(); ++Unit) { - // This instruction explicitly defines rx. - DEBUG(dbgs() << TRI->getName(MO.getReg()) << ":\t" << CurInstr << '\t' - << *MI); - - if (breakDependency) { - // Check clearance before partial register updates. - // Call breakDependence before setting LiveRegs[rx].Def. - unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI); - if (Pref && shouldBreakDependence(MI, i, Pref)) - TII->breakPartialRegDependency(*MI, i, TRI); - } - - // How many instructions since this reg unit was last written? - Defs[*Unit] = CurInstr; - } - } - ++CurInstr; -} - -/// \break Break false dependencies on undefined register reads. -/// -/// Walk the block backward computing precise liveness. This is expensive, so we -/// only do it on demand. Note that the occurrence of undefined register reads -/// that should be broken is very rare, but when they occur we may have many in -/// a single block. -void BreakFalseDeps::processUndefReads(MachineBasicBlock *MBB) { - if (UndefReads.empty()) - return; - - // Collect this block's live out register units. - LiveRegSet.init(*TRI); - // We do not need to care about pristine registers as they are just preserved - // but not actually used in the function. - LiveRegSet.addLiveOutsNoPristines(*MBB); - - MachineInstr *UndefMI = UndefReads.back().first; - unsigned OpIdx = UndefReads.back().second; - - for (MachineInstr &I : make_range(MBB->rbegin(), MBB->rend())) { - // Update liveness, including the current instruction's defs. - LiveRegSet.stepBackward(I); - - if (UndefMI == &I) { - if (!LiveRegSet.contains(UndefMI->getOperand(OpIdx).getReg())) - TII->breakPartialRegDependency(*UndefMI, OpIdx, TRI); - - UndefReads.pop_back(); - if (UndefReads.empty()) - return; - - UndefMI = UndefReads.back().first; - OpIdx = UndefReads.back().second; - } - } -} - // A hard instruction only works in one domain. All input registers will be // forced into that domain. void ExecutionDomainFix::visitHardInstr(MachineInstr *mi, unsigned domain) { @@ -666,129 +417,6 @@ } } -void BreakFalseDeps::processBasicBlock(MachineBasicBlock *MBB, - bool PrimaryPass) { - // If this block is not done, it makes little sense to make any decisions - // based on clearance information. We need to make a second pass anyway, - // and by then we'll have better information, so we can avoid doing the work - // to try and break dependencies now. - bool breakDependency = isBlockDone(MBB); - for (MachineInstr &MI : *MBB) { - if (MI.isDebugValue()) - continue; - processDefs(&MI, breakDependency); - } - if (breakDependency) - processUndefReads(MBB); -} - -/// Includes the general logic of handeling a basic block, activating sub phases -/// that are implemented by the derived classes. -void LoopTraversal::runOnBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass) { - enterBasicBlock(MBB); - processBasicBlock(MBB, PrimaryPass); - leaveBasicBlock(MBB); -} - -/// Have we finished processing this block and all its predecessors. -bool LoopTraversal::isBlockDone(MachineBasicBlock *MBB) { - return MBBInfos[MBB].PrimaryCompleted && - MBBInfos[MBB].IncomingCompleted == MBBInfos[MBB].PrimaryIncoming && - MBBInfos[MBB].IncomingProcessed == MBB->pred_size(); -} - -bool LoopTraversal::traverse(MachineFunction &mf) { - - // Initialize the MMBInfos - for (auto &MBB : mf) { - MBBInfo InitialInfo; - MBBInfos.insert(std::make_pair(&MBB, InitialInfo)); - } - - /* - * We want to visit every instruction in every basic block in order to update - * it's execution domain or break any false dependencies. However, for the - * dependency breaking, we need to know clearances from all predecessors - * (including any backedges). One way to do so would be to do two complete - * passes over all basic blocks/instructions, the first for recording - * clearances, the second to break the dependencies. However, for functions - * without backedges, or functions with a lot of straight-line code, and - * a small loop, that would be a lot of unnecessary work (since only the - * BBs that are part of the loop require two passes). As an example, - * consider the following loop. - * - * - * PH -> A -> B (xmm -> xmm) -> C -> D -> EXIT - * ^ | - * +----------------------------------+ - * - * The iteration order is as follows: - * Naive: PH A B C D A' B' C' D' - * Optimized: PH A B C A' B' C' D - * - * Note that we avoid processing D twice, because we can entirely process - * the predecessors before getting to D. We call a block that is ready - * for its second round of processing `done` (isBlockDone). Once we finish - * processing some block, we update the counters in MBBInfos and re-process - * any successors that are now done. - */ - - MachineBasicBlock *Entry = &*mf.begin(); - ReversePostOrderTraversal RPOT(Entry); - SmallVector Workqueue; - for (ReversePostOrderTraversal::rpo_iterator - MBBI = RPOT.begin(), - MBBE = RPOT.end(); - MBBI != MBBE; ++MBBI) { - MachineBasicBlock *MBB = *MBBI; - // N.B: IncomingProcessed and IncomingCompleted were already updated while - // processing this block's predecessors. - MBBInfos[MBB].PrimaryCompleted = true; - MBBInfos[MBB].PrimaryIncoming = MBBInfos[MBB].IncomingProcessed; - bool Primary = true; - Workqueue.push_back(MBB); - while (!Workqueue.empty()) { - MachineBasicBlock *ActiveMBB = &*Workqueue.back(); - Workqueue.pop_back(); - runOnBasicBlock(ActiveMBB, Primary); - bool Done = isBlockDone(ActiveMBB); - for (auto *Succ : ActiveMBB->successors()) { - if (!isBlockDone(Succ)) { - if (Primary) { - MBBInfos[Succ].IncomingProcessed++; - } - if (Done) { - MBBInfos[Succ].IncomingCompleted++; - } - if (isBlockDone(Succ)) { - Workqueue.push_back(Succ); - } - } - } - Primary = false; - } - } - - // We need to go through again and finalize any blocks that are not done yet. - // This is possible if blocks have dead predecessors, so we didn't visit them - // above. - for (ReversePostOrderTraversal::rpo_iterator - MBBI = RPOT.begin(), - MBBE = RPOT.end(); - MBBI != MBBE; ++MBBI) { - MachineBasicBlock *MBB = *MBBI; - if (!isBlockDone(MBB)) { - runOnBasicBlock(MBB, false); - // Don't update successors here. We'll get to them anyway through this - // loop. - } - } - - MBBInfos.clear(); - - return false; -} - bool ExecutionDomainFix::runOnMachineFunction(MachineFunction &mf) { if (skipFunction(*mf.getFunction())) return false; @@ -845,25 +473,3 @@ return false; } - -bool BreakFalseDeps::runOnMachineFunction(MachineFunction &mf) { - if (skipFunction(*mf.getFunction())) - return false; - MF = &mf; - TII = MF->getSubtarget().getInstrInfo(); - TRI = MF->getSubtarget().getRegisterInfo(); - NumRegUnits = TRI->getNumRegUnits(); - RegClassInfo.runOnMachineFunction(mf); - Defs.clear(); - OutDefs.clear(); - OutDefs.resize(MF->getNumBlockIDs()); - - DEBUG(dbgs() << "********** BREAK FALSE DEPENDENCIES **********\n"); - - // Traverse the basic blocks. - traverse(mf); - - UndefReads.clear(); - - return false; -} Index: lib/CodeGen/LoopTraversal.cpp =================================================================== --- /dev/null +++ lib/CodeGen/LoopTraversal.cpp @@ -0,0 +1,96 @@ +//===- LoopTraversal.cpp - Fix execution dependecy issues ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/LoopTraversal.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + + +/// Includes the general logic of handeling a basic block, activating sub phases +/// that are implemented by the derived classes. +void LoopTraversal::runOnBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass) { + enterBasicBlock(MBB); + processBasicBlock(MBB, PrimaryPass); + leaveBasicBlock(MBB); +} + +/// Have we finished processing this block and all its predecessors. +bool LoopTraversal::isBlockDone(MachineBasicBlock *MBB) { + return MBBInfos[MBB].PrimaryCompleted && + MBBInfos[MBB].IncomingCompleted == MBBInfos[MBB].PrimaryIncoming && + MBBInfos[MBB].IncomingProcessed == MBB->pred_size(); +} + +bool LoopTraversal::traverse(MachineFunction &mf) { + + // Initialize the MMBInfos + for (auto &MBB : mf) { + MBBInfo InitialInfo; + MBBInfos.insert(std::make_pair(&MBB, InitialInfo)); + } + + // Go over the basic blocks, optimizing the loop traverstal. + MachineBasicBlock *Entry = &*mf.begin(); + ReversePostOrderTraversal RPOT(Entry); + SmallVector Workqueue; + for (ReversePostOrderTraversal::rpo_iterator + MBBI = RPOT.begin(), + MBBE = RPOT.end(); + MBBI != MBBE; ++MBBI) { + MachineBasicBlock *MBB = *MBBI; + // N.B: IncomingProcessed and IncomingCompleted were already updated while + // processing this block's predecessors. + MBBInfos[MBB].PrimaryCompleted = true; + MBBInfos[MBB].PrimaryIncoming = MBBInfos[MBB].IncomingProcessed; + bool Primary = true; + Workqueue.push_back(MBB); + while (!Workqueue.empty()) { + MachineBasicBlock *ActiveMBB = &*Workqueue.back(); + Workqueue.pop_back(); + runOnBasicBlock(ActiveMBB, Primary); + bool Done = isBlockDone(ActiveMBB); + for (auto *Succ : ActiveMBB->successors()) { + if (!isBlockDone(Succ)) { + if (Primary) { + MBBInfos[Succ].IncomingProcessed++; + } + if (Done) { + MBBInfos[Succ].IncomingCompleted++; + } + if (isBlockDone(Succ)) { + Workqueue.push_back(Succ); + } + } + } + Primary = false; + } + } + + // We need to go through again and finalize any blocks that are not done yet. + // This is possible if blocks have dead predecessors, so we didn't visit them + // above. + for (ReversePostOrderTraversal::rpo_iterator + MBBI = RPOT.begin(), + MBBE = RPOT.end(); + MBBI != MBBE; ++MBBI) { + MachineBasicBlock *MBB = *MBBI; + if (!isBlockDone(MBB)) { + runOnBasicBlock(MBB, false); + // Don't update successors here. We'll get to them anyway through this + // loop. + } + } + + MBBInfos.clear(); + + return false; +} Index: lib/Target/ARM/ARMTargetMachine.cpp =================================================================== --- lib/Target/ARM/ARMTargetMachine.cpp +++ lib/Target/ARM/ARMTargetMachine.cpp @@ -22,6 +22,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/BreakFalseDeps.h" #include "llvm/CodeGen/ExecutionDomainFix.h" #include "llvm/CodeGen/GlobalISel/CallLowering.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" Index: lib/Target/X86/X86TargetMachine.cpp =================================================================== --- lib/Target/X86/X86TargetMachine.cpp +++ lib/Target/X86/X86TargetMachine.cpp @@ -26,6 +26,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/BreakFalseDeps.h" #include "llvm/CodeGen/ExecutionDomainFix.h" #include "llvm/CodeGen/GlobalISel/CallLowering.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h"