Index: include/llvm/CodeGen/ExecutionDepsFix.h =================================================================== --- include/llvm/CodeGen/ExecutionDepsFix.h +++ include/llvm/CodeGen/ExecutionDepsFix.h @@ -1,230 +1,354 @@ -//==- llvm/CodeGen/ExecutionDepsFix.h - Execution Dependency Fix -*- C++ -*-==// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file Execution Dependency Fix pass. -/// -/// Some X86 SSE instructions like mov, and, or, xor are available in different -/// variants for different operand types. These variant instructions are -/// equivalent, but on Nehalem and newer cpus there is extra latency -/// transferring data between integer and floating point domains. ARM cores -/// have similar issues when they are configured with both VFP and NEON -/// pipelines. -/// -/// This pass changes the variant instructions to minimize domain crossings. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_CODEGEN_EXECUTIONDEPSFIX_H -#define LLVM_CODEGEN_EXECUTIONDEPSFIX_H - -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/iterator_range.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/CodeGen/LivePhysRegs.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/RegisterClassInfo.h" -#include "llvm/Pass.h" -#include "llvm/Support/Allocator.h" -#include "llvm/Support/MathExtras.h" -#include -#include -#include -#include - -namespace llvm { - -class MachineBasicBlock; -class MachineInstr; -class TargetInstrInfo; - -/// A DomainValue is a bit like LiveIntervals' ValNo, but it also keeps track -/// of execution domains. -/// -/// An open DomainValue represents a set of instructions that can still switch -/// execution domain. Multiple registers may refer to the same open -/// DomainValue - they will eventually be collapsed to the same execution -/// domain. -/// -/// A collapsed DomainValue represents a single register that has been forced -/// into one of more execution domains. There is a separate collapsed -/// DomainValue for each register, but it may contain multiple execution -/// domains. A register value is initially created in a single execution -/// domain, but if we were forced to pay the penalty of a domain crossing, we -/// keep track of the fact that the register is now available in multiple -/// domains. -struct DomainValue { - // Basic reference counting. - unsigned Refs = 0; - - // Bitmask of available domains. For an open DomainValue, it is the still - // possible domains for collapsing. For a collapsed DomainValue it is the - // domains where the register is available for free. - unsigned AvailableDomains; - - // Pointer to the next DomainValue in a chain. When two DomainValues are - // merged, Victim.Next is set to point to Victor, so old DomainValue - // references can be updated by following the chain. - DomainValue *Next; - - // Twiddleable instructions using or defining these registers. - SmallVector Instrs; - - DomainValue() { clear(); } - - // A collapsed DomainValue has no instructions to twiddle - it simply keeps - // track of the domains where the registers are already available. - bool isCollapsed() const { return Instrs.empty(); } - - // Is domain available? - bool hasDomain(unsigned domain) const { - assert(domain < - static_cast(std::numeric_limits::digits) && - "undefined behavior"); - return AvailableDomains & (1u << domain); - } - - // Mark domain as available. - void addDomain(unsigned domain) { - AvailableDomains |= 1u << domain; - } - - // Restrict to a single domain available. - void setSingleDomain(unsigned domain) { - AvailableDomains = 1u << domain; - } - - // Return bitmask of domains that are available and in mask. - unsigned getCommonDomains(unsigned mask) const { - return AvailableDomains & mask; - } - - // First domain available. - unsigned getFirstDomain() const { - return countTrailingZeros(AvailableDomains); - } - - // Clear this DomainValue and point to next which has all its data. - void clear() { - AvailableDomains = 0; - Next = nullptr; - Instrs.clear(); - } -}; - -/// Information about a live register. -struct LiveReg { - /// Value currently in this register, or NULL when no value is being tracked. - /// This counts as a DomainValue reference. - DomainValue *Value; - - /// Instruction that defined this register, relative to the beginning of the - /// current basic block. When a LiveReg is used to represent a live-out - /// register, this value is relative to the end of the basic block, so it - /// will be a negative number. - int Def; -}; - -class ExecutionDepsFix : public MachineFunctionPass { - SpecificBumpPtrAllocator Allocator; - SmallVector Avail; - - const TargetRegisterClass *const RC; - MachineFunction *MF; - const TargetInstrInfo *TII; - const TargetRegisterInfo *TRI; - RegisterClassInfo RegClassInfo; - std::vector> AliasMap; - const unsigned NumRegs; - LiveReg *LiveRegs; - struct MBBInfo { - // Keeps clearance and domain information for all registers. Note that this - // is different from the usual definition notion of liveness. The CPU - // doesn't care whether or not we consider a register killed. - LiveReg *OutRegs = nullptr; - - // Whether we have gotten to this block in primary processing yet. - bool PrimaryCompleted = false; - - // The number of predecessors for which primary processing has completed - unsigned IncomingProcessed = 0; - - // The value of `IncomingProcessed` at the start of primary processing - unsigned PrimaryIncoming = 0; - - // The number of predecessors for which all processing steps are done. - unsigned IncomingCompleted = 0; - - MBBInfo() = default; - }; - using MBBInfoMap = DenseMap; - MBBInfoMap MBBInfos; - - /// List of undefined register reads in this block in forward order. - std::vector> UndefReads; - - /// Storage for register unit liveness. - LivePhysRegs LiveRegSet; - - /// Current instruction number. - /// The first instruction in each basic block is 0. - int CurInstr; - -public: - ExecutionDepsFix(char &PassID, const TargetRegisterClass &RC) - : MachineFunctionPass(PassID), RC(&RC), NumRegs(RC.getNumRegs()) {} - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesAll(); - MachineFunctionPass::getAnalysisUsage(AU); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - MachineFunctionProperties getRequiredProperties() const override { - return MachineFunctionProperties().set( - MachineFunctionProperties::Property::NoVRegs); - } - -private: - iterator_range::const_iterator> - regIndices(unsigned Reg) const; - // DomainValue allocation. - DomainValue *alloc(int domain = -1); - DomainValue *retain(DomainValue *DV) { - if (DV) ++DV->Refs; - return DV; - } - void release(DomainValue*); - DomainValue *resolve(DomainValue*&); - - // LiveRegs manipulations. - void setLiveReg(int rx, DomainValue *DV); - void kill(int rx); - void force(int rx, unsigned domain); - void collapse(DomainValue *dv, unsigned domain); - bool merge(DomainValue *A, DomainValue *B); - - void enterBasicBlock(MachineBasicBlock*); - void leaveBasicBlock(MachineBasicBlock*); - bool isBlockDone(MachineBasicBlock *); - void processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass); - bool visitInstr(MachineInstr *); - void processDefs(MachineInstr *, bool breakDependency, bool Kill); - void visitSoftInstr(MachineInstr*, unsigned mask); - void visitHardInstr(MachineInstr*, unsigned domain); - bool pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, - unsigned Pref); - bool shouldBreakDependence(MachineInstr*, unsigned OpIdx, unsigned Pref); - void processUndefReads(MachineBasicBlock*); -}; - -} // end namepsace llvm - -#endif // LLVM_CODEGEN_EXECUTIONDEPSFIX_H +//==- llvm/CodeGen/ExecutionDepsFix.h - Execution Dependency Fix -*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file Execution Dependency Fix pass. +/// +/// Some X86 SSE instructions like mov, and, or, xor are available in different +/// variants for different operand types. These variant instructions are +/// equivalent, but on Nehalem and newer cpus there is extra latency +/// transferring data between integer and floating point domains. ARM cores +/// have similar issues when they are configured with both VFP and NEON +/// pipelines. +/// +/// This pass changes the variant instructions to minimize domain crossings. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_EXECUTIONDEPSFIX_H +#define LLVM_CODEGEN_EXECUTIONDEPSFIX_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/Pass.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/MathExtras.h" +#include +#include +#include +#include + +namespace llvm { + +class MachineBasicBlock; +class MachineInstr; +class TargetInstrInfo; + +/// A DomainValue is a bit like LiveIntervals' ValNo, but it also keeps track +/// of execution domains. +/// +/// An open DomainValue represents a set of instructions that can still switch +/// execution domain. Multiple registers may refer to the same open +/// DomainValue - they will eventually be collapsed to the same execution +/// domain. +/// +/// A collapsed DomainValue represents a single register that has been forced +/// into one of more execution domains. There is a separate collapsed +/// DomainValue for each register, but it may contain multiple execution +/// domains. A register value is initially created in a single execution +/// domain, but if we were forced to pay the penalty of a domain crossing, we +/// keep track of the fact that the register is now available in multiple +/// domains. +struct DomainValue { + // Basic reference counting. + unsigned Refs = 0; + + // Bitmask of available domains. For an open DomainValue, it is the still + // possible domains for collapsing. For a collapsed DomainValue it is the + // domains where the register is available for free. + unsigned AvailableDomains; + + // Pointer to the next DomainValue in a chain. When two DomainValues are + // merged, Victim.Next is set to point to Victor, so old DomainValue + // references can be updated by following the chain. + DomainValue *Next; + + // Twiddleable instructions using or defining these registers. + SmallVector Instrs; + + DomainValue() { clear(); } + + // A collapsed DomainValue has no instructions to twiddle - it simply keeps + // track of the domains where the registers are already available. + bool isCollapsed() const { return Instrs.empty(); } + + // Is domain available? + bool hasDomain(unsigned domain) const { + assert(domain < + static_cast(std::numeric_limits::digits) && + "undefined behavior"); + return AvailableDomains & (1u << domain); + } + + // Mark domain as available. + void addDomain(unsigned domain) { + AvailableDomains |= 1u << domain; + } + + // Restrict to a single domain available. + void setSingleDomain(unsigned domain) { + AvailableDomains = 1u << domain; + } + + // Return bitmask of domains that are available and in mask. + unsigned getCommonDomains(unsigned mask) const { + return AvailableDomains & mask; + } + + // First domain available. + unsigned getFirstDomain() const { + return countTrailingZeros(AvailableDomains); + } + + // Clear this DomainValue and point to next which has all its data. + void clear() { + AvailableDomains = 0; + Next = nullptr; + Instrs.clear(); + } +}; + +/// Information about a live register. +struct LiveReg { + /// Value currently in this register, or NULL when no value is being tracked. + /// This counts as a DomainValue reference. + DomainValue *Value; + + /// Instruction that defined this register, relative to the beginning of the + /// current basic block. When a LiveReg is used to represent a live-out + /// register, this value is relative to the end of the basic block, so it + /// will be a negative number. + int Def; +}; + +/// This class provides the basic blocks traversal order used by passes like +/// ReachingDefAnalysis and ExecutionDomainFix. +/// It identifies basic blocks that are part of loops and should to be visited twise +/// and returns efficient traversal order for all the blocks. +class LoopTraversal { +private: + struct MBBInfo { + // Whether we have gotten to this block in primary processing yet. + bool PrimaryCompleted = false; + + // The number of predecessors for which primary processing has completed + unsigned IncomingProcessed = 0; + + // The value of `IncomingProcessed` at the start of primary processing + unsigned PrimaryIncoming = 0; + + // The number of predecessors for which all processing steps are done. + unsigned IncomingCompleted = 0; + + MBBInfo() = default; + }; + using MBBInfoMap = DenseMap; + MBBInfoMap MBBInfos; + +public: + struct TraversedMBBInfo { + MachineBasicBlock *MBB = nullptr; + bool PrimaryPass = true; + bool IsDone = true; + + TraversedMBBInfo(MachineBasicBlock *BB = nullptr, bool Primary = true, + bool Done = true) + : MBB(BB), PrimaryPass(Primary), IsDone(Done) {} + }; + LoopTraversal() {} + + const SmallVector traverse(MachineFunction &MF); + +private: + bool isBlockDone(MachineBasicBlock *MBB); + +}; + +/// This class provides the reaching def analysis. +class ReachingDefAnalysis : public MachineFunctionPass { +private: + MachineFunction *MF; + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + RegisterClassInfo RegClassInfo; + unsigned NumRegUnits; + LiveReg *LiveRegs; + + // Keeps clearance information for all registers. Note that this + // is different from the usual definition notion of liveness. The CPU + // doesn't care whether or not we consider a register killed. + using OutRegsInfoMap = DenseMap; + OutRegsInfoMap MBBOutRegsInfos; + + /// Current instruction number. + /// The first instruction in each basic block is 0. + int CurInstr; + + /// Maps instructions to their instruction Ids, relative to the begining of + /// their basic blocks. + DenseMap InstIds; + + /// All reaching defs of a given RegUnit for a given MBB. + using MBBRegUnitDefs = SmallVector; + /// All reaching defs of all reg units for a given MBB + using MBBDefsInfo = std::vector; + /// All reaching defs of all reg units for a all MBBs + using MBBReachingDefsInfo = SmallVector; + MBBReachingDefsInfo MBBReachingDefs; + +public: + static char ID; // Pass identification, replacement for typeid + + ReachingDefAnalysis() : MachineFunctionPass(ID) { + initializeReachingDefAnalysisPass(*PassRegistry::getPassRegistry()); + } + void releaseMemory() override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + + /// Provides the instruction id of the closest reaching def instruction of + /// PhysReg that reaches MI, relative to the begining of MI's basic block. + int getReachingDef(MachineInstr *MI, int PhysReg); + /// Provides the clearance - the number of instructions since the closest + /// reaching def instuction of PhysReg that reaches MI. + int getClearance(MachineInstr *MI, MCPhysReg PhysReg); + +private: + void enterBasicBlock(const LoopTraversal::TraversedMBBInfo &TraversedMBB); + void leaveBasicBlock(const LoopTraversal::TraversedMBBInfo &TraversedMBB); + void processBasicBlock(const LoopTraversal::TraversedMBBInfo &TraversedMBB); + void processDefs(MachineInstr *); +}; + +class ExecutionDomainFix : public MachineFunctionPass { + SpecificBumpPtrAllocator Allocator; + SmallVector Avail; + + const TargetRegisterClass *const RC; + MachineFunction *MF; + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + RegisterClassInfo RegClassInfo; + std::vector> AliasMap; + const unsigned NumRegs; + LiveReg *LiveRegs; + // Keeps domain information for all registers. Note that this + // is different from the usual definition notion of liveness. The CPU + // doesn't care whether or not we consider a register killed. + using OutRegsInfoMap = DenseMap; + OutRegsInfoMap MBBOutRegsInfos; + + ReachingDefAnalysis *RDA; + +public: + ExecutionDomainFix(char &PassID, const TargetRegisterClass &RC) + : MachineFunctionPass(PassID), RC(&RC), NumRegs(RC.getNumRegs()) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + +private: + iterator_range::const_iterator> + regIndices(unsigned Reg) const; + // DomainValue allocation. + DomainValue *alloc(int domain = -1); + DomainValue *retain(DomainValue *DV) { + if (DV) ++DV->Refs; + return DV; + } + void release(DomainValue*); + DomainValue *resolve(DomainValue*&); + + // LiveRegs manipulations. + void setLiveReg(int rx, DomainValue *DV); + void kill(int rx); + void force(int rx, unsigned domain); + void collapse(DomainValue *dv, unsigned domain); + bool merge(DomainValue *A, DomainValue *B); + + void enterBasicBlock(const LoopTraversal::TraversedMBBInfo &TraversedMBB); + void leaveBasicBlock(const LoopTraversal::TraversedMBBInfo &TraversedMBB); + void processBasicBlock(const LoopTraversal::TraversedMBBInfo &TraversedMBB); + bool visitInstr(MachineInstr *); + void processDefs(MachineInstr *, bool Kill); + void visitSoftInstr(MachineInstr*, unsigned mask); + void visitHardInstr(MachineInstr*, unsigned domain); +}; + +class BreakFalseDeps : public MachineFunctionPass { +private: + MachineFunction *MF; + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + RegisterClassInfo RegClassInfo; + + /// List of undefined register reads in this block in forward order. + std::vector> UndefReads; + + /// Storage for register unit liveness. + LivePhysRegs LiveRegSet; + + ReachingDefAnalysis *RDA; + +public: + static char ID; // Pass identification, replacement for typeid + + BreakFalseDeps() : MachineFunctionPass(ID) { + initializeBreakFalseDepsPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + +private: + void processBasicBlock(MachineBasicBlock *MBB); + void processDefs(MachineInstr *MI); + bool pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, + unsigned Pref); + bool shouldBreakDependence(MachineInstr*, unsigned OpIdx, unsigned Pref); + void processUndefReads(MachineBasicBlock*); +}; + +} // end namepsace llvm + +#endif // LLVM_CODEGEN_EXECUTIONDEPSFIX_H Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -80,6 +80,7 @@ void initializeBranchProbabilityInfoWrapperPassPass(PassRegistry&); void initializeBranchRelaxationPass(PassRegistry&); void initializeBreakCriticalEdgesPass(PassRegistry&); +void initializeBreakFalseDepsPass(PassRegistry&); void initializeCallSiteSplittingLegacyPassPass(PassRegistry&); void initializeCFGOnlyPrinterLegacyPassPass(PassRegistry&); void initializeCFGOnlyViewerLegacyPassPass(PassRegistry&); @@ -311,6 +312,7 @@ void initializeRAGreedyPass(PassRegistry&); void initializeReassociateLegacyPassPass(PassRegistry&); void initializeRegBankSelectPass(PassRegistry&); +void initializeReachingDefAnalysisPass(PassRegistry&); void initializeRegToMemPass(PassRegistry&); void initializeRegionInfoPassPass(PassRegistry&); void initializeRegionOnlyPrinterPass(PassRegistry&); Index: lib/CodeGen/ExecutionDepsFix.cpp =================================================================== --- lib/CodeGen/ExecutionDepsFix.cpp +++ lib/CodeGen/ExecutionDepsFix.cpp @@ -1,755 +1,983 @@ -//===- ExecutionDepsFix.cpp - Fix execution dependecy issues ----*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "llvm/CodeGen/ExecutionDepsFix.h" - -#include "llvm/ADT/PostOrderIterator.h" -#include "llvm/ADT/iterator_range.h" -#include "llvm/CodeGen/LivePhysRegs.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RegisterClassInfo.h" -#include "llvm/Support/Allocator.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetSubtargetInfo.h" - -using namespace llvm; - -#define DEBUG_TYPE "execution-deps-fix" - -/// Translate TRI register number to a list of indices into our smaller tables -/// of interesting registers. -iterator_range::const_iterator> -ExecutionDepsFix::regIndices(unsigned Reg) const { - assert(Reg < AliasMap.size() && "Invalid register"); - const auto &Entry = AliasMap[Reg]; - return make_range(Entry.begin(), Entry.end()); -} - -DomainValue *ExecutionDepsFix::alloc(int domain) { - DomainValue *dv = Avail.empty() ? - new(Allocator.Allocate()) DomainValue : - Avail.pop_back_val(); - if (domain >= 0) - dv->addDomain(domain); - assert(dv->Refs == 0 && "Reference count wasn't cleared"); - assert(!dv->Next && "Chained DomainValue shouldn't have been recycled"); - return dv; -} - -/// Release a reference to DV. When the last reference is released, -/// collapse if needed. -void ExecutionDepsFix::release(DomainValue *DV) { - while (DV) { - assert(DV->Refs && "Bad DomainValue"); - if (--DV->Refs) - return; - - // There are no more DV references. Collapse any contained instructions. - if (DV->AvailableDomains && !DV->isCollapsed()) - collapse(DV, DV->getFirstDomain()); - - DomainValue *Next = DV->Next; - DV->clear(); - Avail.push_back(DV); - // Also release the next DomainValue in the chain. - DV = Next; - } -} - -/// Follow the chain of dead DomainValues until a live DomainValue is reached. -/// Update the referenced pointer when necessary. -DomainValue *ExecutionDepsFix::resolve(DomainValue *&DVRef) { - DomainValue *DV = DVRef; - if (!DV || !DV->Next) - return DV; - - // DV has a chain. Find the end. - do DV = DV->Next; - while (DV->Next); - - // Update DVRef to point to DV. - retain(DV); - release(DVRef); - DVRef = DV; - return DV; -} - -/// Set LiveRegs[rx] = dv, updating reference counts. -void ExecutionDepsFix::setLiveReg(int rx, DomainValue *dv) { - assert(unsigned(rx) < NumRegs && "Invalid index"); - assert(LiveRegs && "Must enter basic block first."); - - if (LiveRegs[rx].Value == dv) - return; - if (LiveRegs[rx].Value) - release(LiveRegs[rx].Value); - LiveRegs[rx].Value = retain(dv); -} - -// Kill register rx, recycle or collapse any DomainValue. -void ExecutionDepsFix::kill(int rx) { - assert(unsigned(rx) < NumRegs && "Invalid index"); - assert(LiveRegs && "Must enter basic block first."); - if (!LiveRegs[rx].Value) - return; - - release(LiveRegs[rx].Value); - LiveRegs[rx].Value = nullptr; -} - -/// Force register rx into domain. -void ExecutionDepsFix::force(int rx, unsigned domain) { - assert(unsigned(rx) < NumRegs && "Invalid index"); - assert(LiveRegs && "Must enter basic block first."); - if (DomainValue *dv = LiveRegs[rx].Value) { - if (dv->isCollapsed()) - dv->addDomain(domain); - else if (dv->hasDomain(domain)) - collapse(dv, domain); - else { - // This is an incompatible open DomainValue. Collapse it to whatever and - // force the new value into domain. This costs a domain crossing. - collapse(dv, dv->getFirstDomain()); - assert(LiveRegs[rx].Value && "Not live after collapse?"); - LiveRegs[rx].Value->addDomain(domain); - } - } else { - // Set up basic collapsed DomainValue. - setLiveReg(rx, alloc(domain)); - } -} - -/// Collapse open DomainValue into given domain. If there are multiple -/// registers using dv, they each get a unique collapsed DomainValue. -void ExecutionDepsFix::collapse(DomainValue *dv, unsigned domain) { - assert(dv->hasDomain(domain) && "Cannot collapse"); - - // Collapse all the instructions. - while (!dv->Instrs.empty()) - TII->setExecutionDomain(*dv->Instrs.pop_back_val(), domain); - dv->setSingleDomain(domain); - - // If there are multiple users, give them new, unique DomainValues. - if (LiveRegs && dv->Refs > 1) - for (unsigned rx = 0; rx != NumRegs; ++rx) - if (LiveRegs[rx].Value == dv) - setLiveReg(rx, alloc(domain)); -} - -/// All instructions and registers in B are moved to A, and B is released. -bool ExecutionDepsFix::merge(DomainValue *A, DomainValue *B) { - assert(!A->isCollapsed() && "Cannot merge into collapsed"); - assert(!B->isCollapsed() && "Cannot merge from collapsed"); - if (A == B) - return true; - // Restrict to the domains that A and B have in common. - unsigned common = A->getCommonDomains(B->AvailableDomains); - if (!common) - return false; - A->AvailableDomains = common; - A->Instrs.append(B->Instrs.begin(), B->Instrs.end()); - - // Clear the old DomainValue so we won't try to swizzle instructions twice. - B->clear(); - // All uses of B are referred to A. - B->Next = retain(A); - - for (unsigned rx = 0; rx != NumRegs; ++rx) { - assert(LiveRegs && "no space allocated for live registers"); - if (LiveRegs[rx].Value == B) - setLiveReg(rx, A); - } - return true; -} - -/// Set up LiveRegs by merging predecessor live-out values. -void ExecutionDepsFix::enterBasicBlock(MachineBasicBlock *MBB) { - // Reset instruction counter in each basic block. - CurInstr = 0; - - // Set up UndefReads to track undefined register reads. - UndefReads.clear(); - LiveRegSet.clear(); - - // Set up LiveRegs to represent registers entering MBB. - if (!LiveRegs) - LiveRegs = new LiveReg[NumRegs]; - - // Default values are 'nothing happened a long time ago'. - for (unsigned rx = 0; rx != NumRegs; ++rx) { - LiveRegs[rx].Value = nullptr; - LiveRegs[rx].Def = -(1 << 20); - } - - // This is the entry block. - if (MBB->pred_empty()) { - for (const auto &LI : MBB->liveins()) { - for (int rx : regIndices(LI.PhysReg)) { - // Treat function live-ins as if they were defined just before the first - // instruction. Usually, function arguments are set up immediately - // before the call. - LiveRegs[rx].Def = -1; - } - } - DEBUG(dbgs() << "BB#" << MBB->getNumber() << ": entry\n"); - return; - } - - // Try to coalesce live-out registers from predecessors. - for (MachineBasicBlock::const_pred_iterator pi = MBB->pred_begin(), - pe = MBB->pred_end(); pi != pe; ++pi) { - auto fi = MBBInfos.find(*pi); - assert(fi != MBBInfos.end() && - "Should have pre-allocated MBBInfos for all MBBs"); - LiveReg *Incoming = fi->second.OutRegs; - // Incoming is null if this is a backedge from a BB - // we haven't processed yet - if (Incoming == nullptr) { - continue; - } - - for (unsigned rx = 0; rx != NumRegs; ++rx) { - // Use the most recent predecessor def for each register. - LiveRegs[rx].Def = std::max(LiveRegs[rx].Def, Incoming[rx].Def); - - DomainValue *pdv = resolve(Incoming[rx].Value); - if (!pdv) - continue; - if (!LiveRegs[rx].Value) { - setLiveReg(rx, pdv); - continue; - } - - // We have a live DomainValue from more than one predecessor. - if (LiveRegs[rx].Value->isCollapsed()) { - // We are already collapsed, but predecessor is not. Force it. - unsigned Domain = LiveRegs[rx].Value->getFirstDomain(); - if (!pdv->isCollapsed() && pdv->hasDomain(Domain)) - collapse(pdv, Domain); - continue; - } - - // Currently open, merge in predecessor. - if (!pdv->isCollapsed()) - merge(LiveRegs[rx].Value, pdv); - else - force(rx, pdv->getFirstDomain()); - } - } - DEBUG( - dbgs() << "BB#" << MBB->getNumber() - << (!isBlockDone(MBB) ? ": incomplete\n" : ": all preds known\n")); -} - -void ExecutionDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) { - assert(LiveRegs && "Must enter basic block first."); - LiveReg *OldOutRegs = MBBInfos[MBB].OutRegs; - // Save register clearances at end of MBB - used by enterBasicBlock(). - MBBInfos[MBB].OutRegs = LiveRegs; - - // While processing the basic block, we kept `Def` relative to the start - // of the basic block for convenience. However, future use of this information - // only cares about the clearance from the end of the block, so adjust - // everything to be relative to the end of the basic block. - for (unsigned i = 0, e = NumRegs; i != e; ++i) - LiveRegs[i].Def -= CurInstr; - if (OldOutRegs) { - // This must be the second pass. - // Release all the DomainValues instead of keeping them. - for (unsigned i = 0, e = NumRegs; i != e; ++i) - release(OldOutRegs[i].Value); - delete[] OldOutRegs; - } - LiveRegs = nullptr; -} - -bool ExecutionDepsFix::visitInstr(MachineInstr *MI) { - // Update instructions with explicit execution domains. - std::pair DomP = TII->getExecutionDomain(*MI); - if (DomP.first) { - if (DomP.second) - visitSoftInstr(MI, DomP.second); - else - visitHardInstr(MI, DomP.first); - } - - return !DomP.first; -} - -/// \brief Helps avoid false dependencies on undef registers by updating the -/// machine instructions' undef operand to use a register that the instruction -/// is truly dependent on, or use a register with clearance higher than Pref. -/// Returns true if it was able to find a true dependency, thus not requiring -/// a dependency breaking instruction regardless of clearance. -bool ExecutionDepsFix::pickBestRegisterForUndef(MachineInstr *MI, - unsigned OpIdx, unsigned Pref) { - MachineOperand &MO = MI->getOperand(OpIdx); - assert(MO.isUndef() && "Expected undef machine operand"); - - unsigned OriginalReg = MO.getReg(); - - // Update only undef operands that are mapped to one register. - if (AliasMap[OriginalReg].size() != 1) - return false; - - // Get the undef operand's register class - const TargetRegisterClass *OpRC = - TII->getRegClass(MI->getDesc(), OpIdx, TRI, *MF); - - // If the instruction has a true dependency, we can hide the false depdency - // behind it. - for (MachineOperand &CurrMO : MI->operands()) { - if (!CurrMO.isReg() || CurrMO.isDef() || CurrMO.isUndef() || - !OpRC->contains(CurrMO.getReg())) - continue; - // We found a true dependency - replace the undef register with the true - // dependency. - MO.setReg(CurrMO.getReg()); - return true; - } - - // Go over all registers in the register class and find the register with - // max clearance or clearance higher than Pref. - unsigned MaxClearance = 0; - unsigned MaxClearanceReg = OriginalReg; - ArrayRef Order = RegClassInfo.getOrder(OpRC); - for (auto Reg : Order) { - assert(AliasMap[Reg].size() == 1 && - "Reg is expected to be mapped to a single index"); - int RCrx = *regIndices(Reg).begin(); - unsigned Clearance = CurInstr - LiveRegs[RCrx].Def; - if (Clearance <= MaxClearance) - continue; - MaxClearance = Clearance; - MaxClearanceReg = Reg; - - if (MaxClearance > Pref) - break; - } - - // Update the operand if we found a register with better clearance. - if (MaxClearanceReg != OriginalReg) - MO.setReg(MaxClearanceReg); - - return false; -} - -/// \brief Return true to if it makes sense to break dependence on a partial def -/// or undef use. -bool ExecutionDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx, - unsigned Pref) { - unsigned reg = MI->getOperand(OpIdx).getReg(); - for (int rx : regIndices(reg)) { - unsigned Clearance = CurInstr - LiveRegs[rx].Def; - DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref); - - if (Pref > Clearance) { - DEBUG(dbgs() << ": Break dependency.\n"); - continue; - } - DEBUG(dbgs() << ": OK .\n"); - return false; - } - return true; -} - -// Update def-ages for registers defined by MI. -// If Kill is set, also kill off DomainValues clobbered by the defs. -// -// Also break dependencies on partial defs and undef uses. -void ExecutionDepsFix::processDefs(MachineInstr *MI, bool breakDependency, - bool Kill) { - assert(!MI->isDebugValue() && "Won't process debug values"); - - // Break dependence on undef uses. Do this before updating LiveRegs below. - unsigned OpNum; - if (breakDependency) { - unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI); - if (Pref) { - bool HadTrueDependency = pickBestRegisterForUndef(MI, OpNum, Pref); - // We don't need to bother trying to break a dependency if this - // instruction has a true dependency on that register through another - // operand - we'll have to wait for it to be available regardless. - if (!HadTrueDependency && shouldBreakDependence(MI, OpNum, Pref)) - UndefReads.push_back(std::make_pair(MI, OpNum)); - } - } - const MCInstrDesc &MCID = MI->getDesc(); - for (unsigned i = 0, - e = MI->isVariadic() ? MI->getNumOperands() : MCID.getNumDefs(); - i != e; ++i) { - MachineOperand &MO = MI->getOperand(i); - if (!MO.isReg()) - continue; - if (MO.isUse()) - continue; - for (int rx : regIndices(MO.getReg())) { - // This instruction explicitly defines rx. - DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << CurInstr - << '\t' << *MI); - - if (breakDependency) { - // Check clearance before partial register updates. - // Call breakDependence before setting LiveRegs[rx].Def. - unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI); - if (Pref && shouldBreakDependence(MI, i, Pref)) - TII->breakPartialRegDependency(*MI, i, TRI); - } - - // How many instructions since rx was last written? - LiveRegs[rx].Def = CurInstr; - - // Kill off domains redefined by generic instructions. - if (Kill) - kill(rx); - } - } - ++CurInstr; -} - -/// \break Break false dependencies on undefined register reads. -/// -/// Walk the block backward computing precise liveness. This is expensive, so we -/// only do it on demand. Note that the occurrence of undefined register reads -/// that should be broken is very rare, but when they occur we may have many in -/// a single block. -void ExecutionDepsFix::processUndefReads(MachineBasicBlock *MBB) { - if (UndefReads.empty()) - return; - - // Collect this block's live out register units. - LiveRegSet.init(*TRI); - // We do not need to care about pristine registers as they are just preserved - // but not actually used in the function. - LiveRegSet.addLiveOutsNoPristines(*MBB); - - MachineInstr *UndefMI = UndefReads.back().first; - unsigned OpIdx = UndefReads.back().second; - - for (MachineInstr &I : make_range(MBB->rbegin(), MBB->rend())) { - // Update liveness, including the current instruction's defs. - LiveRegSet.stepBackward(I); - - if (UndefMI == &I) { - if (!LiveRegSet.contains(UndefMI->getOperand(OpIdx).getReg())) - TII->breakPartialRegDependency(*UndefMI, OpIdx, TRI); - - UndefReads.pop_back(); - if (UndefReads.empty()) - return; - - UndefMI = UndefReads.back().first; - OpIdx = UndefReads.back().second; - } - } -} - -// A hard instruction only works in one domain. All input registers will be -// forced into that domain. -void ExecutionDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) { - // Collapse all uses. - for (unsigned i = mi->getDesc().getNumDefs(), - e = mi->getDesc().getNumOperands(); i != e; ++i) { - MachineOperand &mo = mi->getOperand(i); - if (!mo.isReg()) continue; - for (int rx : regIndices(mo.getReg())) { - force(rx, domain); - } - } - - // Kill all defs and force them. - for (unsigned i = 0, e = mi->getDesc().getNumDefs(); i != e; ++i) { - MachineOperand &mo = mi->getOperand(i); - if (!mo.isReg()) continue; - for (int rx : regIndices(mo.getReg())) { - kill(rx); - force(rx, domain); - } - } -} - -// A soft instruction can be changed to work in other domains given by mask. -void ExecutionDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) { - // Bitmask of available domains for this instruction after taking collapsed - // operands into account. - unsigned available = mask; - - // Scan the explicit use operands for incoming domains. - SmallVector used; - if (LiveRegs) - for (unsigned i = mi->getDesc().getNumDefs(), - e = mi->getDesc().getNumOperands(); i != e; ++i) { - MachineOperand &mo = mi->getOperand(i); - if (!mo.isReg()) continue; - for (int rx : regIndices(mo.getReg())) { - DomainValue *dv = LiveRegs[rx].Value; - if (dv == nullptr) - continue; - // Bitmask of domains that dv and available have in common. - unsigned common = dv->getCommonDomains(available); - // Is it possible to use this collapsed register for free? - if (dv->isCollapsed()) { - // Restrict available domains to the ones in common with the operand. - // If there are no common domains, we must pay the cross-domain - // penalty for this operand. - if (common) available = common; - } else if (common) - // Open DomainValue is compatible, save it for merging. - used.push_back(rx); - else - // Open DomainValue is not compatible with instruction. It is useless - // now. - kill(rx); - } - } - - // If the collapsed operands force a single domain, propagate the collapse. - if (isPowerOf2_32(available)) { - unsigned domain = countTrailingZeros(available); - TII->setExecutionDomain(*mi, domain); - visitHardInstr(mi, domain); - return; - } - - // Kill off any remaining uses that don't match available, and build a list of - // incoming DomainValues that we want to merge. - SmallVector Regs; - for (int rx : used) { - assert(LiveRegs && "no space allocated for live registers"); - const LiveReg &LR = LiveRegs[rx]; - // This useless DomainValue could have been missed above. - if (!LR.Value->getCommonDomains(available)) { - kill(rx); - continue; - } - // Sorted insertion. - auto I = std::upper_bound(Regs.begin(), Regs.end(), &LR, - [](const LiveReg *LHS, const LiveReg *RHS) { - return LHS->Def < RHS->Def; - }); - Regs.insert(I, &LR); - } - - // doms are now sorted in order of appearance. Try to merge them all, giving - // priority to the latest ones. - DomainValue *dv = nullptr; - while (!Regs.empty()) { - if (!dv) { - dv = Regs.pop_back_val()->Value; - // Force the first dv to match the current instruction. - dv->AvailableDomains = dv->getCommonDomains(available); - assert(dv->AvailableDomains && "Domain should have been filtered"); - continue; - } - - DomainValue *Latest = Regs.pop_back_val()->Value; - // Skip already merged values. - if (Latest == dv || Latest->Next) - continue; - if (merge(dv, Latest)) - continue; - - // If latest didn't merge, it is useless now. Kill all registers using it. - for (int i : used) { - assert(LiveRegs && "no space allocated for live registers"); - if (LiveRegs[i].Value == Latest) - kill(i); - } - } - - // dv is the DomainValue we are going to use for this instruction. - if (!dv) { - dv = alloc(); - dv->AvailableDomains = available; - } - dv->Instrs.push_back(mi); - - // Finally set all defs and non-collapsed uses to dv. We must iterate through - // all the operators, including imp-def ones. - for (MachineInstr::mop_iterator ii = mi->operands_begin(), - ee = mi->operands_end(); - ii != ee; ++ii) { - MachineOperand &mo = *ii; - if (!mo.isReg()) continue; - for (int rx : regIndices(mo.getReg())) { - if (!LiveRegs[rx].Value || (mo.isDef() && LiveRegs[rx].Value != dv)) { - kill(rx); - setLiveReg(rx, dv); - } - } - } -} - -void ExecutionDepsFix::processBasicBlock(MachineBasicBlock *MBB, - bool PrimaryPass) { - enterBasicBlock(MBB); - // If this block is not done, it makes little sense to make any decisions - // based on clearance information. We need to make a second pass anyway, - // and by then we'll have better information, so we can avoid doing the work - // to try and break dependencies now. - bool breakDependency = isBlockDone(MBB); - for (MachineInstr &MI : *MBB) { - if (!MI.isDebugValue()) { - bool Kill = false; - if (PrimaryPass) - Kill = visitInstr(&MI); - processDefs(&MI, breakDependency, Kill); - } - } - if (breakDependency) - processUndefReads(MBB); - leaveBasicBlock(MBB); -} - -bool ExecutionDepsFix::isBlockDone(MachineBasicBlock *MBB) { - return MBBInfos[MBB].PrimaryCompleted && - MBBInfos[MBB].IncomingCompleted == MBBInfos[MBB].PrimaryIncoming && - MBBInfos[MBB].IncomingProcessed == MBB->pred_size(); -} - -bool ExecutionDepsFix::runOnMachineFunction(MachineFunction &mf) { - if (skipFunction(*mf.getFunction())) - return false; - MF = &mf; - TII = MF->getSubtarget().getInstrInfo(); - TRI = MF->getSubtarget().getRegisterInfo(); - RegClassInfo.runOnMachineFunction(mf); - LiveRegs = nullptr; - assert(NumRegs == RC->getNumRegs() && "Bad regclass"); - - DEBUG(dbgs() << "********** FIX EXECUTION DEPENDENCIES: " - << TRI->getRegClassName(RC) << " **********\n"); - - // If no relevant registers are used in the function, we can skip it - // completely. - bool anyregs = false; - const MachineRegisterInfo &MRI = mf.getRegInfo(); - for (unsigned Reg : *RC) { - if (MRI.isPhysRegUsed(Reg)) { - anyregs = true; - break; - } - } - if (!anyregs) return false; - - // Initialize the AliasMap on the first use. - if (AliasMap.empty()) { - // Given a PhysReg, AliasMap[PhysReg] returns a list of indices into RC and - // therefore the LiveRegs array. - AliasMap.resize(TRI->getNumRegs()); - for (unsigned i = 0, e = RC->getNumRegs(); i != e; ++i) - for (MCRegAliasIterator AI(RC->getRegister(i), TRI, true); - AI.isValid(); ++AI) - AliasMap[*AI].push_back(i); - } - - // Initialize the MMBInfos - for (auto &MBB : mf) { - MBBInfo InitialInfo; - MBBInfos.insert(std::make_pair(&MBB, InitialInfo)); - } - - /* - * We want to visit every instruction in every basic block in order to update - * it's execution domain or break any false dependencies. However, for the - * dependency breaking, we need to know clearances from all predecessors - * (including any backedges). One way to do so would be to do two complete - * passes over all basic blocks/instructions, the first for recording - * clearances, the second to break the dependencies. However, for functions - * without backedges, or functions with a lot of straight-line code, and - * a small loop, that would be a lot of unnecessary work (since only the - * BBs that are part of the loop require two passes). As an example, - * consider the following loop. - * - * - * PH -> A -> B (xmm -> xmm) -> C -> D -> EXIT - * ^ | - * +----------------------------------+ - * - * The iteration order is as follows: - * Naive: PH A B C D A' B' C' D' - * Optimized: PH A B C A' B' C' D - * - * Note that we avoid processing D twice, because we can entirely process - * the predecessors before getting to D. We call a block that is ready - * for its second round of processing `done` (isBlockDone). Once we finish - * processing some block, we update the counters in MBBInfos and re-process - * any successors that are now done. - */ - - MachineBasicBlock *Entry = &*MF->begin(); - ReversePostOrderTraversal RPOT(Entry); - SmallVector Workqueue; - for (ReversePostOrderTraversal::rpo_iterator - MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) { - MachineBasicBlock *MBB = *MBBI; - // N.B: IncomingProcessed and IncomingCompleted were already updated while - // processing this block's predecessors. - MBBInfos[MBB].PrimaryCompleted = true; - MBBInfos[MBB].PrimaryIncoming = MBBInfos[MBB].IncomingProcessed; - bool Primary = true; - Workqueue.push_back(MBB); - while (!Workqueue.empty()) { - MachineBasicBlock *ActiveMBB = &*Workqueue.back(); - Workqueue.pop_back(); - processBasicBlock(ActiveMBB, Primary); - bool Done = isBlockDone(ActiveMBB); - for (auto *Succ : ActiveMBB->successors()) { - if (!isBlockDone(Succ)) { - if (Primary) { - MBBInfos[Succ].IncomingProcessed++; - } - if (Done) { - MBBInfos[Succ].IncomingCompleted++; - } - if (isBlockDone(Succ)) { - Workqueue.push_back(Succ); - } - } - } - Primary = false; - } - } - - // We need to go through again and finalize any blocks that are not done yet. - // This is possible if blocks have dead predecessors, so we didn't visit them - // above. - for (ReversePostOrderTraversal::rpo_iterator - MBBI = RPOT.begin(), - MBBE = RPOT.end(); - MBBI != MBBE; ++MBBI) { - MachineBasicBlock *MBB = *MBBI; - if (!isBlockDone(MBB)) { - processBasicBlock(MBB, false); - // Don't update successors here. We'll get to them anyway through this - // loop. - } - } - - // Clear the LiveOuts vectors and collapse any remaining DomainValues. - for (ReversePostOrderTraversal::rpo_iterator - MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) { - auto FI = MBBInfos.find(*MBBI); - if (FI == MBBInfos.end() || !FI->second.OutRegs) - continue; - for (unsigned i = 0, e = NumRegs; i != e; ++i) - if (FI->second.OutRegs[i].Value) - release(FI->second.OutRegs[i].Value); - delete[] FI->second.OutRegs; - } - MBBInfos.clear(); - UndefReads.clear(); - Avail.clear(); - Allocator.DestroyAll(); - - return false; -} +//===- ExecutionDepsFix.cpp - Fix execution dependecy issues ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/ExecutionDepsFix.h" + +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "execution-deps-fix" + +char ReachingDefAnalysis::ID = 0; +INITIALIZE_PASS(ReachingDefAnalysis, "reaching-deps-analysis", + "ReachingDefAnalysis", false, true) + +char BreakFalseDeps::ID = 0; +INITIALIZE_PASS_BEGIN(BreakFalseDeps, "break-false-deps", "BreakFalseDeps", + false, false) +INITIALIZE_PASS_DEPENDENCY(ReachingDefAnalysis) +INITIALIZE_PASS_END(BreakFalseDeps, "break-false-deps", "BreakFalseDeps", false, + false) + +/// Translate TRI register number to a list of indices into our smaller tables +/// of interesting registers. +iterator_range::const_iterator> +ExecutionDomainFix::regIndices(unsigned Reg) const { + assert(Reg < AliasMap.size() && "Invalid register"); + const auto &Entry = AliasMap[Reg]; + return make_range(Entry.begin(), Entry.end()); +} + +DomainValue *ExecutionDomainFix::alloc(int domain) { + DomainValue *dv = Avail.empty() ? + new(Allocator.Allocate()) DomainValue : + Avail.pop_back_val(); + if (domain >= 0) + dv->addDomain(domain); + assert(dv->Refs == 0 && "Reference count wasn't cleared"); + assert(!dv->Next && "Chained DomainValue shouldn't have been recycled"); + return dv; +} + +/// Release a reference to DV. When the last reference is released, +/// collapse if needed. +void ExecutionDomainFix::release(DomainValue *DV) { + while (DV) { + assert(DV->Refs && "Bad DomainValue"); + if (--DV->Refs) + return; + + // There are no more DV references. Collapse any contained instructions. + if (DV->AvailableDomains && !DV->isCollapsed()) + collapse(DV, DV->getFirstDomain()); + + DomainValue *Next = DV->Next; + DV->clear(); + Avail.push_back(DV); + // Also release the next DomainValue in the chain. + DV = Next; + } +} + +/// Follow the chain of dead DomainValues until a live DomainValue is reached. +/// Update the referenced pointer when necessary. +DomainValue *ExecutionDomainFix::resolve(DomainValue *&DVRef) { + DomainValue *DV = DVRef; + if (!DV || !DV->Next) + return DV; + + // DV has a chain. Find the end. + do DV = DV->Next; + while (DV->Next); + + // Update DVRef to point to DV. + retain(DV); + release(DVRef); + DVRef = DV; + return DV; +} + +/// Set LiveRegs[rx] = dv, updating reference counts. +void ExecutionDomainFix::setLiveReg(int rx, DomainValue *dv) { + assert(unsigned(rx) < NumRegs && "Invalid index"); + assert(LiveRegs && "Must enter basic block first."); + + if (LiveRegs[rx].Value == dv) + return; + if (LiveRegs[rx].Value) + release(LiveRegs[rx].Value); + LiveRegs[rx].Value = retain(dv); +} + +// Kill register rx, recycle or collapse any DomainValue. +void ExecutionDomainFix::kill(int rx) { + assert(unsigned(rx) < NumRegs && "Invalid index"); + assert(LiveRegs && "Must enter basic block first."); + if (!LiveRegs[rx].Value) + return; + + release(LiveRegs[rx].Value); + LiveRegs[rx].Value = nullptr; +} + +/// Force register rx into domain. +void ExecutionDomainFix::force(int rx, unsigned domain) { + assert(unsigned(rx) < NumRegs && "Invalid index"); + assert(LiveRegs && "Must enter basic block first."); + if (DomainValue *dv = LiveRegs[rx].Value) { + if (dv->isCollapsed()) + dv->addDomain(domain); + else if (dv->hasDomain(domain)) + collapse(dv, domain); + else { + // This is an incompatible open DomainValue. Collapse it to whatever and + // force the new value into domain. This costs a domain crossing. + collapse(dv, dv->getFirstDomain()); + assert(LiveRegs[rx].Value && "Not live after collapse?"); + LiveRegs[rx].Value->addDomain(domain); + } + } else { + // Set up basic collapsed DomainValue. + setLiveReg(rx, alloc(domain)); + } +} + +/// Collapse open DomainValue into given domain. If there are multiple +/// registers using dv, they each get a unique collapsed DomainValue. +void ExecutionDomainFix::collapse(DomainValue *dv, unsigned domain) { + assert(dv->hasDomain(domain) && "Cannot collapse"); + + // Collapse all the instructions. + while (!dv->Instrs.empty()) + TII->setExecutionDomain(*dv->Instrs.pop_back_val(), domain); + dv->setSingleDomain(domain); + + // If there are multiple users, give them new, unique DomainValues. + if (LiveRegs && dv->Refs > 1) + for (unsigned rx = 0; rx != NumRegs; ++rx) + if (LiveRegs[rx].Value == dv) + setLiveReg(rx, alloc(domain)); +} + +/// All instructions and registers in B are moved to A, and B is released. +bool ExecutionDomainFix::merge(DomainValue *A, DomainValue *B) { + assert(!A->isCollapsed() && "Cannot merge into collapsed"); + assert(!B->isCollapsed() && "Cannot merge from collapsed"); + if (A == B) + return true; + // Restrict to the domains that A and B have in common. + unsigned common = A->getCommonDomains(B->AvailableDomains); + if (!common) + return false; + A->AvailableDomains = common; + A->Instrs.append(B->Instrs.begin(), B->Instrs.end()); + + // Clear the old DomainValue so we won't try to swizzle instructions twice. + B->clear(); + // All uses of B are referred to A. + B->Next = retain(A); + + for (unsigned rx = 0; rx != NumRegs; ++rx) { + assert(LiveRegs && "no space allocated for live registers"); + if (LiveRegs[rx].Value == B) + setLiveReg(rx, A); + } + return true; +} + +/// Set up LiveRegs by merging predecessor live-out values. +void ReachingDefAnalysis::enterBasicBlock( + const LoopTraversal::TraversedMBBInfo &TraversedMBB) { + + MachineBasicBlock *MBB = TraversedMBB.MBB; + int MBBNumber = MBB->getNumber(); + MBBReachingDefs[MBBNumber].resize(NumRegUnits); + + // Reset instruction counter in each basic block. + CurInstr = 0; + + // Set up LiveRegs to represent registers entering MBB. + if (!LiveRegs) + LiveRegs = new LiveReg[NumRegUnits]; + + // Default values are 'nothing happened a long time ago'. + for (unsigned rx = 0; rx != NumRegUnits; ++rx) { + LiveRegs[rx].Def = -(1 << 20); + } + + // This is the entry block. + if (MBB->pred_empty()) { + for (const auto &LI : MBB->liveins()) { + for (MCRegUnitIterator rx(LI.PhysReg, TRI); rx.isValid(); ++rx) { + // Treat function live-ins as if they were defined just before the first + // instruction. Usually, function arguments are set up immediately + // before the call. + LiveRegs[*rx].Def = -1; + MBBReachingDefs[MBBNumber][*rx].push_back(LiveRegs[*rx].Def); + } + } + DEBUG(dbgs() << "BB#" << MBB->getNumber() << ": entry\n"); + return; + } + + // Try to coalesce live-out registers from predecessors. + for (MachineBasicBlock::const_pred_iterator pi = MBB->pred_begin(), + pe = MBB->pred_end(); pi != pe; ++pi) { + auto fi = MBBOutRegsInfos.find(*pi); + assert(fi != MBBOutRegsInfos.end() && + "Should have pre-allocated MBBInfos for all MBBs"); + LiveReg *Incoming = fi->second; + // Incoming is null if this is a backedge from a BB + // we haven't processed yet + if (Incoming == nullptr) { + continue; + } + + for (unsigned rx = 0; rx != NumRegUnits; ++rx) { + // Use the most recent predecessor def for each register. + LiveRegs[rx].Def = std::max(LiveRegs[rx].Def, Incoming[rx].Def); + if ((LiveRegs[rx].Def != -(1 << 20))) + MBBReachingDefs[MBBNumber][rx].push_back(LiveRegs[rx].Def); + } + } + + DEBUG( + dbgs() << "BB#" << MBB->getNumber() + << (!TraversedMBB.IsDone ? ": incomplete\n" : ": all preds known\n")); +} + +/// Set up LiveRegs by merging predecessor live-out values. +void ExecutionDomainFix::enterBasicBlock( + const LoopTraversal::TraversedMBBInfo &TraversedMBB) { + + MachineBasicBlock *MBB = TraversedMBB.MBB; + + // Set up LiveRegs to represent registers entering MBB. + if (!LiveRegs) + LiveRegs = new LiveReg[NumRegs]; + + // Default values are 'nothing happened a long time ago'. + for (unsigned rx = 0; rx != NumRegs; ++rx) { + LiveRegs[rx].Value = nullptr; + } + + // This is the entry block. + if (MBB->pred_empty()) { + DEBUG(dbgs() << "BB#" << MBB->getNumber() << ": entry\n"); + return; + } + + // Try to coalesce live-out registers from predecessors. + for (MachineBasicBlock::const_pred_iterator pi = MBB->pred_begin(), + pe = MBB->pred_end(); pi != pe; ++pi) { + auto fi = MBBOutRegsInfos.find(*pi); + assert(fi != MBBOutRegsInfos.end() && + "Should have pre-allocated MBBInfos for all MBBs"); + LiveReg *Incoming = fi->second; + // Incoming is null if this is a backedge from a BB + // we haven't processed yet + if (Incoming == nullptr) { + continue; + } + + for (unsigned rx = 0; rx != NumRegs; ++rx) { + DomainValue *pdv = resolve(Incoming[rx].Value); + if (!pdv) + continue; + if (!LiveRegs[rx].Value) { + setLiveReg(rx, pdv); + continue; + } + + // We have a live DomainValue from more than one predecessor. + if (LiveRegs[rx].Value->isCollapsed()) { + // We are already collapsed, but predecessor is not. Force it. + unsigned Domain = LiveRegs[rx].Value->getFirstDomain(); + if (!pdv->isCollapsed() && pdv->hasDomain(Domain)) + collapse(pdv, Domain); + continue; + } + + // Currently open, merge in predecessor. + if (!pdv->isCollapsed()) + merge(LiveRegs[rx].Value, pdv); + else + force(rx, pdv->getFirstDomain()); + } + } + DEBUG( + dbgs() << "BB#" << MBB->getNumber() + << (!TraversedMBB.IsDone ? ": incomplete\n" : ": all preds known\n")); +} + +void ReachingDefAnalysis::leaveBasicBlock( + const LoopTraversal::TraversedMBBInfo &TraversedMBB) { + assert(LiveRegs && "Must enter basic block first."); + // Save register clearances at end of MBB - used by enterBasicBlock(). + MBBOutRegsInfos[TraversedMBB.MBB] = LiveRegs; + + // While processing the basic block, we kept `Def` relative to the start + // of the basic block for convenience. However, future use of this information + // only cares about the clearance from the end of the block, so adjust + // everything to be relative to the end of the basic block. + for (unsigned i = 0, e = NumRegUnits; i != e; ++i) + LiveRegs[i].Def -= CurInstr; + LiveRegs = nullptr; +} + +void ExecutionDomainFix::leaveBasicBlock( + const LoopTraversal::TraversedMBBInfo &TraversedMBB) { + assert(LiveRegs && "Must enter basic block first."); + LiveReg *OldOutRegs = MBBOutRegsInfos[TraversedMBB.MBB]; + // Save register clearances at end of MBB - used by enterBasicBlock(). + MBBOutRegsInfos[TraversedMBB.MBB] = LiveRegs; + if (OldOutRegs) { + // This must be the second pass. + // Release all the DomainValues instead of keeping them. + for (unsigned i = 0, e = NumRegs; i != e; ++i) + release(OldOutRegs[i].Value); + delete[] OldOutRegs; + } + LiveRegs = nullptr; +} + +bool ExecutionDomainFix::visitInstr(MachineInstr *MI) { + // Update instructions with explicit execution domains. + std::pair DomP = TII->getExecutionDomain(*MI); + if (DomP.first) { + if (DomP.second) + visitSoftInstr(MI, DomP.second); + else + visitHardInstr(MI, DomP.first); + } + + return !DomP.first; +} + +/// \brief Helps avoid false dependencies on undef registers by updating the +/// machine instructions' undef operand to use a register that the instruction +/// is truly dependent on, or use a register with clearance higher than Pref. +/// Returns true if it was able to find a true dependency, thus not requiring +/// a dependency breaking instruction regardless of clearance. +bool BreakFalseDeps::pickBestRegisterForUndef(MachineInstr *MI, + unsigned OpIdx, unsigned Pref) { + MachineOperand &MO = MI->getOperand(OpIdx); + assert(MO.isUndef() && "Expected undef machine operand"); + + unsigned OriginalReg = MO.getReg(); + + // Update only undef operands that have reg units that are mapped to one root. + for (MCRegUnitIterator Unit(OriginalReg, TRI); Unit.isValid(); ++Unit) { + unsigned NumRoots = 0; + for (MCRegUnitRootIterator Root(*Unit, TRI); Root.isValid(); ++Root) { + NumRoots++; + if (NumRoots > 1) + return false; + } + } + + // Get the undef operand's register class + const TargetRegisterClass *OpRC = + TII->getRegClass(MI->getDesc(), OpIdx, TRI, *MF); + + // If the instruction has a true dependency, we can hide the false depdency + // behind it. + for (MachineOperand &CurrMO : MI->operands()) { + if (!CurrMO.isReg() || CurrMO.isDef() || CurrMO.isUndef() || + !OpRC->contains(CurrMO.getReg())) + continue; + // We found a true dependency - replace the undef register with the true + // dependency. + MO.setReg(CurrMO.getReg()); + return true; + } + + // Go over all registers in the register class and find the register with + // max clearance or clearance higher than Pref. + unsigned MaxClearance = 0; + unsigned MaxClearanceReg = OriginalReg; + ArrayRef Order = RegClassInfo.getOrder(OpRC); + for (auto Reg : Order) { + unsigned Clearance = RDA->getClearance(MI, Reg); + if (Clearance <= MaxClearance) + continue; + MaxClearance = Clearance; + MaxClearanceReg = Reg; + + if (MaxClearance > Pref) + break; + } + + // Update the operand if we found a register with better clearance. + if (MaxClearanceReg != OriginalReg) + MO.setReg(MaxClearanceReg); + + return false; +} + +/// \brief Return true to if it makes sense to break dependence on a partial def +/// or undef use. +bool BreakFalseDeps::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx, + unsigned Pref) { + unsigned reg = MI->getOperand(OpIdx).getReg(); + unsigned Clearance = RDA->getClearance(MI, reg); + DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref); + + if (Pref > Clearance) { + DEBUG(dbgs() << ": Break dependency.\n"); + return true; + } + DEBUG(dbgs() << ": OK .\n"); + return false; +} + +// Update def-ages for registers defined by MI. +// If Kill is set, also kill off DomainValues clobbered by the defs. +// +// Also break dependencies on partial defs and undef uses. +void ExecutionDomainFix::processDefs(MachineInstr *MI, bool Kill) { + assert(!MI->isDebugValue() && "Won't process debug values"); + const MCInstrDesc &MCID = MI->getDesc(); + for (unsigned i = 0, + e = MI->isVariadic() ? MI->getNumOperands() : MCID.getNumDefs(); + i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg()) + continue; + if (MO.isUse()) + continue; + for (int rx : regIndices(MO.getReg())) { + // This instruction explicitly defines rx. + DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << *MI); + + // Kill off domains redefined by generic instructions. + if (Kill) + kill(rx); + } + } +} + +// Update def-ages for registers defined by MI. +// Also break dependencies on partial defs and undef uses. +void ReachingDefAnalysis::processDefs(MachineInstr *MI) { + assert(!MI->isDebugValue() && "Won't process debug values"); + + int MBBNumber = MI->getParent()->getNumber(); + const MCInstrDesc &MCID = MI->getDesc(); + for (unsigned i = 0, + e = MI->isVariadic() ? MI->getNumOperands() : MCID.getNumDefs(); + i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg() || !MO.getReg()) + continue; + if (MO.isUse()) + continue; + for (MCRegUnitIterator rx(MO.getReg(), TRI); rx.isValid(); ++rx) { + // This instruction explicitly defines rx. + DEBUG(dbgs() << TRI->getName(MO.getReg()) << ":\t" << CurInstr << '\t' + << *MI); + + // How many instructions since this reg unit was last written? + LiveRegs[*rx].Def = CurInstr; + MBBReachingDefs[MBBNumber][*rx].push_back(CurInstr); + } + } + InstIds[MI] = CurInstr; + ++CurInstr; +} + +// Update def-ages for registers defined by MI. +// Also break dependencies on partial defs and undef uses. +void BreakFalseDeps::processDefs(MachineInstr *MI) { + assert(!MI->isDebugValue() && "Won't process debug values"); + + // Break dependence on undef uses. Do this before updating LiveRegs below. + unsigned OpNum; + unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI); + if (Pref) { + bool HadTrueDependency = pickBestRegisterForUndef(MI, OpNum, Pref); + // We don't need to bother trying to break a dependency if this + // instruction has a true dependency on that register through another + // operand - we'll have to wait for it to be available regardless. + if (!HadTrueDependency && shouldBreakDependence(MI, OpNum, Pref)) + UndefReads.push_back(std::make_pair(MI, OpNum)); + } + + const MCInstrDesc &MCID = MI->getDesc(); + for (unsigned i = 0, + e = MI->isVariadic() ? MI->getNumOperands() : MCID.getNumDefs(); + i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (!MO.isReg() || !MO.getReg()) + continue; + if (MO.isUse()) + continue; + // Check clearance before partial register updates. + // Call breakDependence before setting LiveRegs[rx].Def. + unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI); + if (Pref && shouldBreakDependence(MI, i, Pref)) + TII->breakPartialRegDependency(*MI, i, TRI); + } +} + +/// \break Break false dependencies on undefined register reads. +/// +/// Walk the block backward computing precise liveness. This is expensive, so we +/// only do it on demand. Note that the occurrence of undefined register reads +/// that should be broken is very rare, but when they occur we may have many in +/// a single block. +void BreakFalseDeps::processUndefReads(MachineBasicBlock *MBB) { + if (UndefReads.empty()) + return; + + // Collect this block's live out register units. + LiveRegSet.init(*TRI); + // We do not need to care about pristine registers as they are just preserved + // but not actually used in the function. + LiveRegSet.addLiveOutsNoPristines(*MBB); + + MachineInstr *UndefMI = UndefReads.back().first; + unsigned OpIdx = UndefReads.back().second; + + for (MachineInstr &I : make_range(MBB->rbegin(), MBB->rend())) { + // Update liveness, including the current instruction's defs. + LiveRegSet.stepBackward(I); + + if (UndefMI == &I) { + if (!LiveRegSet.contains(UndefMI->getOperand(OpIdx).getReg())) + TII->breakPartialRegDependency(*UndefMI, OpIdx, TRI); + + UndefReads.pop_back(); + if (UndefReads.empty()) + return; + + UndefMI = UndefReads.back().first; + OpIdx = UndefReads.back().second; + } + } +} + +// A hard instruction only works in one domain. All input registers will be +// forced into that domain. +void ExecutionDomainFix::visitHardInstr(MachineInstr *mi, unsigned domain) { + // Collapse all uses. + for (unsigned i = mi->getDesc().getNumDefs(), + e = mi->getDesc().getNumOperands(); i != e; ++i) { + MachineOperand &mo = mi->getOperand(i); + if (!mo.isReg()) continue; + for (int rx : regIndices(mo.getReg())) { + force(rx, domain); + } + } + + // Kill all defs and force them. + for (unsigned i = 0, e = mi->getDesc().getNumDefs(); i != e; ++i) { + MachineOperand &mo = mi->getOperand(i); + if (!mo.isReg()) continue; + for (int rx : regIndices(mo.getReg())) { + kill(rx); + force(rx, domain); + } + } +} + +// A soft instruction can be changed to work in other domains given by mask. +void ExecutionDomainFix::visitSoftInstr(MachineInstr *mi, unsigned mask) { + // Bitmask of available domains for this instruction after taking collapsed + // operands into account. + unsigned available = mask; + + // Scan the explicit use operands for incoming domains. + SmallVector used; + if (LiveRegs) + for (unsigned i = mi->getDesc().getNumDefs(), + e = mi->getDesc().getNumOperands(); i != e; ++i) { + MachineOperand &mo = mi->getOperand(i); + if (!mo.isReg()) continue; + for (int rx : regIndices(mo.getReg())) { + DomainValue *dv = LiveRegs[rx].Value; + if (dv == nullptr) + continue; + // Bitmask of domains that dv and available have in common. + unsigned common = dv->getCommonDomains(available); + // Is it possible to use this collapsed register for free? + if (dv->isCollapsed()) { + // Restrict available domains to the ones in common with the operand. + // If there are no common domains, we must pay the cross-domain + // penalty for this operand. + if (common) available = common; + } else if (common) + // Open DomainValue is compatible, save it for merging. + used.push_back(rx); + else + // Open DomainValue is not compatible with instruction. It is useless + // now. + kill(rx); + } + } + + // If the collapsed operands force a single domain, propagate the collapse. + if (isPowerOf2_32(available)) { + unsigned domain = countTrailingZeros(available); + TII->setExecutionDomain(*mi, domain); + visitHardInstr(mi, domain); + return; + } + + // Kill off any remaining uses that don't match available, and build a list of + // incoming DomainValues that we want to merge. + SmallVector Regs; + for (int rx : used) { + assert(LiveRegs && "no space allocated for live registers"); + LiveReg &LR = LiveRegs[rx]; + LR.Def = RDA->getReachingDef(mi, RC->getRegister(rx)); + // This useless DomainValue could have been missed above. + if (!LR.Value->getCommonDomains(available)) { + kill(rx); + continue; + } + // Sorted insertion. + auto I = std::upper_bound(Regs.begin(), Regs.end(), &LR, + [](const LiveReg *LHS, const LiveReg *RHS) { + return LHS->Def < RHS->Def; + }); + Regs.insert(I, &LR); + } + + // doms are now sorted in order of appearance. Try to merge them all, giving + // priority to the latest ones. + DomainValue *dv = nullptr; + while (!Regs.empty()) { + if (!dv) { + dv = Regs.pop_back_val()->Value; + // Force the first dv to match the current instruction. + dv->AvailableDomains = dv->getCommonDomains(available); + assert(dv->AvailableDomains && "Domain should have been filtered"); + continue; + } + + DomainValue *Latest = Regs.pop_back_val()->Value; + // Skip already merged values. + if (Latest == dv || Latest->Next) + continue; + if (merge(dv, Latest)) + continue; + + // If latest didn't merge, it is useless now. Kill all registers using it. + for (int i : used) { + assert(LiveRegs && "no space allocated for live registers"); + if (LiveRegs[i].Value == Latest) + kill(i); + } + } + + // dv is the DomainValue we are going to use for this instruction. + if (!dv) { + dv = alloc(); + dv->AvailableDomains = available; + } + dv->Instrs.push_back(mi); + + // Finally set all defs and non-collapsed uses to dv. We must iterate through + // all the operators, including imp-def ones. + for (MachineInstr::mop_iterator ii = mi->operands_begin(), + ee = mi->operands_end(); + ii != ee; ++ii) { + MachineOperand &mo = *ii; + if (!mo.isReg()) continue; + for (int rx : regIndices(mo.getReg())) { + if (!LiveRegs[rx].Value || (mo.isDef() && LiveRegs[rx].Value != dv)) { + kill(rx); + setLiveReg(rx, dv); + } + } + } +} + +void ExecutionDomainFix::processBasicBlock( + const LoopTraversal::TraversedMBBInfo &TraversedMBB) { + enterBasicBlock(TraversedMBB); + // If this block is not done, it makes little sense to make any decisions + // based on clearance information. We need to make a second pass anyway, + // and by then we'll have better information, so we can avoid doing the work + // to try and break dependencies now. + for (MachineInstr &MI : *TraversedMBB.MBB) { + if (!MI.isDebugValue()) { + bool Kill = false; + if (TraversedMBB.PrimaryPass) + Kill = visitInstr(&MI); + processDefs(&MI, Kill); + } + } + leaveBasicBlock(TraversedMBB); +} + +void ReachingDefAnalysis::processBasicBlock( + const LoopTraversal::TraversedMBBInfo &TraversedMBB) { + enterBasicBlock(TraversedMBB); + for (MachineInstr &MI : *TraversedMBB.MBB) { + if (!MI.isDebugValue()) + processDefs(&MI); + } + leaveBasicBlock(TraversedMBB); +} + +void BreakFalseDeps::processBasicBlock(MachineBasicBlock* MBB) { + UndefReads.clear(); + // If this block is not done, it makes little sense to make any decisions + // based on clearance information. We need to make a second pass anyway, + // and by then we'll have better information, so we can avoid doing the work + // to try and break dependencies now. + for (MachineInstr &MI : *MBB) { + if (!MI.isDebugValue()) { + processDefs(&MI); + } + } + processUndefReads(MBB); +} + +bool LoopTraversal::isBlockDone(MachineBasicBlock *MBB) { + return MBBInfos[MBB].PrimaryCompleted && + MBBInfos[MBB].IncomingCompleted == MBBInfos[MBB].PrimaryIncoming && + MBBInfos[MBB].IncomingProcessed == MBB->pred_size(); +} + +const SmallVector +LoopTraversal::traverse(MachineFunction &MF) { + // Initialize the MMBInfos + for (auto &MBB : MF) { + MBBInfo InitialInfo; + MBBInfos.insert(std::make_pair(&MBB, InitialInfo)); + } + + /* + * We want to visit every instruction in every basic block in order to update + * it's execution domain or break any false dependencies. However, for the + * dependency breaking, we need to know clearances from all predecessors + * (including any backedges). One way to do so would be to do two complete + * passes over all basic blocks/instructions, the first for recording + * clearances, the second to break the dependencies. However, for functions + * without backedges, or functions with a lot of straight-line code, and + * a small loop, that would be a lot of unnecessary work (since only the + * BBs that are part of the loop require two passes). As an example, + * consider the following loop. + * + * + * PH -> A -> B (xmm -> xmm) -> C -> D -> EXIT + * ^ | + * +----------------------------------+ + * + * The iteration order is as follows: + * Naive: PH A B C D A' B' C' D' + * Optimized: PH A B C A' B' C' D + * + * Note that we avoid processing D twice, because we can entirely process + * the predecessors before getting to D. We call a block that is ready + * for its second round of processing `done` (isBlockDone). Once we finish + * processing some block, we update the counters in MBBInfos and re-process + * any successors that are now done. + */ + + MachineBasicBlock *Entry = &*MF.begin(); + ReversePostOrderTraversal RPOT(Entry); + SmallVector Workqueue; + SmallVector MBBTraversalOrder; + for (ReversePostOrderTraversal::rpo_iterator + MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) { + MachineBasicBlock *MBB = *MBBI; + // N.B: IncomingProcessed and IncomingCompleted were already updated while + // processing this block's predecessors. + MBBInfos[MBB].PrimaryCompleted = true; + MBBInfos[MBB].PrimaryIncoming = MBBInfos[MBB].IncomingProcessed; + bool Primary = true; + Workqueue.push_back(MBB); + while (!Workqueue.empty()) { + MachineBasicBlock *ActiveMBB = &*Workqueue.back(); + Workqueue.pop_back(); + bool Done = isBlockDone(ActiveMBB); + MBBTraversalOrder.push_back(TraversedMBBInfo(ActiveMBB, Primary, Done)); + for (auto *Succ : ActiveMBB->successors()) { + if (!isBlockDone(Succ)) { + if (Primary) { + MBBInfos[Succ].IncomingProcessed++; + } + if (Done) { + MBBInfos[Succ].IncomingCompleted++; + } + if (isBlockDone(Succ)) { + Workqueue.push_back(Succ); + } + } + } + Primary = false; + } + } + + // We need to go through again and finalize any blocks that are not done yet. + // This is possible if blocks have dead predecessors, so we didn't visit them + // above. + for (ReversePostOrderTraversal::rpo_iterator + MBBI = RPOT.begin(), + MBBE = RPOT.end(); + MBBI != MBBE; ++MBBI) { + MachineBasicBlock *MBB = *MBBI; + if (!isBlockDone(MBB)) { + MBBTraversalOrder.push_back(TraversedMBBInfo(MBB, false, true)); + // Don't update successors here. We'll get to them anyway through this + // loop. + } + } + + MBBInfos.clear(); + + return MBBTraversalOrder; +} + +bool ExecutionDomainFix::runOnMachineFunction(MachineFunction &mf) { + if (skipFunction(*mf.getFunction())) + return false; + MF = &mf; + TII = MF->getSubtarget().getInstrInfo(); + TRI = MF->getSubtarget().getRegisterInfo(); + LiveRegs = nullptr; + assert(NumRegs == RC->getNumRegs() && "Bad regclass"); + + RDA = &getAnalysis(); + + DEBUG(dbgs() << "********** FIX EXECUTION DOMAIN: " + << TRI->getRegClassName(RC) << " **********\n"); + + // If no relevant registers are used in the function, we can skip it + // completely. + bool anyregs = false; + const MachineRegisterInfo &MRI = mf.getRegInfo(); + for (unsigned Reg : *RC) { + if (MRI.isPhysRegUsed(Reg)) { + anyregs = true; + break; + } + } + if (!anyregs) return false; + + // Initialize the AliasMap on the first use. + if (AliasMap.empty()) { + // Given a PhysReg, AliasMap[PhysReg] returns a list of indices into RC and + // therefore the LiveRegs array. + AliasMap.resize(TRI->getNumRegs()); + for (unsigned i = 0, e = RC->getNumRegs(); i != e; ++i) + for (MCRegAliasIterator AI(RC->getRegister(i), TRI, true); + AI.isValid(); ++AI) + AliasMap[*AI].push_back(i); + } + + // Initialize the MBBOutRegsInfos + for (auto &MBB : mf) { + MBBOutRegsInfos.insert(std::make_pair(&MBB, nullptr)); + } + + // Traverse the basic blocks. + LoopTraversal Traversal; + const SmallVectorImpl + &TraversedMBBInfoOrder = Traversal.traverse(mf); + for (auto TraversedMBB : TraversedMBBInfoOrder) { + processBasicBlock(TraversedMBB); + } + + for (auto MBBOutRegs : MBBOutRegsInfos) { + if (!MBBOutRegs.second) + continue; + for (unsigned i = 0, e = NumRegs; i != e; ++i) + if (MBBOutRegs.second[i].Value) + release(MBBOutRegs.second[i].Value); + delete[] MBBOutRegs.second; + } + MBBOutRegsInfos.clear(); + Avail.clear(); + Allocator.DestroyAll(); + + return false; +} +bool ReachingDefAnalysis::runOnMachineFunction(MachineFunction &mf) { + if (skipFunction(*mf.getFunction())) + return false; + MF = &mf; + TII = MF->getSubtarget().getInstrInfo(); + TRI = MF->getSubtarget().getRegisterInfo(); + + LiveRegs = nullptr; + NumRegUnits = TRI->getNumRegUnits(); + + MBBReachingDefs.resize(mf.getNumBlockIDs()); + + DEBUG(dbgs() << "********** REACHING DEFINITION ANALYSIS **********\n"); + + // Initialize the MBBOutRegsInfos + for (auto &MBB : mf) { + MBBOutRegsInfos.insert(std::make_pair(&MBB, nullptr)); + } + + // Traverse the basic blocks. + LoopTraversal Traversal; + const SmallVectorImpl + &TraversedMBBInfoOrder = Traversal.traverse(mf); + for (auto TraversedMBB : TraversedMBBInfoOrder) { + processBasicBlock(TraversedMBB); + } + + // Sorting all reaching defs found for a ceartin reg unit in a given BB. + for (MBBDefsInfo &MBBDefs : MBBReachingDefs) { + for (MBBRegUnitDefs &RegUnitDefs : MBBDefs) + std::sort(RegUnitDefs.begin(), RegUnitDefs.end()); + } + + return false; +} + +void ReachingDefAnalysis::releaseMemory() { + // Clear the LiveOuts vectors and collapse any remaining DomainValues. + for (auto MBBOutRegs : MBBOutRegsInfos) { + if (!MBBOutRegs.second) + continue; + delete[] MBBOutRegs.second; + } + MBBOutRegsInfos.clear(); + MBBReachingDefs.clear(); + InstIds.clear(); +} + +bool BreakFalseDeps::runOnMachineFunction(MachineFunction &mf) { + if (skipFunction(*mf.getFunction())) + return false; + MF = &mf; + TII = MF->getSubtarget().getInstrInfo(); + TRI = MF->getSubtarget().getRegisterInfo(); + RDA = &getAnalysis(); + + RegClassInfo.runOnMachineFunction(mf); + + DEBUG(dbgs() << "********** BREAK FALSE DEPENDENCIES **********\n"); + + // Traverse the basic blocks. + for (MachineBasicBlock &MBB : mf) { + processBasicBlock(&MBB); + } + + return false; +} + +int ReachingDefAnalysis::getReachingDef(MachineInstr *MI, int PhysReg) { + int InstId = InstIds[MI]; + int DefRes = -(1 << 20); + int MBBNumber = MI->getParent()->getNumber(); + int LatestDef = -(1 << 20); + for (MCRegUnitIterator Unit(PhysReg, TRI); Unit.isValid(); ++Unit) { + for (int Def : MBBReachingDefs[MBBNumber][*Unit]) { + if (Def >= InstId) + break; + DefRes = Def; + } + LatestDef = std::max(LatestDef, DefRes); + } + return LatestDef; +} + +int ReachingDefAnalysis::getClearance(MachineInstr *MI, MCPhysReg PhysReg) { + return InstIds[MI] - getReachingDef(MI, PhysReg); +} Index: lib/Target/ARM/ARMTargetMachine.cpp =================================================================== --- lib/Target/ARM/ARMTargetMachine.cpp +++ lib/Target/ARM/ARMTargetMachine.cpp @@ -75,7 +75,7 @@ cl::desc("Enable the global merge pass")); namespace llvm { - void initializeARMExecutionDepsFixPass(PassRegistry&); + void initializeARMExecutionDomainFixPass(PassRegistry&); } extern "C" void LLVMInitializeARMTarget() { @@ -90,7 +90,7 @@ initializeARMLoadStoreOptPass(Registry); initializeARMPreAllocLoadStoreOptPass(Registry); initializeARMConstantIslandsPass(Registry); - initializeARMExecutionDepsFixPass(Registry); + initializeARMExecutionDomainFixPass(Registry); initializeARMExpandPseudoPass(Registry); } @@ -355,20 +355,23 @@ void addPreEmitPass() override; }; -class ARMExecutionDepsFix : public ExecutionDepsFix { -public: - static char ID; - ARMExecutionDepsFix() : ExecutionDepsFix(ID, ARM::DPRRegClass) {} - StringRef getPassName() const override { - return "ARM Execution Dependency Fix"; - } -}; -char ARMExecutionDepsFix::ID; - -} // end anonymous namespace - -INITIALIZE_PASS(ARMExecutionDepsFix, "arm-execution-deps-fix", - "ARM Execution Dependency Fix", false, false) +class ARMExecutionDomainFix : public ExecutionDomainFix { +public: + static char ID; + ARMExecutionDomainFix() : ExecutionDomainFix(ID, ARM::DPRRegClass) {} + StringRef getPassName() const override { + return "ARM Execution Domain Fix"; + } +}; +char ARMExecutionDomainFix::ID; + +} // end anonymous namespace + +INITIALIZE_PASS_BEGIN(ARMExecutionDomainFix, "arm-execution-domain-fix", + "ARM Execution Domain Fix", false, false) +INITIALIZE_PASS_DEPENDENCY(ReachingDefAnalysis) +INITIALIZE_PASS_END(ARMExecutionDomainFix, "arm-execution-domain-fix", + "ARM Execution Domain Fix", false, false) TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) { return new ARMPassConfig(*this, PM); @@ -462,7 +465,8 @@ if (EnableARMLoadStoreOpt) addPass(createARMLoadStoreOptimizationPass()); - addPass(new ARMExecutionDepsFix()); + addPass(new ARMExecutionDomainFix()); + addPass(new BreakFalseDeps()); } // Expand some pseudo instructions into multiple instructions to allow Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -20586,7 +20586,7 @@ SDValue Segment = DAG.getRegister(0, MVT::i32); // If source is undef or we know it won't be used, use a zero vector // to break register dependency. - // TODO: use undef instead and let ExecutionDepsFix deal with it? + // TODO: use undef instead and let BreakFalseDeps deal with it? if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode())) Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain}; @@ -20614,7 +20614,7 @@ SDValue Segment = DAG.getRegister(0, MVT::i32); // If source is undef or we know it won't be used, use a zero vector // to break register dependency. - // TODO: use undef instead and let ExecutionDepsFix deal with it? + // TODO: use undef instead and let BreakFalseDeps deal with it? if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode())) Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain}; Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -432,7 +432,7 @@ // Alias instruction that maps zero vector to pxor / xorp* for AVX-512. // This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then -// swizzled by ExecutionDepsFix to pxor. +// swizzled by ExecutionDomainFix to pxor. // We set canFoldAsLoad because this can be converted to a constant-pool // load of an all-zeros value if folding it would be beneficial. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -8013,7 +8013,7 @@ return false; } -/// Inform the ExecutionDepsFix pass how many idle +/// Inform the BreakFalseDeps pass how many idle /// instructions we would like before a partial register update. unsigned X86InstrInfo::getPartialRegUpdateClearance( const MachineInstr &MI, unsigned OpNum, @@ -8166,7 +8166,7 @@ return false; } -/// Inform the ExecutionDepsFix pass how many idle instructions we would like +/// Inform the BreakFalseDeps pass how many idle instructions we would like /// before certain undef register reads. /// /// This catches the VCVTSI2SD family of instructions: Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -336,7 +336,7 @@ // Alias instruction that maps zero vector to pxor / xorp* for sse. // This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then -// swizzled by ExecutionDepsFix to pxor. +// swizzled by ExecutionDomainFix to pxor. // We set canFoldAsLoad because this can be converted to a constant-pool // load of an all-zeros value if folding it would be beneficial. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, @@ -3122,7 +3122,7 @@ // which has a clobber before the rcp, vs. // vrcpss mem, %xmm0, %xmm0 // TODO: In theory, we could fold the load, and avoid the stall caused by - // the partial register store, either in ExecutionDepsFix or with smarter RA. + // the partial register store, either in BreakFalseDeps or with smarter RA. let Predicates = [target] in { def : Pat<(OpNode RC:$src), (!cast("V"#NAME#Suffix##r) (ScalarVT (IMPLICIT_DEF)), RC:$src)>; Index: lib/Target/X86/X86RegisterInfo.cpp =================================================================== --- lib/Target/X86/X86RegisterInfo.cpp +++ lib/Target/X86/X86RegisterInfo.cpp @@ -80,7 +80,7 @@ bool X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const { - // ExecutionDepsFixer and PostRAScheduler require liveness. + // ExecutionDomainFix, BreakFalseDeps and PostRAScheduler require liveness. return true; } Index: lib/Target/X86/X86TargetMachine.cpp =================================================================== --- lib/Target/X86/X86TargetMachine.cpp +++ lib/Target/X86/X86TargetMachine.cpp @@ -60,7 +60,7 @@ void initializeFixupLEAPassPass(PassRegistry &); void initializeX86CallFrameOptimizationPass(PassRegistry &); void initializeX86CmovConverterPassPass(PassRegistry &); -void initializeX86ExecutionDepsFixPass(PassRegistry &); +void initializeX86ExecutionDomainFixPass(PassRegistry &); void initializeX86DomainReassignmentPass(PassRegistry &); } // end namespace llvm @@ -78,7 +78,7 @@ initializeFixupLEAPassPass(PR); initializeX86CallFrameOptimizationPass(PR); initializeX86CmovConverterPassPass(PR); - initializeX86ExecutionDepsFixPass(PR); + initializeX86ExecutionDomainFixPass(PR); initializeX86DomainReassignmentPass(PR); } @@ -325,20 +325,23 @@ void addPreSched2() override; }; -class X86ExecutionDepsFix : public ExecutionDepsFix { -public: - static char ID; - X86ExecutionDepsFix() : ExecutionDepsFix(ID, X86::VR128XRegClass) {} - StringRef getPassName() const override { - return "X86 Execution Dependency Fix"; - } -}; -char X86ExecutionDepsFix::ID; - -} // end anonymous namespace - -INITIALIZE_PASS(X86ExecutionDepsFix, "x86-execution-deps-fix", - "X86 Execution Dependency Fix", false, false) +class X86ExecutionDomainFix : public ExecutionDomainFix { +public: + static char ID; + X86ExecutionDomainFix() : ExecutionDomainFix(ID, X86::VR128XRegClass) {} + StringRef getPassName() const override { + return "X86 Execution Dependency Fix"; + } +}; +char X86ExecutionDomainFix::ID; + +} // end anonymous namespace + +INITIALIZE_PASS_BEGIN(X86ExecutionDomainFix, "x86-execution-domain-fix", + "X86 Execution Domain Fix", false, false) +INITIALIZE_PASS_DEPENDENCY(ReachingDefAnalysis) +INITIALIZE_PASS_END(X86ExecutionDomainFix, "x86-execution-domain-fix", + "X86 Execution Domain Fix", false, false) TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) { return new X86PassConfig(*this, PM); @@ -424,8 +427,10 @@ void X86PassConfig::addPreSched2() { addPass(createX86ExpandPseudoPass()); } void X86PassConfig::addPreEmitPass() { - if (getOptLevel() != CodeGenOpt::None) - addPass(new X86ExecutionDepsFix()); + if (getOptLevel() != CodeGenOpt::None) { + addPass(new X86ExecutionDomainFix()); + addPass(new BreakFalseDeps()); + } if (UseVZeroUpper) addPass(createX86IssueVZeroUpperPass()); Index: test/CodeGen/ARM/deps-fix.ll =================================================================== --- test/CodeGen/ARM/deps-fix.ll +++ test/CodeGen/ARM/deps-fix.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mcpu=cortex-a9 -mattr=+neon,+neonfp -float-abi=hard -mtriple armv7-linux-gnueabi | FileCheck %s -;; This test checks that the ExecutionDepsFix pass performs the domain changes +;; This test checks that the ExecutionDomainFix pass performs the domain changes ;; even when some dependencies are propagated through implicit definitions. ; CHECK: fun_a Index: test/CodeGen/X86/break-false-dep.ll =================================================================== --- test/CodeGen/X86/break-false-dep.ll +++ test/CodeGen/X86/break-false-dep.ll @@ -67,7 +67,7 @@ ; SSE: for.body{{$}} ; ; This loop contains two cvtsi2ss instructions that update the same xmm -; register. Verify that the execution dependency fix pass breaks those +; register. Verify that the break false dependency fix pass breaks those ; dependencies by inserting xorps instructions. ; ; If the register allocator chooses different registers for the two cvtsi2ss @@ -141,7 +141,7 @@ ; This loop contains a cvtsi2sd instruction that has a loop-carried ; false dependency on an xmm that is modified by other scalar instructions ; that follow it in the loop. Additionally, the source of convert is a -; memory operand. Verify the execution dependency fix pass breaks this +; memory operand. Verify the break false dependency fix pass breaks this ; dependency by inserting a xor before the convert. @x = common global [1024 x double] zeroinitializer, align 16 @y = common global [1024 x double] zeroinitializer, align 16