Index: lib/CodeGen/TwoAddressInstructionPass.cpp =================================================================== --- lib/CodeGen/TwoAddressInstructionPass.cpp +++ lib/CodeGen/TwoAddressInstructionPass.cpp @@ -37,6 +37,7 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Function.h" @@ -68,6 +69,13 @@ cl::desc("Coalesce copies by rescheduling (default=true)"), cl::init(true), cl::Hidden); +// Limit the number of dataflow edges to traverse when evaluating the benefit +// of commuting operands. +static cl::opt MaxDataFlowEdge( + "dataflow-edge-limit", cl::Hidden, cl::init(3), + cl::desc("Maximum number of dataflow edges to traverse when evaluating " + "the benefit of commuting operands")); + namespace { class TwoAddressInstructionPass : public MachineFunctionPass { MachineFunction *MF; @@ -77,6 +85,7 @@ MachineRegisterInfo *MRI; LiveVariables *LV; LiveIntervals *LIS; + MachineLoopInfo *MLI; AliasAnalysis *AA; CodeGenOpt::Level OptLevel; @@ -87,7 +96,8 @@ DenseMap DistanceMap; // Set of already processed instructions in the current block. - SmallPtrSet Processed; + using MachineInstrSet = SmallPtrSet; + MachineInstrSet Processed; // A map from virtual registers to physical registers which are likely targets // to be coalesced to due to copies from physical registers to virtual @@ -106,6 +116,18 @@ bool noUseAfterLastDef(unsigned Reg, unsigned Dist, unsigned &LastDef); + MachineInstr* findReachingDefInMBB(MachineOperand *UseMO); + void collectReachableLiveIns(MachineInstr *MI, unsigned UseReg, + SmallDenseSet &LiveIns, + unsigned MaxLen); + + void collectUsesInMBB(MachineOperand *DefMO, MachineInstrSet &Uses); + bool isReachableInMBB(unsigned Reg, MachineInstr *From, MachineInstr *To, + unsigned MaxLen); + + bool isRecurrenceChain(MachineInstr *MI, unsigned SrcReg, unsigned DstReg, + int MaxLen); + bool isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC, MachineInstr *MI, unsigned Dist); @@ -162,6 +184,7 @@ AU.addPreserved(); AU.addPreservedID(MachineLoopInfoID); AU.addPreservedID(MachineDominatorsID); + AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -539,7 +562,7 @@ return TRI->regsOverlap(RegA, RegB); } -// Returns true if Reg is equal or aliased to at least one register in Set. +/// Returns true if Reg is equal or aliased to at least one register in Set. static bool regOverlapsSet(const SmallVectorImpl &Set, unsigned Reg, const TargetRegisterInfo *TRI) { for (unsigned R : Set) @@ -549,6 +572,169 @@ return false; } +/// Find a reaching definition for UseMO if there's one in MBB +MachineInstr* TwoAddressInstructionPass::findReachingDefInMBB( + MachineOperand *UseMO) { + assert(UseMO->isUse() && UseMO->isReg()); + auto Inst = UseMO->getParent(); + auto Reg = UseMO->getReg(); + + // Find the last definition of Reg before Inst. + MachineBasicBlock::reverse_iterator MI(Inst); + MI = std::next(MI); + for ( ; MI != MBB->rend() ; ++MI) + if ((*MI).findRegisterDefOperandIdx(Reg) != -1) + return &*MI; + return nullptr; +} + +/// Collect live-ins which are backward-reachable from UseReg. MaxLen limits +/// the maximum number of dataflow edges. +void TwoAddressInstructionPass::collectReachableLiveIns( + MachineInstr *MI, unsigned UseReg, SmallDenseSet &LiveIns, + unsigned MaxLen) { + MachineOperand *UseMO = MI->findRegisterUseOperand(UseReg); + if (!UseMO) + return; + + SmallVector, 4> WorkList; + WorkList.push_back(std::make_pair(UseMO, 0)); + + while (!WorkList.empty()) { + MachineOperand *MO; + unsigned Dist; + std::tie(MO, Dist) = WorkList.back(); + WorkList.pop_back(); + + MachineInstr *DefMI = findReachingDefInMBB(MO); + if (!DefMI) { + // There's no local reaching definition for MO, which means that MO is a + // live-in. + LiveIns.insert(MO->getReg()); + continue; + } + + // Give up if Dist is greater than MaxLen. + if (Dist > MaxLen) + continue; + + auto OpsNum = DefMI->getDesc().getNumOperands(); + auto SrcOpIdx = DefMI->getDesc().getNumDefs(); + for ( ; SrcOpIdx < OpsNum ; ++SrcOpIdx) { + auto &SrcMO = DefMI->getOperand(SrcOpIdx); + if (SrcMO.isReg()) + WorkList.push_back(std::make_pair(&SrcMO, Dist+1)); + } + } +} + +/// Collect instructions that uses DefMO inside MBB. +void TwoAddressInstructionPass::collectUsesInMBB(MachineOperand *DefMO, + MachineInstrSet &Uses) { + assert(DefMO->isDef() && DefMO->isReg()); + auto Inst = DefMO->getParent(); + auto Reg = DefMO->getReg(); + + for (auto &Use : MRI->use_instructions(Reg)) { + if (Use.isDebugValue() || Use.getParent() != MBB) + continue; + auto *UseMO = Use.findRegisterUseOperand(Reg); + if (Inst == findReachingDefInMBB(UseMO)) + Uses.insert(&Use); + } +} + +/// Check if there is a dataflow from Reg, which is defined by From, to To +/// inside MBB. MaxLen limits the maximum number of dataflow edges. +bool TwoAddressInstructionPass::isReachableInMBB( + unsigned Reg, MachineInstr *From, MachineInstr *To, unsigned MaxLen) { + MachineOperand *MO = From->findRegisterDefOperand(Reg); + assert(MO && "Reg is not defined by From"); + + SmallVector, 4> WorkList; + WorkList.push_back(std::make_pair(MO, 0)); + + while (!WorkList.empty()) { + unsigned Dist; + std::tie(MO, Dist) = WorkList.back(); + WorkList.pop_back(); + + MachineInstrSet UseMIs; + collectUsesInMBB(MO, UseMIs); + + for (auto *UseMI : UseMIs) { + if (UseMI == To) + return true; + + // Give up if Dist is greater than MaxLen. + if (Dist > MaxLen) + continue; + + for (auto &Def : UseMI->defs()) + if (Def.isReg()) + WorkList.push_back(std::make_pair(&Def, Dist+1)); + } + } + + return false; +} + +/// Return true if DstReg and SrcReg are the part of a recurrence chain, which +/// ends with a copy instruction. +bool TwoAddressInstructionPass::isRecurrenceChain( + MachineInstr *MI, unsigned SrcReg, unsigned DstReg, int MaxLen) { + // Collect backward-reachable live-ins from SrcReg. + SmallDenseSet ReachableLiveIns; + collectReachableLiveIns(MI, SrcReg, ReachableLiveIns, MaxLen); + if (ReachableLiveIns.empty()) + return false; + + // For each reachable live-ins, check following: + // - The live-in has a unique def UD. + // - UD is a copy. + // - UD is in the header of the loop that MBB belongs to. + // - The source operand of UD comes from another copy, which is in MBB. + // - There is a dataflow from DstReg to the source operand of UD. + for (auto LiveIn : ReachableLiveIns) { + // Check if there's a unique def of LiveIn, which is a copy. + MachineInstr *UD = MRI->getUniqueVRegDef(LiveIn); + if (!UD || !UD->isCopy()) + continue; + + // Check if the definition is in the loop header of the loop for MBB. + MachineBasicBlock *UDBB = UD->getParent(); + MachineLoop *Loop = MLI->getLoopFor(MBB); + if (!Loop || Loop != MLI->getLoopFor(UDBB) ||!MLI->isLoopHeader(UDBB)) + continue; + + // Check if the only definition of the source operand of Def instruction + // inside Loop is in MBB. + MachineInstr *UDSrcDef = nullptr; + for (auto &Def: MRI->def_instructions(UD->getOperand(1).getReg())) { + if (MLI->getLoopFor(Def.getParent()) != Loop) + continue; + + // If UDSrcDef is not nullptr, there are multiple definitions for the + // source operand of UD inside Loop. + if (UDSrcDef) { + UDSrcDef = nullptr; + break; + } + UDSrcDef = &Def; + } + + // Check if UDSrcDef is a copy, and is in MBB. + if (!UDSrcDef || !UDSrcDef->isCopy() || UDSrcDef->getParent() != MBB) + continue; + + // Check if there is a dataflow from DstReg to UDSrcDef + if (isReachableInMBB(DstReg, MI, UDSrcDef, MaxLen)) + return true; + } + + return false; +} + /// Return true if it's potentially profitable to commute the two-address /// instruction that's being processed. bool @@ -637,12 +823,29 @@ // To more generally minimize register copies, ideally the logic of two addr // instruction pass should be integrated with register allocation pass where // interference graph is available. - if (isRevCopyChain(regC, regA, 3)) + if (isRevCopyChain(regC, regA, MaxDataFlowEdge)) return true; - if (isRevCopyChain(regB, regA, 3)) + if (isRevCopyChain(regB, regA, MaxDataFlowEdge)) return false; + // Check if there is a recurrence that a redundant copy can be avoided by + // commuting operands. For example, if we have a recurrence of + // Header: + // %vreg1 = MOV %vreg0 + // Latch: + // %vreg3 = ADD %vreg2, %vreg1 + // %vreg0 = MOV %vreg3 + // swithching %vreg2 and %vreg1 avoids a redundant copy at the end by feeding + // the accumulation result directly back to the header. + if (MLI) { + if (isRecurrenceChain(MI, regC, regA, MaxDataFlowEdge)) + return true; + + if (isRecurrenceChain(MI, regB, regA, MaxDataFlowEdge)) + return false; + } + // Since there are no intervening uses for both registers, then commute // if the def of regC is closer. Its live interval is shorter. return LastDefB && LastDefC && LastDefC > LastDefB; @@ -1627,6 +1830,7 @@ InstrItins = MF->getSubtarget().getInstrItineraryData(); LV = getAnalysisIfAvailable(); LIS = getAnalysisIfAvailable(); + MLI = getAnalysisIfAvailable(); AA = &getAnalysis().getAAResults(); OptLevel = TM.getOptLevel(); Index: test/CodeGen/X86/twoaddr-recurrence.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/twoaddr-recurrence.ll @@ -0,0 +1,45 @@ +; RUN: llc < %s -march=x86-64 | FileCheck %s +; +; CHECK: bb7 +; CHECK: cmpl{{ +}}$10 +; CHECK-NOT: movl +; CHECK: jl +; +; Check that there's no redundant copy at the end of bb7 to feed %vreg3 to bb1. + +define i32 @foo(i32 %a) { +bb0: + br label %bb1 + +bb1: ; preds = %bb0, %bb7 + %vreg0 = phi i32 [ 0 , %bb0 ], [ %vreg3, %bb7 ] + %cond0 = icmp eq i32 %a, 0 + br i1 %cond0, label %bb2, label %bb3 + +bb2: ; preds = %bb1 + br label %bb4 + +bb3: ; preds = %bb1 + br label %bb4 + +bb4: ; preds = %bb2, %bb3 + %vreg5 = phi i32 [ 1, %bb2 ], [ 2, %bb3 ] + %cond1 = icmp eq i32 %vreg5, 0 + br i1 %cond1, label %bb5, label %bb6 + +bb5: ; preds = %bb4 + br label %bb7 + +bb6: ; preds = %bb4 + br label %bb7 + +bb7: ; preds = %bb5, %bb6 + %vreg1 = phi i32 [ 1, %bb5 ], [ 2, %bb6 ] + %vreg2 = add i32 %vreg5, %vreg0 + %vreg3 = add i32 %vreg1, %vreg2 + %cond2 = icmp slt i32 %vreg3, 10 + br i1 %cond2, label %bb1, label %bb8 + +bb8: ; preds = %bb7 + ret i32 0 +}