Index: lib/CodeGen/PeepholeOptimizer.cpp =================================================================== --- lib/CodeGen/PeepholeOptimizer.cpp +++ lib/CodeGen/PeepholeOptimizer.cpp @@ -77,6 +77,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/MC/MCInstrDesc.h" @@ -119,6 +120,14 @@ "rewrite-phi-limit", cl::Hidden, cl::init(10), cl::desc("Limit the length of PHI chains to lookup")); +// Limit the length of recurrence chain when evaluating the benefit of +// commuting operands. +static cl::opt MaxRecurrenceChain( + "recurrence-chain-limit", cl::Hidden, cl::init(3), + cl::desc("Maximum length of recurrence chain when evaluating the benefit " + "of commuting operands")); + + STATISTIC(NumReuse, "Number of extension results reused"); STATISTIC(NumCmps, "Number of compares eliminated"); STATISTIC(NumImmFold, "Number of move immediate folded"); @@ -131,12 +140,14 @@ namespace { class ValueTrackerResult; + class RecurrenceInstr; class PeepholeOptimizer : public MachineFunctionPass { const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; MachineRegisterInfo *MRI; MachineDominatorTree *DT; // Machine dominator tree + MachineLoopInfo *MLI; public: static char ID; // Pass identification @@ -150,6 +161,8 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); + AU.addRequired(); + AU.addPreserved(); if (Aggressive) { AU.addRequired(); AU.addPreserved(); @@ -160,6 +173,9 @@ typedef SmallDenseMap RewriteMapTy; + /// \brief Sequence of instructions that formulate recurrence cycle. + typedef SmallVector RecurrenceCycle; + private: bool optimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB); bool optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB, @@ -170,6 +186,7 @@ bool optimizeCoalescableCopy(MachineInstr *MI); bool optimizeUncoalescableCopy(MachineInstr *MI, SmallPtrSetImpl &LocalMIs); + bool optimizeRecurrence(MachineInstr &PHI); bool findNextSource(unsigned Reg, unsigned SubReg, RewriteMapTy &RewriteMap); bool isMoveImmediate(MachineInstr *MI, @@ -178,6 +195,13 @@ bool foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB, SmallSet &ImmDefRegs, DenseMap &ImmDefMIs); + /// \brief Finds recurrence cycles, but only ones that formulated around + /// a def operand and a use operand that are tied. If there is a use + /// operand commutable with the tied use operand, find recurrence cycle + /// along that operand as well. + void findTargetRecurrence(unsigned DefReg, unsigned TargetReg, + RecurrenceCycle &RC, + SmallVectorImpl &RCs); /// \brief If copy instruction \p MI is a virtual register copy, track it in /// the set \p CopySrcRegs and \p CopyMIs. If this virtual register was @@ -222,6 +246,28 @@ } }; + /// \brief Helper class to hold instructions that are inside recurrence + /// cycles. The recurrence cycle is formulated around 1) a def operand and its + /// tied use operand, or 2) a def operand and a use operand that is commutable + /// with another use operand which is tied to the def operand. In the latter + /// case, index of the tied use operand and the commutable use operand are + /// maintained with CommutePair. + class RecurrenceInstr { + public: + typedef std::pair IndexPair; + + RecurrenceInstr(MachineInstr *MI) : MI(MI) {} + RecurrenceInstr(MachineInstr *MI, unsigned Idx1, unsigned Idx2) + : MI(MI), CommutePair(std::make_pair(Idx1, Idx2)) {} + + MachineInstr *getMI() const { return MI; } + Optional getCommutePair() const { return CommutePair; } + + private: + MachineInstr *MI; + Optional CommutePair; + }; + /// \brief Helper class to hold a reply for ValueTracker queries. Contains the /// returned sources for a given search and the instructions where the sources /// were tracked from. @@ -412,6 +458,7 @@ INITIALIZE_PASS_BEGIN(PeepholeOptimizer, DEBUG_TYPE, "Peephole Optimizations", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_END(PeepholeOptimizer, DEBUG_TYPE, "Peephole Optimizations", false, false) @@ -1487,6 +1534,114 @@ return false; } +/// \bried Returns true if \p MO is a virtual register operand. +static bool isVirtualRegisterOperand(MachineOperand &MO) { + if (!MO.isReg()) + return false; + return TargetRegisterInfo::isVirtualRegister(MO.getReg()); +} + +void PeepholeOptimizer::findTargetRecurrence( + unsigned DefReg, unsigned TargetReg, RecurrenceCycle &RC, + SmallVectorImpl &RCs) { + // Recurrence found if (DefReg == TargetReg). + if (DefReg == TargetReg) { + RCs.push_back(RC); + return; + } + + // Give up if the recurrence chain length is longer than the limit. + if (RC.size() >= MaxRecurrenceChain) + return; + + MachineInstr *DefMI = MRI->getVRegDef(DefReg); + int DefIdx = DefMI->findRegisterDefOperandIdx(DefReg); + assert(DefIdx != -1 && "DefReg is not defined by DefMI"); + + // Check if there is a use operand that tied to DefReg. We are only + // interested in the case that all the instructions in the recurrence chain + // have there def operand tied with one of the use operand. + unsigned TiedUseIdx; + if (!DefMI->isRegTiedToUseOperand(DefIdx, &TiedUseIdx)) + return; + + auto Progress = [&](unsigned Idx, RecurrenceInstr &&RI) { + MachineOperand &MO = DefMI->getOperand(Idx); + if (isVirtualRegisterOperand(MO)) { + RC.push_back(RI); + findTargetRecurrence(MO.getReg(), TargetReg, RC, RCs); + RC.pop_back(); + } + }; + + // Continuing search with tied use operand. + Progress(TiedUseIdx, RecurrenceInstr(DefMI)); + + // If tied use operand is commutable, continuing search with commutable + // operand as well. + unsigned CommutableIdx = TargetInstrInfo::CommuteAnyOperandIndex; + if (TII->findCommutedOpIndices(*DefMI, TiedUseIdx, CommutableIdx)) + Progress(CommutableIdx, RecurrenceInstr(DefMI, TiedUseIdx, CommutableIdx)); +} + +/// \brief Phi instructions will eventually be lowered to copy instructions. If +/// phi is in a loop header, a recurrence may formulated around the source and +/// destination of the phi. For such case commuting operands of the instructions +/// in the recurrence may enable coalescing of the copy instruction generated +/// from the phi. For example, if there is a recurrence of +/// +/// LoopHeader: +/// %vreg1 = phi(%vreg0, %vreg100) +/// LoopLatch: +/// %vreg0 = ADD %vreg2, %vreg1 +/// +/// , the fact that vreg0 and vreg2 are in the same tied operands set makes +/// the coalescing of copy instruction generated from the phi in +/// LoopHeader(i.e. %vreg1 = COPY %vreg0) impossible, because %vreg1 and +/// %vreg2 have overlapping live range. This introduces additional move +/// instruction to the final assembly. However, if we commute %vreg2 and +/// %vreg1 of ADD instruction, the redundant move instruction can be +/// avoided. +bool PeepholeOptimizer::optimizeRecurrence(MachineInstr &PHI) { + bool Changed = false; + unsigned TargetReg = PHI.getOperand(0).getReg(); + + // Find recurrence cycles of interest that are formulated around PHI. Except + // PHI, all the instructions in the cycles are required to have its def + // operand tied with one of its source operand. + for (unsigned Idx = 1; Idx < PHI.getNumOperands(); Idx += 2) { + MachineOperand &MO = PHI.getOperand(Idx); + assert(isVirtualRegisterOperand(MO) && "Invalid PHI instruction"); + + unsigned Reg = MO.getReg(); + MachineInstr *MI = MRI->getVRegDef(Reg); + if (!MI) + continue; + + RecurrenceCycle RC; + SmallVector RCs; + findTargetRecurrence(Reg, TargetReg, RC, RCs); + + // Commutes operands of instructions in each RC if necessary so that the + // copy to be generated from PHI can be coalesced. + for (auto &RC : RCs) { + DEBUG(dbgs() << "Optimize recurrence chain from " << PHI << "\n"); + for (auto RI : RC) { + DEBUG(dbgs() << "\tInst: " << *(RI.getMI())); + auto CP = RI.getCommutePair(); + if (CP) { + Changed = true; + TII->commuteInstruction(*(RI.getMI()), false, (*CP).first, + (*CP).second); + DEBUG(dbgs() << "\t\tCommuted: " << *(RI.getMI())); + } + } + } + } + + return Changed; +} + bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(*MF.getFunction())) return false; @@ -1501,6 +1656,7 @@ TRI = MF.getSubtarget().getRegisterInfo(); MRI = &MF.getRegInfo(); DT = Aggressive ? &getAnalysis() : nullptr; + MLI = &getAnalysis(); bool Changed = false; @@ -1529,6 +1685,8 @@ SmallSet CopySrcRegs; DenseMap CopySrcMIs; + bool IsLoopHeader = MLI->isLoopHeader(&MBB); + for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end(); MII != MIE; ) { MachineInstr *MI = &*MII; @@ -1540,9 +1698,16 @@ if (MI->isDebugValue()) continue; - if (MI->isPosition() || MI->isPHI()) + if (MI->isPosition()) continue; + if (IsLoopHeader && MI->isPHI()) { + if (optimizeRecurrence(*MI)) { + Changed = true; + continue; + } + } + if (!MI->isCopy()) { for (const auto &Op : MI->operands()) { // Visit all operands: definitions can be implicit or explicit. @@ -1667,7 +1832,7 @@ MRI->markUsesInDebugValueAsUndef(FoldedReg); FoldAsLoadDefCandidates.erase(FoldedReg); ++NumLoadFold; - + // MI is replaced with FoldMI so we can continue trying to fold Changed = true; MI = FoldMI; @@ -1675,7 +1840,7 @@ } } } - + // If we run into an instruction we can't fold across, discard // the load candidates. Note: We might be able to fold *into* this // instruction, so this needs to be after the folding logic. Index: lib/CodeGen/TwoAddressInstructionPass.cpp =================================================================== --- lib/CodeGen/TwoAddressInstructionPass.cpp +++ lib/CodeGen/TwoAddressInstructionPass.cpp @@ -68,6 +68,13 @@ cl::desc("Coalesce copies by rescheduling (default=true)"), cl::init(true), cl::Hidden); +// Limit the number of dataflow edges to traverse when evaluating the benefit +// of commuting operands. +static cl::opt MaxDataFlowEdge( + "dataflow-edge-limit", cl::Hidden, cl::init(3), + cl::desc("Maximum number of dataflow edges to traverse when evaluating " + "the benefit of commuting operands")); + namespace { class TwoAddressInstructionPass : public MachineFunctionPass { MachineFunction *MF; @@ -637,10 +644,10 @@ // To more generally minimize register copies, ideally the logic of two addr // instruction pass should be integrated with register allocation pass where // interference graph is available. - if (isRevCopyChain(regC, regA, 3)) + if (isRevCopyChain(regC, regA, MaxDataFlowEdge)) return true; - if (isRevCopyChain(regB, regA, 3)) + if (isRevCopyChain(regB, regA, MaxDataFlowEdge)) return false; // Since there are no intervening uses for both registers, then commute Index: test/CodeGen/MIR/Generic/multiRunPass.mir =================================================================== --- test/CodeGen/MIR/Generic/multiRunPass.mir +++ test/CodeGen/MIR/Generic/multiRunPass.mir @@ -7,7 +7,8 @@ # This test ensures that the command line accepts # several run passes on the same command line and # actually create the proper pipeline for it. -# PSEUDO_PEEPHOLE: -expand-isel-pseudos -peephole-opt +# PSEUDO_PEEPHOLE: -expand-isel-pseudos +# PSEUDO_PEEPHOLE-SAME: -peephole-opt # PEEPHOLE_PSEUDO: -peephole-opt -expand-isel-pseudos # Make sure there are no other passes happening after what we asked. Index: test/CodeGen/X86/fma-do-not-commute.ll =================================================================== --- test/CodeGen/X86/fma-do-not-commute.ll +++ test/CodeGen/X86/fma-do-not-commute.ll @@ -9,9 +9,9 @@ ; CHECK-NOT: {{.*}}, %xmm0 ; %addr lives in rdi. ; %addr2 lives in rsi. -; CHECK: vmovss (%rdi), [[ADDR:%xmm[0-9]+]] +; CHECK: vmovss (%rsi), [[ADDR2:%xmm[0-9]+]] ; The assembly syntax is in the reverse order. -; CHECK: vfmadd231ss (%rsi), [[ADDR]], %xmm0 +; CHECK: vfmadd231ss (%rdi), [[ADDR2]], %xmm0 define void @test1(float* %addr, float* %addr2, float %arg) { entry: br label %loop Index: test/CodeGen/X86/peephole-recurrence.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/peephole-recurrence.ll @@ -0,0 +1,45 @@ +; RUN: llc < %s -march=x86-64 | FileCheck %s +; +; CHECK: bb7 +; CHECK: cmpl{{ +}}$10 +; CHECK-NOT: movl +; CHECK: jl +; +; Check that there's no redundant copy at the end of bb7 to feed %vreg3 to bb1. + +define i32 @foo(i32 %a) { +bb0: + br label %bb1 + +bb1: ; preds = %bb0, %bb7 + %vreg0 = phi i32 [ 0 , %bb0 ], [ %vreg3, %bb7 ] + %cond0 = icmp eq i32 %a, 0 + br i1 %cond0, label %bb2, label %bb3 + +bb2: ; preds = %bb1 + br label %bb4 + +bb3: ; preds = %bb1 + br label %bb4 + +bb4: ; preds = %bb2, %bb3 + %vreg5 = phi i32 [ 1, %bb2 ], [ 2, %bb3 ] + %cond1 = icmp eq i32 %vreg5, 0 + br i1 %cond1, label %bb5, label %bb6 + +bb5: ; preds = %bb4 + br label %bb7 + +bb6: ; preds = %bb4 + br label %bb7 + +bb7: ; preds = %bb5, %bb6 + %vreg1 = phi i32 [ 1, %bb5 ], [ 2, %bb6 ] + %vreg2 = add i32 %vreg5, %vreg0 + %vreg3 = add i32 %vreg1, %vreg2 + %cond2 = icmp slt i32 %vreg3, 10 + br i1 %cond2, label %bb1, label %bb8 + +bb8: ; preds = %bb7 + ret i32 0 +}