Index: llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp =================================================================== --- llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp +++ llvm/trunk/lib/CodeGen/PeepholeOptimizer.cpp @@ -76,6 +76,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" @@ -119,6 +120,14 @@ "rewrite-phi-limit", cl::Hidden, cl::init(10), cl::desc("Limit the length of PHI chains to lookup")); +// Limit the length of recurrence chain when evaluating the benefit of +// commuting operands. +static cl::opt MaxRecurrenceChain( + "recurrence-chain-limit", cl::Hidden, cl::init(3), + cl::desc("Maximum length of recurrence chain when evaluating the benefit " + "of commuting operands")); + + STATISTIC(NumReuse, "Number of extension results reused"); STATISTIC(NumCmps, "Number of compares eliminated"); STATISTIC(NumImmFold, "Number of move immediate folded"); @@ -131,12 +140,14 @@ namespace { class ValueTrackerResult; + class RecurrenceInstr; class PeepholeOptimizer : public MachineFunctionPass { const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; MachineRegisterInfo *MRI; MachineDominatorTree *DT; // Machine dominator tree + MachineLoopInfo *MLI; public: static char ID; // Pass identification @@ -150,6 +161,8 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); + AU.addRequired(); + AU.addPreserved(); if (Aggressive) { AU.addRequired(); AU.addPreserved(); @@ -160,6 +173,9 @@ typedef SmallDenseMap RewriteMapTy; + /// \brief Sequence of instructions that formulate recurrence cycle. + typedef SmallVector RecurrenceCycle; + private: bool optimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB); bool optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB, @@ -170,6 +186,7 @@ bool optimizeCoalescableCopy(MachineInstr *MI); bool optimizeUncoalescableCopy(MachineInstr *MI, SmallPtrSetImpl &LocalMIs); + bool optimizeRecurrence(MachineInstr &PHI); bool findNextSource(unsigned Reg, unsigned SubReg, RewriteMapTy &RewriteMap); bool isMoveImmediate(MachineInstr *MI, @@ -178,6 +195,13 @@ bool foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB, SmallSet &ImmDefRegs, DenseMap &ImmDefMIs); + /// \brief Finds recurrence cycles, but only ones that formulated around + /// a def operand and a use operand that are tied. If there is a use + /// operand commutable with the tied use operand, find recurrence cycle + /// along that operand as well. + bool findTargetRecurrence(unsigned Reg, + const SmallSet &TargetReg, + RecurrenceCycle &RC); /// \brief If copy instruction \p MI is a virtual register copy, track it in /// the set \p CopySrcRegs and \p CopyMIs. If this virtual register was @@ -222,6 +246,28 @@ } }; + /// \brief Helper class to hold instructions that are inside recurrence + /// cycles. The recurrence cycle is formulated around 1) a def operand and its + /// tied use operand, or 2) a def operand and a use operand that is commutable + /// with another use operand which is tied to the def operand. In the latter + /// case, index of the tied use operand and the commutable use operand are + /// maintained with CommutePair. + class RecurrenceInstr { + public: + typedef std::pair IndexPair; + + RecurrenceInstr(MachineInstr *MI) : MI(MI) {} + RecurrenceInstr(MachineInstr *MI, unsigned Idx1, unsigned Idx2) + : MI(MI), CommutePair(std::make_pair(Idx1, Idx2)) {} + + MachineInstr *getMI() const { return MI; } + Optional getCommutePair() const { return CommutePair; } + + private: + MachineInstr *MI; + Optional CommutePair; + }; + /// \brief Helper class to hold a reply for ValueTracker queries. Contains the /// returned sources for a given search and the instructions where the sources /// were tracked from. @@ -412,6 +458,7 @@ INITIALIZE_PASS_BEGIN(PeepholeOptimizer, DEBUG_TYPE, "Peephole Optimizations", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_END(PeepholeOptimizer, DEBUG_TYPE, "Peephole Optimizations", false, false) @@ -1487,6 +1534,113 @@ return false; } +/// \bried Returns true if \p MO is a virtual register operand. +static bool isVirtualRegisterOperand(MachineOperand &MO) { + if (!MO.isReg()) + return false; + return TargetRegisterInfo::isVirtualRegister(MO.getReg()); +} + +bool PeepholeOptimizer::findTargetRecurrence( + unsigned Reg, const SmallSet &TargetRegs, + RecurrenceCycle &RC) { + // Recurrence found if Reg is in TargetRegs. + if (TargetRegs.count(Reg)) + return true; + + // TODO: Curerntly, we only allow the last instruction of the recurrence + // cycle (the instruction that feeds the PHI instruction) to have more than + // one uses to guarantee that commuting operands does not tie registers + // with overlapping live range. Once we have actual live range info of + // each register, this constraint can be relaxed. + if (!MRI->hasOneNonDBGUse(Reg)) + return false; + + // Give up if the reccurrence chain length is longer than the limit. + if (RC.size() >= MaxRecurrenceChain) + return false; + + MachineInstr &MI = *(MRI->use_instr_nodbg_begin(Reg)); + unsigned Idx = MI.findRegisterUseOperandIdx(Reg); + + // Only interested in recurrences whose instructions have only one def, which + // is a virtual register. + if (MI.getDesc().getNumDefs() != 1) + return false; + + MachineOperand &DefOp = MI.getOperand(0); + if (!isVirtualRegisterOperand(DefOp)) + return false; + + // Check if def operand of MI is tied to any use operand. We are only + // interested in the case that all the instructions in the recurrence chain + // have there def operand tied with one of the use operand. + unsigned TiedUseIdx; + if (!MI.isRegTiedToUseOperand(0, &TiedUseIdx)) + return false; + + if (Idx == TiedUseIdx) { + RC.push_back(RecurrenceInstr(&MI)); + return findTargetRecurrence(DefOp.getReg(), TargetRegs, RC); + } else { + // If Idx is not TiedUseIdx, check if Idx is commutable with TiedUseIdx. + unsigned CommIdx = TargetInstrInfo::CommuteAnyOperandIndex; + if (TII->findCommutedOpIndices(MI, Idx, CommIdx) && CommIdx == TiedUseIdx) { + RC.push_back(RecurrenceInstr(&MI, Idx, CommIdx)); + return findTargetRecurrence(DefOp.getReg(), TargetRegs, RC); + } + } + + return false; +} + +/// \brief Phi instructions will eventually be lowered to copy instructions. If +/// phi is in a loop header, a recurrence may formulated around the source and +/// destination of the phi. For such case commuting operands of the instructions +/// in the recurrence may enable coalescing of the copy instruction generated +/// from the phi. For example, if there is a recurrence of +/// +/// LoopHeader: +/// %vreg1 = phi(%vreg0, %vreg100) +/// LoopLatch: +/// %vreg0 = ADD %vreg2, %vreg1 +/// +/// , the fact that vreg0 and vreg2 are in the same tied operands set makes +/// the coalescing of copy instruction generated from the phi in +/// LoopHeader(i.e. %vreg1 = COPY %vreg0) impossible, because %vreg1 and +/// %vreg2 have overlapping live range. This introduces additional move +/// instruction to the final assembly. However, if we commute %vreg2 and +/// %vreg1 of ADD instruction, the redundant move instruction can be +/// avoided. +bool PeepholeOptimizer::optimizeRecurrence(MachineInstr &PHI) { + SmallSet TargetRegs; + for (unsigned Idx = 1; Idx < PHI.getNumOperands(); Idx += 2) { + MachineOperand &MO = PHI.getOperand(Idx); + assert(isVirtualRegisterOperand(MO) && "Invalid PHI instruction"); + TargetRegs.insert(MO.getReg()); + } + + bool Changed = false; + RecurrenceCycle RC; + if (findTargetRecurrence(PHI.getOperand(0).getReg(), TargetRegs, RC)) { + // Commutes operands of instructions in RC if necessary so that the copy to + // be generated from PHI can be coalesced. + DEBUG(dbgs() << "Optimize recurrence chain from " << PHI); + for (auto &RI : RC) { + DEBUG(dbgs() << "\tInst: " << *(RI.getMI())); + auto CP = RI.getCommutePair(); + if (CP) { + Changed = true; + TII->commuteInstruction(*(RI.getMI()), false, (*CP).first, + (*CP).second); + DEBUG(dbgs() << "\t\tCommuted: " << *(RI.getMI())); + } + } + } + + return Changed; +} + bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(*MF.getFunction())) return false; @@ -1501,6 +1655,7 @@ TRI = MF.getSubtarget().getRegisterInfo(); MRI = &MF.getRegInfo(); DT = Aggressive ? &getAnalysis() : nullptr; + MLI = &getAnalysis(); bool Changed = false; @@ -1529,6 +1684,8 @@ SmallSet CopySrcRegs; DenseMap CopySrcMIs; + bool IsLoopHeader = MLI->isLoopHeader(&MBB); + for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end(); MII != MIE; ) { MachineInstr *MI = &*MII; @@ -1540,9 +1697,16 @@ if (MI->isDebugValue()) continue; - if (MI->isPosition() || MI->isPHI()) + if (MI->isPosition()) continue; + if (IsLoopHeader && MI->isPHI()) { + if (optimizeRecurrence(*MI)) { + Changed = true; + continue; + } + } + if (!MI->isCopy()) { for (const auto &Op : MI->operands()) { // Visit all operands: definitions can be implicit or explicit. @@ -1667,7 +1831,7 @@ MRI->markUsesInDebugValueAsUndef(FoldedReg); FoldAsLoadDefCandidates.erase(FoldedReg); ++NumLoadFold; - + // MI is replaced with FoldMI so we can continue trying to fold Changed = true; MI = FoldMI; @@ -1675,7 +1839,7 @@ } } } - + // If we run into an instruction we can't fold across, discard // the load candidates. Note: We might be able to fold *into* this // instruction, so this needs to be after the folding logic. Index: llvm/trunk/lib/CodeGen/TwoAddressInstructionPass.cpp =================================================================== --- llvm/trunk/lib/CodeGen/TwoAddressInstructionPass.cpp +++ llvm/trunk/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -68,6 +68,13 @@ cl::desc("Coalesce copies by rescheduling (default=true)"), cl::init(true), cl::Hidden); +// Limit the number of dataflow edges to traverse when evaluating the benefit +// of commuting operands. +static cl::opt MaxDataFlowEdge( + "dataflow-edge-limit", cl::Hidden, cl::init(3), + cl::desc("Maximum number of dataflow edges to traverse when evaluating " + "the benefit of commuting operands")); + namespace { class TwoAddressInstructionPass : public MachineFunctionPass { MachineFunction *MF; @@ -637,10 +644,10 @@ // To more generally minimize register copies, ideally the logic of two addr // instruction pass should be integrated with register allocation pass where // interference graph is available. - if (isRevCopyChain(regC, regA, 3)) + if (isRevCopyChain(regC, regA, MaxDataFlowEdge)) return true; - if (isRevCopyChain(regB, regA, 3)) + if (isRevCopyChain(regB, regA, MaxDataFlowEdge)) return false; // Since there are no intervening uses for both registers, then commute Index: llvm/trunk/test/CodeGen/MIR/Generic/multiRunPass.mir =================================================================== --- llvm/trunk/test/CodeGen/MIR/Generic/multiRunPass.mir +++ llvm/trunk/test/CodeGen/MIR/Generic/multiRunPass.mir @@ -7,7 +7,8 @@ # This test ensures that the command line accepts # several run passes on the same command line and # actually create the proper pipeline for it. -# PSEUDO_PEEPHOLE: -expand-isel-pseudos {{(-machineverifier )?}}-peephole-opt +# PSEUDO_PEEPHOLE: -expand-isel-pseudos +# PSEUDO_PEEPHOLE-SAME: {{(-machineverifier )?}}-peephole-opt # PEEPHOLE_PSEUDO: -peephole-opt {{(-machineverifier )?}}-expand-isel-pseudos # Make sure there are no other passes happening after what we asked. Index: llvm/trunk/test/CodeGen/X86/peephole-recurrence.mir =================================================================== --- llvm/trunk/test/CodeGen/X86/peephole-recurrence.mir +++ llvm/trunk/test/CodeGen/X86/peephole-recurrence.mir @@ -0,0 +1,232 @@ +# RUN: llc -mtriple=x86_64-- -run-pass=peephole-opt -o - %s | FileCheck %s + +--- | + define i32 @foo(i32 %a) { + bb0: + br label %bb1 + + bb1: ; preds = %bb7, %bb0 + %vreg0 = phi i32 [ 0, %bb0 ], [ %vreg3, %bb7 ] + %cond0 = icmp eq i32 %a, 0 + br i1 %cond0, label %bb4, label %bb3 + + bb3: ; preds = %bb1 + br label %bb4 + + bb4: ; preds = %bb1, %bb3 + %vreg5 = phi i32 [ 2, %bb3 ], [ 1, %bb1 ] + %cond1 = icmp eq i32 %vreg5, 0 + br i1 %cond1, label %bb7, label %bb6 + + bb6: ; preds = %bb4 + br label %bb7 + + bb7: ; preds = %bb4, %bb6 + %vreg1 = phi i32 [ 2, %bb6 ], [ 1, %bb4 ] + %vreg2 = add i32 %vreg5, %vreg0 + %vreg3 = add i32 %vreg1, %vreg2 + %cond2 = icmp slt i32 %vreg3, 10 + br i1 %cond2, label %bb1, label %bb8 + + bb8: ; preds = %bb7 + ret i32 0 + } + + define i32 @bar(i32 %a, i32* %p) { + bb0: + br label %bb1 + + bb1: ; preds = %bb7, %bb0 + %vreg0 = phi i32 [ 0, %bb0 ], [ %vreg3, %bb7 ] + %cond0 = icmp eq i32 %a, 0 + br i1 %cond0, label %bb4, label %bb3 + + bb3: ; preds = %bb1 + br label %bb4 + + bb4: ; preds = %bb1, %bb3 + %vreg5 = phi i32 [ 2, %bb3 ], [ 1, %bb1 ] + %cond1 = icmp eq i32 %vreg5, 0 + br i1 %cond1, label %bb7, label %bb6 + + bb6: ; preds = %bb4 + br label %bb7 + + bb7: ; preds = %bb4, %bb6 + %vreg1 = phi i32 [ 2, %bb6 ], [ 1, %bb4 ] + %vreg2 = add i32 %vreg5, %vreg0 + store i32 %vreg0, i32* %p + %vreg3 = add i32 %vreg1, %vreg2 + %cond2 = icmp slt i32 %vreg3, 10 + br i1 %cond2, label %bb1, label %bb8 + + bb8: ; preds = %bb7 + ret i32 0 + } + +... +--- +# There is a recurrence formulated around %0, %10, and %3. Check that operands +# are commuted for ADD instructions in bb.5.bb7 so that the values involved in +# the recurrence are tied. This will remove redundant copy instruction. +name: foo +tracksRegLiveness: true +registers: + - { id: 0, class: gr32, preferred-register: '' } + - { id: 1, class: gr32, preferred-register: '' } + - { id: 2, class: gr32, preferred-register: '' } + - { id: 3, class: gr32, preferred-register: '' } + - { id: 4, class: gr32, preferred-register: '' } + - { id: 5, class: gr32, preferred-register: '' } + - { id: 6, class: gr32, preferred-register: '' } + - { id: 7, class: gr32, preferred-register: '' } + - { id: 8, class: gr32, preferred-register: '' } + - { id: 9, class: gr32, preferred-register: '' } + - { id: 10, class: gr32, preferred-register: '' } + - { id: 11, class: gr32, preferred-register: '' } + - { id: 12, class: gr32, preferred-register: '' } +liveins: + - { reg: '%edi', virtual-reg: '%4' } +body: | + bb.0.bb0: + successors: %bb.1.bb1(0x80000000) + liveins: %edi + + %4 = COPY %edi + %5 = MOV32r0 implicit-def dead %eflags + + bb.1.bb1: + successors: %bb.3.bb4(0x30000000), %bb.2.bb3(0x50000000) + + ; CHECK: %0 = PHI %5, %bb.0.bb0, %3, %bb.5.bb7 + %0 = PHI %5, %bb.0.bb0, %3, %bb.5.bb7 + %6 = MOV32ri 1 + TEST32rr %4, %4, implicit-def %eflags + JE_1 %bb.3.bb4, implicit %eflags + JMP_1 %bb.2.bb3 + + bb.2.bb3: + successors: %bb.3.bb4(0x80000000) + + %7 = MOV32ri 2 + + bb.3.bb4: + successors: %bb.5.bb7(0x30000000), %bb.4.bb6(0x50000000) + + %1 = PHI %6, %bb.1.bb1, %7, %bb.2.bb3 + TEST32rr %1, %1, implicit-def %eflags + JE_1 %bb.5.bb7, implicit %eflags + JMP_1 %bb.4.bb6 + + bb.4.bb6: + successors: %bb.5.bb7(0x80000000) + + %9 = MOV32ri 2 + + bb.5.bb7: + successors: %bb.1.bb1(0x7c000000), %bb.6.bb8(0x04000000) + + %2 = PHI %6, %bb.3.bb4, %9, %bb.4.bb6 + %10 = ADD32rr %1, %0, implicit-def dead %eflags + ; CHECK: %10 = ADD32rr + ; CHECK-SAME: %0, + ; CHECK-SAME: %1, + %3 = ADD32rr %2, killed %10, implicit-def dead %eflags + ; CHECK: %3 = ADD32rr + ; CHECK-SAME: %10, + ; CHECK-SAME: %2, + %11 = SUB32ri8 %3, 10, implicit-def %eflags + JL_1 %bb.1.bb1, implicit %eflags + JMP_1 %bb.6.bb8 + + bb.6.bb8: + %12 = MOV32r0 implicit-def dead %eflags + %eax = COPY %12 + RET 0, %eax + +... +--- +# Here a recurrence is formulated around %0, %11, and %3, but operands should +# not be commuted because %0 has a use outside of recurrence. This is to +# prevent the case of commuting operands ties the values with overlapping live +# ranges. +name: bar +tracksRegLiveness: true +registers: + - { id: 0, class: gr32, preferred-register: '' } + - { id: 1, class: gr32, preferred-register: '' } + - { id: 2, class: gr32, preferred-register: '' } + - { id: 3, class: gr32, preferred-register: '' } + - { id: 4, class: gr32, preferred-register: '' } + - { id: 5, class: gr64, preferred-register: '' } + - { id: 6, class: gr32, preferred-register: '' } + - { id: 7, class: gr32, preferred-register: '' } + - { id: 8, class: gr32, preferred-register: '' } + - { id: 9, class: gr32, preferred-register: '' } + - { id: 10, class: gr32, preferred-register: '' } + - { id: 11, class: gr32, preferred-register: '' } + - { id: 12, class: gr32, preferred-register: '' } + - { id: 13, class: gr32, preferred-register: '' } +liveins: + - { reg: '%edi', virtual-reg: '%4' } + - { reg: '%rsi', virtual-reg: '%5' } +body: | + bb.0.bb0: + successors: %bb.1.bb1(0x80000000) + liveins: %edi, %rsi + + %5 = COPY %rsi + %4 = COPY %edi + %6 = MOV32r0 implicit-def dead %eflags + + bb.1.bb1: + successors: %bb.3.bb4(0x30000000), %bb.2.bb3(0x50000000) + + %0 = PHI %6, %bb.0.bb0, %3, %bb.5.bb7 + ; CHECK: %0 = PHI %6, %bb.0.bb0, %3, %bb.5.bb7 + %7 = MOV32ri 1 + TEST32rr %4, %4, implicit-def %eflags + JE_1 %bb.3.bb4, implicit %eflags + JMP_1 %bb.2.bb3 + + bb.2.bb3: + successors: %bb.3.bb4(0x80000000) + + %8 = MOV32ri 2 + + bb.3.bb4: + successors: %bb.5.bb7(0x30000000), %bb.4.bb6(0x50000000) + + %1 = PHI %7, %bb.1.bb1, %8, %bb.2.bb3 + TEST32rr %1, %1, implicit-def %eflags + JE_1 %bb.5.bb7, implicit %eflags + JMP_1 %bb.4.bb6 + + bb.4.bb6: + successors: %bb.5.bb7(0x80000000) + + %10 = MOV32ri 2 + + bb.5.bb7: + successors: %bb.1.bb1(0x7c000000), %bb.6.bb8(0x04000000) + + %2 = PHI %7, %bb.3.bb4, %10, %bb.4.bb6 + %11 = ADD32rr %1, %0, implicit-def dead %eflags + ; CHECK: %11 = ADD32rr + ; CHECK-SAME: %1, + ; CHECK-SAME: %0, + MOV32mr %5, 1, _, 0, _, %0 :: (store 4 into %ir.p) + %3 = ADD32rr %2, killed %11, implicit-def dead %eflags + ; CHECK: %3 = ADD32rr + ; CHECK-SAME: %2, + ; CHECK-SAME: %11, + %12 = SUB32ri8 %3, 10, implicit-def %eflags + JL_1 %bb.1.bb1, implicit %eflags + JMP_1 %bb.6.bb8 + + bb.6.bb8: + %13 = MOV32r0 implicit-def dead %eflags + %eax = COPY %13 + RET 0, %eax + +...