Index: include/llvm/CodeGen/MachineInstr.h =================================================================== --- include/llvm/CodeGen/MachineInstr.h +++ include/llvm/CodeGen/MachineInstr.h @@ -1300,12 +1300,13 @@ /// Add all implicit def and use operands to this instruction. void addImplicitDefUseOperands(MachineFunction &MF); -private: /// If this instruction is embedded into a MachineFunction, return the /// MachineRegisterInfo object for the current function, otherwise /// return null. MachineRegisterInfo *getRegInfo(); +private: + /// Unlink all of the register operands in this instruction from their /// respective use lists. This requires that the operands already be on their /// use lists. Index: include/llvm/CodeGen/SelectionDAG.h =================================================================== --- include/llvm/CodeGen/SelectionDAG.h +++ include/llvm/CodeGen/SelectionDAG.h @@ -300,6 +300,9 @@ /// type legalization. bool NewNodesMustHaveLegalTypes = false; + /// Set to true for DAG of BasicBlock contained inside a loop. + bool IsDAGPartOfLoop = false; + private: /// DAGUpdateListener is a friend so it can manipulate the listener stack. friend struct DAGUpdateListener; Index: lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -26,6 +26,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/CodeGen/FastISel.h" @@ -325,6 +326,8 @@ if (OptLevel != CodeGenOpt::None) AU.addRequired(); AU.addRequired(); + if (OptLevel != CodeGenOpt::None) + AU.addRequired(); AU.addRequired(); AU.addPreserved(); AU.addPreserved(); @@ -1416,6 +1419,7 @@ // Iterate over all basic blocks in the function. for (const BasicBlock *LLVMBB : RPOT) { + CurDAG->IsDAGPartOfLoop = false; if (OptLevel != CodeGenOpt::None) { bool AllPredsVisited = true; for (const_pred_iterator PI = pred_begin(LLVMBB), PE = pred_end(LLVMBB); @@ -1593,6 +1597,13 @@ FunctionBasedInstrumentation); } + if (OptLevel != CodeGenOpt::None) { + auto &LIWP = getAnalysis(); + LoopInfo &LI = LIWP.getLoopInfo(); + if (LI.getLoopFor(LLVMBB)) + CurDAG->IsDAGPartOfLoop = true; + } + if (Begin != BI) ++NumDAGBlocks; else Index: lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- lib/Target/X86/X86ISelDAGToDAG.cpp +++ lib/Target/X86/X86ISelDAGToDAG.cpp @@ -88,6 +88,11 @@ IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr; } + bool hasComplexAddressingMode() const { + return Disp && IndexReg.getNode() != nullptr && + Base_Reg.getNode() != nullptr; + } + /// Return true if this addressing mode is already RIP-relative. bool isRIPRelative() const { if (BaseType != RegBase) return false; @@ -97,6 +102,10 @@ return false; } + bool isLegalScale() { + return (Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8); + } + void setBaseReg(SDValue Reg) { BaseType = RegBase; Base_Reg = Reg; @@ -162,10 +171,13 @@ /// If true, selector should try to optimize for minimum code size. bool OptForMinSize; + /// If true, selector should try to aggresively fold operands into AM. + bool OptForAggressingFolding; + public: explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel) : SelectionDAGISel(tm, OptLevel), OptForSize(false), - OptForMinSize(false) {} + OptForMinSize(false), OptForAggressingFolding(false) {} StringRef getPassName() const override { return "X86 DAG->DAG Instruction Selection"; @@ -184,6 +196,12 @@ void PreprocessISelDAG() override; + void setAggressiveOperandFolding(bool val = false) { + OptForAggressingFolding = val; + } + + bool getAggressiveOperandFolding() { return OptForAggressingFolding; } + // Include the pieces autogenerated from the target description. #include "X86GenDAGISel.inc" @@ -197,6 +215,7 @@ bool matchAdd(SDValue N, X86ISelAddressMode &AM, unsigned Depth); bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, unsigned Depth); + bool matchAddressLEA(SDValue N, X86ISelAddressMode &AM); bool matchAddressBase(SDValue N, X86ISelAddressMode &AM); bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, @@ -443,6 +462,20 @@ bool isMaskZeroExtended(SDNode *N) const; }; + + class X86AggressiveOperandFolding { + public: + explicit X86AggressiveOperandFolding(X86DAGToDAGISel &ISel, bool val) + : Selector(&ISel) { + Selector->setAggressiveOperandFolding(val); + } + ~X86AggressiveOperandFolding() { + Selector->setAggressiveOperandFolding(false); + } + + private: + X86DAGToDAGISel *Selector; + }; } @@ -1189,7 +1222,7 @@ AM.IndexReg = NewSRL; return false; } - + bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, unsigned Depth) { SDLoc dl(N); @@ -1197,8 +1230,11 @@ dbgs() << "MatchAddress: "; AM.dump(); }); - // Limit recursion. - if (Depth > 5) + + // Limit recursion. For aggressive operand folding recurse + // till depth 8 which is the maximum legal scale value. + unsigned MaxDepth = getAggressiveOperandFolding() ? 8 : 5; + if (Depth > MaxDepth) return matchAddressBase(N, AM); // If this is already a %rip relative address, we can only merge immediates @@ -1489,6 +1525,20 @@ return false; } + if (OptLevel != CodeGenOpt::None && getAggressiveOperandFolding() && + AM.BaseType == X86ISelAddressMode::RegBase) { + if (AM.Base_Reg == N) { + SDValue Base_Reg = AM.Base_Reg; + AM.Base_Reg = AM.IndexReg; + AM.IndexReg = Base_Reg; + AM.Scale++; + return false; + } else if (AM.IndexReg == N) { + AM.Scale++; + return false; + } + } + // Otherwise, we cannot select it. return true; } @@ -1719,7 +1769,7 @@ SDValue &Disp, SDValue &Segment) { // Save the debug loc before calling selectLEAAddr, in case it invalidates N. SDLoc DL(N); - + if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment)) return false; @@ -1754,6 +1804,29 @@ return true; } +bool X86DAGToDAGISel::matchAddressLEA(SDValue N, X86ISelAddressMode &AM) { + // Avoid enabling aggressive operand folding when node N is a part of loop. + X86AggressiveOperandFolding Enable(*this, !CurDAG->IsDAGPartOfLoop); + + bool matchRes = matchAddress(N, AM); + + // Check for legality of scale when recursion unwinds back to the top. + if (!matchRes) { + if (!AM.isLegalScale()) + return true; + + // Avoid creating costly complex LEAs having scale less than 2 + // within loop. + if(CurDAG->IsDAGPartOfLoop && Subtarget->slow3OpsLEA() && + AM.Scale <= 2 && AM.hasComplexAddressingMode() && + (!AM.hasSymbolicDisplacement() && N.getOpcode() < ISD::BUILTIN_OP_END)) + return true; + } + + return matchRes; +} + + /// Calls SelectAddr and determines if the maximal addressing /// mode it matches can be cost effectively emitted as an LEA instruction. bool X86DAGToDAGISel::selectLEAAddr(SDValue N, @@ -1771,7 +1844,7 @@ SDValue Copy = AM.Segment; SDValue T = CurDAG->getRegister(0, MVT::i32); AM.Segment = T; - if (matchAddress(N, AM)) + if (matchAddressLEA(N, AM)) return false; assert (T == AM.Segment); AM.Segment = Copy; Index: lib/Target/X86/X86OptimizeLEAs.cpp =================================================================== --- lib/Target/X86/X86OptimizeLEAs.cpp +++ lib/Target/X86/X86OptimizeLEAs.cpp @@ -26,6 +26,8 @@ #include "llvm/ADT/Hashing.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -58,6 +60,7 @@ cl::init(false)); STATISTIC(NumSubstLEAs, "Number of LEA instruction substitutions"); +STATISTIC(NumFactoredLEAs, "Number of LEAs factorized"); STATISTIC(NumRedundantLEAs, "Number of redundant LEA instructions removed"); /// \brief Returns true if two machine operands are identical and they are not @@ -65,6 +68,10 @@ static inline bool isIdenticalOp(const MachineOperand &MO1, const MachineOperand &MO2); +/// \brief Returns true if two machine instructions have identical operands. +static bool isIdenticalMI(MachineRegisterInfo *MRI, const MachineOperand &MO1, + const MachineOperand &MO2); + /// \brief Returns true if two address displacement operands are of the same /// type and use the same symbol/index/address regardless of the offset. static bool isSimilarDispOp(const MachineOperand &MO1, @@ -73,6 +80,9 @@ /// \brief Returns true if the instruction is LEA. static inline bool isLEA(const MachineInstr &MI); +/// \brief Returns true if Definition of Operand is a copylike instruction. +static bool isDefCopyLike(MachineRegisterInfo *MRI, const MachineOperand &Opr); + namespace { /// A key based on instruction's memory operands. @@ -80,15 +90,35 @@ public: MemOpKey(const MachineOperand *Base, const MachineOperand *Scale, const MachineOperand *Index, const MachineOperand *Segment, - const MachineOperand *Disp) - : Disp(Disp) { + const MachineOperand *Disp, bool DispCheck = false) + : Disp(Disp), DeepCheck(DispCheck) { Operands[0] = Base; Operands[1] = Scale; Operands[2] = Index; Operands[3] = Segment; } + /// Checks operands of MemOpKey are identical, if Base or Index + /// operand definitions are of kind SUBREG_TO_REG then compare + /// operands of defining MI. + bool performDeepCheck(const MemOpKey &Other) const { + MachineInstr *MI = const_cast(Operands[0]->getParent()); + MachineRegisterInfo *MRI = MI->getRegInfo(); + + for (int i = 0; i < 4; i++) { + bool copyLike = isDefCopyLike(MRI, *Operands[i]); + if (copyLike && !isIdenticalMI(MRI, *Operands[i], *Other.Operands[i])) + return false; + else if (!copyLike && !isIdenticalOp(*Operands[i], *Other.Operands[i])) + return false; + } + return isIdenticalOp(*Disp, *Other.Disp); + } + bool operator==(const MemOpKey &Other) const { + if (DeepCheck) + return performDeepCheck(Other); + // Addresses' bases, scales, indices and segments must be identical. for (int i = 0; i < 4; ++i) if (!isIdenticalOp(*Operands[i], *Other.Operands[i])) @@ -106,6 +136,12 @@ // Address' displacement operand. const MachineOperand *Disp; + + // If true checks Address' base, index, segment and + // displacement are identical, in additions if base/index + // are defined by copylike instruction then futher + // compare the operands of the defining instruction. + bool DeepCheck; }; } // end anonymous namespace @@ -131,12 +167,34 @@ static unsigned getHashValue(const MemOpKey &Val) { // Checking any field of MemOpKey is enough to determine if the key is // empty or tombstone. + hash_code Hash(0); assert(Val.Disp != PtrInfo::getEmptyKey() && "Cannot hash the empty key"); assert(Val.Disp != PtrInfo::getTombstoneKey() && "Cannot hash the tombstone key"); - hash_code Hash = hash_combine(*Val.Operands[0], *Val.Operands[1], - *Val.Operands[2], *Val.Operands[3]); + auto getMIHash = [](MachineInstr *MI) -> hash_code { + hash_code h(0); + for (unsigned i = 1, e = MI->getNumOperands(); i < e; i++) + h = hash_combine(h, MI->getOperand(i)); + return h; + }; + + const MachineOperand &Base = *Val.Operands[0]; + const MachineOperand &Index = *Val.Operands[2]; + MachineInstr *MI = const_cast(Base.getParent()); + MachineRegisterInfo *MRI = MI->getRegInfo(); + + if (isDefCopyLike(MRI, Base)) + Hash = getMIHash(MRI->getVRegDef(Base.getReg())); + else + Hash = hash_combine(Hash, Base); + + if (isDefCopyLike(MRI, Index)) + Hash = getMIHash(MRI->getVRegDef(Index.getReg())); + else + Hash = hash_combine(Hash, Index); + + Hash = hash_combine(Hash, *Val.Operands[1], *Val.Operands[3]); // If the address displacement is an immediate, it should not affect the // hash so that memory operands which differ only be immediate displacement @@ -196,6 +254,16 @@ &MI.getOperand(N + X86::AddrDisp)); } +static inline MemOpKey getMemOpCSEKey(const MachineInstr &MI, unsigned N) { + static MachineOperand DummyScale = MachineOperand::CreateImm(1); + assert((isLEA(MI) || MI.mayLoadOrStore()) && + "The instruction must be a LEA, a load or a store"); + return MemOpKey(&MI.getOperand(N + X86::AddrBaseReg), &DummyScale, + &MI.getOperand(N + X86::AddrIndexReg), + &MI.getOperand(N + X86::AddrSegmentReg), + &MI.getOperand(N + X86::AddrDisp), true); +} + static inline bool isIdenticalOp(const MachineOperand &MO1, const MachineOperand &MO2) { return MO1.isIdenticalTo(MO2) && @@ -203,6 +271,27 @@ !TargetRegisterInfo::isPhysicalRegister(MO1.getReg())); } +static bool isIdenticalMI(MachineRegisterInfo *MRI, const MachineOperand &MO1, + const MachineOperand &MO2) { + MachineInstr *MI1 = nullptr; + MachineInstr *MI2 = nullptr; + if (!MO1.isReg() || !MO2.isReg()) + return false; + + MI1 = MRI->getVRegDef(MO1.getReg()); + MI2 = MRI->getVRegDef(MO2.getReg()); + if (!MI1 || !MI2) + return false; + if (MI1->getOpcode() != MI2->getOpcode()) + return false; + if (MI1->getNumOperands() != MI2->getNumOperands()) + return false; + for (unsigned i = 1, e = MI1->getNumOperands(); i < e; ++i) + if (!isIdenticalOp(MI1->getOperand(i), MI2->getOperand(i))) + return false; + return true; +} + #ifndef NDEBUG static bool isValidDispOp(const MachineOperand &MO) { return MO.isImm() || MO.isCPI() || MO.isJTI() || MO.isSymbol() || @@ -234,8 +323,150 @@ Opcode == X86::LEA64r || Opcode == X86::LEA64_32r; } +static bool isDefCopyLike(MachineRegisterInfo *MRI, const MachineOperand &Opr) { + bool isInstrErased = Opr.isReg() && Opr.getParent()->getParent() ? 0 : 1; + if (!Opr.isReg() || isInstrErased || + TargetRegisterInfo::isPhysicalRegister(Opr.getReg())) + return false; + MachineInstr *MI = MRI->getVRegDef(Opr.getReg()); + return MI && MI->isCopyLike(); +} + namespace { +/// This class captures the functions and attributes +/// needed to factorize LEA within and across basic +/// blocks.LEA instruction with same BASE,OFFSET and +/// INDEX are the candidates for factorization. +class FactorizeLEAOpt { +public: + using LEAListT = std::list; + using LEAMapT = DenseMap; + using ValueT = DenseMap; + using ScopeEntryT = std::pair; + using ScopeStackT = std::vector; + + FactorizeLEAOpt() = default; + FactorizeLEAOpt(const FactorizeLEAOpt &) = delete; + FactorizeLEAOpt &operator=(const FactorizeLEAOpt &) = delete; + + void performCleanup() { + for (auto LEA : removedLEAs) + LEA->eraseFromParent(); + LEAs.clear(); + Stack.clear(); + removedLEAs.clear(); + } + + LEAMapT &getLEAMap() { return LEAs; } + ScopeEntryT *getTopScope() { return &Stack.back(); } + + void addForLazyRemoval(MachineInstr *Instr) { removedLEAs.insert(Instr); } + + bool checkIfScheduledForRemoval(MachineInstr *Instr) { + return removedLEAs.find(Instr) != removedLEAs.end(); + } + + /// Push the ScopeEntry for the BasicBlock over Stack. + /// Also traverses over list of instruction and update + /// LEAs Map and ScopeEntry for each LEA instruction + /// found using insertLEA(). + void pushScope(MachineBasicBlock *MBB); + + /// Stores the size of MachineInstr list corrosponding + /// to key K from LEAs MAP into the ScopeEntry of + /// the basic block, then insert the LEA at the beginning + /// of the list. + void insertLEA(MachineInstr *MI); + + /// Pops out ScopeEntry of top most BasicBlock from the stack + /// and remove the LEA instructions contained in the scope + /// from the LEAs Map. + void popScope(); + + /// If LEA contains Physical Registers then its not a candidate + /// for factorizations since physical registers may violate SSA + /// semantics of MI. + bool containsPhyReg(MachineInstr *MI, unsigned RecLevel); + +private: + ScopeStackT Stack; + LEAMapT LEAs; + std::set removedLEAs; +}; + +void FactorizeLEAOpt::pushScope(MachineBasicBlock *MBB) { + ValueT EmptyMap; + ScopeEntryT SE = std::make_pair(MBB, EmptyMap); + Stack.push_back(SE); + for (auto &MI : *MBB) { + if (isLEA(MI)) + insertLEA(&MI); + } +} + +void FactorizeLEAOpt::popScope() { + ScopeEntryT &SE = Stack.back(); + for (auto MapEntry : SE.second) { + LEAMapT::iterator Itr = LEAs.find(MapEntry.first); + assert((Itr != LEAs.end()) && + "LEAs map must have a node corresponding to ScopeEntry's Key."); + + while (((*Itr).second.size() > MapEntry.second)) + (*Itr).second.pop_front(); + // If list goes empty remove entry from LEAs Map. + if ((*Itr).second.empty()) + LEAs.erase(Itr); + } + Stack.pop_back(); +} + +bool FactorizeLEAOpt::containsPhyReg(MachineInstr *MI, unsigned RecLevel) { + if (!MI || !RecLevel) + return false; + + MachineRegisterInfo *MRI = MI->getRegInfo(); + for (auto Operand : MI->operands()) { + if (!Operand.isReg()) + continue; + if (TargetRegisterInfo::isPhysicalRegister(Operand.getReg())) + return true; + MachineInstr *OperDefMI = MRI->getVRegDef(Operand.getReg()); + if (OperDefMI && (MI != OperDefMI) && OperDefMI->isCopyLike() && + containsPhyReg(OperDefMI, RecLevel - 1)) + return true; + } + return false; +} + +void FactorizeLEAOpt::insertLEA(MachineInstr *MI) { + unsigned lsize; + if (containsPhyReg(MI, 2)) + return; + + // Factorization is beneficial only for complex LEAs. + MachineOperand &Base = MI->getOperand(1); + MachineOperand &Index = MI->getOperand(3); + MachineOperand &Offset = MI->getOperand(4); + if ((Offset.isImm() && !Offset.getImm()) || + (!Base.isReg() || !Base.getReg()) || (!Index.isReg() || !Index.getReg())) + return; + + MemOpKey Key = getMemOpCSEKey(*MI, 1); + ScopeEntryT *TopScope = getTopScope(); + + LEAMapT::iterator Itr = LEAs.find(Key); + if (Itr == LEAs.end()) { + lsize = 0; + LEAs[Key].push_front(MI); + } else { + lsize = (*Itr).second.size(); + (*Itr).second.push_front(MI); + } + if (TopScope->second.find(Key) == TopScope->second.end()) + TopScope->second[Key] = lsize; +} + class OptimizeLEAPass : public MachineFunctionPass { public: OptimizeLEAPass() : MachineFunctionPass(ID) {} @@ -247,6 +478,12 @@ /// been calculated by LEA. Also, remove redundant LEAs. bool runOnMachineFunction(MachineFunction &MF) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + AU.addRequired(); + } + private: using MemOpMap = DenseMap>; @@ -292,8 +529,24 @@ /// \brief Removes LEAs which calculate similar addresses. bool removeRedundantLEAs(MemOpMap &LEAs); + /// \brief Visit over basic blocks, collect LEAs in a scoped + /// hash map (FactorizeLEAOpt::LEAs) and try to factor them out. + bool FactorizeLEAsAllBasicBlocks(MachineFunction &MF); + + bool FactorizeLEAsBasicBlock(MachineDomTreeNode *DN); + + /// \brief Factor out LEAs which share Base,Index,Offset and Segment. + bool processBasicBlock(const MachineBasicBlock &MBB); + + /// \brief Try to replace LEA with a lower strength instruction + /// to improves latency and throughput. + bool strengthReduceLEAs(MemOpMap &LEAs, const MachineBasicBlock &MBB); + DenseMap InstrPos; + FactorizeLEAOpt FactorOpt; + + MachineDominatorTree *DT; MachineRegisterInfo *MRI; const X86InstrInfo *TII; const X86RegisterInfo *TRI; @@ -489,7 +742,9 @@ bool OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) { bool Changed = false; - assert(!LEAs.empty()); + if (LEAs.empty()) + return Changed; + MachineBasicBlock *MBB = (*LEAs.begin()->second.begin())->getParent(); // Process all instructions in basic block. @@ -659,6 +914,10 @@ // Erase removed LEA from the list. I2 = List.erase(I2); + // If List becomes empty remove it from LEAs map. + if (List.empty()) + LEAs.erase(E.first); + Changed = true; } ++I1; @@ -668,6 +927,201 @@ return Changed; } +static inline int getADDrrFromLEA(int LEAOpcode) { + switch (LEAOpcode) { + default: + llvm_unreachable("Unexpected LEA instruction"); + case X86::LEA16r: + return X86::ADD16rr; + case X86::LEA32r: + return X86::ADD32rr; + case X86::LEA64_32r: + case X86::LEA64r: + return X86::ADD64rr; + } +} + +bool OptimizeLEAPass::strengthReduceLEAs(MemOpMap &LEAs, + const MachineBasicBlock &BB) { + bool Changed = false; + + // Loop over all entries in the table. + for (auto &E : LEAs) { + auto &List = E.second; + + // Loop over all LEA pairs. + auto I1 = List.begin(); + while (I1 != List.end()) { + MachineInstrBuilder NewMI; + MachineInstr &First = **I1; + MachineOperand &Res = First.getOperand(0); + MachineOperand &Base = First.getOperand(1); + MachineOperand &Scale = First.getOperand(2); + MachineOperand &Index = First.getOperand(3); + MachineOperand &Offset = First.getOperand(4); + + const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(First.getOpcode())); + const DebugLoc DL = First.getDebugLoc(); + + if (!Base.isReg() || !Index.isReg() || !Index.getReg()) { + I1++; + continue; + } + + if (TargetRegisterInfo::isPhysicalRegister(Res.getReg()) || + TargetRegisterInfo::isPhysicalRegister(Base.getReg()) || + TargetRegisterInfo::isPhysicalRegister(Index.getReg())) { + I1++; + continue; + } + + // Check for register class compatibility between Result and + // Index operands. + const TargetRegisterClass *ResRC = MRI->getRegClass(Res.getReg()); + const TargetRegisterClass *IndexRC = MRI->getRegClass(Index.getReg()); + if (TRI->getRegSizeInBits(*ResRC) != TRI->getRegSizeInBits(*IndexRC)) { + I1++; + continue; + } + + MachineBasicBlock &MBB = *(const_cast(&BB)); + // R = B + I + if (Scale.isImm() && Scale.getImm() == 1 && Offset.isImm() && + !Offset.getImm()) { + NewMI = BuildMI(MBB, &First, DL, ADDrr) + .addDef(Res.getReg()) + .addUse(Base.getReg()) + .addUse(Index.getReg()); + Changed = NewMI.getInstr() != nullptr; + First.eraseFromParent(); + I1 = List.erase(I1); + + // If List becomes empty remove it from LEAs map. + if (List.empty()) + LEAs.erase(E.first); + } else + I1++; + } + } + return Changed; +} + +bool OptimizeLEAPass::processBasicBlock(const MachineBasicBlock &MBB) { + bool cseDone = false; + + // Legal scale value (1,2,4 & 8) vector. + int LegalScale[9] = {0, 1, 1, 0, 1, 0, 0, 0, 1}; + + auto CompareFn = [](const MachineInstr *Arg1, + const MachineInstr *Arg2) -> bool { + if (Arg1->getOperand(2).getImm() < Arg2->getOperand(2).getImm()) + return false; + return true; + }; + + // Loop over all entries in the table. + for (auto &E : FactorOpt.getLEAMap()) { + auto &List = E.second; + if (List.size() > 1) + List.sort(CompareFn); + + // Loop over all LEA pairs. + for (auto Iter1 = List.begin(); Iter1 != List.end(); Iter1++) { + for (auto Iter2 = std::next(Iter1); Iter2 != List.end(); Iter2++) { + MachineInstr &LI1 = **Iter1; + MachineInstr &LI2 = **Iter2; + + if (!DT->dominates(&LI2, &LI1)) + continue; + + int Scale1 = LI1.getOperand(2).getImm(); + int Scale2 = LI2.getOperand(2).getImm(); + assert(LI2.getOperand(0).isReg() && "Result is a VirtualReg"); + DebugLoc DL = LI1.getDebugLoc(); + + // Continue if instruction has already been factorized. + if (FactorOpt.checkIfScheduledForRemoval(&LI1)) + continue; + + int Factor = Scale1 - Scale2; + if (Factor > 0 && LegalScale[Factor]) { + MachineOperand NewBase = LI2.getOperand(0); + MachineOperand NewIndex = LI1.getOperand(3); + + const TargetRegisterClass *LI2ResRC = + MRI->getRegClass(LI2.getOperand(0).getReg()); + const TargetRegisterClass *LI1BaseRC = + MRI->getRegClass(LI1.getOperand(1).getReg()); + + if (TRI->getRegSizeInBits(*LI1BaseRC) > + TRI->getRegSizeInBits(*LI2ResRC)) { + MachineInstr *LI1IndexDef = + MRI->getVRegDef(LI1.getOperand(3).getReg()); + if (LI1IndexDef->getOpcode() == TargetOpcode::SUBREG_TO_REG) { + MachineOperand &SubReg = LI1IndexDef->getOperand(2); + const TargetRegisterClass *SubRegRC = + MRI->getRegClass(SubReg.getReg()); + if (TRI->getRegSizeInBits(*SubRegRC) == + TRI->getRegSizeInBits(*LI2ResRC)) + NewIndex = SubReg; + else + continue; + } + } + + DEBUG(dbgs() << "CSE LEAs: Candidate to replace: "; LI1.dump();); + MachineInstrBuilder NewMI = + BuildMI(*(const_cast(&MBB)), &LI1, DL, + TII->get(LI1.getOpcode())) + .addDef(LI1.getOperand(0).getReg()) // Dst = Dst of LI1. + .addUse(NewBase.getReg()) // Base = Dst to LI2. + .addImm(Factor) // Scale = Diff b/w scales. + .addUse(NewIndex.getReg()) // Index = Index of LI1. + .addImm(0) // Disp = 0 + .addUse( + LI1.getOperand(5).getReg()); // Segment = Segmant of LI1. + + cseDone = NewMI.getInstr() != nullptr; + + /// To preserve the SSA nature we need to remove Def flag + /// from old result. + LI1.getOperand(0).setIsDef(false); + + /// Lazy removal shall ensure that replaced LEA remains + /// till we finish processing all the basic block. This shall + /// provide opportunity for further factorization based on + /// the replaced LEA which will be legal since it has same + /// destination as newly formed LEA. + FactorOpt.addForLazyRemoval(&LI1); + + NumFactoredLEAs++; + DEBUG(dbgs() << "CSE LEAs: Replaced by: "; NewMI->dump();); + } + } + } + } + return cseDone; +} + +bool OptimizeLEAPass::FactorizeLEAsBasicBlock(MachineDomTreeNode *DN) { + bool Changed = false; + MachineBasicBlock *MBB = DN->getBlock(); + FactorOpt.pushScope(MBB); + + Changed |= processBasicBlock(*MBB); + for (auto Child : DN->getChildren()) + FactorizeLEAsBasicBlock(Child); + + FactorOpt.popScope(); + return Changed; +} + +bool OptimizeLEAPass::FactorizeLEAsAllBasicBlocks(MachineFunction &MF) { + bool Changed = FactorizeLEAsBasicBlock(DT->getRootNode()); + FactorOpt.performCleanup(); + return Changed; +} + bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; @@ -677,6 +1131,10 @@ MRI = &MF.getRegInfo(); TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getSubtarget().getRegisterInfo(); + DT = &getAnalysis(); + + // Attempt factorizing LEAs. + Changed |= FactorizeLEAsAllBasicBlocks(MF); // Process all basic blocks. for (auto &MBB : MF) { @@ -693,6 +1151,9 @@ // Remove redundant LEA instructions. Changed |= removeRedundantLEAs(LEAs); + // Strength reduce LEA instructions. + Changed |= strengthReduceLEAs(LEAs, MBB); + // Remove redundant address calculations. Do it only for -Os/-Oz since only // a code size gain is expected from this part of the pass. if (MF.getFunction()->optForSize()) Index: test/CodeGen/X86/GlobalISel/callingconv.ll =================================================================== --- test/CodeGen/X86/GlobalISel/callingconv.ll +++ test/CodeGen/X86/GlobalISel/callingconv.ll @@ -388,7 +388,7 @@ ; X32-NEXT: movl 4(%ecx), %ecx ; X32-NEXT: movl %eax, (%esp) ; X32-NEXT: movl $4, %eax -; X32-NEXT: leal (%esp,%eax), %eax +; X32-NEXT: addl %esp, %eax ; X32-NEXT: movl %edx, 4(%esp) ; X32-NEXT: movl %ecx, 4(%eax) ; X32-NEXT: calll variadic_callee Index: test/CodeGen/X86/GlobalISel/gep.ll =================================================================== --- test/CodeGen/X86/GlobalISel/gep.ll +++ test/CodeGen/X86/GlobalISel/gep.ll @@ -5,10 +5,10 @@ define i32* @test_gep_i8(i32 *%arr, i8 %ind) { ; X64_GISEL-LABEL: test_gep_i8: ; X64_GISEL: # BB#0: -; X64_GISEL-NEXT: movq $4, %rax -; X64_GISEL-NEXT: movsbq %sil, %rcx -; X64_GISEL-NEXT: imulq %rax, %rcx -; X64_GISEL-NEXT: leaq (%rdi,%rcx), %rax +; X64_GISEL-NEXT: movq $4, %rcx +; X64_GISEL-NEXT: movsbq %sil, %rax +; X64_GISEL-NEXT: imulq %rcx, %rax +; X64_GISEL-NEXT: addq %rdi, %rax ; X64_GISEL-NEXT: retq ; ; X64-LABEL: test_gep_i8: @@ -25,7 +25,7 @@ ; X64_GISEL-LABEL: test_gep_i8_const: ; X64_GISEL: # BB#0: ; X64_GISEL-NEXT: movq $80, %rax -; X64_GISEL-NEXT: leaq (%rdi,%rax), %rax +; X64_GISEL-NEXT: addq %rdi, %rax ; X64_GISEL-NEXT: retq ; ; X64-LABEL: test_gep_i8_const: @@ -39,10 +39,10 @@ define i32* @test_gep_i16(i32 *%arr, i16 %ind) { ; X64_GISEL-LABEL: test_gep_i16: ; X64_GISEL: # BB#0: -; X64_GISEL-NEXT: movq $4, %rax -; X64_GISEL-NEXT: movswq %si, %rcx -; X64_GISEL-NEXT: imulq %rax, %rcx -; X64_GISEL-NEXT: leaq (%rdi,%rcx), %rax +; X64_GISEL-NEXT: movq $4, %rcx +; X64_GISEL-NEXT: movswq %si, %rax +; X64_GISEL-NEXT: imulq %rcx, %rax +; X64_GISEL-NEXT: addq %rdi, %rax ; X64_GISEL-NEXT: retq ; ; X64-LABEL: test_gep_i16: @@ -59,7 +59,7 @@ ; X64_GISEL-LABEL: test_gep_i16_const: ; X64_GISEL: # BB#0: ; X64_GISEL-NEXT: movq $80, %rax -; X64_GISEL-NEXT: leaq (%rdi,%rax), %rax +; X64_GISEL-NEXT: addq %rdi, %rax ; X64_GISEL-NEXT: retq ; ; X64-LABEL: test_gep_i16_const: @@ -73,10 +73,10 @@ define i32* @test_gep_i32(i32 *%arr, i32 %ind) { ; X64_GISEL-LABEL: test_gep_i32: ; X64_GISEL: # BB#0: -; X64_GISEL-NEXT: movq $4, %rax -; X64_GISEL-NEXT: movslq %esi, %rcx -; X64_GISEL-NEXT: imulq %rax, %rcx -; X64_GISEL-NEXT: leaq (%rdi,%rcx), %rax +; X64_GISEL-NEXT: movq $4, %rcx +; X64_GISEL-NEXT: movslq %esi, %rax +; X64_GISEL-NEXT: imulq %rcx, %rax +; X64_GISEL-NEXT: addq %rdi, %rax ; X64_GISEL-NEXT: retq ; ; X64-LABEL: test_gep_i32: @@ -92,7 +92,7 @@ ; X64_GISEL-LABEL: test_gep_i32_const: ; X64_GISEL: # BB#0: ; X64_GISEL-NEXT: movq $20, %rax -; X64_GISEL-NEXT: leaq (%rdi,%rax), %rax +; X64_GISEL-NEXT: addq %rdi, %rax ; X64_GISEL-NEXT: retq ; ; X64-LABEL: test_gep_i32_const: @@ -108,7 +108,7 @@ ; X64_GISEL: # BB#0: ; X64_GISEL-NEXT: movq $4, %rax ; X64_GISEL-NEXT: imulq %rsi, %rax -; X64_GISEL-NEXT: leaq (%rdi,%rax), %rax +; X64_GISEL-NEXT: addq %rdi, %rax ; X64_GISEL-NEXT: retq ; ; X64-LABEL: test_gep_i64: @@ -123,7 +123,7 @@ ; X64_GISEL-LABEL: test_gep_i64_const: ; X64_GISEL: # BB#0: ; X64_GISEL-NEXT: movq $20, %rax -; X64_GISEL-NEXT: leaq (%rdi,%rax), %rax +; X64_GISEL-NEXT: addq %rdi, %rax ; X64_GISEL-NEXT: retq ; ; X64-LABEL: test_gep_i64_const: Index: test/CodeGen/X86/GlobalISel/memop-scalar.ll =================================================================== --- test/CodeGen/X86/GlobalISel/memop-scalar.ll +++ test/CodeGen/X86/GlobalISel/memop-scalar.ll @@ -181,7 +181,7 @@ ; ALL-LABEL: test_gep_folding_largeGepIndex: ; ALL: # BB#0: ; ALL-NEXT: movabsq $228719476720, %rax # imm = 0x3540BE3FF0 -; ALL-NEXT: leaq (%rdi,%rax), %rax +; ALL-NEXT: addq %rdi, %rax ; ALL-NEXT: movl %esi, (%rax) ; ALL-NEXT: movl (%rax), %eax ; ALL-NEXT: retq Index: test/CodeGen/X86/lea-opt-cse1.ll =================================================================== --- test/CodeGen/X86/lea-opt-cse1.ll +++ test/CodeGen/X86/lea-opt-cse1.ll @@ -9,27 +9,21 @@ ; X64: # BB#0: # %entry ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: movl 16(%rdi), %ecx -; X64-NEXT: leal (%rax,%rcx), %edx ; X64-NEXT: leal 1(%rax,%rcx), %eax ; X64-NEXT: movl %eax, 12(%rdi) -; X64-NEXT: leal 1(%rcx,%rdx), %eax +; X64-NEXT: addq %ecx, %eax ; X64-NEXT: movl %eax, 16(%rdi) ; X64-NEXT: retq ; ; X86-LABEL: test_func: ; X86: # BB#0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl (%eax), %ecx ; X86-NEXT: movl 16(%eax), %edx -; X86-NEXT: leal 1(%ecx,%edx), %esi +; X86-NEXT: leal 1(%ecx,%edx), %ecx +; X86-NEXT: movl %ecx, 12(%eax) ; X86-NEXT: addl %edx, %ecx -; X86-NEXT: movl %esi, 12(%eax) -; X86-NEXT: leal 1(%edx,%ecx), %ecx ; X86-NEXT: movl %ecx, 16(%eax) -; X86-NEXT: popl %esi ; X86-NEXT: retl entry: %h0 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 0 Index: test/CodeGen/X86/lea-opt-cse2.ll =================================================================== --- test/CodeGen/X86/lea-opt-cse2.ll +++ test/CodeGen/X86/lea-opt-cse2.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s -check-prefix=X64 -; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s -check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+slow-3ops-lea | FileCheck %s -check-prefix=X64 +; RUN: llc < %s -mtriple=i686-unknown -mattr=+slow-3ops-lea | FileCheck %s -check-prefix=X86 %struct.SA = type { i32 , i32 , i32 , i32 , i32}; @@ -10,43 +10,39 @@ ; X64-NEXT: .p2align 4, 0x90 ; X64-NEXT: .LBB0_1: # %loop ; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: movl 16(%rdi), %ecx -; X64-NEXT: leal 1(%rax,%rcx), %edx -; X64-NEXT: movl %edx, 12(%rdi) +; X64-NEXT: movl 16(%rdi), %eax +; X64-NEXT: movl (%rdi), %ecx +; X64-NEXT: addl %eax, %ecx +; X64-NEXT: incl %ecx +; X64-NEXT: movl %ecx, 12(%rdi) ; X64-NEXT: decl %esi ; X64-NEXT: jne .LBB0_1 ; X64-NEXT: # BB#2: # %exit -; X64-NEXT: addl %ecx, %eax -; X64-NEXT: leal 1(%rcx,%rax), %eax -; X64-NEXT: movl %eax, 16(%rdi) +; X64-NEXT: addl %eax, %ecx +; X64-NEXT: movl %ecx, 16(%rdi) ; X64-NEXT: retq ; ; X86-LABEL: foo: ; X86: # BB#0: # %entry -; X86-NEXT: pushl %edi -; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 12 -; X86-NEXT: .cfi_offset %esi, -12 -; X86-NEXT: .cfi_offset %edi, -8 +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB0_1: # %loop ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: movl 16(%eax), %esi -; X86-NEXT: leal 1(%edx,%esi), %edi -; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: movl 16(%eax), %edx +; X86-NEXT: movl (%eax), %esi +; X86-NEXT: addl %edx, %esi +; X86-NEXT: incl %esi +; X86-NEXT: movl %esi, 12(%eax) ; X86-NEXT: decl %ecx ; X86-NEXT: jne .LBB0_1 ; X86-NEXT: # BB#2: # %exit -; X86-NEXT: addl %esi, %edx -; X86-NEXT: leal 1(%esi,%edx), %ecx -; X86-NEXT: movl %ecx, 16(%eax) +; X86-NEXT: addl %edx, %esi +; X86-NEXT: movl %esi, 16(%eax) ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl entry: br label %loop Index: test/CodeGen/X86/lea-opt-cse3.ll =================================================================== --- test/CodeGen/X86/lea-opt-cse3.ll +++ test/CodeGen/X86/lea-opt-cse3.ll @@ -8,7 +8,7 @@ ; X64-NEXT: # kill: %ESI %ESI %RSI ; X64-NEXT: # kill: %EDI %EDI %RDI ; X64-NEXT: leal 4(%rdi,%rsi,2), %ecx -; X64-NEXT: leal 4(%rdi,%rsi,4), %eax +; X64-NEXT: leal (%ecx,%esi,2), %eax ; X64-NEXT: imull %ecx, %eax ; X64-NEXT: retq ; @@ -16,9 +16,9 @@ ; X86: # BB#0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal 4(%ecx,%eax,2), %edx -; X86-NEXT: leal 4(%ecx,%eax,4), %eax -; X86-NEXT: imull %edx, %eax +; X86-NEXT: leal 4(%ecx,%eax,2), %ecx +; X86-NEXT: leal (%ecx,%eax,2), %eax +; X86-NEXT: imull %ecx, %eax ; X86-NEXT: retl entry: %mul = shl i32 %b, 1 @@ -36,7 +36,7 @@ ; X64-NEXT: # kill: %ESI %ESI %RSI ; X64-NEXT: # kill: %EDI %EDI %RDI ; X64-NEXT: leal 4(%rdi,%rsi,4), %ecx -; X64-NEXT: leal 4(%rdi,%rsi,8), %eax +; X64-NEXT: leal (%ecx,%esi,4), %eax ; X64-NEXT: imull %ecx, %eax ; X64-NEXT: retq ; @@ -44,9 +44,9 @@ ; X86: # BB#0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal 4(%ecx,%eax,4), %edx -; X86-NEXT: leal 4(%ecx,%eax,8), %eax -; X86-NEXT: imull %edx, %eax +; X86-NEXT: leal 4(%ecx,%eax,4), %ecx +; X86-NEXT: leal (%ecx,%eax,4), %eax +; X86-NEXT: imull %ecx, %eax ; X86-NEXT: retl entry: %mul = shl i32 %b, 2 @@ -68,29 +68,23 @@ ; X64-NEXT: cmpl $10, %ecx ; X64-NEXT: je .LBB2_2 ; X64-NEXT: # BB#1: # %mid -; X64-NEXT: leal 4(%rdi,%rsi,8), %eax -; X64-NEXT: imull %eax, %ecx -; X64-NEXT: movl %ecx, %eax +; X64-NEXT: leal (%ecx,%esi,4), %eax +; X64-NEXT: imull %ecx, %eax ; X64-NEXT: .LBB2_2: # %exit ; X64-NEXT: retq ; ; X86-LABEL: foo1_mult_basic_blocks: ; X86: # BB#0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: leal 4(%esi,%edx,4), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: leal 4(%eax,%edx,4), %ecx ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: cmpl $10, %ecx ; X86-NEXT: je .LBB2_2 ; X86-NEXT: # BB#1: # %mid -; X86-NEXT: leal 4(%esi,%edx,8), %eax -; X86-NEXT: imull %eax, %ecx -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: leal (%ecx,%edx,4), %eax +; X86-NEXT: imull %ecx, %eax ; X86-NEXT: .LBB2_2: # %exit -; X86-NEXT: popl %esi ; X86-NEXT: retl entry: %mul = shl i32 %b, 2 Index: test/CodeGen/X86/lea-opt-cse4.ll =================================================================== --- test/CodeGen/X86/lea-opt-cse4.ll +++ test/CodeGen/X86/lea-opt-cse4.ll @@ -1,41 +1,31 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s -check-prefix=X64 -; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s -check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+slow-3ops-lea | FileCheck %s -check-prefix=X64 +; RUN: llc < %s -mtriple=i686-unknown -mattr=+slow-3ops-lea | FileCheck %s -check-prefix=X86 %struct.SA = type { i32 , i32 , i32 , i32 , i32}; define void @foo(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr #0 { ; X64-LABEL: foo: ; X64: # BB#0: # %entry -; X64-NEXT: movl 16(%rdi), %eax -; X64-NEXT: movl (%rdi), %ecx -; X64-NEXT: addl %eax, %ecx -; X64-NEXT: addl %eax, %ecx -; X64-NEXT: addl %eax, %ecx -; X64-NEXT: leal (%rcx,%rax), %edx -; X64-NEXT: leal 1(%rax,%rcx), %ecx -; X64-NEXT: movl %ecx, 12(%rdi) -; X64-NEXT: leal 1(%rax,%rdx), %eax +; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movl 16(%rdi), %ecx +; X64-NEXT: leal (%rax,%rcx,4), %eax +; X64-NEXT: addl $1, %eax +; X64-NEXT: movl %eax, 12(%rdi) +; X64-NEXT: addl %ecx, %eax ; X64-NEXT: movl %eax, 16(%rdi) ; X64-NEXT: retq ; ; X86-LABEL: foo: ; X86: # BB#0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl 16(%eax), %ecx -; X86-NEXT: movl (%eax), %edx -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: leal 1(%ecx,%edx), %esi -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: movl %esi, 12(%eax) -; X86-NEXT: leal 1(%ecx,%edx), %ecx +; X86-NEXT: movl (%eax), %ecx +; X86-NEXT: movl 16(%eax), %edx +; X86-NEXT: leal (%ecx,%edx,4), %ecx +; X86-NEXT: addl $1, %ecx +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: addl %edx, %ecx ; X86-NEXT: movl %ecx, 16(%eax) -; X86-NEXT: popl %esi ; X86-NEXT: retl entry: %h0 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 0 @@ -62,15 +52,15 @@ ; X64-NEXT: .p2align 4, 0x90 ; X64-NEXT: .LBB1_1: # %loop ; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl (%rdi), %ecx ; X64-NEXT: movl 16(%rdi), %eax -; X64-NEXT: leal 1(%rcx,%rax), %edx -; X64-NEXT: movl %edx, 12(%rdi) +; X64-NEXT: movl (%rdi), %ecx +; X64-NEXT: addl %eax, %ecx +; X64-NEXT: incl %ecx +; X64-NEXT: movl %ecx, 12(%rdi) ; X64-NEXT: decl %esi ; X64-NEXT: jne .LBB1_1 ; X64-NEXT: # BB#2: # %exit ; X64-NEXT: addl %eax, %ecx -; X64-NEXT: leal 1(%rax,%rcx), %ecx ; X64-NEXT: addl %eax, %ecx ; X64-NEXT: addl %eax, %ecx ; X64-NEXT: addl %eax, %ecx @@ -82,26 +72,23 @@ ; ; X86-LABEL: foo_loop: ; X86: # BB#0: # %entry -; X86-NEXT: pushl %edi -; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 12 -; X86-NEXT: .cfi_offset %esi, -12 -; X86-NEXT: .cfi_offset %edi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB1_1: # %loop ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl (%eax), %esi ; X86-NEXT: movl 16(%eax), %ecx -; X86-NEXT: leal 1(%esi,%ecx), %edi -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: decl %edx +; X86-NEXT: movl (%eax), %edx +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: incl %edx +; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: decl %esi ; X86-NEXT: jne .LBB1_1 ; X86-NEXT: # BB#2: # %exit -; X86-NEXT: addl %ecx, %esi -; X86-NEXT: leal 1(%ecx,%esi), %edx +; X86-NEXT: addl %ecx, %edx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: addl %ecx, %edx @@ -110,7 +97,6 @@ ; X86-NEXT: addl %ecx, %edx ; X86-NEXT: movl %edx, 16(%eax) ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl entry: br label %loop Index: test/CodeGen/X86/mul-constant-i16.ll =================================================================== --- test/CodeGen/X86/mul-constant-i16.ll +++ test/CodeGen/X86/mul-constant-i16.ll @@ -558,11 +558,10 @@ define i16 @test_mul_by_29(i16 %x) { ; X86-LABEL: test_mul_by_29: ; X86: # BB#0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %eax -; X86-NEXT: leal (%eax,%eax,2), %eax -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: leal (%eax,%eax,8), %ecx +; X86-NEXT: leal (%ecx,%ecx,2), %ecx +; X86-NEXT: leal (%ecx,%eax,2), %eax ; X86-NEXT: # kill: %AX %AX %EAX ; X86-NEXT: retl ; @@ -571,8 +570,7 @@ ; X64-NEXT: # kill: %EDI %EDI %RDI ; X64-NEXT: leal (%rdi,%rdi,8), %eax ; X64-NEXT: leal (%rax,%rax,2), %eax -; X64-NEXT: addl %edi, %eax -; X64-NEXT: addl %edi, %eax +; X64-NEXT: leal (%rax,%rdi,2), %eax ; X64-NEXT: # kill: %AX %AX %EAX ; X64-NEXT: retq %mul = mul nsw i16 %x, 29 Index: test/CodeGen/X86/mul-constant-i32.ll =================================================================== --- test/CodeGen/X86/mul-constant-i32.ll +++ test/CodeGen/X86/mul-constant-i32.ll @@ -1457,11 +1457,10 @@ define i32 @test_mul_by_29(i32 %x) { ; X86-LABEL: test_mul_by_29: ; X86: # BB#0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: leal (%ecx,%ecx,8), %eax -; X86-NEXT: leal (%eax,%eax,2), %eax -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: leal (%eax,%eax,8), %ecx +; X86-NEXT: leal (%ecx,%ecx,2), %ecx +; X86-NEXT: leal (%ecx,%eax,2), %eax ; X86-NEXT: retl ; ; X64-HSW-LABEL: test_mul_by_29: @@ -1469,8 +1468,7 @@ ; X64-HSW-NEXT: # kill: %EDI %EDI %RDI ; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] ; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] -; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25] +; X64-HSW-NEXT: leal (%rax,%rdi,2), %eax # sched: [1:0.50] ; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_29: @@ -1478,8 +1476,7 @@ ; X64-JAG-NEXT: # kill: %EDI %EDI %RDI ; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50] ; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50] -; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50] -; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50] +; X64-JAG-NEXT: leal (%rax,%rdi,2), %eax # sched: [1:0.50] ; X64-JAG-NEXT: retq # sched: [4:1.00] ; ; X86-NOOPT-LABEL: test_mul_by_29: Index: test/CodeGen/X86/mul-constant-i64.ll =================================================================== --- test/CodeGen/X86/mul-constant-i64.ll +++ test/CodeGen/X86/mul-constant-i64.ll @@ -1523,8 +1523,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: leal (%eax,%eax,8), %ecx ; X86-NEXT: leal (%ecx,%ecx,2), %ecx -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: addl %eax, %ecx +; X86-NEXT: leal (%ecx,%eax,2), %ecx ; X86-NEXT: movl $29, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %ecx, %edx @@ -1534,16 +1533,14 @@ ; X64-HSW: # BB#0: ; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] ; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] -; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25] +; X64-HSW-NEXT: leaq (%rax,%rdi,2), %rax # sched: [1:0.50] ; X64-HSW-NEXT: retq # sched: [2:1.00] ; ; X64-JAG-LABEL: test_mul_by_29: ; X64-JAG: # BB#0: ; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50] ; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50] -; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50] -; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50] +; X64-JAG-NEXT: leaq (%rax,%rdi,2), %rax # sched: [1:0.50] ; X64-JAG-NEXT: retq # sched: [4:1.00] ; ; X86-NOOPT-LABEL: test_mul_by_29: Index: test/CodeGen/X86/mul-constant-result.ll =================================================================== --- test/CodeGen/X86/mul-constant-result.ll +++ test/CodeGen/X86/mul-constant-result.ll @@ -164,8 +164,7 @@ ; X86-NEXT: .LBB0_35: ; X86-NEXT: leal (%eax,%eax,8), %ecx ; X86-NEXT: leal (%ecx,%ecx,2), %ecx -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: leal (%ecx,%eax,2), %eax ; X86-NEXT: popl %esi ; X86-NEXT: retl ; X86-NEXT: .LBB0_36: @@ -323,16 +322,17 @@ ; X64-HSW-NEXT: .LBB0_31: ; X64-HSW-NEXT: leal (%rax,%rax,8), %ecx ; X64-HSW-NEXT: leal (%rcx,%rcx,2), %ecx -; X64-HSW-NEXT: jmp .LBB0_17 -; X64-HSW-NEXT: .LBB0_32: -; X64-HSW-NEXT: leal (%rax,%rax,8), %ecx -; X64-HSW-NEXT: leal (%rcx,%rcx,2), %ecx -; X64-HSW-NEXT: addl %eax, %ecx ; X64-HSW-NEXT: .LBB0_17: ; X64-HSW-NEXT: addl %eax, %ecx ; X64-HSW-NEXT: movl %ecx, %eax ; X64-HSW-NEXT: # kill: %EAX %EAX %RAX ; X64-HSW-NEXT: retq +; X64-HSW-NEXT: .LBB0_32: +; X64-HSW-NEXT: leal (%rax,%rax,8), %ecx +; X64-HSW-NEXT: leal (%rcx,%rcx,2), %ecx +; X64-HSW-NEXT: leal (%rcx,%rax,2), %eax +; X64-HSW-NEXT: # kill: %EAX %EAX %RAX +; X64-HSW-NEXT: retq ; X64-HSW-NEXT: .LBB0_33: ; X64-HSW-NEXT: movl %eax, %ecx ; X64-HSW-NEXT: shll $5, %ecx Index: test/CodeGen/X86/umul-with-overflow.ll =================================================================== --- test/CodeGen/X86/umul-with-overflow.ll +++ test/CodeGen/X86/umul-with-overflow.ll @@ -40,10 +40,10 @@ ; X64-NEXT: leal (%rdi,%rdi), %eax ; X64-NEXT: retq entry: - %tmp0 = add i32 %b, %a - %tmp1 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %tmp0, i32 2) - %tmp2 = extractvalue { i32, i1 } %tmp1, 0 - ret i32 %tmp2 + %tmp0 = add i32 %b, %a + %tmp1 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %tmp0, i32 2) + %tmp2 = extractvalue { i32, i1 } %tmp1, 0 + ret i32 %tmp2 } define i32 @test3(i32 %a, i32 %b) nounwind readnone { @@ -64,8 +64,8 @@ ; X64-NEXT: mull %ecx ; X64-NEXT: retq entry: - %tmp0 = add i32 %b, %a - %tmp1 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %tmp0, i32 4) - %tmp2 = extractvalue { i32, i1 } %tmp1, 0 - ret i32 %tmp2 + %tmp0 = add i32 %b, %a + %tmp1 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %tmp0, i32 4) + %tmp2 = extractvalue { i32, i1 } %tmp1, 0 + ret i32 %tmp2 } Index: test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll =================================================================== --- test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll +++ test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll @@ -13,14 +13,14 @@ ; X64-NEXT: .p2align ; X64: %loop ; no complex address modes -; X64-NOT: (%{{[^)]+}},%{{[^)]+}}, +; X64-NOT: [1-9]+(%{{[^)]+}},%{{[^)]+}}, ; ; X32: @simple ; no expensive address computation in the preheader ; X32-NOT: imul ; X32: %loop ; no complex address modes -; X32-NOT: (%{{[^)]+}},%{{[^)]+}}, +; X32-NOT: [1-9]+(%{{[^)]+}},%{{[^)]+}}, define i32 @simple(i32* %a, i32* %b, i32 %x) nounwind { entry: br label %loop @@ -103,7 +103,7 @@ ; X32-NOT: mov{{.*}}(%esp){{$}} ; X32: %for.body{{$}} ; no complex address modes -; X32-NOT: (%{{[^)]+}},%{{[^)]+}}, +; X32-NOT: [1-9]+(%{{[^)]+}},%{{[^)]+}}, ; no reloads ; X32-NOT: (%esp) define void @extrastride(i8* nocapture %main, i32 %main_stride, i32* nocapture %res, i32 %x, i32 %y, i32 %z) nounwind {