Index: lib/Target/X86/X86ISelDAGToDAG.cpp
===================================================================
--- lib/Target/X86/X86ISelDAGToDAG.cpp
+++ lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1410,6 +1410,19 @@
       return false;
     }
 
+    if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Scale == 1) {
+      if (AM.Base_Reg == N) {
+         SDValue Base_Reg = AM.Base_Reg;
+         AM.Base_Reg = AM.IndexReg;
+         AM.IndexReg = Base_Reg;
+         AM.Scale = 2;
+         return false;
+      } else if (AM.IndexReg == N) {
+         AM.Scale = 2;
+         return false;
+      }
+    }
+
     // Otherwise, we cannot select it.
     return true;
   }
Index: lib/Target/X86/X86OptimizeLEAs.cpp
===================================================================
--- lib/Target/X86/X86OptimizeLEAs.cpp
+++ lib/Target/X86/X86OptimizeLEAs.cpp
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -44,6 +45,7 @@
                      cl::init(false));
 
 STATISTIC(NumSubstLEAs, "Number of LEA instruction substitutions");
+STATISTIC(NumFactoredLEAs, "Number of LEAs factorized");
 STATISTIC(NumRedundantLEAs, "Number of redundant LEA instructions removed");
 
 /// \brief Returns true if two machine operands are identical and they are not
@@ -65,8 +67,8 @@
 public:
   MemOpKey(const MachineOperand *Base, const MachineOperand *Scale,
            const MachineOperand *Index, const MachineOperand *Segment,
-           const MachineOperand *Disp)
-      : Disp(Disp) {
+           const MachineOperand *Disp, bool DispCheck = false)
+      : Disp(Disp), HardDispCheck(DispCheck) {
     Operands[0] = Base;
     Operands[1] = Scale;
     Operands[2] = Index;
@@ -82,8 +84,10 @@
     // Addresses' displacements don't have to be exactly the same. It only
     // matters that they use the same symbol/index/address. Immediates' or
     // offsets' differences will be taken care of during instruction
-    // substitution.
-    return isSimilarDispOp(*Disp, *Other.Disp);
+    // substitution. If HardDispCheck is true then Disp must be identical.
+    if (!HardDispCheck)
+       return isSimilarDispOp(*Disp, *Other.Disp);
+    return isIdenticalOp(*Disp,*Other.Disp);
   }
 
   // Address' base, scale, index and segment operands.
@@ -91,6 +95,9 @@
 
   // Address' displacement operand.
   const MachineOperand *Disp;
+
+  // Forces absolute displacement check.
+  bool HardDispCheck;
 };
 } // end anonymous namespace
 
@@ -178,6 +185,17 @@
                   &MI.getOperand(N + X86::AddrDisp));
 }
 
+static inline MemOpKey getMemOpCSEKey(const MachineInstr &MI, unsigned N) {
+  static MachineOperand DummyScale = MachineOperand::CreateImm(1);
+  assert((isLEA(MI) || MI.mayLoadOrStore()) &&
+         "The instruction must be a LEA, a load or a store");
+  return MemOpKey(&MI.getOperand(N + X86::AddrBaseReg),
+                  &DummyScale,
+                  &MI.getOperand(N + X86::AddrIndexReg),
+                  &MI.getOperand(N + X86::AddrSegmentReg),
+                  &MI.getOperand(N + X86::AddrDisp), true);
+}
+
 static inline bool isIdenticalOp(const MachineOperand &MO1,
                                  const MachineOperand &MO2) {
   return MO1.isIdenticalTo(MO2) &&
@@ -217,8 +235,124 @@
 }
 
 namespace {
+
+
+class FactorizeLEAOpt {
+public:
+   using LEAListT    = std::list<MachineInstr*>;
+   using LEAMapT     = DenseMap<MemOpKey, LEAListT>;
+   using ValueT      = DenseMap<MemOpKey, unsigned>;
+   using ScopeEntryT = std::pair<MachineBasicBlock*, ValueT>;
+   using ScopeStackT = std::vector<ScopeEntryT>;
+
+   FactorizeLEAOpt() = default;
+   FactorizeLEAOpt(const FactorizeLEAOpt&) = delete;
+   FactorizeLEAOpt & operator = (const FactorizeLEAOpt&) = delete;
+
+   void performCleanup() {
+      for(auto LEA : removedLEAs) {
+         LEA->eraseFromParent();
+      }
+      LEAs.clear();
+      Stack.clear();
+   }
+
+   LEAMapT&     getLEAMap()   { return LEAs;}
+   ScopeEntryT* getTopScope() { return &Stack.back();}
+
+   void addForLazyRemoval(MachineInstr * Instr) {
+      removedLEAs.push_back(Instr);
+   }
+
+   /// Push the ScopeEntry for the BasicBlock over Stack.
+   /// Also traverses over list of instruction and update
+   /// LEAs Map and ScopeEntry for each LEA instruction 
+   /// found using insertLEA().
+   void pushScope(MachineBasicBlock *MBB);
+
+   /// Pops out ScopeEntry of top most BasicBlock from the stack
+   /// and remove the LEA instructions contained in the scope
+   /// from the LEAs Map.
+   void popScope();
+
+   void insertLEA(MachineInstr * MI);
+
+   /// If LEA contains Physical Registers then its not a candidate
+   /// for factorizations since physical registers may violate SSA
+   /// semantics of MI.
+   bool constainsPhyReg(MachineInstr &MI);
+
+private:
+   ScopeStackT Stack; 
+   LEAMapT     LEAs;
+   std::vector<MachineInstr*> removedLEAs;
+};
+
+void FactorizeLEAOpt::pushScope(MachineBasicBlock * MBB) {
+   ValueT EmptyMap;
+   ScopeEntryT SE = std::make_pair(MBB,EmptyMap);
+   Stack.push_back(SE);
+   for (auto &MI : *MBB) {
+     if (isLEA(MI))
+       insertLEA(&MI);
+   }
+}
+
+void FactorizeLEAOpt::popScope() {
+   ScopeEntryT &SE = Stack.back();
+   for(auto MapEntry : SE.second) {
+      LEAMapT::iterator Itr = LEAs.find(MapEntry.first);
+      assert((Itr != LEAs.end()) && 
+        "LEAs map must have a node corresponding to ScopeEntry's Key.");
+
+      while(((*Itr).second.size() > MapEntry.second))
+         (*Itr).second.pop_front(); 
+      // If list goes empty remove entry from LEAs Map.
+      if((*Itr).second.empty()) 
+         LEAs.erase(Itr); 
+   }
+   Stack.pop_back();
+}
+
+bool FactorizeLEAOpt::constainsPhyReg(MachineInstr &MI) {
+   MachineOperand Res   = MI.getOperand(0);
+   MachineOperand Base  = MI.getOperand(1);
+   MachineOperand Index = MI.getOperand(3);
+
+   if (Res.isReg() && TargetRegisterInfo::isPhysicalRegister(Res.getReg()))
+       return true;
+   if (Base.isReg() && TargetRegisterInfo::isPhysicalRegister(Base.getReg()))
+       return true;
+   if (Index.isReg() && TargetRegisterInfo::isPhysicalRegister(Index.getReg()))
+       return true;
+
+   return false;
+}
+
+void FactorizeLEAOpt::insertLEA(MachineInstr * MI) {
+   unsigned lsize = 0;
+   if (constainsPhyReg(*MI))
+      return;
+
+   MemOpKey Key = getMemOpCSEKey(*MI, 1);
+   ScopeEntryT *TopScope = getTopScope();
+
+   LEAMapT::iterator Itr = LEAs.find(Key);
+   if (Itr == LEAs.end()) {
+       lsize = 0;
+       LEAs[Key].push_front(MI);
+   } else {
+       lsize = (*Itr).second.size();
+       (*Itr).second.push_front(MI);
+   }
+   if (TopScope->second.find(Key) == TopScope->second.end())
+       TopScope->second[Key] = lsize;
+}
+
+
 class OptimizeLEAPass : public MachineFunctionPass {
 public:
+
   OptimizeLEAPass() : MachineFunctionPass(ID) {}
 
   StringRef getPassName() const override { return "X86 LEA Optimize"; }
@@ -228,6 +362,13 @@
   /// been calculated by LEA. Also, remove redundant LEAs.
   bool runOnMachineFunction(MachineFunction &MF) override;
 
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+     AU.setPreservesCFG();
+     MachineFunctionPass::getAnalysisUsage(AU);
+     AU.addRequired<MachineDominatorTree>();
+  }
+
+
 private:
   typedef DenseMap<MemOpKey, SmallVector<MachineInstr *, 16>> MemOpMap;
 
@@ -273,8 +414,20 @@
   /// \brief Removes LEAs which calculate similar addresses.
   bool removeRedundantLEAs(MemOpMap &LEAs);
 
+  /// \brief Visit over basic blocks, collect LEAs in a scoped
+  ///  hash map (FactorizeLEAOpt::LEAs) and try to factor them out.
+  bool FactorizeLEAsAllBasicBlocks(MachineFunction &MF);
+
+  bool FactorizeLEAsBasicBlock(MachineDomTreeNode *DN);
+
+  /// \brief Factor out LEAs which share Base,Index,Offset and Segment.
+  bool processBasicBlock(const MachineBasicBlock &MBB);
+
   DenseMap<const MachineInstr *, unsigned> InstrPos;
 
+  FactorizeLEAOpt FactorOpt;
+
+  MachineDominatorTree *DT;
   MachineRegisterInfo *MRI;
   const X86InstrInfo *TII;
   const X86RegisterInfo *TRI;
@@ -646,6 +799,89 @@
   return Changed;
 }
 
+bool OptimizeLEAPass::processBasicBlock(const MachineBasicBlock &MBB) {
+   bool cseDone = false;
+
+   // Legal scale value (1,2,4 & 8) vector.
+   int LegalScale[9] = {0,1,1,0,1,0,0,0,1};
+
+   auto CompareFn = 
+     [] (const MachineInstr *Arg1,const MachineInstr *Arg2) -> bool {
+        if(Arg1->getOperand(2).getImm() < Arg2->getOperand(2).getImm())
+           return false;
+        return true;  
+     };
+
+   // Loop over all entries in the table.
+   for (auto &E : FactorOpt.getLEAMap()) {
+     auto &List = E.second;
+     if(List.size() > 1) {
+        List.sort(CompareFn);
+     }
+     // Loop over all LEA pairs.
+     for(auto Iter1 = List.begin(); Iter1 != List.end(); Iter1++) {
+        for(auto Iter2 = std::next(Iter1); Iter2 != List.end(); Iter2++) {
+           MachineInstr &LI1 = **Iter1;
+           MachineInstr &LI2 = **Iter2;
+
+           if (!DT->dominates(&LI2,&LI1))
+              continue;
+        
+           int Scale1 = LI1.getOperand(2).getImm(); 
+           int Scale2 = LI2.getOperand(2).getImm(); 
+           assert(LI2.getOperand(0).isReg() && "Result is a VirtualReg");
+           DebugLoc DL = LI1.getDebugLoc();
+
+           int Factor = Scale1 - Scale2;
+           if (Factor > 0 && LegalScale[Factor]) {
+               DEBUG(dbgs() << "CSE LEAs: Candidate to replace: "; LI1.dump(););
+               MachineInstr *NewMI = 
+                  BuildMI(*(const_cast<MachineBasicBlock *>(&MBB)),
+                     &LI1,DL,TII->get(LI1.getOpcode()))
+                   .addDef(LI1.getOperand(0).getReg())  // Dst   = Dst of LI1.
+                   .addUse(LI2.getOperand(0).getReg())  // Base  = Dst of LI2.
+                   .addImm(Factor)                      // Scale = Diff b/w scales.
+                   .addUse(LI1.getOperand(3).getReg())  // Index = Index of LI1.
+                   .addImm(0)                           // Disp  = 0 
+                   .addUse(LI1.getOperand(5).getReg()); // Segment = Segmant of LI1.  
+
+               cseDone = NewMI != nullptr;
+
+               /// Lazy removal shall ensure that replaced LEA remains
+               /// till we finish processing all the basic block. This shall
+               /// provide opportunity for further factorization based on
+               /// the replaced LEA which will be legal since it has same
+               /// destination as newly formed LEA.
+               FactorOpt.addForLazyRemoval(&LI1);
+
+               NumFactoredLEAs++;
+               DEBUG(dbgs() << "CSE LEAs: Replaced by: "; NewMI->dump(););
+           }
+        }
+     }
+   }
+  return cseDone;
+}
+
+bool OptimizeLEAPass::FactorizeLEAsBasicBlock(MachineDomTreeNode * DN) {
+   bool Changed = false;
+   MachineBasicBlock * MBB = DN->getBlock();
+   FactorOpt.pushScope(MBB); 
+
+   Changed |= processBasicBlock(*MBB);
+   for(auto Child : DN->getChildren()) 
+      FactorizeLEAsBasicBlock(Child);
+
+   FactorOpt.popScope();  
+   return Changed;
+}
+
+bool OptimizeLEAPass::FactorizeLEAsAllBasicBlocks(MachineFunction &MF) {
+   bool Changed = FactorizeLEAsBasicBlock(DT->getRootNode());
+   FactorOpt.performCleanup();
+   return Changed; 
+}
+
 bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
 
@@ -655,6 +891,10 @@
   MRI = &MF.getRegInfo();
   TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
   TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
+  DT  = &getAnalysis<MachineDominatorTree>();
+
+  // Attempt factorizing LEAs.
+  Changed |= FactorizeLEAsAllBasicBlocks(MF);
 
   // Process all basic blocks.
   for (auto &MBB : MF) {
Index: test/CodeGen/X86/lea-opt-csebb.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/lea-opt-csebb.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown   | FileCheck %s -check-prefix=X86
+%struct.SA = type { i32 , i32 , i32 , i32 , i32};
+
+define void @foo(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr #0 {
+; X64-LABEL: foo:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    .p2align 4, 0x90
+; X64-NEXT:  .LBB0_1: # %loop
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    movl (%rdi), %ecx
+; X64-NEXT:    movl 16(%rdi), %eax
+; X64-NEXT:    leal 1(%rcx,%rax), %ecx
+; X64-NEXT:    movl %ecx, 12(%rdi)
+; X64-NEXT:    decl %esi
+; X64-NEXT:    jne .LBB0_1
+; X64-NEXT:  # BB#2: # %exit
+; X64-NEXT:    leal (%ecx,%rax), %eax
+; X64-NEXT:    movl %eax, 16(%rdi)
+; X64-NEXT:    retq
+;
+; X86-LABEL: foo:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    pushl %esi
+; X86-NEXT:  .Lcfi0:
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:  .Lcfi1:
+; X86-NEXT:    .cfi_offset %esi, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    .p2align 4, 0x90
+; X86-NEXT:  .LBB0_1: # %loop
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl (%eax), %esi
+; X86-NEXT:    movl 16(%eax), %edx
+; X86-NEXT:    leal 1(%esi,%edx), %esi
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    decl %ecx
+; X86-NEXT:    jne .LBB0_1
+; X86-NEXT:  # BB#2: # %exit
+; X86-NEXT:    leal (%esi,%edx), %ecx
+; X86-NEXT:    movl %ecx, 16(%eax)
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+ entry:
+   br label %loop
+
+ loop:
+   %iter = phi i32 [%n ,%entry ] ,[ %iter.ctr ,%loop]
+   %h0 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 0
+   %0 = load i32, i32* %h0, align 8
+   %h3 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 3
+   %h4 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 4
+   %1 = load i32, i32* %h4, align 8
+   %add = add i32 %0, 1
+   %add4 = add i32 %add, %1
+   store i32 %add4, i32* %h3, align 4
+   %add29 = add i32 %add4, %1
+   %iter.ctr = sub i32 %iter , 1
+   %res = icmp ne i32 %iter.ctr , 0
+   br i1 %res , label %loop , label %exit
+
+ exit:
+   store i32 %add29, i32* %h4, align 8
+   ret void
+}
Index: test/CodeGen/X86/lea-opt-cst.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/lea-opt-cst.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown   | FileCheck %s -check-prefix=X86
+
+%struct.SA = type { i32 , i32  , i32 , i32 ,i32 }
+
+define void @test_func(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr {
+; X64-LABEL: test_func:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl 16(%rdi), %ecx
+; X64-NEXT:    leal 1(%rax,%rcx), %eax
+; X64-NEXT:    movl %eax, 12(%rdi)
+; X64-NEXT:    leal (%eax,%rcx), %eax
+; X64-NEXT:    movl %eax, 16(%rdi)
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_func:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 16(%eax), %edx
+; X86-NEXT:    leal 1(%ecx,%edx), %ecx
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal (%ecx,%edx), %ecx
+; X86-NEXT:    movl %ecx, 16(%eax)
+; X86-NEXT:    retl
+ entry:
+   %h0 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 0
+   %0 = load i32, i32* %h0, align 8
+   %h3 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 3
+   %h4 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 4
+   %1 = load i32, i32* %h4, align 8
+   %add = add i32 %0, 1
+   %add4 = add i32 %add, %1
+   store i32 %add4, i32* %h3, align 4
+   %add29 = add i32 %add4 , %1
+   store i32 %add29, i32* %h4, align 8
+   ret void
+}
Index: test/CodeGen/X86/mul-constant-i16.ll
===================================================================
--- test/CodeGen/X86/mul-constant-i16.ll
+++ test/CodeGen/X86/mul-constant-i16.ll
@@ -558,11 +558,10 @@
 define i16 @test_mul_by_29(i16 %x) {
 ; X86-LABEL: test_mul_by_29:
 ; X86:       # BB#0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    leal (%ecx,%ecx,8), %eax
-; X86-NEXT:    leal (%eax,%eax,2), %eax
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,8), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
+; X86-NEXT:    leal (%ecx,%eax,2), %eax
 ; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-NEXT:    retl
 ;
@@ -571,8 +570,7 @@
 ; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-NEXT:    leal (%rdi,%rdi,8), %eax
 ; X64-NEXT:    leal (%rax,%rax,2), %eax
-; X64-NEXT:    addl %edi, %eax
-; X64-NEXT:    addl %edi, %eax
+; X64-NEXT:    leal (%rax,%rdi,2), %eax
 ; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 29
Index: test/CodeGen/X86/mul-constant-i32.ll
===================================================================
--- test/CodeGen/X86/mul-constant-i32.ll
+++ test/CodeGen/X86/mul-constant-i32.ll
@@ -1457,11 +1457,10 @@
 define i32 @test_mul_by_29(i32 %x) {
 ; X86-LABEL: test_mul_by_29:
 ; X86:       # BB#0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    leal (%ecx,%ecx,8), %eax
-; X86-NEXT:    leal (%eax,%eax,2), %eax
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,8), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
+; X86-NEXT:    leal (%ecx,%eax,2), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-HSW-LABEL: test_mul_by_29:
@@ -1469,8 +1468,7 @@
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    addl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    addl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    leal (%rax,%rdi,2), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    retq # sched: [1:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_29:
@@ -1478,8 +1476,7 @@
 ; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-JAG-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
 ; X64-JAG-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
-; X64-JAG-NEXT:    addl %edi, %eax # sched: [1:0.50]
-; X64-JAG-NEXT:    addl %edi, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rax,%rdi,2), %eax # sched: [1:0.50]
 ; X64-JAG-NEXT:    retq # sched: [4:1.00]
 ;
 ; X86-NOOPT-LABEL: test_mul_by_29:
Index: test/CodeGen/X86/mul-constant-i64.ll
===================================================================
--- test/CodeGen/X86/mul-constant-i64.ll
+++ test/CodeGen/X86/mul-constant-i64.ll
@@ -1523,8 +1523,7 @@
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    leal (%eax,%eax,8), %ecx
 ; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    leal (%ecx,%eax,2), %ecx
 ; X86-NEXT:    movl $29, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ecx, %edx
@@ -1534,16 +1533,14 @@
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    addq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    leaq (%rax,%rdi,2), %rax # sched: [1:0.50]
 ; X64-HSW-NEXT:    retq # sched: [1:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_29:
 ; X64-JAG:       # BB#0:
 ; X64-JAG-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
 ; X64-JAG-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
-; X64-JAG-NEXT:    addq %rdi, %rax # sched: [1:0.50]
-; X64-JAG-NEXT:    addq %rdi, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rax,%rdi,2), %rax # sched: [1:0.50]
 ; X64-JAG-NEXT:    retq # sched: [4:1.00]
 ;
 ; X86-NOOPT-LABEL: test_mul_by_29:
Index: test/CodeGen/X86/mul-constant-result.ll
===================================================================
--- test/CodeGen/X86/mul-constant-result.ll
+++ test/CodeGen/X86/mul-constant-result.ll
@@ -163,8 +163,7 @@
 ; X86-NEXT:  .LBB0_35:
 ; X86-NEXT:    leal (%eax,%eax,8), %ecx
 ; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    leal (%ecx,%eax,2), %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ; X86-NEXT:  .LBB0_36:
@@ -322,16 +321,17 @@
 ; X64-HSW-NEXT:  .LBB0_31:
 ; X64-HSW-NEXT:    leal (%rax,%rax,8), %ecx
 ; X64-HSW-NEXT:    leal (%rcx,%rcx,2), %ecx
-; X64-HSW-NEXT:    jmp .LBB0_17
-; X64-HSW-NEXT:  .LBB0_32:
-; X64-HSW-NEXT:    leal (%rax,%rax,8), %ecx
-; X64-HSW-NEXT:    leal (%rcx,%rcx,2), %ecx
-; X64-HSW-NEXT:    addl %eax, %ecx
 ; X64-HSW-NEXT:  .LBB0_17:
 ; X64-HSW-NEXT:    addl %eax, %ecx
 ; X64-HSW-NEXT:    movl %ecx, %eax
 ; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
 ; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_32:
+; X64-HSW-NEXT:    leal (%rax,%rax,8), %ecx
+; X64-HSW-NEXT:    leal (%rcx,%rcx,2), %ecx
+; X64-HSW-NEXT:    leal (%rcx,%rax,2), %eax
+; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT:    retq
 ; X64-HSW-NEXT:  .LBB0_33:
 ; X64-HSW-NEXT:    movl %eax, %ecx
 ; X64-HSW-NEXT:    shll $5, %ecx
Index: test/CodeGen/X86/umul-with-overflow.ll
===================================================================
--- test/CodeGen/X86/umul-with-overflow.ll
+++ test/CodeGen/X86/umul-with-overflow.ll
@@ -1,37 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s
 
 declare {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
 define zeroext i1 @a(i32 %x)  nounwind {
+; CHECK-LABEL: a:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl $3, %ecx
+; CHECK-NEXT:    mull %ecx
+; CHECK-NEXT:    seto %al
+; CHECK-NEXT:    retl
   %res = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %x, i32 3)
   %obil = extractvalue {i32, i1} %res, 1
   ret i1 %obil
-  
-; CHECK-LABEL: a:
-; CHECK: mull
-; CHECK: seto %al
-; CHECK: ret
+
 }
 
 define i32 @test2(i32 %a, i32 %b) nounwind readnone {
+; CHECK-LABEL: test2:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    addl %eax, %eax
+; CHECK-NEXT:    retl
 entry:
 	%tmp0 = add i32 %b, %a
 	%tmp1 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %tmp0, i32 2)
 	%tmp2 = extractvalue { i32, i1 } %tmp1, 0
 	ret i32 %tmp2
-; CHECK-LABEL: test2:
-; CHECK: addl
-; CHECK-NEXT: addl
-; CHECK-NEXT: ret
 }
 
 define i32 @test3(i32 %a, i32 %b) nounwind readnone {
+; CHECK-LABEL: test3:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl $4, %ecx
+; CHECK-NEXT:    mull %ecx
+; CHECK-NEXT:    retl
 entry:
 	%tmp0 = add i32 %b, %a
 	%tmp1 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %tmp0, i32 4)
 	%tmp2 = extractvalue { i32, i1 } %tmp1, 0
 	ret i32 %tmp2
-; CHECK-LABEL: test3:
-; CHECK: addl
-; CHECK: mull
-; CHECK-NEXT: ret
 }
Index: test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
===================================================================
--- test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
+++ test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
@@ -13,14 +13,14 @@
 ; X64-NEXT: .p2align
 ; X64: %loop
 ; no complex address modes
-; X64-NOT: (%{{[^)]+}},%{{[^)]+}},
+; X64-NOT: [1-9]+(%{{[^)]+}},%{{[^)]+}},
 ;
 ; X32: @simple
 ; no expensive address computation in the preheader
 ; X32-NOT: imul
 ; X32: %loop
 ; no complex address modes
-; X32-NOT: (%{{[^)]+}},%{{[^)]+}},
+; X32-NOT: [1-9]+(%{{[^)]+}},%{{[^)]+}},
 define i32 @simple(i32* %a, i32* %b, i32 %x) nounwind {
 entry:
   br label %loop
@@ -103,7 +103,7 @@
 ; X32-NOT: mov{{.*}}(%esp){{$}}
 ; X32: %for.body{{$}}
 ; no complex address modes
-; X32-NOT: (%{{[^)]+}},%{{[^)]+}},
+; X32-NOT: [1-9]+(%{{[^)]+}},%{{[^)]+}},
 ; no reloads
 ; X32-NOT: (%esp)
 define void @extrastride(i8* nocapture %main, i32 %main_stride, i32* nocapture %res, i32 %x, i32 %y, i32 %z) nounwind {
Index: utils/TableGen/DAGISelMatcherGen.cpp
===================================================================
--- utils/TableGen/DAGISelMatcherGen.cpp
+++ utils/TableGen/DAGISelMatcherGen.cpp
@@ -305,7 +305,7 @@
   const SDNodeInfo &CInfo = CGP.getSDNodeInfo(N->getOperator());
 
   // If this is an 'and R, 1234' where the operation is AND/OR and the RHS is
-  // a constant without a predicate fn that has more that one bit set, handle
+  // a constant without a predicate fn that has more than one bit set, handle
   // this as a special case.  This is usually for targets that have special
   // handling of certain large constants (e.g. alpha with it's 8/16/32-bit
   // handling stuff).  Using these instructions is often far more efficient