Index: lib/Target/X86/X86ISelDAGToDAG.cpp
===================================================================
--- lib/Target/X86/X86ISelDAGToDAG.cpp
+++ lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1410,6 +1410,19 @@
       return false;
     }
 
+    if (AM.Scale == 1) {
+       if (AM.Base_Reg == N) {
+          SDValue Base_Reg = AM.Base_Reg;  
+          AM.Base_Reg = AM.IndexReg;
+          AM.IndexReg = Base_Reg;
+          AM.Scale++;
+          return false;
+       } else if (AM.IndexReg == N) {
+          AM.Scale++;
+          return false;
+       }
+    }
+
     // Otherwise, we cannot select it.
     return true;
   }
Index: lib/Target/X86/X86OptimizeLEAs.cpp
===================================================================
--- lib/Target/X86/X86OptimizeLEAs.cpp
+++ lib/Target/X86/X86OptimizeLEAs.cpp
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -65,8 +66,8 @@
 public:
   MemOpKey(const MachineOperand *Base, const MachineOperand *Scale,
            const MachineOperand *Index, const MachineOperand *Segment,
-           const MachineOperand *Disp)
-      : Disp(Disp) {
+           const MachineOperand *Disp, bool DispCheck = false)
+      : Disp(Disp),HardDispCheck(DispCheck) {
     Operands[0] = Base;
     Operands[1] = Scale;
     Operands[2] = Index;
@@ -83,7 +84,10 @@
     // matters that they use the same symbol/index/address. Immediates' or
     // offsets' differences will be taken care of during instruction
     // substitution.
-    return isSimilarDispOp(*Disp, *Other.Disp);
+    if (HardDispCheck)
+      return isIdenticalOp(*Disp,*Other.Disp);
+    else
+      return isSimilarDispOp(*Disp, *Other.Disp);
   }
 
   // Address' base, scale, index and segment operands.
@@ -91,6 +95,9 @@
 
   // Address' displacement operand.
   const MachineOperand *Disp;
+
+  // Forces absolute displacement check.
+  bool HardDispCheck;
 };
 } // end anonymous namespace
 
@@ -178,6 +185,17 @@
                   &MI.getOperand(N + X86::AddrDisp));
 }
 
+static inline MemOpKey getMemOpCSEKey(const MachineInstr &MI, unsigned N) {
+  static MachineOperand DummyScale = MachineOperand::CreateImm(1);
+  assert((isLEA(MI) || MI.mayLoadOrStore()) &&
+         "The instruction must be a LEA, a load or a store");
+  return MemOpKey(&MI.getOperand(N + X86::AddrBaseReg),
+                  &DummyScale,
+                  &MI.getOperand(N + X86::AddrIndexReg),
+                  &MI.getOperand(N + X86::AddrSegmentReg),
+                  &MI.getOperand(N + X86::AddrDisp),true);
+}
+
 static inline bool isIdenticalOp(const MachineOperand &MO1,
                                  const MachineOperand &MO2) {
   return MO1.isIdenticalTo(MO2) &&
@@ -228,6 +246,12 @@
   /// been calculated by LEA. Also, remove redundant LEAs.
   bool runOnMachineFunction(MachineFunction &MF) override;
 
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+     AU.setPreservesCFG();
+     MachineFunctionPass::getAnalysisUsage(AU);
+     AU.addRequired<MachineDominatorTree>();
+  }
+
 private:
   typedef DenseMap<MemOpKey, SmallVector<MachineInstr *, 16>> MemOpMap;
 
@@ -261,6 +285,12 @@
   /// distance between them.
   void findLEAs(const MachineBasicBlock &MBB, MemOpMap &LEAs);
 
+  /// \brief Find all LEA instructions in the basic block that have same
+  /// Base, Index, Disp and Segment.
+  void populateCSEMap(const MachineBasicBlock &MBB, MemOpMap &LEAs);
+
+  bool cseLEAs(const MachineBasicBlock &MBB);
+
   /// \brief Removes redundant address calculations.
   bool removeRedundantAddrCalc(MemOpMap &LEAs);
 
@@ -275,6 +305,7 @@
 
   DenseMap<const MachineInstr *, unsigned> InstrPos;
 
+  MachineDominatorTree *DT;
   MachineRegisterInfo *MRI;
   const X86InstrInfo *TII;
   const X86RegisterInfo *TRI;
@@ -286,6 +317,13 @@
 
 FunctionPass *llvm::createX86OptimizeLEAs() { return new OptimizeLEAPass(); }
 
+void OptimizeLEAPass::populateCSEMap(const MachineBasicBlock &MBB, MemOpMap &LEAs) {
+  for (auto &MI : MBB) {
+    if (isLEA(MI))
+      LEAs[getMemOpCSEKey(MI, 1)].push_back(const_cast<MachineInstr *>(&MI));
+  }
+}
+
 int OptimizeLEAPass::calcInstrDist(const MachineInstr &First,
                                    const MachineInstr &Last) {
   // Both instructions must be in the same basic block and they must be
@@ -646,6 +684,60 @@
   return Changed;
 }
 
+bool OptimizeLEAPass::cseLEAs(const MachineBasicBlock &MBB) {
+   MemOpMap LEAs;
+   bool cseDone = false;
+
+   populateCSEMap(MBB,LEAs);
+
+   auto CompareFn = 
+     [] (const MachineInstr *Arg1,const MachineInstr *Arg2) -> bool {
+        if(Arg1->getOperand(2).getImm() < Arg2->getOperand(2).getImm())
+           return false;
+        return true;  
+     };
+
+   // Loop over all entries in the table.
+   for (auto &E : LEAs) {
+     auto &List = E.second;
+     if(List.size() > 1) {
+        std::sort(List.begin(),List.end(),CompareFn);
+     }
+     // Loop over all LEA pairs.
+     for(auto LII = List.begin(); LII != List.end(); LII++) {
+        MachineInstr &LI1 = **LII;
+        auto LINext = std::next(LII);
+        if(LINext == List.end()) 
+           break;
+        MachineInstr &LI2 = **LINext;
+        if (!DT->dominates(&LI2,&LI1))
+           continue;
+        
+        int Scale1 = LI1.getOperand(2).getImm(); 
+        int Scale2 = LI2.getOperand(2).getImm(); 
+        assert(LI2.getOperand(0).isReg() && "Result is a VirtualReg");
+        DebugLoc DL = LI1.getDebugLoc();
+
+        if ((Scale1 - Scale2) > 0) {
+            MachineInstr *NewMI = 
+            BuildMI(*(const_cast<MachineBasicBlock *>(&MBB)),
+                &LI1,DL,TII->get(LI1.getOpcode()))
+              .addDef(LI1.getOperand(0).getReg())  // Dst   = Dst of LI1. 
+              .addUse(LI2.getOperand(0).getReg())  // Base  = Dst of LI2.
+              .addImm(Scale1-Scale2)               // Scale = Diff b/w scales.
+              .addUse(LI1.getOperand(3).getReg())  // Index = Index of LI1.
+              .addImm(0)                           // Disp  = 0 
+              .addUse(LI1.getOperand(5).getReg()); // Segment = Segmant of LI1.  
+
+              LI1.eraseFromParent();
+              cseDone = NewMI != nullptr;
+        }
+     }
+   } 
+   return cseDone;
+}
+
+
 bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
 
@@ -655,12 +747,16 @@
   MRI = &MF.getRegInfo();
   TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
   TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
+  DT  = &getAnalysis<MachineDominatorTree>();
 
   // Process all basic blocks.
   for (auto &MBB : MF) {
     MemOpMap LEAs;
     InstrPos.clear();
 
+    // Attempt CSE over LEAs.
+    Changed |= cseLEAs(MBB);
+
     // Find all LEA instructions in basic block.
     findLEAs(MBB, LEAs);
 
Index: test/CodeGen/X86/lea-opt-cst.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/lea-opt-cst.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=X86
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.SA = type { i32 , i32  , i32 , i32 ,i32 }
+
+define void @test_func(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr {
+; X86-LABEL: test_func: 
+; X86:         # BB#0: 
+; X86-NEXT:    movl (%rdi), %eax
+; X86-NEXT:    movl 16(%rdi), %ecx
+; X86-NEXT:    leal 1(%rax,%rcx), %eax
+; X86-NEXT:    movl %eax, 12(%rdi)
+; X86-NEXT:    leal (%eax,%rcx), %eax
+; X86-NEXT:    movl %eax, 16(%rdi)
+; X86-NEXT:    retq
+ entry:
+   %h0 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 0
+   %0 = load i32, i32* %h0, align 8
+   %h3 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 3
+   %h4 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 4
+   %1 = load i32, i32* %h4, align 8
+   %add = add i32 %0, 1
+   %add4 = add i32 %add, %1
+   store i32 %add4, i32* %h3, align 4
+   %add29 = add i32 %add4 , %1
+   store i32 %add29, i32* %h4, align 8
+   ret void
+}
Index: test/CodeGen/X86/mul-constant-i16.ll
===================================================================
--- test/CodeGen/X86/mul-constant-i16.ll
+++ test/CodeGen/X86/mul-constant-i16.ll
@@ -558,11 +558,10 @@
 define i16 @test_mul_by_29(i16 %x) {
 ; X86-LABEL: test_mul_by_29:
 ; X86:       # BB#0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    leal (%ecx,%ecx,8), %eax
-; X86-NEXT:    leal (%eax,%eax,2), %eax
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,8), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
+; X86-NEXT:    leal (%ecx,%eax,2), %eax
 ; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-NEXT:    retl
 ;
@@ -571,8 +570,7 @@
 ; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-NEXT:    leal (%rdi,%rdi,8), %eax
 ; X64-NEXT:    leal (%rax,%rax,2), %eax
-; X64-NEXT:    addl %edi, %eax
-; X64-NEXT:    addl %edi, %eax
+; X64-NEXT:    leal (%rax,%rdi,2), %eax
 ; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 29
Index: test/CodeGen/X86/mul-constant-i32.ll
===================================================================
--- test/CodeGen/X86/mul-constant-i32.ll
+++ test/CodeGen/X86/mul-constant-i32.ll
@@ -1457,30 +1457,27 @@
 define i32 @test_mul_by_29(i32 %x) {
 ; X86-LABEL: test_mul_by_29:
 ; X86:       # BB#0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    leal (%ecx,%ecx,8), %eax
-; X86-NEXT:    leal (%eax,%eax,2), %eax
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,8), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
+; X86-NEXT:    leal (%ecx,%eax,2), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-HSW-LABEL: test_mul_by_29:
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]   
 ; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    addl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    addl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    leal (%rax,%rdi,2), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    retq # sched: [1:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_29:
 ; X64-JAG:       # BB#0:
 ; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; X64-JAG-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
-; X64-JAG-NEXT:    addl %edi, %eax # sched: [1:0.50]
-; X64-JAG-NEXT:    addl %edi, %eax # sched: [1:0.50]
-; X64-JAG-NEXT:    retq # sched: [4:1.00]
+; X64-JAG-NEXT:    leal (%rdi,%rdi,8), %eax #  sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rax,%rax,2), %eax #  sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rax,%rdi,2), %eax #  sched: [1:0.50]
+; X64-JAG-NEXT:    retq                     #  sched: [4:1.00]
 ;
 ; X86-NOOPT-LABEL: test_mul_by_29:
 ; X86-NOOPT:       # BB#0:
Index: test/CodeGen/X86/mul-constant-i64.ll
===================================================================
--- test/CodeGen/X86/mul-constant-i64.ll
+++ test/CodeGen/X86/mul-constant-i64.ll
@@ -1523,8 +1523,7 @@
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    leal (%eax,%eax,8), %ecx
 ; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    leal (%ecx,%eax,2), %ecx
 ; X86-NEXT:    movl $29, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ecx, %edx
@@ -1532,19 +1531,17 @@
 ;
 ; X64-HSW-LABEL: test_mul_by_29:
 ; X64-HSW:       # BB#0:
-; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rax   #  sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rax,%rax,2), %rax   #  sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rax,%rdi,2), %rax   #  sched: [1:0.50]
+; X64-HSW-NEXT:    retq                       #  sched: [1:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_29:
 ; X64-JAG:       # BB#0:
-; X64-JAG-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; X64-JAG-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
-; X64-JAG-NEXT:    addq %rdi, %rax # sched: [1:0.50]
-; X64-JAG-NEXT:    addq %rdi, %rax # sched: [1:0.50]
-; X64-JAG-NEXT:    retq # sched: [4:1.00]
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,8), %rax     #  sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rax,%rax,2), %rax     #  sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rax,%rdi,2), %rax     #  sched: [1:0.50]
+; X64-JAG-NEXT:    retq                         #  sched: [4:1.00]
 ;
 ; X86-NOOPT-LABEL: test_mul_by_29:
 ; X86-NOOPT:       # BB#0:
Index: test/CodeGen/X86/mul-constant-result.ll
===================================================================
--- test/CodeGen/X86/mul-constant-result.ll
+++ test/CodeGen/X86/mul-constant-result.ll
@@ -163,8 +163,7 @@
 ; X86-NEXT:  .LBB0_35:
 ; X86-NEXT:    leal (%eax,%eax,8), %ecx
 ; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    leal (%ecx,%eax,2), %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ; X86-NEXT:  .LBB0_36:
@@ -322,16 +321,17 @@
 ; X64-HSW-NEXT:  .LBB0_31:
 ; X64-HSW-NEXT:    leal (%rax,%rax,8), %ecx
 ; X64-HSW-NEXT:    leal (%rcx,%rcx,2), %ecx
-; X64-HSW-NEXT:    jmp .LBB0_17
-; X64-HSW-NEXT:  .LBB0_32:
-; X64-HSW-NEXT:    leal (%rax,%rax,8), %ecx
-; X64-HSW-NEXT:    leal (%rcx,%rcx,2), %ecx
-; X64-HSW-NEXT:    addl %eax, %ecx
 ; X64-HSW-NEXT:  .LBB0_17:
 ; X64-HSW-NEXT:    addl %eax, %ecx
 ; X64-HSW-NEXT:    movl %ecx, %eax
 ; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
 ; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_32:
+; X64-HSW-NEXT:    leal (%rax,%rax,8), %ecx
+; X64-HSW-NEXT:    leal (%rcx,%rcx,2), %ecx
+; X64-HSW-NEXT:    leal (%rcx,%rax,2), %eax
+; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT:    retq
 ; X64-HSW-NEXT:  .LBB0_33:
 ; X64-HSW-NEXT:    movl %eax, %ecx
 ; X64-HSW-NEXT:    shll $5, %ecx
Index: test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
===================================================================
--- test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
+++ test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
@@ -13,14 +13,14 @@
 ; X64-NEXT: .p2align
 ; X64: %loop
 ; no complex address modes
-; X64-NOT: (%{{[^)]+}},%{{[^)]+}},
+; X64-NOT: [1-9]+(%{{[^)]+}},%{{[^)]+}},
 ;
 ; X32: @simple
 ; no expensive address computation in the preheader
 ; X32-NOT: imul
 ; X32: %loop
 ; no complex address modes
-; X32-NOT: (%{{[^)]+}},%{{[^)]+}},
+; X32-NOT: [1-9]+(%{{[^)]+}},%{{[^)]+}},
 define i32 @simple(i32* %a, i32* %b, i32 %x) nounwind {
 entry:
   br label %loop
@@ -103,7 +103,7 @@
 ; X32-NOT: mov{{.*}}(%esp){{$}}
 ; X32: %for.body{{$}}
 ; no complex address modes
-; X32-NOT: (%{{[^)]+}},%{{[^)]+}},
+; X32-NOT: [1-9]+(%{{[^)]+}},%{{[^)]+}},
 ; no reloads
 ; X32-NOT: (%esp)
 define void @extrastride(i8* nocapture %main, i32 %main_stride, i32* nocapture %res, i32 %x, i32 %y, i32 %z) nounwind {
Index: utils/TableGen/DAGISelMatcherGen.cpp
===================================================================
--- utils/TableGen/DAGISelMatcherGen.cpp
+++ utils/TableGen/DAGISelMatcherGen.cpp
@@ -305,7 +305,7 @@
   const SDNodeInfo &CInfo = CGP.getSDNodeInfo(N->getOperator());
 
   // If this is an 'and R, 1234' where the operation is AND/OR and the RHS is
-  // a constant without a predicate fn that has more that one bit set, handle
+  // a constant without a predicate fn that has more than one bit set, handle
   // this as a special case.  This is usually for targets that have special
   // handling of certain large constants (e.g. alpha with it's 8/16/32-bit
   // handling stuff).  Using these instructions is often far more efficient