Index: include/llvm/Target/TargetLowering.h
===================================================================
--- include/llvm/Target/TargetLowering.h
+++ include/llvm/Target/TargetLowering.h
@@ -2827,6 +2827,9 @@
     // TargetLowering::LowerCall that perform tail call conversions.
     bool IsTailCall = false;
 
+    // Is Call lowering done post SelectionDAG type legalization.
+    bool IsPostTypeLegalization = false;
+
     unsigned NumFixedArgs = -1;
     CallingConv::ID CallConv = CallingConv::C;
     SDValue Callee;
@@ -2838,7 +2841,7 @@
     SmallVector<SDValue, 32> OutVals;
     SmallVector<ISD::InputArg, 32> Ins;
     SmallVector<SDValue, 4> InVals;
-
+ 
     CallLoweringInfo(SelectionDAG &DAG)
         : RetSExt(false), RetZExt(false), IsVarArg(false), IsInReg(false),
           DoesNotReturn(false), IsReturnValueUsed(true), IsConvergent(false),
@@ -2949,6 +2952,11 @@
       return *this;
     }
 
+    CallLoweringInfo &setIsPostTypeLegalization(bool Value=true){
+      IsPostTypeLegalization = Value;
+      return *this;
+    }
+
     ArgListTy &getArgs() {
       return Args;
     }
Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1991,7 +1991,8 @@
                     std::move(Args))
       .setTailCall(isTailCall)
       .setSExtResult(isSigned)
-      .setZExtResult(!isSigned);
+      .setZExtResult(!isSigned)
+      .setIsPostTypeLegalization(true);
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
@@ -2029,7 +2030,8 @@
       .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
                     std::move(Args))
       .setSExtResult(isSigned)
-      .setZExtResult(!isSigned);
+      .setZExtResult(!isSigned)
+      .setIsPostTypeLegalization(true);
 
   std::pair<SDValue,SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
@@ -3565,16 +3567,11 @@
         SDValue Args[] = { HiLHS, LHS, HiRHS, RHS };
         Ret = ExpandLibCall(LC, WideVT, Args, 4, isSigned, dl);
       }
-      BottomHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Ret,
-                               DAG.getIntPtrConstant(0, dl));
-      TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Ret,
-                            DAG.getIntPtrConstant(1, dl));
-      // Ret is a node with an illegal type. Because such things are not
-      // generally permitted during this phase of legalization, make sure the
-      // node has no more uses. The above EXTRACT_ELEMENT nodes should have been
-      // folded.
-      assert(Ret->use_empty() &&
-             "Unexpected uses of illegally type from expanded lib call.");
+      assert((Ret.getOpcode() == ISD::BUILD_PAIR ||
+              Ret.getOpcode() == ISD::MERGE_VALUES ) &&
+              "Ret value is a collection of constituent nodes holding result.");
+      BottomHalf = Ret.getOperand(0);
+      TopHalf = Ret.getOperand(1);
     }
 
     if (isSigned) {
Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8085,6 +8085,9 @@
 
   SmallVector<SDValue, 4> ReturnValues;
   if (!CanLowerReturn) {
+    unsigned NumValues;
+    SmallVector<SDValue, 4> Chains;
+
     // The instruction result is the result of loading from the
     // hidden sret parameter.
     SmallVector<EVT, 1> PVTs;
@@ -8092,11 +8095,37 @@
 
     ComputeValueVTs(*this, DL, PtrRetTy, PVTs);
     assert(PVTs.size() == 1 && "Pointers should fit in one register");
-    EVT PtrVT = PVTs[0];
+    EVT PartPtrVT = PVTs[0];
 
-    unsigned NumValues = RetTys.size();
+    // During node legalization expansion of larger integer
+    // results into constituents is done during call lowering,
+    // whereas during type legalization this is handled post
+    // call lowering.
+    if (!CLI.IsPostTypeLegalization) {
+        NumValues     = RetTys.size();
+    } else {
+        int i = 0;
+        LLVMContext &LC = OrigRetTy->getContext();
+        EVT OrigRetVT = getValueType(DL, OrigRetTy);
+        MVT PartVT    = getRegisterType(LC, OrigRetVT);
+        PartPtrVT     = getValueType(DL,
+                PointerType::getUnqual(EVT(PartVT).getTypeForEVT(LC)));
+
+        NumValues     = getNumRegisters(LC, OrigRetVT);
+        unsigned ByteOffset    = (PartVT.getSizeInBits()+7)/8;
+        Offsets.resize(NumValues);
+        for(auto & val : Offsets) {
+            val = i*ByteOffset;
+            i++;
+        }
+
+        RetTys.clear();
+        RetTys.resize(NumValues);
+        std::for_each(RetTys.begin(),RetTys.end(),
+             [&](EVT & val){val = EVT(PartVT);});
+    }
     ReturnValues.resize(NumValues);
-    SmallVector<SDValue, 4> Chains(NumValues);
+    Chains.resize(NumValues);
 
     // An aggregate return value cannot wrap around the address space, so
     // offsets to its parts don't wrap either.
@@ -8104,9 +8133,9 @@
     Flags.setNoUnsignedWrap(true);
 
     for (unsigned i = 0; i < NumValues; ++i) {
-      SDValue Add = CLI.DAG.getNode(ISD::ADD, CLI.DL, PtrVT, DemoteStackSlot,
+      SDValue Add = CLI.DAG.getNode(ISD::ADD, CLI.DL, PartPtrVT, DemoteStackSlot,
                                     CLI.DAG.getConstant(Offsets[i], CLI.DL,
-                                                        PtrVT), Flags);
+                                                        PartPtrVT), Flags);
       SDValue L = CLI.DAG.getLoad(
           RetTys[i], CLI.DL, CLI.Chain, Add,
           MachinePointerInfo::getFixedStack(CLI.DAG.getMachineFunction(),
Index: lib/Target/X86/X86ISelDAGToDAG.cpp
===================================================================
--- lib/Target/X86/X86ISelDAGToDAG.cpp
+++ lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1410,6 +1410,19 @@
       return false;
     }
 
+    if (AM.Scale == 1) {
+       if (AM.Base_Reg == N) {
+          SDValue Base_Reg = AM.Base_Reg;  
+          AM.Base_Reg = AM.IndexReg;
+          AM.IndexReg = Base_Reg;
+          AM.Scale++;
+          return false;
+       } else if (AM.IndexReg == N) {
+          AM.Scale++;
+          return false;
+       }
+    }
+
     // Otherwise, we cannot select it.
     return true;
   }
Index: lib/Target/X86/X86OptimizeLEAs.cpp
===================================================================
--- lib/Target/X86/X86OptimizeLEAs.cpp
+++ lib/Target/X86/X86OptimizeLEAs.cpp
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -65,8 +66,8 @@
 public:
   MemOpKey(const MachineOperand *Base, const MachineOperand *Scale,
            const MachineOperand *Index, const MachineOperand *Segment,
-           const MachineOperand *Disp)
-      : Disp(Disp) {
+           const MachineOperand *Disp, bool DispCheck = false)
+      : Disp(Disp),HardDispCheck(DispCheck) {
     Operands[0] = Base;
     Operands[1] = Scale;
     Operands[2] = Index;
@@ -83,7 +84,10 @@
     // matters that they use the same symbol/index/address. Immediates' or
     // offsets' differences will be taken care of during instruction
     // substitution.
-    return isSimilarDispOp(*Disp, *Other.Disp);
+    if (HardDispCheck)
+      return isIdenticalOp(*Disp,*Other.Disp);
+    else
+      return isSimilarDispOp(*Disp, *Other.Disp);
   }
 
   // Address' base, scale, index and segment operands.
@@ -91,6 +95,9 @@
 
   // Address' displacement operand.
   const MachineOperand *Disp;
+
+  // Forces absolute displacement check.
+  bool HardDispCheck;
 };
 } // end anonymous namespace
 
@@ -178,6 +185,18 @@
                   &MI.getOperand(N + X86::AddrDisp));
 }
 
+static inline MemOpKey getMemOpCSEKey(const MachineInstr &MI, unsigned N) {
+  static MachineOperand DummyScale = MachineOperand::CreateImm(1);
+  assert((isLEA(MI) || MI.mayLoadOrStore()) &&
+         "The instruction must be a LEA, a load or a store");
+  return MemOpKey(&MI.getOperand(N + X86::AddrBaseReg),
+                  &DummyScale,
+                  &MI.getOperand(N + X86::AddrIndexReg),
+                  &MI.getOperand(N + X86::AddrSegmentReg),
+                  &MI.getOperand(N + X86::AddrDisp),true);
+}
+
+
 static inline bool isIdenticalOp(const MachineOperand &MO1,
                                  const MachineOperand &MO2) {
   return MO1.isIdenticalTo(MO2) &&
@@ -228,6 +247,12 @@
   /// been calculated by LEA. Also, remove redundant LEAs.
   bool runOnMachineFunction(MachineFunction &MF) override;
 
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+     AU.setPreservesCFG();
+     MachineFunctionPass::getAnalysisUsage(AU);
+     AU.addRequired<MachineDominatorTree>();
+  }
+
 private:
   typedef DenseMap<MemOpKey, SmallVector<MachineInstr *, 16>> MemOpMap;
 
@@ -261,6 +286,13 @@
   /// distance between them.
   void findLEAs(const MachineBasicBlock &MBB, MemOpMap &LEAs);
 
+  /// \brief Find all LEA instructions in the basic block that have same
+  /// Base, Index, Disp and Segment.
+  void populateCSEMap(const MachineBasicBlock &MBB, MemOpMap &LEAs);
+
+  bool cseLEAs(const MachineBasicBlock &MBB);
+
+
   /// \brief Removes redundant address calculations.
   bool removeRedundantAddrCalc(MemOpMap &LEAs);
 
@@ -276,6 +308,7 @@
   DenseMap<const MachineInstr *, unsigned> InstrPos;
 
   MachineRegisterInfo *MRI;
+  MachineDominatorTree *DT;
   const X86InstrInfo *TII;
   const X86RegisterInfo *TRI;
 
@@ -462,6 +495,14 @@
   }
 }
 
+void OptimizeLEAPass::populateCSEMap(const MachineBasicBlock &MBB, MemOpMap &LEAs) {
+  for (auto &MI : MBB) {
+    if (isLEA(MI))
+      LEAs[getMemOpCSEKey(MI, 1)].push_back(const_cast<MachineInstr *>(&MI));
+  }
+}
+
+
 // Try to find load and store instructions which recalculate addresses already
 // calculated by some LEA and replace their memory operands with its def
 // register.
@@ -646,6 +687,59 @@
   return Changed;
 }
 
+bool OptimizeLEAPass::cseLEAs(const MachineBasicBlock &MBB) {
+   MemOpMap LEAs;
+   bool cseDone = false;
+
+   populateCSEMap(MBB,LEAs);
+
+   auto CompareFn = 
+     [] (const MachineInstr *Arg1,const MachineInstr *Arg2) -> bool {
+        if(Arg1->getOperand(2).getImm() < Arg2->getOperand(2).getImm())
+           return false;
+        return true;  
+     };
+
+   // Loop over all entries in the table.
+   for (auto &E : LEAs) {
+     auto &List = E.second;
+     if(List.size() > 1) {
+        std::sort(List.begin(),List.end(),CompareFn);
+     }
+     // Loop over all LEA pairs.
+     for(auto LII = List.begin(); LII != List.end(); LII++) {
+        MachineInstr &LI1 = **LII;
+        auto LINext = std::next(LII);
+        if(LINext == List.end()) 
+           break;
+        MachineInstr &LI2 = **LINext;
+        if (!DT->dominates(&LI2,&LI1))
+           continue;
+        
+        int Scale1 = LI1.getOperand(2).getImm(); 
+        int Scale2 = LI2.getOperand(2).getImm(); 
+        assert(LI2.getOperand(0).isReg() && "Result is a VirtualReg");
+        DebugLoc DL = LI1.getDebugLoc();
+
+        if ((Scale1 - Scale2) > 0) {
+            MachineInstr *NewMI = 
+            BuildMI(*(const_cast<MachineBasicBlock *>(&MBB)),
+                &LI1,DL,TII->get(LI1.getOpcode()))
+              .addDef(LI1.getOperand(0).getReg())  // Dst   = Dst of LI1. 
+              .addUse(LI2.getOperand(0).getReg())  // Base  = Dst of LI2.
+              .addImm(Scale1-Scale2)               // Scale = Diff b/w scales.
+              .addUse(LI1.getOperand(3).getReg())  // Index = Index of LI1.
+              .addImm(0)                           // Disp  = 0 
+              .addUse(LI1.getOperand(5).getReg()); // Segment = Segmant of LI1.  
+
+              LI1.eraseFromParent();
+              cseDone = NewMI != nullptr;
+        }
+     }
+   } 
+   return cseDone;
+}
+
 bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
 
@@ -655,12 +749,16 @@
   MRI = &MF.getRegInfo();
   TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
   TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
+  DT  = &getAnalysis<MachineDominatorTree>();
 
   // Process all basic blocks.
   for (auto &MBB : MF) {
     MemOpMap LEAs;
     InstrPos.clear();
 
+    // Attempt CSE over LEAs.
+    Changed |= cseLEAs(MBB);
+
     // Find all LEA instructions in basic block.
     findLEAs(MBB, LEAs);
 
@@ -675,6 +773,7 @@
     // a code size gain is expected from this part of the pass.
     if (MF.getFunction()->optForSize())
       Changed |= removeRedundantAddrCalc(LEAs);
+
   }
 
   return Changed;
Index: test/CodeGen/WebAssembly/umulo-i64.ll
===================================================================
--- /dev/null
+++ test/CodeGen/WebAssembly/umulo-i64.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; Test that UMULO works correctly on 64-bit operands.
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-emscripten"
+
+; CHECK-LABEL: _ZN4core3num21_$LT$impl$u20$u64$GT$15overflowing_mul17h07be88b4cbac028fE:
+; CHECK:     __multi3
+; Function Attrs: inlinehint
+define void @"_ZN4core3num21_$LT$impl$u20$u64$GT$15overflowing_mul17h07be88b4cbac028fE"(i64, i64) unnamed_addr #0 {
+start:
+  %2 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %0, i64 %1)
+  %3 = extractvalue { i64, i1 } %2, 0
+  store i64 %3, i64* undef
+  unreachable
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64) #1
+
+attributes #0 = { inlinehint }
+attributes #1 = { nounwind readnone speculatable }
+
+
+
Index: test/CodeGen/X86/lea-opt-cse.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/lea-opt-cse.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=X86
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.SA = type { i32 , i32  , i32 , i32 ,i32 }
+
+define void @test_func(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr {
+; X86-LABEL: test_func: 
+; X86:         # BB#0: 
+; X86-NEXT:    movl (%rdi), %eax
+; X86-NEXT:    movl 16(%rdi), %ecx
+; X86-NEXT:    leal 1(%rax,%rcx), %eax
+; X86-NEXT:    movl %eax, 12(%rdi)
+; X86-NEXT:    leal (%eax,%rcx), %eax
+; X86-NEXT:    movl %eax, 16(%rdi)
+; X86-NEXT:    retq
+ entry:
+   %h0 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 0
+   %0 = load i32, i32* %h0, align 8
+   %h3 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 3
+   %h4 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 4
+   %1 = load i32, i32* %h4, align 8
+   %add = add i32 %0, 1
+   %add4 = add i32 %add, %1
+   store i32 %add4, i32* %h3, align 4
+   %add29 = add i32 %add4 , %1
+   store i32 %add29, i32* %h4, align 8
+   ret void
+}
Index: test/CodeGen/X86/mul-constant-i16.ll
===================================================================
--- test/CodeGen/X86/mul-constant-i16.ll
+++ test/CodeGen/X86/mul-constant-i16.ll
@@ -558,11 +558,10 @@
 define i16 @test_mul_by_29(i16 %x) {
 ; X86-LABEL: test_mul_by_29:
 ; X86:       # BB#0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    leal (%ecx,%ecx,8), %eax
-; X86-NEXT:    leal (%eax,%eax,2), %eax
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,8), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
+; X86-NEXT:    leal (%ecx,%eax,2), %eax
 ; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-NEXT:    retl
 ;
@@ -571,8 +570,7 @@
 ; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; X64-NEXT:    leal (%rdi,%rdi,8), %eax
 ; X64-NEXT:    leal (%rax,%rax,2), %eax
-; X64-NEXT:    addl %edi, %eax
-; X64-NEXT:    addl %edi, %eax
+; X64-NEXT:    leal (%rax,%rdi,2), %eax
 ; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 29
Index: test/CodeGen/X86/mul-constant-i32.ll
===================================================================
--- test/CodeGen/X86/mul-constant-i32.ll
+++ test/CodeGen/X86/mul-constant-i32.ll
@@ -1457,30 +1457,27 @@
 define i32 @test_mul_by_29(i32 %x) {
 ; X86-LABEL: test_mul_by_29:
 ; X86:       # BB#0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    leal (%ecx,%ecx,8), %eax
-; X86-NEXT:    leal (%eax,%eax,2), %eax
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,8), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
+; X86-NEXT:    leal (%ecx,%eax,2), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-HSW-LABEL: test_mul_by_29:
 ; X64-HSW:       # BB#0:
 ; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]   
 ; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT:    addl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT:    addl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    leal (%rax,%rdi,2), %eax # sched: [1:0.50]
 ; X64-HSW-NEXT:    retq # sched: [1:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_29:
 ; X64-JAG:       # BB#0:
 ; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; X64-JAG-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
-; X64-JAG-NEXT:    addl %edi, %eax # sched: [1:0.50]
-; X64-JAG-NEXT:    addl %edi, %eax # sched: [1:0.50]
-; X64-JAG-NEXT:    retq # sched: [4:1.00]
+; X64-JAG-NEXT:    leal (%rdi,%rdi,8), %eax #  sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rax,%rax,2), %eax #  sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rax,%rdi,2), %eax #  sched: [1:0.50]
+; X64-JAG-NEXT:    retq                     #  sched: [4:1.00]
 ;
 ; X86-NOOPT-LABEL: test_mul_by_29:
 ; X86-NOOPT:       # BB#0:
Index: test/CodeGen/X86/mul-constant-i64.ll
===================================================================
--- test/CodeGen/X86/mul-constant-i64.ll
+++ test/CodeGen/X86/mul-constant-i64.ll
@@ -1523,8 +1523,7 @@
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    leal (%eax,%eax,8), %ecx
 ; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    leal (%ecx,%eax,2), %ecx
 ; X86-NEXT:    movl $29, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ecx, %edx
@@ -1532,19 +1531,17 @@
 ;
 ; X64-HSW-LABEL: test_mul_by_29:
 ; X64-HSW:       # BB#0:
-; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT:    addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT:    retq # sched: [1:1.00]
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rax   #  sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rax,%rax,2), %rax   #  sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rax,%rdi,2), %rax   #  sched: [1:0.50]
+; X64-HSW-NEXT:    retq                       #  sched: [1:1.00]
 ;
 ; X64-JAG-LABEL: test_mul_by_29:
 ; X64-JAG:       # BB#0:
-; X64-JAG-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; X64-JAG-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
-; X64-JAG-NEXT:    addq %rdi, %rax # sched: [1:0.50]
-; X64-JAG-NEXT:    addq %rdi, %rax # sched: [1:0.50]
-; X64-JAG-NEXT:    retq # sched: [4:1.00]
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,8), %rax     #  sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rax,%rax,2), %rax     #  sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rax,%rdi,2), %rax     #  sched: [1:0.50]
+; X64-JAG-NEXT:    retq                         #  sched: [4:1.00]
 ;
 ; X86-NOOPT-LABEL: test_mul_by_29:
 ; X86-NOOPT:       # BB#0:
Index: test/CodeGen/X86/mul-constant-result.ll
===================================================================
--- test/CodeGen/X86/mul-constant-result.ll
+++ test/CodeGen/X86/mul-constant-result.ll
@@ -163,8 +163,7 @@
 ; X86-NEXT:  .LBB0_35:
 ; X86-NEXT:    leal (%eax,%eax,8), %ecx
 ; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    leal (%ecx,%eax,2), %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ; X86-NEXT:  .LBB0_36:
@@ -322,16 +321,17 @@
 ; X64-HSW-NEXT:  .LBB0_31:
 ; X64-HSW-NEXT:    leal (%rax,%rax,8), %ecx
 ; X64-HSW-NEXT:    leal (%rcx,%rcx,2), %ecx
-; X64-HSW-NEXT:    jmp .LBB0_17
-; X64-HSW-NEXT:  .LBB0_32:
-; X64-HSW-NEXT:    leal (%rax,%rax,8), %ecx
-; X64-HSW-NEXT:    leal (%rcx,%rcx,2), %ecx
-; X64-HSW-NEXT:    addl %eax, %ecx
 ; X64-HSW-NEXT:  .LBB0_17:
 ; X64-HSW-NEXT:    addl %eax, %ecx
 ; X64-HSW-NEXT:    movl %ecx, %eax
 ; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
 ; X64-HSW-NEXT:    retq
+; X64-HSW-NEXT:  .LBB0_32:
+; X64-HSW-NEXT:    leal (%rax,%rax,8), %ecx
+; X64-HSW-NEXT:    leal (%rcx,%rcx,2), %ecx
+; X64-HSW-NEXT:    leal (%rcx,%rax,2), %eax
+; X64-HSW-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT:    retq
 ; X64-HSW-NEXT:  .LBB0_33:
 ; X64-HSW-NEXT:    movl %eax, %ecx
 ; X64-HSW-NEXT:    shll $5, %ecx
Index: test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
===================================================================
--- test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
+++ test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
@@ -13,14 +13,14 @@
 ; X64-NEXT: .p2align
 ; X64: %loop
 ; no complex address modes
-; X64-NOT: (%{{[^)]+}},%{{[^)]+}},
+; X64-NOT: [1-9]+(%{{[^)]+}},%{{[^)]+}},
 ;
 ; X32: @simple
 ; no expensive address computation in the preheader
 ; X32-NOT: imul
 ; X32: %loop
 ; no complex address modes
-; X32-NOT: (%{{[^)]+}},%{{[^)]+}},
+; X32-NOT: [1-9]+(%{{[^)]+}},%{{[^)]+}},
 define i32 @simple(i32* %a, i32* %b, i32 %x) nounwind {
 entry:
   br label %loop
@@ -103,7 +103,7 @@
 ; X32-NOT: mov{{.*}}(%esp){{$}}
 ; X32: %for.body{{$}}
 ; no complex address modes
-; X32-NOT: (%{{[^)]+}},%{{[^)]+}},
+; X32-NOT: [1-9]+(%{{[^)]+}},%{{[^)]+}},
 ; no reloads
 ; X32-NOT: (%esp)
 define void @extrastride(i8* nocapture %main, i32 %main_stride, i32* nocapture %res, i32 %x, i32 %y, i32 %z) nounwind {
Index: utils/TableGen/DAGISelMatcherGen.cpp
===================================================================
--- utils/TableGen/DAGISelMatcherGen.cpp
+++ utils/TableGen/DAGISelMatcherGen.cpp
@@ -305,7 +305,7 @@
   const SDNodeInfo &CInfo = CGP.getSDNodeInfo(N->getOperator());
 
   // If this is an 'and R, 1234' where the operation is AND/OR and the RHS is
-  // a constant without a predicate fn that has more that one bit set, handle
+  // a constant without a predicate fn that has more than one bit set, handle
   // this as a special case.  This is usually for targets that have special
   // handling of certain large constants (e.g. alpha with it's 8/16/32-bit
   // handling stuff).  Using these instructions is often far more efficient