Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -23971,11 +23971,29 @@
   }
 }
 
+static int getSmallDivisor(uint64_t Val) {
+  // Note that order is important - 9 must come before 3.
+  if (Val % 9 == 0)
+    return 9;
+  if (Val % 5 == 0)
+    return 5;
+  if (Val % 3 == 0)
+    return 3;
+  return 0;
+}
+
 /// PerformMulCombine - Optimize a single multiply with constant into two
 /// in order to implement it with two cheaper instructions, e.g.
 /// LEA + SHL, LEA + LEA.
 static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI) {
+  
+  // The sequences we want to produce are larger than in imul, 
+  // disable this for -Oz
+  if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
+        AttributeSet::FunctionIndex, Attribute::MinSize))
+    return SDValue();
+
   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
     return SDValue();
 
@@ -23986,51 +24004,62 @@
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
   if (!C)
     return SDValue();
-  uint64_t MulAmt = C->getZExtValue();
-  if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
+
+  bool IsNeg = C->getSExtValue() < 0;
+  uint64_t MulAmt = IsNeg ? -C->getSExtValue() : C->getSExtValue();
+
+  // This is handled by the target-independent DAGCombine
+  if (!MulAmt || isPowerOf2_64(MulAmt))
     return SDValue();
 
-  uint64_t MulAmt1 = 0;
-  uint64_t MulAmt2 = 0;
-  if ((MulAmt % 9) == 0) {
-    MulAmt1 = 9;
-    MulAmt2 = MulAmt / 9;
-  } else if ((MulAmt % 5) == 0) {
-    MulAmt1 = 5;
-    MulAmt2 = MulAmt / 5;
-  } else if ((MulAmt % 3) == 0) {
-    MulAmt1 = 3;
-    MulAmt2 = MulAmt / 3;
-  }
-  if (MulAmt2 &&
-      (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
-    SDLoc DL(N);
+  unsigned ShiftAmt = countTrailingZeros(MulAmt);
+  if (ShiftAmt)
+    MulAmt >>= ShiftAmt;
+
+  // How many steps will we have to peform to replace the
+  // MUL. We limit this to 3 steps, based on imul latency of 4
+  // (If the latency is equal, this is probably not a win)
+  unsigned Steps = 0;
+  const unsigned MAX_STEPS = 3;
+  // Having to negate or shift is a step
+  Steps += (IsNeg ?  1 : 0);
+  Steps += (ShiftAmt ?  1 : 0);
+  
+  // Finalize multiplications by interegers whose only factors are in 
+  // {3, 5, 9, 2 ^ C}
+  SDValue NewMul = N->getOperand(0);
+  SDLoc DL(N);
 
-    if (isPowerOf2_64(MulAmt2) &&
-        !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
-      // If second multiplifer is pow2, issue it first. We want the multiply by
-      // 3, 5, or 9 to be folded into the addressing mode unless the lone use
-      // is an add.
-      std::swap(MulAmt1, MulAmt2);
-
-    SDValue NewMul;
-    if (isPowerOf2_64(MulAmt1))
-      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
-                           DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
-    else
-      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
-                           DAG.getConstant(MulAmt1, VT));
+  // First, we shift, then we multiply by the small constant, with the
+  // expectation that the small constant multiplication will become a LEA.
+  // But if the original mul has one use, and that use is an ADD, then we
+  // may get better code from the opposite order, since that ADD may be
+  // part of an addressing mode calculation.
+  bool MulFirst = N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD;
+
+  if (ShiftAmt && !MulFirst)
+    NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
+                         DAG.getConstant(ShiftAmt, MVT::i8));
+
+  unsigned Val = getSmallDivisor(MulAmt);
+  for (; Val && (Steps < MAX_STEPS) && (MulAmt != 1); ++Steps) {
+    NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
+                         DAG.getConstant(Val, VT));
+    MulAmt /= Val;
+    Val = getSmallDivisor(MulAmt);
+  }
+
+  if (ShiftAmt && MulFirst)
+    NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
+                         DAG.getConstant(ShiftAmt, MVT::i8));
 
-    if (isPowerOf2_64(MulAmt2))
-      NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
-                           DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
-    else
-      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
-                           DAG.getConstant(MulAmt2, VT));
+  if (IsNeg)
+    NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), NewMul);
 
-    // Do not add new nodes to DAG combiner worklist.
+  // Do not add new nodes to DAG combiner worklist.
+  if (MulAmt == 1)
     DCI.CombineTo(N, NewMul, false);
-  }
+
   return SDValue();
 }
 
Index: test/CodeGen/X86/imul-lea-2.ll
===================================================================
--- test/CodeGen/X86/imul-lea-2.ll
+++ test/CodeGen/X86/imul-lea-2.ll
@@ -1,19 +0,0 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
-
-; CHECK-NOT: imul
-
-define i64 @t1(i64 %a) nounwind readnone {
-entry:
-  %0 = mul i64 %a, 81
-; CHECK: lea
-; CHECK: lea
-  ret i64 %0
-}
-
-define i64 @t2(i64 %a) nounwind readnone {
-entry:
-  %0 = mul i64 %a, 40
-; CHECK: shl
-; CHECK: lea
-  ret i64 %0
-}
Index: test/CodeGen/X86/imul.ll
===================================================================
--- test/CodeGen/X86/imul.ll
+++ test/CodeGen/X86/imul.ll
@@ -69,8 +69,7 @@
 ; X64-LABEL: mul3_32:
 ; X64: leal
 ; X86-LABEL: mul3_32:
-; But why?!
-; X86: imull
+; X86: leal
     %mul = mul i32 %A, 3
     ret i32 %mul
 }
@@ -79,8 +78,10 @@
 ; X64-LABEL: mul3_64:
 ; X64: leaq
 ; X86-LABEL: mul3_64:
-; X86: mull
-; X86-NEXT: imull
+; X86: leal
+; X86-NEXT: movl
+; X86-NEXT: mull
+; X86-NEXT: addl
     %mul = mul i64 %A, 3
     ret i64 %mul
 }
@@ -108,3 +109,54 @@
     %mul = mul i64 %A, 40
     ret i64 %mul
 }
+
+define i32 @mul81_32(i32 %A) {
+; X64-LABEL: mul81_32:
+; X64: leal
+; X64-NEXT: leal
+; X86-LABEL: mul81_32:
+; X86: leal
+; X86-NEXT: leal
+    %mul = mul i32 %A, 81
+    ret i32 %mul
+}
+
+define i32 @mulmin81_32(i32 %A) {
+; X64-LABEL: mulmin81_32:
+; X64: leal
+; X64-NEXT: leal
+; X64-NEXT: negl
+; X86-LABEL: mulmin81_32:
+; X86: leal
+; X86-NEXT: leal
+; X86-NEXT: negl
+    %mul = mul i32 %A, -81
+    ret i32 %mul
+}
+
+define i32 @mul1920_32(i32 %A) {
+; X64-LABEL: mul1920_32:
+; X64: shll
+; X64-NEXT: leal
+; X64-NEXT: leal
+; X86-LABEL: mul1920_32:
+; X86: shll
+; X86-NEXT: leal
+; X86-NEXT: leal
+    %mul = mul i32 %A, 1920
+    ret i32 %mul
+}
+
+; These muls require too many instruction
+define i32 @negative(i32 %A) {
+; X64-LABEL: negative:
+; X64: imull $-1920
+; X64: imull $625
+; X86-LABEL: negative:
+; X86: imull $-1920
+; X86: imull $625
+    %mul = mul i32 %A, -1920
+    %mul2 = mul i32 %A, 625
+    %f = add i32 %mul, %mul2
+    ret i32 %f
+}
\ No newline at end of file