Index: lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2195,18 +2195,23 @@
     // options. This is a trivially-generalized version of the code from
     // Hacker's Delight (itself derived from Knuth's Algorithm M from section
     // 4.3.1).
-    SDValue Mask =
-      DAG.getConstant(APInt::getLowBitsSet(NVT.getSizeInBits(),
-                                           NVT.getSizeInBits() >> 1), dl, NVT);
+    unsigned Bits = NVT.getSizeInBits();
+    unsigned HalfBits = Bits >> 1;
+    SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(Bits, HalfBits), dl,
+                                   NVT);
     SDValue LLL = DAG.getNode(ISD::AND, dl, NVT, LL, Mask);
     SDValue RLL = DAG.getNode(ISD::AND, dl, NVT, RL, Mask);
 
     SDValue T = DAG.getNode(ISD::MUL, dl, NVT, LLL, RLL);
     SDValue TL = DAG.getNode(ISD::AND, dl, NVT, T, Mask);
 
-    SDValue Shift =
-      DAG.getConstant(NVT.getSizeInBits() >> 1, dl,
-                      TLI.getShiftAmountTy(NVT, DAG.getDataLayout()));
+    EVT ShiftAmtTy = TLI.getShiftAmountTy(NVT, DAG.getDataLayout());
+    if (APInt::getMaxValue(ShiftAmtTy.getSizeInBits()).ult(HalfBits)) {
+      // The type from TLI is too small to fit the shift amount we want.
+      // Override it with i32. The shift will have to be legalized.
+      ShiftAmtTy = MVT::i32;
+    }
+    SDValue Shift = DAG.getConstant(HalfBits, dl, ShiftAmtTy);
     SDValue TH = DAG.getNode(ISD::SRL, dl, NVT, T, Shift);
     SDValue LLH = DAG.getNode(ISD::SRL, dl, NVT, LL, Shift);
     SDValue RLH = DAG.getNode(ISD::SRL, dl, NVT, RL, Shift);
Index: test/CodeGen/X86/mul-bigint.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/mul-bigint.ll
@@ -0,0 +1,50 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test_512p(i512* %a, i512* %b, i512* %out) #0 {
+  %av = load i512, i512* %a
+  %bv = load i512, i512* %b
+  %r = mul i512 %av, %bv
+  store i512 %r, i512* %out
+  ret void
+}
+
+; CHECK-LABEL: @test_512p
+; There is a lot of inter-register motion, and so matching the instruction
+; sequence will be fragile. There should be 6 underlying multiplications.
+; CHECK: imulq
+; CHECK: imulq
+; CHECK: imulq
+; CHECK: imulq
+; CHECK: imulq
+; CHECK: imulq
+; CHECK: imulq
+; CHECK: imulq
+; CHECK: imulq
+; CHECK: imulq
+; CHECK: imulq
+; CHECK: imulq
+; CHECK: imulq
+; CHECK: imulq
+; CHECK: imulq
+; CHECK: imulq
+; CHECK: imulq
+; CHECK: imulq
+; CHECK-NOT: imulq
+; CHECK: retq
+
+define void @test_1024p(i1024* %a, i1024* %b, i1024* %out) #0 {
+  %av = load i1024, i1024* %a
+  %bv = load i1024, i1024* %b
+  %r = mul i1024 %av, %bv
+  store i1024 %r, i1024* %out
+  ret void
+}
+
+; CHECK-LABEL: @test_1024p
+; CHECK: imulq
+; CHECK: mulq
+; CHECK: retq
+
+attributes #0 = { norecurse nounwind uwtable "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" }