Index: lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -2195,18 +2195,23 @@ // options. This is a trivially-generalized version of the code from // Hacker's Delight (itself derived from Knuth's Algorithm M from section // 4.3.1). - SDValue Mask = - DAG.getConstant(APInt::getLowBitsSet(NVT.getSizeInBits(), - NVT.getSizeInBits() >> 1), dl, NVT); + unsigned Bits = NVT.getSizeInBits(); + unsigned HalfBits = Bits >> 1; + SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(Bits, HalfBits), dl, + NVT); SDValue LLL = DAG.getNode(ISD::AND, dl, NVT, LL, Mask); SDValue RLL = DAG.getNode(ISD::AND, dl, NVT, RL, Mask); SDValue T = DAG.getNode(ISD::MUL, dl, NVT, LLL, RLL); SDValue TL = DAG.getNode(ISD::AND, dl, NVT, T, Mask); - SDValue Shift = - DAG.getConstant(NVT.getSizeInBits() >> 1, dl, - TLI.getShiftAmountTy(NVT, DAG.getDataLayout())); + EVT ShiftAmtTy = TLI.getShiftAmountTy(NVT, DAG.getDataLayout()); + if (APInt::getMaxValue(ShiftAmtTy.getSizeInBits()).ult(HalfBits)) { + // The type from TLI is too small to fit the shift amount we want. + // Override it with i32. The shift will have to be legalized. + ShiftAmtTy = MVT::i32; + } + SDValue Shift = DAG.getConstant(HalfBits, dl, ShiftAmtTy); SDValue TH = DAG.getNode(ISD::SRL, dl, NVT, T, Shift); SDValue LLH = DAG.getNode(ISD::SRL, dl, NVT, LL, Shift); SDValue RLH = DAG.getNode(ISD::SRL, dl, NVT, RL, Shift); Index: test/CodeGen/X86/mul-bigint.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/mul-bigint.ll @@ -0,0 +1,50 @@ +; RUN: llc < %s | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define void @test_512p(i512* %a, i512* %b, i512* %out) #0 { + %av = load i512, i512* %a + %bv = load i512, i512* %b + %r = mul i512 %av, %bv + store i512 %r, i512* %out + ret void +} + +; CHECK-LABEL: @test_512p +; There is a lot of inter-register motion, and so matching the instruction +; sequence will be fragile. There should be 6 underlying multiplications. +; CHECK: imulq +; CHECK: imulq +; CHECK: imulq +; CHECK: imulq +; CHECK: imulq +; CHECK: imulq +; CHECK: imulq +; CHECK: imulq +; CHECK: imulq +; CHECK: imulq +; CHECK: imulq +; CHECK: imulq +; CHECK: imulq +; CHECK: imulq +; CHECK: imulq +; CHECK: imulq +; CHECK: imulq +; CHECK: imulq +; CHECK-NOT: imulq +; CHECK: retq + +define void @test_1024p(i1024* %a, i1024* %b, i1024* %out) #0 { + %av = load i1024, i1024* %a + %bv = load i1024, i1024* %b + %r = mul i1024 %av, %bv + store i1024 %r, i1024* %out + ret void +} + +; CHECK-LABEL: @test_1024p +; CHECK: imulq +; CHECK: mulq +; CHECK: retq + +attributes #0 = { norecurse nounwind uwtable "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" }