Index: llvm/trunk/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/TargetLowering.h +++ llvm/trunk/include/llvm/CodeGen/TargetLowering.h @@ -1904,7 +1904,8 @@ /// This may be true if the target does not directly support the /// multiplication operation for the specified type or the sequence of simpler /// ops is faster than the multiply. - virtual bool decomposeMulByConstant(EVT VT, SDValue C) const { + virtual bool decomposeMulByConstant(LLVMContext &Context, + EVT VT, SDValue C) const { return false; } Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3556,7 +3556,7 @@ // x * 15 --> (x << 4) - x // x * -33 --> -((x << 5) + x) // x * -15 --> -((x << 4) - x) ; this reduces --> x - (x << 4) - if (N1IsConst && TLI.decomposeMulByConstant(VT, N1)) { + if (N1IsConst && TLI.decomposeMulByConstant(*DAG.getContext(), VT, N1)) { // TODO: We could handle more general decomposition of any constant by // having the target set a limit on number of ops and making a // callback to determine that sequence (similar to sqrt expansion). Index: llvm/trunk/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.h +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h @@ -1105,7 +1105,8 @@ bool convertSelectOfConstantsToMath(EVT VT) const override; - bool decomposeMulByConstant(EVT VT, SDValue C) const override; + bool decomposeMulByConstant(LLVMContext &Context, EVT VT, + SDValue C) const override; bool shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT, bool IsSigned) const override; Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -4869,15 +4869,25 @@ return true; } -bool X86TargetLowering::decomposeMulByConstant(EVT VT, SDValue C) const { +bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT, + SDValue C) const { // TODO: We handle scalars using custom code, but generic combining could make // that unnecessary. APInt MulC; if (!ISD::isConstantSplatVector(C.getNode(), MulC)) return false; + // Find the type this will be legalized too. Otherwise we might prematurely + // convert this to shl+add/sub and then still have to type legalize those ops. + // Another choice would be to defer the decision for illegal types until + // after type legalization. But constant splat vectors of i64 can't make it + // through type legalization on 32-bit targets so we would need to special + // case vXi64. + while (getTypeAction(Context, VT) != TypeLegal) + VT = getTypeToTransformTo(Context, VT); + // If vector multiply is legal, assume that's faster than shl + add/sub. - // TODO: Multiply is a complex op with higher latency and lower througput in + // TODO: Multiply is a complex op with higher latency and lower throughput in // most implementations, so this check could be loosened based on type // and/or a CPU attribute. if (isOperationLegal(ISD::MUL, VT)) Index: llvm/trunk/test/CodeGen/X86/vector-mul.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-mul.ll +++ llvm/trunk/test/CodeGen/X86/vector-mul.ll @@ -435,26 +435,16 @@ define <8 x i32> @mul_v8i32_17(<8 x i32> %a0) nounwind { ; X86-LABEL: mul_v8i32_17: ; X86: # %bb.0: -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pslld $4, %xmm2 -; X86-NEXT: paddd %xmm0, %xmm2 -; X86-NEXT: movdqa %xmm1, %xmm3 -; X86-NEXT: pslld $4, %xmm3 -; X86-NEXT: paddd %xmm1, %xmm3 -; X86-NEXT: movdqa %xmm2, %xmm0 -; X86-NEXT: movdqa %xmm3, %xmm1 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17] +; X86-NEXT: pmulld %xmm2, %xmm0 +; X86-NEXT: pmulld %xmm2, %xmm1 ; X86-NEXT: retl ; ; X64-LABEL: mul_v8i32_17: ; X64: # %bb.0: -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pslld $4, %xmm2 -; X64-NEXT: paddd %xmm0, %xmm2 -; X64-NEXT: movdqa %xmm1, %xmm3 -; X64-NEXT: pslld $4, %xmm3 -; X64-NEXT: paddd %xmm1, %xmm3 -; X64-NEXT: movdqa %xmm2, %xmm0 -; X64-NEXT: movdqa %xmm3, %xmm1 +; X64-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17] +; X64-NEXT: pmulld %xmm2, %xmm0 +; X64-NEXT: pmulld %xmm2, %xmm1 ; X64-NEXT: retq ; ; X64-XOP-LABEL: mul_v8i32_17: @@ -484,26 +474,16 @@ define <16 x i16> @mul_v16i16_17(<16 x i16> %a0) nounwind { ; X86-LABEL: mul_v16i16_17: ; X86: # %bb.0: -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: psllw $4, %xmm2 -; X86-NEXT: paddw %xmm0, %xmm2 -; X86-NEXT: movdqa %xmm1, %xmm3 -; X86-NEXT: psllw $4, %xmm3 -; X86-NEXT: paddw %xmm1, %xmm3 -; X86-NEXT: movdqa %xmm2, %xmm0 -; X86-NEXT: movdqa %xmm3, %xmm1 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17] +; X86-NEXT: pmullw %xmm2, %xmm0 +; X86-NEXT: pmullw %xmm2, %xmm1 ; X86-NEXT: retl ; ; X64-LABEL: mul_v16i16_17: ; X64: # %bb.0: -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: psllw $4, %xmm2 -; X64-NEXT: paddw %xmm0, %xmm2 -; X64-NEXT: movdqa %xmm1, %xmm3 -; X64-NEXT: psllw $4, %xmm3 -; X64-NEXT: paddw %xmm1, %xmm3 -; X64-NEXT: movdqa %xmm2, %xmm0 -; X64-NEXT: movdqa %xmm3, %xmm1 +; X64-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17] +; X64-NEXT: pmullw %xmm2, %xmm0 +; X64-NEXT: pmullw %xmm2, %xmm1 ; X64-NEXT: retq ; ; X64-XOP-LABEL: mul_v16i16_17: @@ -797,32 +777,16 @@ define <8 x i32> @mul_v8i32_neg33(<8 x i32> %a0) nounwind { ; X86-LABEL: mul_v8i32_neg33: ; X86: # %bb.0: -; X86-NEXT: movdqa %xmm0, %xmm3 -; X86-NEXT: pslld $5, %xmm3 -; X86-NEXT: paddd %xmm0, %xmm3 -; X86-NEXT: pxor %xmm2, %xmm2 -; X86-NEXT: pxor %xmm0, %xmm0 -; X86-NEXT: psubd %xmm3, %xmm0 -; X86-NEXT: movdqa %xmm1, %xmm3 -; X86-NEXT: pslld $5, %xmm3 -; X86-NEXT: paddd %xmm1, %xmm3 -; X86-NEXT: psubd %xmm3, %xmm2 -; X86-NEXT: movdqa %xmm2, %xmm1 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [4294967263,4294967263,4294967263,4294967263] +; X86-NEXT: pmulld %xmm2, %xmm0 +; X86-NEXT: pmulld %xmm2, %xmm1 ; X86-NEXT: retl ; ; X64-LABEL: mul_v8i32_neg33: ; X64: # %bb.0: -; X64-NEXT: movdqa %xmm0, %xmm3 -; X64-NEXT: pslld $5, %xmm3 -; X64-NEXT: paddd %xmm0, %xmm3 -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: pxor %xmm0, %xmm0 -; X64-NEXT: psubd %xmm3, %xmm0 -; X64-NEXT: movdqa %xmm1, %xmm3 -; X64-NEXT: pslld $5, %xmm3 -; X64-NEXT: paddd %xmm1, %xmm3 -; X64-NEXT: psubd %xmm3, %xmm2 -; X64-NEXT: movdqa %xmm2, %xmm1 +; X64-NEXT: movdqa {{.*#+}} xmm2 = [4294967263,4294967263,4294967263,4294967263] +; X64-NEXT: pmulld %xmm2, %xmm0 +; X64-NEXT: pmulld %xmm2, %xmm1 ; X64-NEXT: retq ; ; X64-XOP-LABEL: mul_v8i32_neg33: @@ -855,32 +819,16 @@ define <16 x i16> @mul_v16i16_neg9(<16 x i16> %a0) nounwind { ; X86-LABEL: mul_v16i16_neg9: ; X86: # %bb.0: -; X86-NEXT: movdqa %xmm0, %xmm3 -; X86-NEXT: psllw $3, %xmm3 -; X86-NEXT: paddw %xmm0, %xmm3 -; X86-NEXT: pxor %xmm2, %xmm2 -; X86-NEXT: pxor %xmm0, %xmm0 -; X86-NEXT: psubw %xmm3, %xmm0 -; X86-NEXT: movdqa %xmm1, %xmm3 -; X86-NEXT: psllw $3, %xmm3 -; X86-NEXT: paddw %xmm1, %xmm3 -; X86-NEXT: psubw %xmm3, %xmm2 -; X86-NEXT: movdqa %xmm2, %xmm1 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [65527,65527,65527,65527,65527,65527,65527,65527] +; X86-NEXT: pmullw %xmm2, %xmm0 +; X86-NEXT: pmullw %xmm2, %xmm1 ; X86-NEXT: retl ; ; X64-LABEL: mul_v16i16_neg9: ; X64: # %bb.0: -; X64-NEXT: movdqa %xmm0, %xmm3 -; X64-NEXT: psllw $3, %xmm3 -; X64-NEXT: paddw %xmm0, %xmm3 -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: pxor %xmm0, %xmm0 -; X64-NEXT: psubw %xmm3, %xmm0 -; X64-NEXT: movdqa %xmm1, %xmm3 -; X64-NEXT: psllw $3, %xmm3 -; X64-NEXT: paddw %xmm1, %xmm3 -; X64-NEXT: psubw %xmm3, %xmm2 -; X64-NEXT: movdqa %xmm2, %xmm1 +; X64-NEXT: movdqa {{.*#+}} xmm2 = [65527,65527,65527,65527,65527,65527,65527,65527] +; X64-NEXT: pmullw %xmm2, %xmm0 +; X64-NEXT: pmullw %xmm2, %xmm1 ; X64-NEXT: retq ; ; X64-XOP-LABEL: mul_v16i16_neg9: