Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -1904,7 +1904,8 @@ /// This may be true if the target does not directly support the /// multiplication operation for the specified type or the sequence of simpler /// ops is faster than the multiply. - virtual bool decomposeMulByConstant(EVT VT, SDValue C) const { + virtual bool decomposeMulByConstant(LLVMContext &Context, + EVT VT, SDValue C) const { return false; } Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3542,7 +3542,7 @@ // x * 15 --> (x << 4) - x // x * -33 --> -((x << 5) + x) // x * -15 --> -((x << 4) - x) ; this reduces --> x - (x << 4) - if (N1IsConst && TLI.decomposeMulByConstant(VT, N1)) { + if (N1IsConst && TLI.decomposeMulByConstant(*DAG.getContext(), VT, N1)) { // TODO: We could handle more general decomposition of any constant by // having the target set a limit on number of ops and making a // callback to determine that sequence (similar to sqrt expansion). Index: llvm/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.h +++ llvm/lib/Target/X86/X86ISelLowering.h @@ -1101,7 +1101,8 @@ bool convertSelectOfConstantsToMath(EVT VT) const override; - bool decomposeMulByConstant(EVT VT, SDValue C) const override; + bool decomposeMulByConstant(LLVMContext &Context, EVT VT, + SDValue C) const override; bool shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT, bool IsSigned) const override; Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -4880,18 +4880,25 @@ return true; } -bool X86TargetLowering::decomposeMulByConstant(EVT VT, SDValue C) const { +bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT, + SDValue C) const { // TODO: We handle scalars using custom code, but generic combining could make // that unnecessary. APInt MulC; if (!ISD::isConstantSplatVector(C.getNode(), MulC)) return false; + // Find the type this will be legalized too. Otherwise we might prematurely + // convert this to shl+add/sub and then still have to type legalize those ops. + EVT LegalVT = VT; + while (getTypeAction(Context, LegalVT) != TypeLegal) + LegalVT = getTypeToTransformTo(Context, LegalVT); + // If vector multiply is legal, assume that's faster than shl + add/sub. - // TODO: Multiply is a complex op with higher latency and lower througput in + // TODO: Multiply is a complex op with higher latency and lower throughput in // most implementations, so this check could be loosened based on type // and/or a CPU attribute. - if (isOperationLegal(ISD::MUL, VT)) + if (isOperationLegal(ISD::MUL, LegalVT)) return false; // shl+add, shl+sub, shl+add+neg Index: llvm/test/CodeGen/X86/vector-mul.ll =================================================================== --- llvm/test/CodeGen/X86/vector-mul.ll +++ llvm/test/CodeGen/X86/vector-mul.ll @@ -435,26 +435,16 @@ define <8 x i32> @mul_v8i32_17(<8 x i32> %a0) nounwind { ; X86-LABEL: mul_v8i32_17: ; X86: # %bb.0: -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pslld $4, %xmm2 -; X86-NEXT: paddd %xmm0, %xmm2 -; X86-NEXT: movdqa %xmm1, %xmm3 -; X86-NEXT: pslld $4, %xmm3 -; X86-NEXT: paddd %xmm1, %xmm3 -; X86-NEXT: movdqa %xmm2, %xmm0 -; X86-NEXT: movdqa %xmm3, %xmm1 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17] +; X86-NEXT: pmulld %xmm2, %xmm0 +; X86-NEXT: pmulld %xmm2, %xmm1 ; X86-NEXT: retl ; ; X64-LABEL: mul_v8i32_17: ; X64: # %bb.0: -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pslld $4, %xmm2 -; X64-NEXT: paddd %xmm0, %xmm2 -; X64-NEXT: movdqa %xmm1, %xmm3 -; X64-NEXT: pslld $4, %xmm3 -; X64-NEXT: paddd %xmm1, %xmm3 -; X64-NEXT: movdqa %xmm2, %xmm0 -; X64-NEXT: movdqa %xmm3, %xmm1 +; X64-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17] +; X64-NEXT: pmulld %xmm2, %xmm0 +; X64-NEXT: pmulld %xmm2, %xmm1 ; X64-NEXT: retq ; ; X64-XOP-LABEL: mul_v8i32_17: @@ -484,26 +474,16 @@ define <16 x i16> @mul_v16i16_17(<16 x i16> %a0) nounwind { ; X86-LABEL: mul_v16i16_17: ; X86: # %bb.0: -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: psllw $4, %xmm2 -; X86-NEXT: paddw %xmm0, %xmm2 -; X86-NEXT: movdqa %xmm1, %xmm3 -; X86-NEXT: psllw $4, %xmm3 -; X86-NEXT: paddw %xmm1, %xmm3 -; X86-NEXT: movdqa %xmm2, %xmm0 -; X86-NEXT: movdqa %xmm3, %xmm1 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17] +; X86-NEXT: pmullw %xmm2, %xmm0 +; X86-NEXT: pmullw %xmm2, %xmm1 ; X86-NEXT: retl ; ; X64-LABEL: mul_v16i16_17: ; X64: # %bb.0: -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: psllw $4, %xmm2 -; X64-NEXT: paddw %xmm0, %xmm2 -; X64-NEXT: movdqa %xmm1, %xmm3 -; X64-NEXT: psllw $4, %xmm3 -; X64-NEXT: paddw %xmm1, %xmm3 -; X64-NEXT: movdqa %xmm2, %xmm0 -; X64-NEXT: movdqa %xmm3, %xmm1 +; X64-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17] +; X64-NEXT: pmullw %xmm2, %xmm0 +; X64-NEXT: pmullw %xmm2, %xmm1 ; X64-NEXT: retq ; ; X64-XOP-LABEL: mul_v16i16_17: @@ -797,32 +777,16 @@ define <8 x i32> @mul_v8i32_neg33(<8 x i32> %a0) nounwind { ; X86-LABEL: mul_v8i32_neg33: ; X86: # %bb.0: -; X86-NEXT: movdqa %xmm0, %xmm3 -; X86-NEXT: pslld $5, %xmm3 -; X86-NEXT: paddd %xmm0, %xmm3 -; X86-NEXT: pxor %xmm2, %xmm2 -; X86-NEXT: pxor %xmm0, %xmm0 -; X86-NEXT: psubd %xmm3, %xmm0 -; X86-NEXT: movdqa %xmm1, %xmm3 -; X86-NEXT: pslld $5, %xmm3 -; X86-NEXT: paddd %xmm1, %xmm3 -; X86-NEXT: psubd %xmm3, %xmm2 -; X86-NEXT: movdqa %xmm2, %xmm1 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [4294967263,4294967263,4294967263,4294967263] +; X86-NEXT: pmulld %xmm2, %xmm0 +; X86-NEXT: pmulld %xmm2, %xmm1 ; X86-NEXT: retl ; ; X64-LABEL: mul_v8i32_neg33: ; X64: # %bb.0: -; X64-NEXT: movdqa %xmm0, %xmm3 -; X64-NEXT: pslld $5, %xmm3 -; X64-NEXT: paddd %xmm0, %xmm3 -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: pxor %xmm0, %xmm0 -; X64-NEXT: psubd %xmm3, %xmm0 -; X64-NEXT: movdqa %xmm1, %xmm3 -; X64-NEXT: pslld $5, %xmm3 -; X64-NEXT: paddd %xmm1, %xmm3 -; X64-NEXT: psubd %xmm3, %xmm2 -; X64-NEXT: movdqa %xmm2, %xmm1 +; X64-NEXT: movdqa {{.*#+}} xmm2 = [4294967263,4294967263,4294967263,4294967263] +; X64-NEXT: pmulld %xmm2, %xmm0 +; X64-NEXT: pmulld %xmm2, %xmm1 ; X64-NEXT: retq ; ; X64-XOP-LABEL: mul_v8i32_neg33: @@ -855,32 +819,16 @@ define <16 x i16> @mul_v16i16_neg9(<16 x i16> %a0) nounwind { ; X86-LABEL: mul_v16i16_neg9: ; X86: # %bb.0: -; X86-NEXT: movdqa %xmm0, %xmm3 -; X86-NEXT: psllw $3, %xmm3 -; X86-NEXT: paddw %xmm0, %xmm3 -; X86-NEXT: pxor %xmm2, %xmm2 -; X86-NEXT: pxor %xmm0, %xmm0 -; X86-NEXT: psubw %xmm3, %xmm0 -; X86-NEXT: movdqa %xmm1, %xmm3 -; X86-NEXT: psllw $3, %xmm3 -; X86-NEXT: paddw %xmm1, %xmm3 -; X86-NEXT: psubw %xmm3, %xmm2 -; X86-NEXT: movdqa %xmm2, %xmm1 +; X86-NEXT: movdqa {{.*#+}} xmm2 = [65527,65527,65527,65527,65527,65527,65527,65527] +; X86-NEXT: pmullw %xmm2, %xmm0 +; X86-NEXT: pmullw %xmm2, %xmm1 ; X86-NEXT: retl ; ; X64-LABEL: mul_v16i16_neg9: ; X64: # %bb.0: -; X64-NEXT: movdqa %xmm0, %xmm3 -; X64-NEXT: psllw $3, %xmm3 -; X64-NEXT: paddw %xmm0, %xmm3 -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: pxor %xmm0, %xmm0 -; X64-NEXT: psubw %xmm3, %xmm0 -; X64-NEXT: movdqa %xmm1, %xmm3 -; X64-NEXT: psllw $3, %xmm3 -; X64-NEXT: paddw %xmm1, %xmm3 -; X64-NEXT: psubw %xmm3, %xmm2 -; X64-NEXT: movdqa %xmm2, %xmm1 +; X64-NEXT: movdqa {{.*#+}} xmm2 = [65527,65527,65527,65527,65527,65527,65527,65527] +; X64-NEXT: pmullw %xmm2, %xmm0 +; X64-NEXT: pmullw %xmm2, %xmm1 ; X64-NEXT: retq ; ; X64-XOP-LABEL: mul_v16i16_neg9: