Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -32906,6 +32906,46 @@ return SDValue(); } +// If the upper 17 bits of each element are zero then we can use PMADDWD, +// which is always at least as quick as PMULLD, expect on KNL. +static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + if (!Subtarget.hasSSE2()) + return SDValue(); + + if (Subtarget.getProcFamily() == X86Subtarget::IntelKNL) + return SDValue(); + + EVT VT = N->getValueType(0); + + // Only support vXi32 vectors. + if (!VT.isVector() || VT.getVectorElementType() != MVT::i32) + return SDValue(); + + // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case. + MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements()); + if (!DAG.getTargetLoweringInfo().isTypeLegal(WVT)) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + APInt Mask17 = APInt::getHighBitsSet(32, 17); + if (!DAG.MaskedValueIsZero(N1, Mask17) || + !DAG.MaskedValueIsZero(N0, Mask17)) + return SDValue(); + + // Use SplitBinaryOpsAndApply to handle AVX splitting. + auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, SDValue Op0, + SDValue Op1) { + MVT VT = MVT::getVectorVT(MVT::i32, Op0.getValueSizeInBits() / 32); + return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Op0, Op1); + }; + return SplitBinaryOpsAndApply(DAG, Subtarget, SDLoc(N), VT, + DAG.getBitcast(WVT, N0), + DAG.getBitcast(WVT, N1), PMADDWDBuilder); +} + /// Optimize a single multiply with constant into two operations in order to /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA. static SDValue combineMul(SDNode *N, SelectionDAG &DAG, @@ -32913,23 +32953,8 @@ const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); - // If the upper 17 bits of each element are zero then we can use PMADDWD, - // which is always at least as quick as PMULLD, expect on KNL. - if (Subtarget.getProcFamily() != X86Subtarget::IntelKNL && - ((VT == MVT::v4i32 && Subtarget.hasSSE2()) || - (VT == MVT::v8i32 && Subtarget.hasAVX2()) || - (VT == MVT::v16i32 && Subtarget.useBWIRegs()))) { - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - APInt Mask17 = APInt::getHighBitsSet(32, 17); - if (DAG.MaskedValueIsZero(N0, Mask17) && - DAG.MaskedValueIsZero(N1, Mask17)) { - unsigned NumElts = VT.getVectorNumElements(); - MVT WVT = MVT::getVectorVT(MVT::i16, 2 * NumElts); - return DAG.getNode(X86ISD::VPMADDWD, SDLoc(N), VT, - DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1)); - } - } + if (SDValue V = combineMulToPMADDWD(N, DAG, DCI, Subtarget)) + return V; if (DCI.isBeforeLegalize() && VT.isVector()) return reduceVMULWidth(N, DAG, Subtarget); Index: test/CodeGen/X86/shrink_vmul.ll =================================================================== --- test/CodeGen/X86/shrink_vmul.ll +++ test/CodeGen/X86/shrink_vmul.ll @@ -2227,89 +2227,67 @@ ; ; X86-AVX1-LABEL: PR34947: ; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: pushl %ebp -; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX1-NEXT: pushl %ebx -; X86-AVX1-NEXT: .cfi_def_cfa_offset 12 -; X86-AVX1-NEXT: pushl %edi -; X86-AVX1-NEXT: .cfi_def_cfa_offset 16 ; X86-AVX1-NEXT: pushl %esi -; X86-AVX1-NEXT: .cfi_def_cfa_offset 20 -; X86-AVX1-NEXT: subl $16, %esp -; X86-AVX1-NEXT: .cfi_def_cfa_offset 36 -; X86-AVX1-NEXT: .cfi_offset %esi, -20 -; X86-AVX1-NEXT: .cfi_offset %edi, -16 -; X86-AVX1-NEXT: .cfi_offset %ebx, -12 -; X86-AVX1-NEXT: .cfi_offset %ebp, -8 +; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX1-NEXT: .cfi_offset %esi, -8 ; X86-AVX1-NEXT: vmovdqa (%eax), %ymm0 -; X86-AVX1-NEXT: xorl %eax, %eax -; X86-AVX1-NEXT: xorl %edx, %edx -; X86-AVX1-NEXT: divl (%eax) -; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill -; X86-AVX1-NEXT: vpextrd $3, %xmm0, %ecx +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %ecx ; X86-AVX1-NEXT: xorl %eax, %eax ; X86-AVX1-NEXT: xorl %edx, %edx ; X86-AVX1-NEXT: divl %ecx -; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill -; X86-AVX1-NEXT: vpextrd $2, %xmm0, %ecx +; X86-AVX1-NEXT: movl %edx, %ecx +; X86-AVX1-NEXT: vmovd %xmm0, %esi ; X86-AVX1-NEXT: xorl %eax, %eax ; X86-AVX1-NEXT: xorl %edx, %edx -; X86-AVX1-NEXT: divl %ecx -; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill -; X86-AVX1-NEXT: vpextrd $1, %xmm0, %ecx +; X86-AVX1-NEXT: divl %esi +; X86-AVX1-NEXT: vmovd %edx, %xmm1 +; X86-AVX1-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpextrd $2, %xmm0, %ecx ; X86-AVX1-NEXT: xorl %eax, %eax ; X86-AVX1-NEXT: xorl %edx, %edx ; X86-AVX1-NEXT: divl %ecx -; X86-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-AVX1-NEXT: vmovd %xmm0, %ecx +; X86-AVX1-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpextrd $3, %xmm0, %ecx ; X86-AVX1-NEXT: xorl %eax, %eax ; X86-AVX1-NEXT: xorl %edx, %edx ; X86-AVX1-NEXT: divl %ecx -; X86-AVX1-NEXT: movl %edx, %ebp +; X86-AVX1-NEXT: vpinsrd $3, %edx, %xmm1, %xmm1 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %ecx ; X86-AVX1-NEXT: xorl %eax, %eax ; X86-AVX1-NEXT: xorl %edx, %edx -; X86-AVX1-NEXT: vpextrd $3, %xmm0, %ecx ; X86-AVX1-NEXT: divl %ecx ; X86-AVX1-NEXT: movl %edx, %ecx +; X86-AVX1-NEXT: vmovd %xmm0, %esi ; X86-AVX1-NEXT: xorl %eax, %eax ; X86-AVX1-NEXT: xorl %edx, %edx -; X86-AVX1-NEXT: vpextrd $2, %xmm0, %esi ; X86-AVX1-NEXT: divl %esi -; X86-AVX1-NEXT: movl %edx, %esi +; X86-AVX1-NEXT: vmovd %edx, %xmm2 +; X86-AVX1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpextrd $2, %xmm0, %ecx ; X86-AVX1-NEXT: xorl %eax, %eax ; X86-AVX1-NEXT: xorl %edx, %edx -; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edi -; X86-AVX1-NEXT: divl %edi -; X86-AVX1-NEXT: movl %edx, %edi +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 +; X86-AVX1-NEXT: vpextrd $3, %xmm0, %ecx ; X86-AVX1-NEXT: xorl %eax, %eax ; X86-AVX1-NEXT: xorl %edx, %edx -; X86-AVX1-NEXT: vmovd %xmm0, %ebx -; X86-AVX1-NEXT: divl %ebx -; X86-AVX1-NEXT: vmovd %edx, %xmm0 -; X86-AVX1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 -; X86-AVX1-NEXT: vmovd %ebp, %xmm1 -; X86-AVX1-NEXT: vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload -; X86-AVX1-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 # 4-byte Folded Reload -; X86-AVX1-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 # 4-byte Folded Reload -; X86-AVX1-NEXT: vmovd {{[0-9]+}}(%esp), %xmm2 # 4-byte Folded Reload -; X86-AVX1-NEXT: # xmm2 = mem[0],zero,zero,zero -; X86-AVX1-NEXT: movl $8199, %eax # imm = 0x2007 -; X86-AVX1-NEXT: vmovd %eax, %xmm3 -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8199,8199,8199,8199] -; X86-AVX1-NEXT: vpmaddwd %xmm4, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpmaddwd %xmm4, %xmm1, %xmm1 +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0 +; X86-AVX1-NEXT: xorl %eax, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl (%eax) +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] +; X86-AVX1-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpmaddwd %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; X86-AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm1 +; X86-AVX1-NEXT: vmovd %edx, %xmm1 +; X86-AVX1-NEXT: movl $8199, %eax # imm = 0x2007 +; X86-AVX1-NEXT: vmovd %eax, %xmm2 +; X86-AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vmovd %xmm1, (%eax) ; X86-AVX1-NEXT: vmovaps %ymm0, (%eax) -; X86-AVX1-NEXT: addl $16, %esp ; X86-AVX1-NEXT: popl %esi -; X86-AVX1-NEXT: popl %edi -; X86-AVX1-NEXT: popl %ebx -; X86-AVX1-NEXT: popl %ebp ; X86-AVX1-NEXT: vzeroupper ; X86-AVX1-NEXT: retl ; @@ -2421,77 +2399,63 @@ ; ; X64-AVX1-LABEL: PR34947: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: pushq %rbp -; X64-AVX1-NEXT: .cfi_def_cfa_offset 16 -; X64-AVX1-NEXT: pushq %rbx -; X64-AVX1-NEXT: .cfi_def_cfa_offset 24 -; X64-AVX1-NEXT: .cfi_offset %rbx, -24 -; X64-AVX1-NEXT: .cfi_offset %rbp, -16 ; X64-AVX1-NEXT: vmovdqa (%rax), %ymm0 +; X64-AVX1-NEXT: vpextrd $1, %xmm0, %ecx ; X64-AVX1-NEXT: xorl %eax, %eax ; X64-AVX1-NEXT: xorl %edx, %edx -; X64-AVX1-NEXT: divl (%rax) -; X64-AVX1-NEXT: movl %edx, %r8d -; X64-AVX1-NEXT: vpextrd $3, %xmm0, %ecx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %ecx +; X64-AVX1-NEXT: vmovd %xmm0, %esi ; X64-AVX1-NEXT: xorl %eax, %eax ; X64-AVX1-NEXT: xorl %edx, %edx -; X64-AVX1-NEXT: divl %ecx -; X64-AVX1-NEXT: movl %edx, %r9d +; X64-AVX1-NEXT: divl %esi +; X64-AVX1-NEXT: vmovd %edx, %xmm1 +; X64-AVX1-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpextrd $2, %xmm0, %ecx ; X64-AVX1-NEXT: xorl %eax, %eax ; X64-AVX1-NEXT: xorl %edx, %edx ; X64-AVX1-NEXT: divl %ecx -; X64-AVX1-NEXT: movl %edx, %r10d -; X64-AVX1-NEXT: vpextrd $1, %xmm0, %ecx +; X64-AVX1-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpextrd $3, %xmm0, %ecx ; X64-AVX1-NEXT: xorl %eax, %eax ; X64-AVX1-NEXT: xorl %edx, %edx ; X64-AVX1-NEXT: divl %ecx -; X64-AVX1-NEXT: movl %edx, %r11d -; X64-AVX1-NEXT: vmovd %xmm0, %ecx +; X64-AVX1-NEXT: vpinsrd $3, %edx, %xmm1, %xmm1 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X64-AVX1-NEXT: vpextrd $1, %xmm0, %ecx ; X64-AVX1-NEXT: xorl %eax, %eax ; X64-AVX1-NEXT: xorl %edx, %edx ; X64-AVX1-NEXT: divl %ecx -; X64-AVX1-NEXT: movl %edx, %esi -; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; X64-AVX1-NEXT: vpextrd $3, %xmm0, %ecx +; X64-AVX1-NEXT: movl %edx, %ecx +; X64-AVX1-NEXT: vmovd %xmm0, %esi ; X64-AVX1-NEXT: xorl %eax, %eax ; X64-AVX1-NEXT: xorl %edx, %edx -; X64-AVX1-NEXT: divl %ecx -; X64-AVX1-NEXT: movl %edx, %edi +; X64-AVX1-NEXT: divl %esi +; X64-AVX1-NEXT: vmovd %edx, %xmm2 +; X64-AVX1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 ; X64-AVX1-NEXT: vpextrd $2, %xmm0, %ecx ; X64-AVX1-NEXT: xorl %eax, %eax ; X64-AVX1-NEXT: xorl %edx, %edx ; X64-AVX1-NEXT: divl %ecx -; X64-AVX1-NEXT: movl %edx, %ecx -; X64-AVX1-NEXT: vpextrd $1, %xmm0, %ebx +; X64-AVX1-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpextrd $3, %xmm0, %ecx ; X64-AVX1-NEXT: xorl %eax, %eax ; X64-AVX1-NEXT: xorl %edx, %edx -; X64-AVX1-NEXT: divl %ebx -; X64-AVX1-NEXT: movl %edx, %ebx -; X64-AVX1-NEXT: vmovd %xmm0, %ebp +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0 ; X64-AVX1-NEXT: xorl %eax, %eax ; X64-AVX1-NEXT: xorl %edx, %edx -; X64-AVX1-NEXT: divl %ebp -; X64-AVX1-NEXT: vmovd %edx, %xmm0 -; X64-AVX1-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 -; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199] -; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vmovd %esi, %xmm2 -; X64-AVX1-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2 -; X64-AVX1-NEXT: vpinsrd $2, %r10d, %xmm2, %xmm2 -; X64-AVX1-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2 -; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 +; X64-AVX1-NEXT: divl (%rax) +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] +; X64-AVX1-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpmaddwd %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; X64-AVX1-NEXT: vmovd %r8d, %xmm1 +; X64-AVX1-NEXT: vmovd %edx, %xmm1 ; X64-AVX1-NEXT: movl $8199, %eax # imm = 0x2007 ; X64-AVX1-NEXT: vmovd %eax, %xmm2 ; X64-AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vmovd %xmm1, (%rax) ; X64-AVX1-NEXT: vmovaps %ymm0, (%rax) -; X64-AVX1-NEXT: popq %rbx -; X64-AVX1-NEXT: popq %rbp ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ;