Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -22326,7 +22326,7 @@ assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() && "Should not custom lower when pmulld is available!"); - // If the upper 17 bits of each element are zero then we can use PMADD. + // If the upper 17 bits of each element are zero then we can use PMADDWD. APInt Mask17 = APInt::getHighBitsSet(32, 17); if (DAG.MaskedValueIsZero(A, Mask17) && DAG.MaskedValueIsZero(B, Mask17)) return DAG.getNode(X86ISD::VPMADDWD, dl, VT, @@ -32707,13 +32707,6 @@ if ((NumElts % 2) != 0) return SDValue(); - // If the upper 17 bits of each element are zero then we can use PMADD. - APInt Mask17 = APInt::getHighBitsSet(32, 17); - if (VT == MVT::v4i32 && DAG.MaskedValueIsZero(N0, Mask17) && - DAG.MaskedValueIsZero(N1, Mask17)) - return DAG.getNode(X86ISD::VPMADDWD, DL, VT, DAG.getBitcast(MVT::v8i16, N0), - DAG.getBitcast(MVT::v8i16, N1)); - unsigned RegSize = 128; MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16); EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts); @@ -32885,6 +32878,25 @@ TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); + + // If the upper 17 bits of each element are zero then we can use PMADDWD, + // which is always at least as quick as PMULLD, expect on KNL. + if (Subtarget.getProcFamily() != X86Subtarget::IntelKNL && + ((VT == MVT::v4i32 && Subtarget.hasSSE2()) || + (VT == MVT::v8i32 && Subtarget.hasAVX2()) || + (VT == MVT::v16i32 && Subtarget.hasBWI()))) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + APInt Mask17 = APInt::getHighBitsSet(32, 17); + if (DAG.MaskedValueIsZero(N0, Mask17) && + DAG.MaskedValueIsZero(N1, Mask17)) { + unsigned NumElts = VT.getVectorNumElements(); + MVT WVT = MVT::getVectorVT(MVT::i16, 2 * NumElts); + return DAG.getNode(X86ISD::VPMADDWD, SDLoc(N), VT, + DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1)); + } + } + if (DCI.isBeforeLegalize() && VT.isVector()) return reduceVMULWidth(N, DAG, Subtarget); Index: llvm/trunk/test/CodeGen/X86/promote.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/promote.ll +++ llvm/trunk/test/CodeGen/X86/promote.ll @@ -7,7 +7,7 @@ ; X86: # %bb.0: # %entry ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-NEXT: pmulld %xmm0, %xmm0 +; X86-NEXT: pmaddwd %xmm0, %xmm0 ; X86-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; X86-NEXT: movd %xmm0, (%eax) ; X86-NEXT: xorl %eax, %eax @@ -16,7 +16,7 @@ ; X64-LABEL: mul_f: ; X64: # %bb.0: # %entry ; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-NEXT: pmulld %xmm0, %xmm0 +; X64-NEXT: pmaddwd %xmm0, %xmm0 ; X64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; X64-NEXT: movd %xmm0, (%rax) ; X64-NEXT: xorl %eax, %eax Index: llvm/trunk/test/CodeGen/X86/shrink_vmul.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/shrink_vmul.ll +++ llvm/trunk/test/CodeGen/X86/shrink_vmul.ll @@ -48,7 +48,7 @@ ; X86-AVX-NEXT: movl c, %esi ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi @@ -74,7 +74,7 @@ ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax ; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero ; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq @@ -134,7 +134,7 @@ ; X86-AVX-NEXT: movl c, %esi ; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 ; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl @@ -158,7 +158,7 @@ ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax ; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -220,9 +220,9 @@ ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 +; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 +; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X86-AVX1-NEXT: vmovups %ymm0, (%esi,%ecx,4) ; X86-AVX1-NEXT: popl %esi @@ -240,7 +240,7 @@ ; X86-AVX2-NEXT: movl c, %esi ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4) ; X86-AVX2-NEXT: popl %esi ; X86-AVX2-NEXT: vzeroupper @@ -268,9 +268,9 @@ ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 +; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 +; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X64-AVX1-NEXT: vmovups %ymm0, (%rax,%rdx,4) ; X64-AVX1-NEXT: vzeroupper @@ -281,7 +281,7 @@ ; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4) ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -356,13 +356,13 @@ ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4) @@ -383,9 +383,9 @@ ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 +; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; X86-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 ; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) ; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) ; X86-AVX2-NEXT: popl %esi @@ -426,13 +426,13 @@ ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 +; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4) @@ -446,9 +446,9 @@ ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 +; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; X64-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 ; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) ; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) ; X64-AVX2-NEXT: vzeroupper @@ -1488,7 +1488,7 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl @@ -1512,7 +1512,7 @@ ; X64-AVX-NEXT: movl $255, %ecx ; X64-AVX-NEXT: vmovq %rcx, %xmm1 ; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq @@ -1624,7 +1624,7 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl @@ -1651,7 +1651,7 @@ ; X64-AVX-NEXT: movl $256, %ecx # imm = 0x100 ; X64-AVX-NEXT: vmovq %rcx, %xmm1 ; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq @@ -2299,8 +2299,8 @@ ; X86-AVX1-NEXT: movl $8199, %eax # imm = 0x2007 ; X86-AVX1-NEXT: vmovd %eax, %xmm3 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8199,8199,8199,8199] -; X86-AVX1-NEXT: vpmulld %xmm4, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpmulld %xmm4, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpmaddwd %xmm4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpmaddwd %xmm4, %xmm1, %xmm1 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X86-AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm1 ; X86-AVX1-NEXT: vmovd %xmm1, (%eax) @@ -2368,7 +2368,7 @@ ; X86-AVX2-NEXT: divl (%eax) ; X86-AVX2-NEXT: vmovd %edx, %xmm1 ; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199] -; X86-AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 ; X86-AVX2-NEXT: movl $8199, %eax # imm = 0x2007 ; X86-AVX2-NEXT: vmovd %eax, %xmm2 ; X86-AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1 @@ -2477,12 +2477,12 @@ ; X64-AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 ; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199] -; X64-AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vmovd %esi, %xmm2 ; X64-AVX1-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2 ; X64-AVX1-NEXT: vpinsrd $2, %r10d, %xmm2, %xmm2 ; X64-AVX1-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2 -; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 +; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X64-AVX1-NEXT: vmovd %r8d, %xmm1 ; X64-AVX1-NEXT: movl $8199, %eax # imm = 0x2007 @@ -2547,7 +2547,7 @@ ; X64-AVX2-NEXT: divl (%rax) ; X64-AVX2-NEXT: vmovd %edx, %xmm1 ; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199] -; X64-AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 ; X64-AVX2-NEXT: movl $8199, %eax # imm = 0x2007 ; X64-AVX2-NEXT: vmovd %eax, %xmm2 ; X64-AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1 Index: llvm/trunk/test/CodeGen/X86/slow-pmulld.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/slow-pmulld.ll +++ llvm/trunk/test/CodeGen/X86/slow-pmulld.ll @@ -33,28 +33,64 @@ ; SSE4-32-LABEL: test_mul_v4i32_v4i8: ; SSE4-32: # %bb.0: ; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0 -; SSE4-32-NEXT: pmulld {{\.LCPI.*}}, %xmm0 +; SSE4-32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0 ; SSE4-32-NEXT: retl ; ; SSE4-64-LABEL: test_mul_v4i32_v4i8: ; SSE4-64: # %bb.0: ; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE4-64-NEXT: pmulld {{.*}}(%rip), %xmm0 +; SSE4-64-NEXT: pmaddwd {{.*}}(%rip), %xmm0 ; SSE4-64-NEXT: retq ; -; AVX-32-LABEL: test_mul_v4i32_v4i8: -; AVX-32: # %bb.0: -; AVX-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 -; AVX-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] -; AVX-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX-32-NEXT: retl +; AVX2-32-LABEL: test_mul_v4i32_v4i8: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; AVX2-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 +; AVX2-32-NEXT: retl ; -; AVX-64-LABEL: test_mul_v4i32_v4i8: -; AVX-64: # %bb.0: -; AVX-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] -; AVX-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX-64-NEXT: retq +; AVX2-64-LABEL: test_mul_v4i32_v4i8: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-64-NEXT: retq +; +; AVX512DQ-32-LABEL: test_mul_v4i32_v4i8: +; AVX512DQ-32: # %bb.0: +; AVX512DQ-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; AVX512DQ-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 +; AVX512DQ-32-NEXT: retl +; +; AVX512DQ-64-LABEL: test_mul_v4i32_v4i8: +; AVX512DQ-64: # %bb.0: +; AVX512DQ-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-64-NEXT: retq +; +; AVX512BW-32-LABEL: test_mul_v4i32_v4i8: +; AVX512BW-32: # %bb.0: +; AVX512BW-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 +; AVX512BW-32-NEXT: retl +; +; AVX512BW-64-LABEL: test_mul_v4i32_v4i8: +; AVX512BW-64: # %bb.0: +; AVX512BW-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-64-NEXT: retq +; +; KNL-32-LABEL: test_mul_v4i32_v4i8: +; KNL-32: # %bb.0: +; KNL-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; KNL-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] +; KNL-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; KNL-32-NEXT: retl +; +; KNL-64-LABEL: test_mul_v4i32_v4i8: +; KNL-64: # %bb.0: +; KNL-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; KNL-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] +; KNL-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; KNL-64-NEXT: retq %z = zext <4 x i8> %A to <4 x i32> %m = mul nuw nsw <4 x i32> %z, ret <4 x i32> %m @@ -120,8 +156,8 @@ ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] -; SSE4-32-NEXT: pmulld %xmm2, %xmm0 -; SSE4-32-NEXT: pmulld %xmm2, %xmm1 +; SSE4-32-NEXT: pmaddwd %xmm2, %xmm0 +; SSE4-32-NEXT: pmaddwd %xmm2, %xmm1 ; SSE4-32-NEXT: retl ; ; SSE4-64-LABEL: test_mul_v8i32_v8i8: @@ -131,25 +167,67 @@ ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] -; SSE4-64-NEXT: pmulld %xmm2, %xmm0 -; SSE4-64-NEXT: pmulld %xmm2, %xmm1 +; SSE4-64-NEXT: pmaddwd %xmm2, %xmm0 +; SSE4-64-NEXT: pmaddwd %xmm2, %xmm1 ; SSE4-64-NEXT: retq ; -; AVX-32-LABEL: test_mul_v8i32_v8i8: -; AVX-32: # %bb.0: -; AVX-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 -; AVX-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] -; AVX-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0 -; AVX-32-NEXT: retl +; AVX2-32-LABEL: test_mul_v8i32_v8i8: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0 +; AVX2-32-NEXT: retl ; -; AVX-64-LABEL: test_mul_v8i32_v8i8: -; AVX-64: # %bb.0: -; AVX-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] -; AVX-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0 -; AVX-64-NEXT: retq +; AVX2-64-LABEL: test_mul_v8i32_v8i8: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-64-NEXT: retq +; +; AVX512DQ-32-LABEL: test_mul_v8i32_v8i8: +; AVX512DQ-32: # %bb.0: +; AVX512DQ-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; AVX512DQ-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0 +; AVX512DQ-32-NEXT: retl +; +; AVX512DQ-64-LABEL: test_mul_v8i32_v8i8: +; AVX512DQ-64: # %bb.0: +; AVX512DQ-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512DQ-64-NEXT: retq +; +; AVX512BW-32-LABEL: test_mul_v8i32_v8i8: +; AVX512BW-32: # %bb.0: +; AVX512BW-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; AVX512BW-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0 +; AVX512BW-32-NEXT: retl +; +; AVX512BW-64-LABEL: test_mul_v8i32_v8i8: +; AVX512BW-64: # %bb.0: +; AVX512BW-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-64-NEXT: retq +; +; KNL-32-LABEL: test_mul_v8i32_v8i8: +; KNL-32: # %bb.0: +; KNL-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; KNL-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] +; KNL-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; KNL-32-NEXT: retl +; +; KNL-64-LABEL: test_mul_v8i32_v8i8: +; KNL-64: # %bb.0: +; KNL-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; KNL-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; KNL-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] +; KNL-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; KNL-64-NEXT: retq %z = zext <8 x i8> %A to <8 x i32> %m = mul nuw nsw <8 x i32> %z, ret <8 x i32> %m @@ -248,10 +326,10 @@ ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE4-32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778] -; SSE4-32-NEXT: pmulld %xmm4, %xmm0 -; SSE4-32-NEXT: pmulld %xmm4, %xmm1 -; SSE4-32-NEXT: pmulld %xmm4, %xmm2 -; SSE4-32-NEXT: pmulld %xmm4, %xmm3 +; SSE4-32-NEXT: pmaddwd %xmm4, %xmm0 +; SSE4-32-NEXT: pmaddwd %xmm4, %xmm1 +; SSE4-32-NEXT: pmaddwd %xmm4, %xmm2 +; SSE4-32-NEXT: pmaddwd %xmm4, %xmm3 ; SSE4-32-NEXT: retl ; ; SSE4-64-LABEL: test_mul_v16i32_v16i8: @@ -264,10 +342,10 @@ ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE4-64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778] -; SSE4-64-NEXT: pmulld %xmm4, %xmm0 -; SSE4-64-NEXT: pmulld %xmm4, %xmm1 -; SSE4-64-NEXT: pmulld %xmm4, %xmm2 -; SSE4-64-NEXT: pmulld %xmm4, %xmm3 +; SSE4-64-NEXT: pmaddwd %xmm4, %xmm0 +; SSE4-64-NEXT: pmaddwd %xmm4, %xmm1 +; SSE4-64-NEXT: pmaddwd %xmm4, %xmm2 +; SSE4-64-NEXT: pmaddwd %xmm4, %xmm3 ; SSE4-64-NEXT: retq ; ; AVX2-32-LABEL: test_mul_v16i32_v16i8: @@ -276,8 +354,8 @@ ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] -; AVX2-32-NEXT: vpmulld %ymm2, %ymm0, %ymm0 -; AVX2-32-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 +; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 ; AVX2-32-NEXT: retl ; ; AVX2-64-LABEL: test_mul_v16i32_v16i8: @@ -286,21 +364,45 @@ ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] -; AVX2-64-NEXT: vpmulld %ymm2, %ymm0, %ymm0 -; AVX2-64-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: retq ; -; AVX512-32-LABEL: test_mul_v16i32_v16i8: -; AVX512-32: # %bb.0: -; AVX512-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512-32-NEXT: vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0 -; AVX512-32-NEXT: retl -; -; AVX512-64-LABEL: test_mul_v16i32_v16i8: -; AVX512-64: # %bb.0: -; AVX512-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512-64-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512-64-NEXT: retq +; AVX512DQ-32-LABEL: test_mul_v16i32_v16i8: +; AVX512DQ-32: # %bb.0: +; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-32-NEXT: vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0 +; AVX512DQ-32-NEXT: retl +; +; AVX512DQ-64-LABEL: test_mul_v16i32_v16i8: +; AVX512DQ-64: # %bb.0: +; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-64-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512DQ-64-NEXT: retq +; +; AVX512BW-32-LABEL: test_mul_v16i32_v16i8: +; AVX512BW-32: # %bb.0: +; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %zmm0, %zmm0 +; AVX512BW-32-NEXT: retl +; +; AVX512BW-64-LABEL: test_mul_v16i32_v16i8: +; AVX512BW-64: # %bb.0: +; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-64-NEXT: retq +; +; KNL-32-LABEL: test_mul_v16i32_v16i8: +; KNL-32: # %bb.0: +; KNL-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; KNL-32-NEXT: vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0 +; KNL-32-NEXT: retl +; +; KNL-64-LABEL: test_mul_v16i32_v16i8: +; KNL-64: # %bb.0: +; KNL-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; KNL-64-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; KNL-64-NEXT: retq %z = zext <16 x i8> %A to <16 x i32> %m = mul nuw nsw <16 x i32> %z, ret <16 x i32> %m @@ -621,40 +723,76 @@ ; CHECK32-LABEL: test_mul_v4i32_v4i8_minsize: ; CHECK32: # %bb.0: ; CHECK32-NEXT: pand {{\.LCPI.*}}, %xmm0 -; CHECK32-NEXT: pmulld {{\.LCPI.*}}, %xmm0 +; CHECK32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0 ; CHECK32-NEXT: retl ; ; CHECK64-LABEL: test_mul_v4i32_v4i8_minsize: ; CHECK64: # %bb.0: ; CHECK64-NEXT: pand {{.*}}(%rip), %xmm0 -; CHECK64-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK64-NEXT: pmaddwd {{.*}}(%rip), %xmm0 ; CHECK64-NEXT: retq ; ; SSE4-32-LABEL: test_mul_v4i32_v4i8_minsize: ; SSE4-32: # %bb.0: ; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0 -; SSE4-32-NEXT: pmulld {{\.LCPI.*}}, %xmm0 +; SSE4-32-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0 ; SSE4-32-NEXT: retl ; ; SSE4-64-LABEL: test_mul_v4i32_v4i8_minsize: ; SSE4-64: # %bb.0: ; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE4-64-NEXT: pmulld {{.*}}(%rip), %xmm0 +; SSE4-64-NEXT: pmaddwd {{.*}}(%rip), %xmm0 ; SSE4-64-NEXT: retq ; -; AVX-32-LABEL: test_mul_v4i32_v4i8_minsize: -; AVX-32: # %bb.0: -; AVX-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 -; AVX-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] -; AVX-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX-32-NEXT: retl +; AVX2-32-LABEL: test_mul_v4i32_v4i8_minsize: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; AVX2-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 +; AVX2-32-NEXT: retl ; -; AVX-64-LABEL: test_mul_v4i32_v4i8_minsize: -; AVX-64: # %bb.0: -; AVX-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] -; AVX-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX-64-NEXT: retq +; AVX2-64-LABEL: test_mul_v4i32_v4i8_minsize: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-64-NEXT: retq +; +; AVX512DQ-32-LABEL: test_mul_v4i32_v4i8_minsize: +; AVX512DQ-32: # %bb.0: +; AVX512DQ-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; AVX512DQ-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 +; AVX512DQ-32-NEXT: retl +; +; AVX512DQ-64-LABEL: test_mul_v4i32_v4i8_minsize: +; AVX512DQ-64: # %bb.0: +; AVX512DQ-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-64-NEXT: retq +; +; AVX512BW-32-LABEL: test_mul_v4i32_v4i8_minsize: +; AVX512BW-32: # %bb.0: +; AVX512BW-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 +; AVX512BW-32-NEXT: retl +; +; AVX512BW-64-LABEL: test_mul_v4i32_v4i8_minsize: +; AVX512BW-64: # %bb.0: +; AVX512BW-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-64-NEXT: retq +; +; KNL-32-LABEL: test_mul_v4i32_v4i8_minsize: +; KNL-32: # %bb.0: +; KNL-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; KNL-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] +; KNL-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; KNL-32-NEXT: retl +; +; KNL-64-LABEL: test_mul_v4i32_v4i8_minsize: +; KNL-64: # %bb.0: +; KNL-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; KNL-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] +; KNL-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; KNL-64-NEXT: retq %z = zext <4 x i8> %A to <4 x i32> %m = mul nuw nsw <4 x i32> %z, ret <4 x i32> %m @@ -668,8 +806,8 @@ ; SLM32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SLM32-NEXT: pmulld %xmm2, %xmm0 -; SLM32-NEXT: pmulld %xmm2, %xmm1 +; SLM32-NEXT: pmaddwd %xmm2, %xmm0 +; SLM32-NEXT: pmaddwd %xmm2, %xmm1 ; SLM32-NEXT: retl ; ; SLM64-LABEL: test_mul_v8i32_v8i8_minsize: @@ -679,8 +817,8 @@ ; SLM64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SLM64-NEXT: pmulld %xmm2, %xmm0 -; SLM64-NEXT: pmulld %xmm2, %xmm1 +; SLM64-NEXT: pmaddwd %xmm2, %xmm0 +; SLM64-NEXT: pmaddwd %xmm2, %xmm1 ; SLM64-NEXT: retq ; ; SLOW32-LABEL: test_mul_v8i32_v8i8_minsize: @@ -690,8 +828,8 @@ ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SLOW32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SLOW32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] -; SLOW32-NEXT: pmulld %xmm2, %xmm0 -; SLOW32-NEXT: pmulld %xmm2, %xmm1 +; SLOW32-NEXT: pmaddwd %xmm2, %xmm0 +; SLOW32-NEXT: pmaddwd %xmm2, %xmm1 ; SLOW32-NEXT: retl ; ; SLOW64-LABEL: test_mul_v8i32_v8i8_minsize: @@ -701,8 +839,8 @@ ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SLOW64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SLOW64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] -; SLOW64-NEXT: pmulld %xmm2, %xmm0 -; SLOW64-NEXT: pmulld %xmm2, %xmm1 +; SLOW64-NEXT: pmaddwd %xmm2, %xmm0 +; SLOW64-NEXT: pmaddwd %xmm2, %xmm1 ; SLOW64-NEXT: retq ; ; SSE4-32-LABEL: test_mul_v8i32_v8i8_minsize: @@ -712,8 +850,8 @@ ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE4-32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE4-32-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] -; SSE4-32-NEXT: pmulld %xmm2, %xmm0 -; SSE4-32-NEXT: pmulld %xmm2, %xmm1 +; SSE4-32-NEXT: pmaddwd %xmm2, %xmm0 +; SSE4-32-NEXT: pmaddwd %xmm2, %xmm1 ; SSE4-32-NEXT: retl ; ; SSE4-64-LABEL: test_mul_v8i32_v8i8_minsize: @@ -723,25 +861,67 @@ ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE4-64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE4-64-NEXT: movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778] -; SSE4-64-NEXT: pmulld %xmm2, %xmm0 -; SSE4-64-NEXT: pmulld %xmm2, %xmm1 +; SSE4-64-NEXT: pmaddwd %xmm2, %xmm0 +; SSE4-64-NEXT: pmaddwd %xmm2, %xmm1 ; SSE4-64-NEXT: retq ; -; AVX-32-LABEL: test_mul_v8i32_v8i8_minsize: -; AVX-32: # %bb.0: -; AVX-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 -; AVX-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] -; AVX-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0 -; AVX-32-NEXT: retl +; AVX2-32-LABEL: test_mul_v8i32_v8i8_minsize: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0 +; AVX2-32-NEXT: retl ; -; AVX-64-LABEL: test_mul_v8i32_v8i8_minsize: -; AVX-64: # %bb.0: -; AVX-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] -; AVX-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0 -; AVX-64-NEXT: retq +; AVX2-64-LABEL: test_mul_v8i32_v8i8_minsize: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-64-NEXT: retq +; +; AVX512DQ-32-LABEL: test_mul_v8i32_v8i8_minsize: +; AVX512DQ-32: # %bb.0: +; AVX512DQ-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; AVX512DQ-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0 +; AVX512DQ-32-NEXT: retl +; +; AVX512DQ-64-LABEL: test_mul_v8i32_v8i8_minsize: +; AVX512DQ-64: # %bb.0: +; AVX512DQ-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512DQ-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512DQ-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512DQ-64-NEXT: retq +; +; AVX512BW-32-LABEL: test_mul_v8i32_v8i8_minsize: +; AVX512BW-32: # %bb.0: +; AVX512BW-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; AVX512BW-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0 +; AVX512BW-32-NEXT: retl +; +; AVX512BW-64-LABEL: test_mul_v8i32_v8i8_minsize: +; AVX512BW-64: # %bb.0: +; AVX512BW-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-64-NEXT: retq +; +; KNL-32-LABEL: test_mul_v8i32_v8i8_minsize: +; KNL-32: # %bb.0: +; KNL-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 +; KNL-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] +; KNL-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; KNL-32-NEXT: retl +; +; KNL-64-LABEL: test_mul_v8i32_v8i8_minsize: +; KNL-64: # %bb.0: +; KNL-64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; KNL-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; KNL-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] +; KNL-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; KNL-64-NEXT: retq %z = zext <8 x i8> %A to <8 x i32> %m = mul nuw nsw <8 x i32> %z, ret <8 x i32> %m @@ -758,10 +938,10 @@ ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLM32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; SLM32-NEXT: pmulld %xmm5, %xmm0 -; SLM32-NEXT: pmulld %xmm5, %xmm1 -; SLM32-NEXT: pmulld %xmm5, %xmm2 -; SLM32-NEXT: pmulld %xmm5, %xmm3 +; SLM32-NEXT: pmaddwd %xmm5, %xmm0 +; SLM32-NEXT: pmaddwd %xmm5, %xmm1 +; SLM32-NEXT: pmaddwd %xmm5, %xmm2 +; SLM32-NEXT: pmaddwd %xmm5, %xmm3 ; SLM32-NEXT: retl ; ; SLM64-LABEL: test_mul_v16i32_v16i8_minsize: @@ -774,10 +954,10 @@ ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLM64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; SLM64-NEXT: pmulld %xmm5, %xmm0 -; SLM64-NEXT: pmulld %xmm5, %xmm1 -; SLM64-NEXT: pmulld %xmm5, %xmm2 -; SLM64-NEXT: pmulld %xmm5, %xmm3 +; SLM64-NEXT: pmaddwd %xmm5, %xmm0 +; SLM64-NEXT: pmaddwd %xmm5, %xmm1 +; SLM64-NEXT: pmaddwd %xmm5, %xmm2 +; SLM64-NEXT: pmaddwd %xmm5, %xmm3 ; SLM64-NEXT: retq ; ; SLOW32-LABEL: test_mul_v16i32_v16i8_minsize: @@ -790,10 +970,10 @@ ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLOW32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SLOW32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778] -; SLOW32-NEXT: pmulld %xmm4, %xmm0 -; SLOW32-NEXT: pmulld %xmm4, %xmm1 -; SLOW32-NEXT: pmulld %xmm4, %xmm2 -; SLOW32-NEXT: pmulld %xmm4, %xmm3 +; SLOW32-NEXT: pmaddwd %xmm4, %xmm0 +; SLOW32-NEXT: pmaddwd %xmm4, %xmm1 +; SLOW32-NEXT: pmaddwd %xmm4, %xmm2 +; SLOW32-NEXT: pmaddwd %xmm4, %xmm3 ; SLOW32-NEXT: retl ; ; SLOW64-LABEL: test_mul_v16i32_v16i8_minsize: @@ -806,10 +986,10 @@ ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SLOW64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SLOW64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778] -; SLOW64-NEXT: pmulld %xmm4, %xmm0 -; SLOW64-NEXT: pmulld %xmm4, %xmm1 -; SLOW64-NEXT: pmulld %xmm4, %xmm2 -; SLOW64-NEXT: pmulld %xmm4, %xmm3 +; SLOW64-NEXT: pmaddwd %xmm4, %xmm0 +; SLOW64-NEXT: pmaddwd %xmm4, %xmm1 +; SLOW64-NEXT: pmaddwd %xmm4, %xmm2 +; SLOW64-NEXT: pmaddwd %xmm4, %xmm3 ; SLOW64-NEXT: retq ; ; SSE4-32-LABEL: test_mul_v16i32_v16i8_minsize: @@ -822,10 +1002,10 @@ ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE4-32-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778] -; SSE4-32-NEXT: pmulld %xmm4, %xmm0 -; SSE4-32-NEXT: pmulld %xmm4, %xmm1 -; SSE4-32-NEXT: pmulld %xmm4, %xmm2 -; SSE4-32-NEXT: pmulld %xmm4, %xmm3 +; SSE4-32-NEXT: pmaddwd %xmm4, %xmm0 +; SSE4-32-NEXT: pmaddwd %xmm4, %xmm1 +; SSE4-32-NEXT: pmaddwd %xmm4, %xmm2 +; SSE4-32-NEXT: pmaddwd %xmm4, %xmm3 ; SSE4-32-NEXT: retl ; ; SSE4-64-LABEL: test_mul_v16i32_v16i8_minsize: @@ -838,10 +1018,10 @@ ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE4-64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE4-64-NEXT: movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778] -; SSE4-64-NEXT: pmulld %xmm4, %xmm0 -; SSE4-64-NEXT: pmulld %xmm4, %xmm1 -; SSE4-64-NEXT: pmulld %xmm4, %xmm2 -; SSE4-64-NEXT: pmulld %xmm4, %xmm3 +; SSE4-64-NEXT: pmaddwd %xmm4, %xmm0 +; SSE4-64-NEXT: pmaddwd %xmm4, %xmm1 +; SSE4-64-NEXT: pmaddwd %xmm4, %xmm2 +; SSE4-64-NEXT: pmaddwd %xmm4, %xmm3 ; SSE4-64-NEXT: retq ; ; AVX2-32-LABEL: test_mul_v16i32_v16i8_minsize: @@ -850,8 +1030,8 @@ ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero ; AVX2-32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-32-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] -; AVX2-32-NEXT: vpmulld %ymm2, %ymm0, %ymm0 -; AVX2-32-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 +; AVX2-32-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 ; AVX2-32-NEXT: retl ; ; AVX2-64-LABEL: test_mul_v16i32_v16i8_minsize: @@ -860,21 +1040,45 @@ ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero ; AVX2-64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778] -; AVX2-64-NEXT: vpmulld %ymm2, %ymm0, %ymm0 -; AVX2-64-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: retq ; -; AVX512-32-LABEL: test_mul_v16i32_v16i8_minsize: -; AVX512-32: # %bb.0: -; AVX512-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512-32-NEXT: vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0 -; AVX512-32-NEXT: retl -; -; AVX512-64-LABEL: test_mul_v16i32_v16i8_minsize: -; AVX512-64: # %bb.0: -; AVX512-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512-64-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512-64-NEXT: retq +; AVX512DQ-32-LABEL: test_mul_v16i32_v16i8_minsize: +; AVX512DQ-32: # %bb.0: +; AVX512DQ-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-32-NEXT: vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0 +; AVX512DQ-32-NEXT: retl +; +; AVX512DQ-64-LABEL: test_mul_v16i32_v16i8_minsize: +; AVX512DQ-64: # %bb.0: +; AVX512DQ-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-64-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; AVX512DQ-64-NEXT: retq +; +; AVX512BW-32-LABEL: test_mul_v16i32_v16i8_minsize: +; AVX512BW-32: # %bb.0: +; AVX512BW-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512BW-32-NEXT: vpmaddwd {{\.LCPI.*}}, %zmm0, %zmm0 +; AVX512BW-32-NEXT: retl +; +; AVX512BW-64-LABEL: test_mul_v16i32_v16i8_minsize: +; AVX512BW-64: # %bb.0: +; AVX512BW-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512BW-64-NEXT: vpmaddwd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-64-NEXT: retq +; +; KNL-32-LABEL: test_mul_v16i32_v16i8_minsize: +; KNL-32: # %bb.0: +; KNL-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; KNL-32-NEXT: vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0 +; KNL-32-NEXT: retl +; +; KNL-64-LABEL: test_mul_v16i32_v16i8_minsize: +; KNL-64: # %bb.0: +; KNL-64-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; KNL-64-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; KNL-64-NEXT: retq %z = zext <16 x i8> %A to <16 x i32> %m = mul nuw nsw <16 x i32> %z, ret <16 x i32> %m