diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -4464,6 +4464,9 @@ SDLoc DL(N); if (VT.isVector()) { + if (SDValue FoldedVOp = SimplifyVBinOp(N, DL)) + return FoldedVOp; + // fold (mulhs x, 0) -> 0 // do not return N0/N1, because undef node may exist. if (ISD::isConstantSplatVectorAllZeros(N0.getNode()) || @@ -4521,6 +4524,9 @@ SDLoc DL(N); if (VT.isVector()) { + if (SDValue FoldedVOp = SimplifyVBinOp(N, DL)) + return FoldedVOp; + // fold (mulhu x, 0) -> 0 // do not return N0/N1, because undef node may exist. if (ISD::isConstantSplatVectorAllZeros(N0.getNode()) || diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -48562,20 +48562,50 @@ SDValue LHS = Src.getOperand(0).getOperand(0); SDValue RHS = Src.getOperand(0).getOperand(1); - unsigned ExtOpc = LHS.getOpcode(); - if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) || - RHS.getOpcode() != ExtOpc) - return SDValue(); - - // Peek through the extends. - LHS = LHS.getOperand(0); - RHS = RHS.getOperand(0); - - // Ensure the input types match. - if (LHS.getValueType() != VT || RHS.getValueType() != VT) - return SDValue(); + // Count leading sign/zero bits on both inputs - if there are enough then + // truncation back to vXi16 will be cheap - either as a pack/shuffle + // sequence or using AVX512 truncations. If the inputs are sext/zext then the + // truncations may actually be free by peeking through to the ext source. + auto IsSext = [&DAG](SDValue V) { + return DAG.ComputeMinSignedBits(V) <= 16; + }; + auto IsZext = [&DAG](SDValue V) { + return DAG.computeKnownBits(V).countMaxActiveBits() <= 16; + }; - unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU; + bool IsSigned = IsSext(LHS) && IsSext(RHS); + bool IsUnsigned = IsZext(LHS) && IsZext(RHS); + if (!IsSigned && !IsUnsigned) + return SDValue(); + + // Check if both inputs are extensions, which will be removed by truncation. + bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND || + LHS.getOpcode() == ISD::ZERO_EXTEND) && + (RHS.getOpcode() == ISD::SIGN_EXTEND || + RHS.getOpcode() == ISD::ZERO_EXTEND) && + LHS.getOperand(0).getScalarValueSizeInBits() <= 16 && + RHS.getOperand(0).getScalarValueSizeInBits() <= 16; + + // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on + // the (bitcasted) inputs directly, and then cheaply pack/truncate the result + // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU + // will have to split anyway. + unsigned InSizeInBits = InVT.getSizeInBits(); + if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() && + !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) && + (InSizeInBits % 16) == 0) { + EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, + InVT.getSizeInBits() / 16); + SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS), + DAG.getBitcast(BCVT, RHS)); + return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res)); + } + + // Truncate back to source type. + LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS); + RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS); + + unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU; return DAG.getNode(Opc, DL, VT, LHS, RHS); } diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll --- a/llvm/test/CodeGen/X86/pmulh.ll +++ b/llvm/test/CodeGen/X86/pmulh.ll @@ -26,44 +26,39 @@ define <4 x i16> @and_mulhuw_v4i16(<4 x i64> %a, <4 x i64> %b) { ; SSE2-LABEL: and_mulhuw_v4i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,0,0,65535,0,0,0] -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pmuludq %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pmuludq %xmm2, %xmm0 -; SSE2-NEXT: psrlq $16, %xmm0 -; SSE2-NEXT: psrlq $16, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: pmulhuw %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: and_mulhuw_v4i16: ; SSE41: # %bb.0: ; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4],xmm4[5],xmm2[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4],xmm4[5],xmm0[6,7] -; SSE41-NEXT: pmuldq %xmm2, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4],xmm4[5],xmm3[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3,4],xmm4[5],xmm1[6,7] -; SSE41-NEXT: pmuldq %xmm3, %xmm1 -; SSE41-NEXT: psrlq $16, %xmm1 -; SSE41-NEXT: psrlq $16, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: packusdw %xmm2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] ; SSE41-NEXT: packusdw %xmm1, %xmm0 ; SSE41-NEXT: packusdw %xmm0, %xmm0 +; SSE41-NEXT: pmulhuw %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX2-LABEL: and_mulhuw_v4i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlq $16, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] +; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 @@ -73,10 +68,9 @@ ; AVX512-LABEL: and_mulhuw_v4i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] -; AVX512-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpsrlq $16, %ymm0, %ymm0 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] +; AVX512-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -118,21 +112,29 @@ ; ; SSE41-LABEL: ashr_mulhw_v4i16: ; SSE41: # %bb.0: -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pmaddwd %xmm1, %xmm0 ; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: packusdw %xmm1, %xmm1 ; SSE41-NEXT: packusdw %xmm0, %xmm0 +; SSE41-NEXT: pmulhw %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: ashr_mulhw_v4i16: -; AVX: # %bb.0: -; AVX-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: ashr_mulhw_v4i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: ashr_mulhw_v4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX512-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512-NEXT: retq %a1 = ashr <4 x i32> %a, <i32 16, i32 16, i32 16, i32 16> %b1 = ashr <4 x i32> %b, <i32 16, i32 16, i32 16, i32 16> %c = mul <4 x i32> %a1, %b1 @@ -175,21 +177,18 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: psrld $16, %xmm1 ; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 ; SSE41-NEXT: psrld $16, %xmm3 -; SSE41-NEXT: pmulld %xmm1, %xmm3 ; SSE41-NEXT: psrld $16, %xmm2 -; SSE41-NEXT: pmulld %xmm2, %xmm0 -; SSE41-NEXT: psrld $16, %xmm3 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: packusdw %xmm3, %xmm0 +; SSE41-NEXT: packusdw %xmm3, %xmm2 +; SSE41-NEXT: pmulhuw %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX2-LABEL: lshr_mulhuw_v8i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 -; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -199,8 +198,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX512-NEXT: vpsrld $16, %ymm1, %ymm1 -; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX512-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper @@ -232,42 +230,20 @@ } define <8 x i16> @sextinreg_mulhw_v8i16(<8 x i32> %a, <8 x i32> %b) { -; SSE2-LABEL: sextinreg_mulhw_v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pslld $24, %xmm1 -; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: pslld $24, %xmm0 -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: pslld $25, %xmm3 -; SSE2-NEXT: psrad $25, %xmm3 -; SSE2-NEXT: pslld $25, %xmm2 -; SSE2-NEXT: psrad $25, %xmm2 -; SSE2-NEXT: packssdw %xmm3, %xmm2 -; SSE2-NEXT: pmullw %xmm0, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: sextinreg_mulhw_v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pslld $24, %xmm1 -; SSE41-NEXT: psrad $24, %xmm1 -; SSE41-NEXT: pslld $24, %xmm0 -; SSE41-NEXT: psrad $24, %xmm0 -; SSE41-NEXT: pslld $25, %xmm3 -; SSE41-NEXT: psrad $25, %xmm3 -; SSE41-NEXT: pmulld %xmm1, %xmm3 -; SSE41-NEXT: pslld $25, %xmm2 -; SSE41-NEXT: psrad $25, %xmm2 -; SSE41-NEXT: pmulld %xmm2, %xmm0 -; SSE41-NEXT: psrld $16, %xmm3 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: packusdw %xmm3, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: sextinreg_mulhw_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pslld $24, %xmm1 +; SSE-NEXT: psrad $24, %xmm1 +; SSE-NEXT: pslld $24, %xmm0 +; SSE-NEXT: psrad $24, %xmm0 +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: pslld $25, %xmm3 +; SSE-NEXT: psrad $25, %xmm3 +; SSE-NEXT: pslld $25, %xmm2 +; SSE-NEXT: psrad $25, %xmm2 +; SSE-NEXT: packssdw %xmm3, %xmm2 +; SSE-NEXT: pmulhw %xmm2, %xmm0 +; SSE-NEXT: retq ; ; AVX2-LABEL: sextinreg_mulhw_v8i16: ; AVX2: # %bb.0: @@ -275,10 +251,11 @@ ; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0 ; AVX2-NEXT: vpslld $25, %ymm1, %ymm1 ; AVX2-NEXT: vpsrad $25, %ymm1, %ymm1 -; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -288,10 +265,9 @@ ; AVX512-NEXT: vpsrad $24, %ymm0, %ymm0 ; AVX512-NEXT: vpslld $25, %ymm1, %ymm1 ; AVX512-NEXT: vpsrad $25, %ymm1, %ymm1 -; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX512-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %a1 = shl <8 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24> @@ -348,22 +324,18 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [32767,32767,32767,32767] ; SSE2-NEXT: pand %xmm6, %xmm3 ; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 ; SSE2-NEXT: pand %xmm6, %xmm1 ; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 ; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pmaddwd %xmm3, %xmm7 ; SSE2-NEXT: pand %xmm6, %xmm8 -; SSE2-NEXT: pmaddwd %xmm2, %xmm8 +; SSE2-NEXT: packssdw %xmm7, %xmm8 +; SSE2-NEXT: pmulhw %xmm2, %xmm8 ; SSE2-NEXT: pand %xmm6, %xmm5 -; SSE2-NEXT: pmaddwd %xmm1, %xmm5 ; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pmaddwd %xmm6, %xmm0 -; SSE2-NEXT: psrld $16, %xmm7 -; SSE2-NEXT: psrld $16, %xmm8 -; SSE2-NEXT: packssdw %xmm7, %xmm8 -; SSE2-NEXT: psrld $16, %xmm5 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: packssdw %xmm5, %xmm0 +; SSE2-NEXT: packssdw %xmm5, %xmm6 +; SSE2-NEXT: pmulhw %xmm6, %xmm0 ; SSE2-NEXT: movdqa %xmm8, %xmm1 ; SSE2-NEXT: retq ; @@ -373,48 +345,42 @@ ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [32767,32767,32767,32767] ; SSE41-NEXT: pand %xmm6, %xmm3 ; SSE41-NEXT: pand %xmm6, %xmm2 +; SSE41-NEXT: packusdw %xmm3, %xmm2 ; SSE41-NEXT: pand %xmm6, %xmm1 ; SSE41-NEXT: pand %xmm6, %xmm0 +; SSE41-NEXT: packusdw %xmm1, %xmm0 ; SSE41-NEXT: pand %xmm6, %xmm7 -; SSE41-NEXT: pmaddwd %xmm3, %xmm7 ; SSE41-NEXT: pand %xmm6, %xmm8 -; SSE41-NEXT: pmaddwd %xmm2, %xmm8 +; SSE41-NEXT: packusdw %xmm7, %xmm8 +; SSE41-NEXT: pmulhw %xmm2, %xmm8 ; SSE41-NEXT: pand %xmm6, %xmm5 -; SSE41-NEXT: pmaddwd %xmm1, %xmm5 ; SSE41-NEXT: pand %xmm4, %xmm6 -; SSE41-NEXT: pmaddwd %xmm6, %xmm0 -; SSE41-NEXT: psrld $16, %xmm7 -; SSE41-NEXT: psrld $16, %xmm8 -; SSE41-NEXT: packusdw %xmm7, %xmm8 -; SSE41-NEXT: psrld $16, %xmm5 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: packusdw %xmm5, %xmm0 +; SSE41-NEXT: packusdw %xmm5, %xmm6 +; SSE41-NEXT: pmulhw %xmm6, %xmm0 ; SSE41-NEXT: movdqa %xmm8, %xmm1 ; SSE41-NEXT: retq ; ; AVX2-LABEL: and_mulhuw_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32767,32767,32767,32767,32767,32767,32767,32767] -; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 -; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2 +; AVX2-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq ; ; AVX512F-LABEL: and_mulhuw_v16i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] -; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpandd %zmm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpmulld %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpsrld $16, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] +; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: and_mulhuw_v16i16: @@ -422,8 +388,7 @@ ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] ; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandd %zmm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmulhuw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512BW-NEXT: retq %a1 = and <16 x i32> %a, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> @@ -475,58 +440,43 @@ ; ; SSE41-LABEL: ashr_mulhuw_v16i16: ; SSE41: # %bb.0: -; SSE41-NEXT: psrld $16, %xmm4 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pmaddwd %xmm4, %xmm0 -; SSE41-NEXT: psrld $16, %xmm5 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: pmaddwd %xmm5, %xmm1 -; SSE41-NEXT: psrld $16, %xmm6 -; SSE41-NEXT: psrld $16, %xmm2 -; SSE41-NEXT: pmaddwd %xmm6, %xmm2 -; SSE41-NEXT: psrld $16, %xmm7 -; SSE41-NEXT: psrld $16, %xmm3 -; SSE41-NEXT: pmaddwd %xmm7, %xmm3 ; SSE41-NEXT: psrld $16, %xmm3 ; SSE41-NEXT: psrld $16, %xmm2 ; SSE41-NEXT: packusdw %xmm3, %xmm2 ; SSE41-NEXT: psrld $16, %xmm1 ; SSE41-NEXT: psrld $16, %xmm0 ; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psrld $16, %xmm7 +; SSE41-NEXT: psrld $16, %xmm6 +; SSE41-NEXT: packusdw %xmm7, %xmm6 +; SSE41-NEXT: pmulhw %xmm2, %xmm6 +; SSE41-NEXT: psrld $16, %xmm5 +; SSE41-NEXT: psrld $16, %xmm4 +; SSE41-NEXT: packusdw %xmm5, %xmm4 +; SSE41-NEXT: pmulhw %xmm4, %xmm0 +; SSE41-NEXT: movdqa %xmm6, %xmm1 ; SSE41-NEXT: retq ; ; AVX2-LABEL: ashr_mulhuw_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrld $16, %ymm2, %ymm2 -; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 -; AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $16, %ymm3, %ymm2 -; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 -; AVX2-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %ymm3, %ymm1 +; AVX2-NEXT: vpsrld $16, %ymm2, %ymm2 +; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq ; -; AVX512F-LABEL: ashr_mulhuw_v16i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrad $16, %zmm0, %zmm0 -; AVX512F-NEXT: vpsrad $16, %zmm1, %zmm1 -; AVX512F-NEXT: vpmulld %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpsrld $16, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: ashr_mulhuw_v16i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsrld $16, %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: retq +; AVX512-LABEL: ashr_mulhuw_v16i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0 +; AVX512-NEXT: vpsrld $16, %zmm1, %zmm1 +; AVX512-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retq %a1 = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> %b1 = ashr <16 x i32> %b, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> %c = mul <16 x i32> %a1, %b1