diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -27651,20 +27651,65 @@ const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + EVT OvfVT = Op->getValueType(1); SDValue Ovf; if (IsSigned) { // SMULO overflows if the high bits doesn't match the sign of the low. - SDValue LowSign = - DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT)); + SDValue LowSign; + + if ((OvfVT == MVT::v16i1 && Subtarget.hasBWI()) || + (OvfVT == MVT::v32i1 && Subtarget.canExtendTo512BW())) { + // We can do a compare directly on the vXi16 PMULLW result. But first + // we need to reverse the truncate that was done in LowerFullMultiplyvXi8. + MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements()); + High = DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, High); + High = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, High, 8, DAG); + High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, High, 8, DAG); + + LowSign = DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, Low); + LowSign = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, LowSign, 8, DAG); + LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign, 15, DAG); + SetccVT = OvfVT; + } else if (OvfVT == MVT::v16i1 && Subtarget.canExtendTo512DQ()) { + MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements()); + High = DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, High); + High = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, High, 8, DAG); + High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, High, 8, DAG); + ExVT = MVT::v16i32; + High = DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, High); + + LowSign = DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, Low); + LowSign = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, LowSign, 24, DAG); + LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign, 31, DAG); + SetccVT = OvfVT; + } else { + LowSign = DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT)); + } + Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE); } else { + // Optimize for cases where the result type is vXi1 with avx512. + if ((OvfVT == MVT::v16i1 && Subtarget.hasBWI()) || + (OvfVT == MVT::v32i1 && Subtarget.canExtendTo512BW())) { + // We can do a compare directly on the vXi16 PMULLW result. But first + // we need to reverse the truncate that was done in LowerFullMultiplyvXi8. + MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements()); + High = DAG.getNode(ISD::ZERO_EXTEND, dl, ExVT, High); + SetccVT = OvfVT; + } else if (OvfVT == MVT::v16i1 && Subtarget.canExtendTo512DQ()) { + // We can't use a vXi16 compare, but we can use a v16i32 compare. + High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High); + SetccVT = OvfVT; + } + // UMULO overflow is the high bits are non-zero. - Ovf = - DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE); + Ovf = DAG.getSetCC(dl, SetccVT, High, + DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE); } - Ovf = DAG.getSExtOrTrunc(Ovf, dl, Op->getValueType(1)); + // Convert to the desired type if we used a different type above. + Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT); return DAG.getMergeValues({Low, Ovf}, dl); } @@ -30146,8 +30191,7 @@ case ISD::USUBO: return LowerXALUO(Op, DAG); case ISD::SMULO: - case ISD::UMULO: - return LowerMULO(Op, Subtarget, DAG); + case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG); case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG); case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG); case ISD::SADDO_CARRY: diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -1455,34 +1455,27 @@ ; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm1 ; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm2 -; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpcmpeqb %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpsraw $8, %ymm0, %ymm1 +; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpslld $24, %zmm2, %zmm0 +; AVX512F-NEXT: vpsrad $31, %zmm0, %zmm0 +; AVX512F-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vmovdqa %xmm1, (%rdi) +; AVX512F-NEXT: vpmovdb %zmm2, (%rdi) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: smulo_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1 ; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovwb %ymm0, %xmm1 -; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BW-NEXT: vpcmpneqb %xmm0, %xmm2, %k1 +; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm1 +; AVX512BW-NEXT: vpsraw $8, %ymm1, %ymm0 +; AVX512BW-NEXT: vpsllw $8, %ymm1, %ymm2 +; AVX512BW-NEXT: vpsraw $15, %ymm2, %ymm2 +; AVX512BW-NEXT: vpcmpneqw %ymm0, %ymm2, %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vmovdqa %xmm1, (%rdi) +; AVX512BW-NEXT: vpmovwb %ymm1, (%rdi) ; AVX512BW-NEXT: retq %t = call {<16 x i8>, <16 x i1>} @llvm.smul.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0 @@ -1874,51 +1867,40 @@ ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512F-NEXT: vpmovsxbw %xmm3, %ymm3 ; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 -; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512F-NEXT: vpcmpgtb %xmm3, %xmm4, %xmm5 -; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512F-NEXT: vpsraw $8, %ymm2, %ymm3 +; AVX512F-NEXT: vpmovsxwd %ymm3, %zmm3 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm5, %xmm2 -; AVX512F-NEXT: vpternlogq $15, %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512F-NEXT: vpslld $24, %zmm2, %zmm4 +; AVX512F-NEXT: vpsrad $31, %zmm4, %zmm4 +; AVX512F-NEXT: vpcmpneqd %zmm3, %zmm4, %k1 ; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm1 ; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm1, %xmm2 -; AVX512F-NEXT: vpcmpgtb %xmm2, %xmm4, %xmm1 -; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpcmpeqb %xmm0, %xmm1, %xmm0 -; AVX512F-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 +; AVX512F-NEXT: vpsraw $8, %ymm0, %ymm1 +; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpslld $24, %zmm3, %zmm0 +; AVX512F-NEXT: vpsrad $31, %zmm0, %zmm0 +; AVX512F-NEXT: vpcmpneqd %zmm1, %zmm0, %k2 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-NEXT: vmovdqa %xmm3, 16(%rdi) -; AVX512F-NEXT: vmovdqa %xmm2, (%rdi) +; AVX512F-NEXT: vpmovdb %zmm2, 16(%rdi) +; AVX512F-NEXT: vpmovdb %zmm3, (%rdi) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: smulo_v32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 ; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 -; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm2 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1 -; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vpcmpneqb %ymm0, %ymm1, %k1 +; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpsraw $8, %zmm2, %zmm0 +; AVX512BW-NEXT: vpsllw $8, %zmm2, %zmm1 +; AVX512BW-NEXT: vpsraw $15, %zmm1, %zmm1 +; AVX512BW-NEXT: vpcmpneqw %zmm0, %zmm1, %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512BW-NEXT: vmovdqa %ymm2, (%rdi) +; AVX512BW-NEXT: vpmovwb %zmm2, (%rdi) ; AVX512BW-NEXT: retq %t = call {<32 x i8>, <32 x i1>} @llvm.smul.with.overflow.v32i8(<32 x i8> %a0, <32 x i8> %a1) %val = extractvalue {<32 x i8>, <32 x i1>} %t, 0 @@ -2664,73 +2646,56 @@ ; ; AVX512F-LABEL: smulo_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm2 -; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2 -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 -; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm4 -; AVX512F-NEXT: vpmovsxbw %xmm4, %ymm4 -; AVX512F-NEXT: vpmullw %ymm2, %ymm4, %ymm6 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero -; AVX512F-NEXT: vpmovdb %zmm2, %xmm4 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpgtb %xmm4, %xmm2, %xmm7 -; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm6 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero -; AVX512F-NEXT: vpmovdb %zmm6, %xmm6 -; AVX512F-NEXT: vpcmpeqb %xmm6, %xmm7, %xmm6 -; AVX512F-NEXT: vpternlogq $15, %xmm6, %xmm6, %xmm6 -; AVX512F-NEXT: vpmovsxbd %xmm6, %zmm6 -; AVX512F-NEXT: vptestmd %zmm6, %zmm6, %k1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-NEXT: vpmovsxbw %xmm3, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512F-NEXT: vpmovsxbw %xmm5, %ymm5 ; AVX512F-NEXT: vpmullw %ymm3, %ymm5, %ymm3 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512F-NEXT: vpmovdb %zmm5, %xmm5 -; AVX512F-NEXT: vpcmpgtb %xmm5, %xmm2, %xmm6 -; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 -; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm6, %xmm3 -; AVX512F-NEXT: vpternlogq $15, %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k2 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512F-NEXT: vpmovsxbw %xmm3, %ymm3 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512F-NEXT: vpmovsxbw %xmm6, %ymm6 -; AVX512F-NEXT: vpmullw %ymm3, %ymm6, %ymm3 +; AVX512F-NEXT: vpsraw $8, %ymm3, %ymm5 +; AVX512F-NEXT: vpmovsxwd %ymm5, %zmm5 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512F-NEXT: vpmovdb %zmm6, %xmm6 -; AVX512F-NEXT: vpcmpgtb %xmm6, %xmm2, %xmm7 -; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 -; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm7, %xmm3 -; AVX512F-NEXT: vpternlogq $15, %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k3 +; AVX512F-NEXT: vpslld $24, %zmm6, %zmm3 +; AVX512F-NEXT: vpsrad $31, %zmm3, %zmm3 +; AVX512F-NEXT: vpcmpneqd %zmm5, %zmm3, %k1 +; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2 +; AVX512F-NEXT: vpmovsxbw %xmm4, %ymm3 +; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsraw $8, %ymm2, %ymm3 +; AVX512F-NEXT: vpmovsxwd %ymm3, %zmm3 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512F-NEXT: vpslld $24, %zmm4, %zmm2 +; AVX512F-NEXT: vpsrad $31, %zmm2, %zmm2 +; AVX512F-NEXT: vpcmpneqd %zmm3, %zmm2, %k2 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512F-NEXT: vpmovsxbw %xmm3, %ymm3 +; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsraw $8, %ymm2, %ymm3 +; AVX512F-NEXT: vpmovsxwd %ymm3, %zmm3 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512F-NEXT: vpslld $24, %zmm5, %zmm2 +; AVX512F-NEXT: vpsrad $31, %zmm2, %zmm2 +; AVX512F-NEXT: vpcmpneqd %zmm3, %zmm2, %k3 ; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm1 ; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm1, %xmm7 -; AVX512F-NEXT: vpcmpgtb %xmm7, %xmm2, %xmm1 -; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpcmpeqb %xmm0, %xmm1, %xmm0 -; AVX512F-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k4 +; AVX512F-NEXT: vpsraw $8, %ymm0, %ymm1 +; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpslld $24, %zmm7, %zmm0 +; AVX512F-NEXT: vpsrad $31, %zmm0, %zmm0 +; AVX512F-NEXT: vpcmpneqd %zmm1, %zmm0, %k4 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k3} {z} ; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} ; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} -; AVX512F-NEXT: vmovdqa %xmm4, 48(%rdi) -; AVX512F-NEXT: vmovdqa %xmm5, 32(%rdi) -; AVX512F-NEXT: vmovdqa %xmm6, 16(%rdi) -; AVX512F-NEXT: vmovdqa %xmm7, (%rdi) +; AVX512F-NEXT: vpmovdb %zmm6, 48(%rdi) +; AVX512F-NEXT: vpmovdb %zmm4, 32(%rdi) +; AVX512F-NEXT: vpmovdb %zmm5, 16(%rdi) +; AVX512F-NEXT: vpmovdb %zmm7, (%rdi) ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: smulo_v64i8: diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -1254,11 +1254,6 @@ ; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero @@ -1271,8 +1266,7 @@ ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm1 ; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm0 -; AVX512BW-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BW-NEXT: vptestmb %xmm0, %xmm0, %k1 +; AVX512BW-NEXT: vptestmw %ymm0, %ymm0, %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpmovwb %ymm1, (%rdi) ; AVX512BW-NEXT: retq @@ -1633,21 +1627,12 @@ ; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm3 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 -; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512F-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm3 -; AVX512F-NEXT: vpternlogq $15, %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 ; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 -; AVX512F-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -1663,8 +1648,7 @@ ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm2 ; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vptestmb %ymm0, %ymm0, %k1 +; AVX512BW-NEXT: vptestmw %zmm0, %zmm0, %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 ; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} @@ -2344,52 +2328,35 @@ ; ; AVX512F-LABEL: umulo_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm2 -; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 ; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm4 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero -; AVX512F-NEXT: vpmullw %ymm2, %ymm4, %ymm4 -; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm2 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpmovdb %zmm2, %xmm6 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm6, %xmm6 -; AVX512F-NEXT: vpternlogq $15, %xmm6, %xmm6, %xmm6 -; AVX512F-NEXT: vpmovsxbd %xmm6, %zmm6 -; AVX512F-NEXT: vptestmd %zmm6, %zmm6, %k1 -; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero -; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512F-NEXT: vpmullw %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm3 +; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4 +; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm3 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm3 -; AVX512F-NEXT: vpternlogq $15, %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k2 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero +; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm5 +; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm2 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k2 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero -; AVX512F-NEXT: vpmullw %ymm3, %ymm6, %ymm6 -; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm3 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm3 -; AVX512F-NEXT: vpternlogq $15, %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k3 +; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm6 +; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm2 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k3 ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm7 ; AVX512F-NEXT: vpsrlw $8, %ymm7, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k4 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k3} {z}