Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16131,6 +16131,21 @@ NewBV = DAG.getBitcast(VT, NewBV); return NewBV; } + + // If we're shuffling an insert_vector_elt, see if we are broadcasting + // the inserted element and if so convert to build_vector. + if (V->getOpcode() == ISD::INSERT_VECTOR_ELT && + isa(V->getOperand(2))) { + unsigned Elt = cast(V->getOperand(2))->getZExtValue(); + if (Elt == (unsigned)SVN->getSplatIndex()) { + SmallVector Ops(NumElts, V->getOperand(1)); + SDValue NewBV = DAG.getBuildVector(V->getValueType(0), SDLoc(N), Ops); + + // We may have jumped through bitcasts, so the type of the + // BUILD_VECTOR may not match the type of the shuffle. + return DAG.getBitcast(VT, NewBV); + } + } } // There are various patterns used to build up a vector from smaller vectors, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1191,7 +1191,6 @@ setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Expand); } @@ -1439,7 +1438,6 @@ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); } setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom); @@ -14294,56 +14292,6 @@ } } -// Lower vXi1 vector shuffles. -// There is no a dedicated instruction on AVX-512 that shuffles the masks. -// The only way to shuffle bits is to sign-extend the mask vector to SIMD -// vector, shuffle and then truncate it back. -static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef Mask, - MVT VT, SDValue V1, SDValue V2, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - assert(Subtarget.hasAVX512() && - "Cannot lower 512-bit vectors w/o basic ISA!"); - MVT ExtVT; - switch (VT.SimpleTy) { - default: - llvm_unreachable("Expected a vector of i1 elements"); - case MVT::v2i1: - ExtVT = MVT::v2i64; - break; - case MVT::v4i1: - ExtVT = MVT::v4i32; - break; - case MVT::v8i1: - // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit - // shuffle. - ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64; - break; - case MVT::v16i1: - ExtVT = MVT::v16i32; - break; - case MVT::v32i1: - ExtVT = MVT::v32i16; - break; - case MVT::v64i1: - ExtVT = MVT::v64i8; - break; - } - - V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1); - V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2); - - SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask); - // i1 was sign extended we can use X86ISD::CVT2MASK. - int NumElems = VT.getVectorNumElements(); - if ((Subtarget.hasBWI() && (NumElems >= 32)) || - (Subtarget.hasDQI() && (NumElems < 32))) - return DAG.getNode(X86ISD::PCMPGTM, DL, VT, DAG.getConstant(0, DL, ExtVT), - Shuffle); - - return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle); -} - /// Helper function that returns true if the shuffle mask should be /// commuted to improve canonicalization. static bool canonicalizeShuffleMaskWithCommute(ArrayRef Mask) { @@ -14504,9 +14452,6 @@ return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); - if (Is1BitVector) - return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG); - llvm_unreachable("Unimplemented!"); } @@ -30011,6 +29956,59 @@ return SDValue(); } +// There is no a dedicated instruction on AVX-512 that shuffles the masks. +// The only way to shuffle bits is to sign-extend the mask vector to SIMD +// vector, shuffle and then truncate it back. +static SDValue promoteI1Shuffle(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + MVT VT = N->getValueType(0).getSimpleVT(); + SDLoc dl(N); + + assert(Subtarget.hasAVX512() && + "Cannot lower 512-bit vectors w/o basic ISA!"); + MVT ExtVT; + switch (VT.SimpleTy) { + default: + llvm_unreachable("Expected a vector of i1 elements"); + case MVT::v2i1: + ExtVT = MVT::v2i64; + break; + case MVT::v4i1: + ExtVT = MVT::v4i32; + break; + case MVT::v8i1: + // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit + // shuffle. + ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64; + break; + case MVT::v16i1: + ExtVT = MVT::v16i32; + break; + case MVT::v32i1: + ExtVT = MVT::v32i16; + break; + case MVT::v64i1: + ExtVT = MVT::v64i8; + break; + } + + ArrayRef Mask = cast(N)->getMask(); + + SDValue V1 = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, N->getOperand(0)); + SDValue V2 = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, N->getOperand(1)); + + SDValue Shuffle = DAG.getVectorShuffle(ExtVT, dl, V1, V2, Mask); + + // i1 was sign extended we can use use move to mask. + int NumElems = VT.getVectorNumElements(); + if ((Subtarget.hasBWI() && (NumElems >= 32)) || + (Subtarget.hasDQI() && (NumElems < 32))) + return DAG.getSetCC(dl, VT, Shuffle, DAG.getConstant(0, dl, ExtVT), + ISD::SETLT); + + return DAG.getNode(ISD::TRUNCATE, dl, VT, Shuffle); +} + static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -30030,6 +30028,11 @@ return HAddSub; } + if (DCI.isBeforeLegalizeOps() && Subtarget.hasAVX512() && + N->getOpcode() == ISD::VECTOR_SHUFFLE && + VT.getVectorElementType() == MVT::i1) + return promoteI1Shuffle(N, DAG, Subtarget); + // During Type Legalization, when promoting illegal vector types, // the backend might introduce new shuffle dag nodes and bitcasts. // Index: test/CodeGen/X86/vector-shuffle-v1.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-v1.ll +++ test/CodeGen/X86/vector-shuffle-v1.ll @@ -7,31 +7,22 @@ ; AVX512F-LABEL: shuf2i1_1_0: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuf2i1_1_0: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0 -; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z} -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512VL-NEXT: vpsraq $63, %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX512VL-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf2i1_1_0: ; VL_BW_DQ: # %bb.0: ; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0 -; VL_BW_DQ-NEXT: vptestmq %xmm0, %xmm0, %k0 -; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 +; VL_BW_DQ-NEXT: vpsraq $63, %xmm0, %xmm0 ; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 @@ -44,38 +35,29 @@ ; AVX512F-LABEL: shuf2i1_1_2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm0 ; AVX512F-NEXT: movq $-1, %rax ; AVX512F-NEXT: vmovq %rax, %xmm1 ; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuf2i1_1_2: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0 -; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z} ; AVX512VL-NEXT: movq $-1, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm2 -; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] -; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 -; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512VL-NEXT: vmovq %rax, %xmm1 +; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsraq $63, %xmm0, %xmm0 +; AVX512VL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX512VL-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf2i1_1_2: ; VL_BW_DQ: # %bb.0: -; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0 -; VL_BW_DQ-NEXT: vptestmq %xmm0, %xmm0, %k0 ; VL_BW_DQ-NEXT: movq $-1, %rax -; VL_BW_DQ-NEXT: vmovq %rax, %xmm0 -; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm1 -; VL_BW_DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; VL_BW_DQ-NEXT: vmovq %rax, %xmm1 +; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0 +; VL_BW_DQ-NEXT: vpsraq $63, %xmm0, %xmm0 +; VL_BW_DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 ; VL_BW_DQ-NEXT: retq @@ -88,31 +70,21 @@ ; AVX512F-LABEL: shuf4i1_3_2_10: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpsrad $31, %xmm0, %xmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0 -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuf4i1_3_2_10: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} {z} -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0] -; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512VL-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] ; AVX512VL-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf4i1_3_2_10: ; VL_BW_DQ: # %bb.0: ; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0 -; VL_BW_DQ-NEXT: vptestmd %xmm0, %xmm0, %k0 -; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0 +; VL_BW_DQ-NEXT: vpsrad $31, %xmm0, %xmm0 ; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] ; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0 @@ -128,10 +100,7 @@ ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] ; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -139,11 +108,9 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,1,0,3,7,7,0] -; AVX512VL-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,6,1,0,3,7,7,0] +; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -173,9 +140,7 @@ ; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpmovdb %zmm2, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -187,9 +152,7 @@ ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512VL-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 -; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vpmovdb %zmm2, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -214,27 +177,49 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<32 x i1> %a) { ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u] -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0] -; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512F-NEXT: vpsllw $15, %ymm0, %ymm0 +; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0 +; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,12,13,u,u,8,9,6,7,14,15,14,15,0,1,22,23,28,29,18,19,26,27,22,23,u,u,30,31,16,17] +; AVX512F-NEXT: vpsllw $15, %ymm1, %ymm1 +; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm1 +; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,1] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16] -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0] -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512VL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VL-NEXT: vpsllw $15, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsraw $15, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,6,3,0,1,6,3] +; AVX512VL-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,12,13,u,u,8,9,6,7,14,15,14,15,0,1,22,23,28,29,18,19,26,27,22,23,u,u,30,31,16,17] +; AVX512VL-NEXT: vpsllw $15, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsraw $15, %ymm1, %ymm1 +; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,1] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255] +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: ; VL_BW_DQ: # %bb.0: -; VL_BW_DQ-NEXT: vpsllw $7, %ymm0, %ymm0 -; VL_BW_DQ-NEXT: vpmovb2m %ymm0, %k0 -; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 +; VL_BW_DQ-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; VL_BW_DQ-NEXT: vpsllw $15, %zmm0, %zmm0 +; VL_BW_DQ-NEXT: vpsraw $15, %zmm0, %zmm0 ; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k0 @@ -251,11 +236,7 @@ ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vpbroadcastq %xmm0, %zmm0 -; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -263,12 +244,9 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: kmovw %edi, %k1 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX512VL-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX512VL-NEXT: vpslld $31, %ymm1, %ymm1 -; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX512VL-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -519,10 +497,9 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { ; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7] ; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 @@ -534,13 +511,12 @@ ; ; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 -; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 -; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7] +; AVX512VL-NEXT: vpsrad $31, %ymm0, %ymm0 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX512VL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512VL-NEXT: kmovw %k0, %eax ; AVX512VL-NEXT: # kill: def %al killed %al killed %eax @@ -549,9 +525,9 @@ ; ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones: ; VL_BW_DQ: # %bb.0: -; VL_BW_DQ-NEXT: vpsllw $15, %xmm0, %xmm0 -; VL_BW_DQ-NEXT: vpmovw2m %xmm0, %k0 -; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 +; VL_BW_DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; VL_BW_DQ-NEXT: vpslld $31, %ymm0, %ymm0 +; VL_BW_DQ-NEXT: vpsrad $31, %ymm0, %ymm0 ; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; VL_BW_DQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; VL_BW_DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]