Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -7606,6 +7606,77 @@ return SDValue(); } +static SDValue getCompressWithConstantMask(SDLoc DL, MVT VT, SelectionDAG &DAG, + unsigned MaskVal, SDValue Vec, + SDValue BlendVec = SDValue()) { + if (!BlendVec) + BlendVec = DAG.getUNDEF(VT); + SDValue K = DAG.getConstant(MaskVal, DL, + MVT::getIntegerVT(VT.getVectorNumElements())); + K = DAG.getBitcast(MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()), K); + + SDValue Compress = + DAG.getNode(X86ISD::COMPRESS, DL, VT, Vec); + Compress = DAG.getSelect(DL, VT, K, Compress, BlendVec); + return Compress; +} + +/// Tries to match a COMPRESS node from a BUILD_VECTOR +static SDValue lowerBuildVectorAsCompress(BuildVectorSDNode *BV, + SelectionDAG &DAG) { + SDLoc DL(BV); + MVT VT = BV->getSimpleValueType(0); + + // If the input is something other than an EXTRACT_VECTOR_ELT with a constant + // index, bail out. + // TODO: Allow undef elements in some cases? + // TODO: Support zeros with zero-masking. + if (any_of(BV->ops(), [VT](SDValue Op) { + return Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !isa(Op.getOperand(1)) || + Op.getValueType() != VT.getVectorElementType(); + })) + return SDValue(); + + // Helper for obtaining an EXTRACT_VECTOR_ELT's constant index + auto GetExtractIdx = [](SDValue Extract) { + return cast(Extract.getOperand(1))->getSExtValue(); + }; + + SDValue ExtractedFromVec = BV->getOperand(0).getOperand(0); + MVT SrcVT = ExtractedFromVec.getSimpleValueType(); + // COMPRESS supports 32-bit and 64-bit vector elements + if (SrcVT.getScalarSizeInBits() != 32 && SrcVT.getScalarSizeInBits() != 64) + return SDValue(); + + // All extractelt operands must be from the same vector source. + if (any_of(BV->ops(), [ExtractedFromVec](SDValue Op) { + return Op.getOperand(0) != ExtractedFromVec; + })) + return SDValue(); + + assert(SrcVT.getSizeInBits() > VT.getSizeInBits() && + "Why wasn't this BUILD_VECTOR lowered as a shuffle?"); + + // The extractelt indices must be strictly increasing. + if (!std::is_sorted(BV->op_begin(), BV->op_end(), + [GetExtractIdx](SDValue L, SDValue R) { + return GetExtractIdx(L) <= GetExtractIdx(R); + })) { + return SDValue(); + } + + // Construct the mask + unsigned Mask = 0; + for (SDValue Op : BV->ops()) + Mask |= 1 << GetExtractIdx(Op); + + SDValue Compress = + getCompressWithConstantMask(DL, SrcVT, DAG, Mask, ExtractedFromVec); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Compress, + DAG.getIntPtrConstant(0, DL)); +} + SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -7630,6 +7701,9 @@ return Broadcast; if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG)) return BitOp; + if (Subtarget.hasAVX512()) + if (SDValue Compress = lowerBuildVectorAsCompress(BV, DAG)) + return Compress; unsigned EVTBits = ExtVT.getSizeInBits(); @@ -11339,6 +11413,55 @@ return DAG.getBitcast(VT, V); } +/// Tries to match a COMPRESS node from a vector shuffle. +static SDValue lowerShuffleVectorAsCompress(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Indices, + SelectionDAG &DAG) { + + // Perform substitution of a shuffle of two vectors orginating from the same + // large vector source. Example: + // (shuffle (extract_subvector V, 0), (extract_subvector V, 4) <1, 3, 5, 7>) + // --> + // COMPRESS V, b01010101 + unsigned NumElems = VT.getVectorNumElements(); + assert((VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v4f32 || + VT == MVT::v8f32 || VT == MVT::v4i64 || VT == MVT::v4f64) && + "Unsupported shuffle type"); + assert(NumElems == Indices.size() && + "Inconsistent shuffle type and mask indices"); + + if (V2.isUndef() || V1.getOpcode() != ISD::EXTRACT_SUBVECTOR || + V2.getOpcode() != ISD::EXTRACT_SUBVECTOR || + V1.getOperand(0) != V2.getOperand(0) || + V1.getOperand(0).getValueType().getVectorNumElements() != NumElems * 2) + return SDValue(); + + // Conservatively bail-out if there are any undef mask elements. + // TODO: Handle cases where it is profitable to select COMPRESS in the + // presence of undef mask indices. + if (find(Indices, -1) != Indices.end()) + return SDValue(); + + // The shuffle mask indices are strictly increasing. + if (!std::is_sorted(Indices.begin(), Indices.end(), + [](int L, int R) { return L <= R; })) { + return SDValue(); + } + + SDValue ExtractedFromVec = V1.getOperand(0); + MVT SrcVT = ExtractedFromVec.getSimpleValueType(); + + // Construct the bitmask pattern from the shuffle indices. + unsigned Mask = 0; + for (int Idx : Indices) + Mask |= 1 << Idx; + + SDValue Res = + getCompressWithConstantMask(DL, SrcVT, DAG, Mask, ExtractedFromVec); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, + DAG.getIntPtrConstant(0, DL)); +} + /// \brief Generic lowering of 8-lane i16 shuffles. /// /// This handles both single-input shuffles and combined shuffle/blends with @@ -12666,6 +12789,11 @@ return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, DAG); } + if (Subtarget.hasAVX512()) + if (SDValue V = + lowerShuffleVectorAsCompress(DL, MVT::v4f64, V1, V2, Mask, DAG)) + return V; + // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = @@ -12780,6 +12908,11 @@ Mask, Subtarget, DAG)) return Rotate; + if (Subtarget.hasAVX512()) + if (SDValue V = + lowerShuffleVectorAsCompress(DL, MVT::v4i64, V1, V2, Mask, DAG)) + return V; + // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG)) @@ -12848,6 +12981,10 @@ // have already handled any direct blends. return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG); } + if (Subtarget.hasAVX512()) + if (SDValue V = + lowerShuffleVectorAsCompress(DL, MVT::v8f32, V1, V2, Mask, DAG)) + return V; // Try to create an in-lane repeating shuffle mask and then shuffle the // the results into the target lanes. @@ -12972,6 +13109,11 @@ return V; } + if (Subtarget.hasAVX512()) + if (SDValue V = + lowerShuffleVectorAsCompress(DL, MVT::v8i32, V1, V2, Mask, DAG)) + return V; + // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) Index: test/CodeGen/X86/pmul.ll =================================================================== --- test/CodeGen/X86/pmul.ll +++ test/CodeGen/X86/pmul.ll @@ -1364,15 +1364,27 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq ; -; AVX512-LABEL: mul_v8i64_zero_upper: -; AVX512: # BB#0: # %entry -; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero -; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512-NEXT: retq +; AVX512F-LABEL: mul_v8i64_zero_upper: +; AVX512F: # BB#0: # %entry +; AVX512F-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; AVX512F-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero +; AVX512F-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: movw $-21846, %ax # imm = 0xAAAA +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k1} +; AVX512F-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: mul_v8i64_zero_upper: +; AVX512BW: # BB#0: # %entry +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero +; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: movw $-21846, %ax # imm = 0xAAAA +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpcompressd %zmm0, %zmm0 {%k1} +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: retq entry: %val1a = zext <8 x i32> %val1 to <8 x i64> %val2a = zext <8 x i32> %val2 to <8 x i64> Index: test/CodeGen/X86/shuffle-vs-trunc-512.ll =================================================================== --- test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -175,15 +175,45 @@ } define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind { -; AVX512-LABEL: shuffle_v16i32_to_v8i32: -; AVX512: # BB#0: -; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512-NEXT: vmovdqa %ymm0, (%rsi) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: shuffle_v16i32_to_v8i32: +; AVX512F: # BB#0: +; AVX512F-NEXT: vmovdqa32 (%rdi), %zmm0 +; AVX512F-NEXT: movw $21845, %ax # imm = 0x5555 +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k1} +; AVX512F-NEXT: vextracti64x4 $0, %zmm0, (%rsi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v16i32_to_v8i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa32 (%rdi), %zmm0 +; AVX512VL-NEXT: movw $21845, %ax # imm = 0x5555 +; AVX512VL-NEXT: kmovw %eax, %k1 +; AVX512VL-NEXT: vpcompressd %zmm0, %zmm0 {%k1} +; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v16i32_to_v8i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vmovdqa32 (%rdi), %zmm0 +; AVX512BW-NEXT: movw $21845, %ax # imm = 0x5555 +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpcompressd %zmm0, %zmm0 {%k1} +; AVX512BW-NEXT: vextracti64x4 $0, %zmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32: +; AVX512BWVL: # BB#0: +; AVX512BWVL-NEXT: vmovdqa32 (%rdi), %zmm0 +; AVX512BWVL-NEXT: movw $21845, %ax # imm = 0x5555 +; AVX512BWVL-NEXT: kmovd %eax, %k1 +; AVX512BWVL-NEXT: vpcompressd %zmm0, %zmm0 {%k1} +; AVX512BWVL-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %L %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> store <8 x i32> %strided.vec, <8 x i32>* %S Index: test/CodeGen/X86/vector-shuffle-512-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v16.ll +++ test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -281,29 +281,44 @@ ;FIXME: can do better with vpcompress define <8 x i32> @test_v16i32_1_3_5_7_9_11_13_15(<16 x i32> %v) { -; ALL-LABEL: test_v16i32_1_3_5_7_9_11_13_15: -; ALL: # BB#0: -; ALL-NEXT: vextracti32x8 $1, %zmm0, %ymm1 -; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; ALL-NEXT: retq +; AVX512F-LABEL: test_v16i32_1_3_5_7_9_11_13_15: +; AVX512F: # BB#0: +; AVX512F-NEXT: movw $-21846, %ax # imm = 0xAAAA +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k1} +; AVX512F-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: test_v16i32_1_3_5_7_9_11_13_15: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: movw $-21846, %ax # imm = 0xAAAA +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpcompressd %zmm0, %zmm0 {%k1} +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: retq %res = shufflevector <16 x i32> %v, <16 x i32> undef, <8 x i32> ret <8 x i32> %res } ;FIXME: can do better with vpcompress define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) { -; ALL-LABEL: test_v16i32_0_1_2_12: -; ALL: # BB#0: -; ALL-NEXT: vpextrd $1, %xmm0, %eax -; ALL-NEXT: vpinsrd $1, %eax, %xmm0, %xmm1 -; ALL-NEXT: vpextrd $2, %xmm0, %eax -; ALL-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; ALL-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; ALL-NEXT: vmovd %xmm0, %eax -; ALL-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq +; AVX512F-LABEL: test_v16i32_0_1_2_12: +; AVX512F: # BB#0: +; AVX512F-NEXT: movw $4103, %ax # imm = 0x1007 +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k1} +; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: test_v16i32_0_1_2_12: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: movw $4103, %ax # imm = 0x1007 +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpcompressd %zmm0, %zmm0 {%k1} +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %res = shufflevector <16 x i32> %v, <16 x i32> undef, <4 x i32> ret <4 x i32> %res } @@ -321,28 +336,44 @@ ;FIXME: can do better with vcompressp define <8 x float> @test_v16f32_0_1_2_3_4_6_7_10 (<16 x float> %v) { -; ALL-LABEL: test_v16f32_0_1_2_3_4_6_7_10: -; ALL: # BB#0: -; ALL-NEXT: vextractf32x8 $1, %zmm0, %ymm1 -; ALL-NEXT: vmovsldup {{.*#+}} xmm1 = xmm1[0,0,2,2] -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,7,u] -; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] -; ALL-NEXT: retq +; AVX512F-LABEL: test_v16f32_0_1_2_3_4_6_7_10: +; AVX512F: # BB#0: +; AVX512F-NEXT: movw $1247, %ax # imm = 0x4DF +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vcompressps %zmm0, %zmm0 {%k1} +; AVX512F-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: test_v16f32_0_1_2_3_4_6_7_10: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: movw $1247, %ax # imm = 0x4DF +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vcompressps %zmm0, %zmm0 {%k1} +; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 +; AVX512BW-NEXT: retq %res = shufflevector <16 x float> %v, <16 x float> undef, <8 x i32> ret <8 x float> %res } ;FIXME: can do better with vcompressp define <4 x float> @test_v16f32_0_1_3_6 (<16 x float> %v) { -; ALL-LABEL: test_v16f32_0_1_3_6: -; ALL: # BB#0: -; ALL-NEXT: vextractf32x4 $1, %zmm0, %xmm1 -; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,3,3] -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq +; AVX512F-LABEL: test_v16f32_0_1_3_6: +; AVX512F: # BB#0: +; AVX512F-NEXT: movw $75, %ax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vcompressps %zmm0, %zmm0 {%k1} +; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: test_v16f32_0_1_3_6: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: movw $75, %ax +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vcompressps %zmm0, %zmm0 {%k1} +; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %res = shufflevector <16 x float> %v, <16 x float> undef, <4 x i32> ret <4 x float> %res } Index: test/CodeGen/X86/vector-shuffle-512-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v8.ll +++ test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -2664,18 +2664,18 @@ define <4 x double> @test_v8f64_2346 (<8 x double> %v) { ; AVX512F-LABEL: test_v8f64_2346: ; AVX512F: # BB#0: -; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,2] -; AVX512F-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX512F-NEXT: movb $92, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vcompresspd %zmm0, %zmm0 {%k1} +; AVX512F-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: test_v8f64_2346: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512F-32-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; AVX512F-32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,2] -; AVX512F-32-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX512F-32-NEXT: movb $92, %al +; AVX512F-32-NEXT: kmovw %eax, %k1 +; AVX512F-32-NEXT: vcompresspd %zmm0, %zmm0 {%k1} +; AVX512F-32-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512F-32-NEXT: retl %res = shufflevector <8 x double> %v, <8 x double> undef, <4 x i32> ret <4 x double> %res @@ -2685,17 +2685,19 @@ define <2 x double> @test_v8f64_34 (<8 x double> %v) { ; AVX512F-LABEL: test_v8f64_34: ; AVX512F: # BB#0: -; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm1 -; AVX512F-NEXT: vextractf32x4 $1, %zmm0, %xmm0 -; AVX512F-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; AVX512F-NEXT: movb $24, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vcompresspd %zmm0, %zmm0 {%k1} +; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: test_v8f64_34: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vextractf32x4 $2, %zmm0, %xmm1 -; AVX512F-32-NEXT: vextractf32x4 $1, %zmm0, %xmm0 -; AVX512F-32-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; AVX512F-32-NEXT: movb $24, %al +; AVX512F-32-NEXT: kmovw %eax, %k1 +; AVX512F-32-NEXT: vcompresspd %zmm0, %zmm0 {%k1} +; AVX512F-32-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res = shufflevector <8 x double> %v, <8 x double> undef, <2 x i32> @@ -2706,18 +2708,18 @@ define <4 x i64> @test_v8i64_1257 (<8 x i64> %v) { ; AVX512F-LABEL: test_v8i64_1257: ; AVX512F: # BB#0: -; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,2,3] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: movb $-90, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpcompressq %zmm0, %zmm0 {%k1} +; AVX512F-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: test_v8i64_1257: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-32-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX512F-32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,2,3] -; AVX512F-32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-32-NEXT: movb $-90, %al +; AVX512F-32-NEXT: kmovw %eax, %k1 +; AVX512F-32-NEXT: vpcompressq %zmm0, %zmm0 {%k1} +; AVX512F-32-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512F-32-NEXT: retl %res = shufflevector <8 x i64> %v, <8 x i64> undef, <4 x i32> ret <4 x i64> %res @@ -2726,22 +2728,19 @@ define <2 x i64> @test_v8i64_2_5 (<8 x i64> %v) { ; AVX512F-LABEL: test_v8i64_2_5: ; AVX512F: # BB#0: -; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; AVX512F-NEXT: vextracti32x4 $1, %zmm0, %xmm0 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX512F-NEXT: movb $36, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpcompressq %zmm0, %zmm0 {%k1} +; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: test_v8i64_2_5: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vextracti32x4 $1, %zmm0, %xmm1 -; AVX512F-32-NEXT: vpextrd $1, %xmm1, %eax -; AVX512F-32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; AVX512F-32-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; AVX512F-32-NEXT: vpextrd $2, %xmm0, %eax -; AVX512F-32-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; AVX512F-32-NEXT: vpextrd $3, %xmm0, %eax -; AVX512F-32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; AVX512F-32-NEXT: movw $3120, %ax # imm = 0xC30 +; AVX512F-32-NEXT: kmovw %eax, %k1 +; AVX512F-32-NEXT: vpcompressd %zmm0, %zmm0 {%k1} +; AVX512F-32-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res = shufflevector <8 x i64> %v, <8 x i64> undef, <2 x i32>