diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -11325,17 +11325,15 @@ // // But when avx512vl is available, one can just use a single vpmovdw // instruction. -static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef Mask, - MVT VT, SDValue V1, SDValue V2, - SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +// TODO: Merge with lowerShuffleAsVTRUNC. +static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type"); - - if (Mask.size() != VT.getVectorNumElements()) - return SDValue(); - bool SwappedOps = false; + // TODO: Convert to use Zeroable bitmask. if (!ISD::isBuildVectorAllZeros(V2.getNode())) { if (!ISD::isBuildVectorAllZeros(V1.getNode())) return SDValue(); @@ -11378,6 +11376,73 @@ return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src); } +// Attempt to match binary shuffle patterns as a truncate. +static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type"); + if (!Subtarget.hasAVX512()) + return SDValue(); + + unsigned NumElts = VT.getVectorNumElements(); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + unsigned MaxScale = 64 / VT.getScalarSizeInBits(); + for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) { + // TODO: Support non-BWI VPMOVWB truncations? + unsigned SrcEltBits = EltSizeInBits * Scale; + if (SrcEltBits < 32 && !Subtarget.hasBWI()) + continue; + + // Match shuffle <0,Scale,2*Scale,..,undef_or_zero,undef_or_zero,...> + // Bail if the V2 elements are undef. + unsigned NumHalfSrcElts = NumElts / Scale; + unsigned NumSrcElts = 2 * NumHalfSrcElts; + if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) || + isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts)) + continue; + + // The elements beyond the truncation must be undef/zero. + unsigned UpperElts = NumElts - NumSrcElts; + if (UpperElts > 0 && + !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue()) + continue; + + // As we're using both sources then we need to concat them together + // and truncate from the 256-bit src. + MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2); + SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2); + + MVT SrcSVT = MVT::getIntegerVT(SrcEltBits); + MVT SrcVT = MVT::getVectorVT(SrcSVT, 256 / SrcEltBits); + Src = DAG.getBitcast(SrcVT, Src); + + if (SrcVT.getVectorNumElements() == NumElts) + return DAG.getNode(ISD::TRUNCATE, DL, VT, Src); + + if (!Subtarget.hasVLX()) { + // Non-VLX targets must truncate from a 512-bit type, so we need to + // widen, truncate and then possibly extract the original 128-bit + // vector. + bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts); + Src = widenSubVector(Src, !UndefUppers, Subtarget, DAG, DL, 512); + unsigned NumWideSrcElts = Src.getValueType().getVectorNumElements(); + if (NumWideSrcElts >= NumElts) { + // Widening means we can now use a regular TRUNCATE. + MVT WideVT = MVT::getVectorVT(VT.getScalarType(), NumWideSrcElts); + SDValue WideRes = DAG.getNode(ISD::TRUNCATE, DL, WideVT, Src); + if (!WideVT.is128BitVector()) + WideRes = extract128BitVector(WideRes, 0, DAG, DL); + return WideRes; + } + } + return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src); + } + + return SDValue(); +} + /// Check whether a compaction lowering can be done by dropping even /// elements and compute how many times even elements must be dropped. /// @@ -14733,7 +14798,7 @@ // Try to use lower using a truncation. if (SDValue V = - lowerShuffleWithVPMOV(DL, Mask, MVT::v8i16, V1, V2, DAG, Subtarget)) + lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) return V; int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; }); @@ -14816,6 +14881,11 @@ Subtarget)) return V; + // Try to use lower using a truncation. + if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable, + Subtarget, DAG)) + return V; + // Try to use byte rotation instructions. if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) @@ -14922,7 +14992,11 @@ // Try to use lower using a truncation. if (SDValue V = - lowerShuffleWithVPMOV(DL, Mask, MVT::v16i8, V1, V2, DAG, Subtarget)) + lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) + return V; + + if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable, + Subtarget, DAG)) return V; // See if we can use SSE4A Extraction / Insertion. diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -42,11 +42,10 @@ ; ; AVX512BW-LABEL: shuffle_v32i8_to_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpand 16(%rdi), %xmm0, %xmm1 -; AVX512BW-NEXT: vpand (%rdi), %xmm0, %xmm0 -; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8: @@ -143,11 +142,10 @@ ; ; AVX512F-LABEL: shuffle_v16i16_to_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7] -; AVX512F-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_to_v8i16: @@ -159,11 +157,10 @@ ; ; AVX512BW-LABEL: shuffle_v16i16_to_v8i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7] -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7] -; AVX512BW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16: @@ -377,54 +374,42 @@ ; ; AVX512F-LABEL: shuffle_v32i8_to_v8i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vpmovdb %xmm1, %xmm1 -; AVX512VL-NEXT: vpmovdb %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BWVL-NEXT: vpmovdb %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovdb %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VBMIVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VBMIVL-NEXT: vpmovdb %xmm1, %xmm1 -; AVX512VBMIVL-NEXT: vpmovdb %xmm0, %xmm0 -; AVX512VBMIVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0 ; AVX512VBMIVL-NEXT: vmovq %xmm0, (%rsi) +; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> @@ -1081,49 +1066,42 @@ ; ; AVX512F-LABEL: shuffle_v16i16_to_v4i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7] -; AVX512F-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7] -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7] -; AVX512BW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u> -; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 -; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi) +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v4i16: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u> -; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 -; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi) +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512VBMIVL-NEXT: vmovq %xmm0, (%rsi) +; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> @@ -1199,54 +1177,42 @@ ; ; AVX512F-LABEL: shuffle_v32i8_to_v4i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vpmovqb %xmm1, %xmm1 -; AVX512VL-NEXT: vpmovqb %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0 ; AVX512VL-NEXT: vmovd %xmm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BWVL-NEXT: vpmovqb %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovqb %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v4i8: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VBMIVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VBMIVL-NEXT: vpmovqb %xmm1, %xmm1 -; AVX512VBMIVL-NEXT: vpmovqb %xmm0, %xmm0 -; AVX512VBMIVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VBMIVL-NEXT: vpmovqb %ymm0, %xmm0 ; AVX512VBMIVL-NEXT: vmovd %xmm0, (%rsi) +; AVX512VBMIVL-NEXT: vzeroupper ; AVX512VBMIVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -178,20 +178,17 @@ define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind { ; AVX512F-LABEL: shuffle_v64i8_to_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX512F-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v64i8_to_v16i8: @@ -211,20 +208,17 @@ ; ; AVX512BW-LABEL: shuffle_v64i8_to_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX512BW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8: @@ -244,20 +238,17 @@ ; ; AVX512VBMI-LABEL: shuffle_v64i8_to_v16i8: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512VBMI-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX512VBMI-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512VBMI-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512VBMI-NEXT: vzeroupper ; AVX512VBMI-NEXT: retq ; ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v16i8: @@ -293,44 +284,43 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7] -; AVX512F-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7] ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7] -; AVX512F-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15] -; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX512VL-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512VL-NEXT: vpmovqw %ymm1, %xmm1 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7] -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7] -; AVX512BW-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7] ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7] -; AVX512BW-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16: @@ -346,13 +336,14 @@ ; AVX512VBMI: # %bb.0: ; AVX512VBMI-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX512VBMI-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7] -; AVX512VBMI-NEXT: vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7] -; AVX512VBMI-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX512VBMI-NEXT: vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7] ; AVX512VBMI-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7] -; AVX512VBMI-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX512VBMI-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX512VBMI-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512VBMI-NEXT: vpmovqw %zmm1, %xmm1 +; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512VBMI-NEXT: vzeroupper ; AVX512VBMI-NEXT: retq ; ; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v8i16: @@ -386,20 +377,17 @@ define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind { ; AVX512F-LABEL: shuffle_v64i8_to_v8i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] +; AVX512F-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX512F-NEXT: vmovq %xmm0, (%rsi) +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8: @@ -415,20 +403,17 @@ ; ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] +; AVX512BW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8: @@ -444,20 +429,17 @@ ; ; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX512VBMI-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] +; AVX512VBMI-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512VBMI-NEXT: vpmovqb %zmm1, %xmm1 +; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX512VBMI-NEXT: vmovq %xmm0, (%rsi) +; AVX512VBMI-NEXT: vzeroupper ; AVX512VBMI-NEXT: retq ; ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v8i8: diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -1581,10 +1581,11 @@ ; ; AVX512F-LABEL: trunc2x4i32_8i16: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX512F-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc2x4i32_8i16: @@ -1597,10 +1598,11 @@ ; ; AVX512BW-LABEL: trunc2x4i32_8i16: ; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX512BW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc2x4i32_8i16: @@ -1709,10 +1711,11 @@ ; ; AVX512BW-LABEL: trunc2x8i16_16i8: ; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc2x8i16_16i8: diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -383,33 +383,88 @@ } define <8 x i8> @interleaved_load_vf8_i8_stride4(<32 x i8>* %ptr) { -; AVX-LABEL: interleaved_load_vf8_i8_stride4: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vmovdqa (%rdi), %xmm1 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm3 -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX-NEXT: retq +; AVX1-LABEL: interleaved_load_vf8_i8_stride4: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX1-NEXT: retq +; +; AVX2-LABEL: interleaved_load_vf8_i8_stride4: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX2-NEXT: retq +; +; AVX512-LABEL: interleaved_load_vf8_i8_stride4: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm4 +; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm4 +; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX512-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %wide.vec = load <32 x i8>, <32 x i8>* %ptr, align 16 %v1 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> %v2 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> @@ -529,10 +584,8 @@ ; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm5 ; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm4 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm5, %xmm1, %xmm6 -; AVX512-NEXT: vpshufb %xmm5, %xmm0, %xmm5 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX512-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512-NEXT: vpmovdb %zmm5, %xmm5 ; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2,3] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = ; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm6 @@ -762,85 +815,83 @@ ; ; AVX512-LABEL: interleaved_load_vf32_i8_stride4: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa 112(%rdi), %xmm11 +; AVX512-NEXT: vmovdqa 112(%rdi), %xmm14 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX512-NEXT: vpshufb %xmm0, %xmm11, %xmm3 -; AVX512-NEXT: vmovdqa 96(%rdi), %xmm13 -; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm0 +; AVX512-NEXT: vpshufb %xmm0, %xmm14, %xmm3 +; AVX512-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm0 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa 80(%rdi), %xmm14 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm5, %xmm14, %xmm6 -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm5 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm5 -; AVX512-NEXT: vpmovdb %zmm5, %xmm5 -; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = -; AVX512-NEXT: vpshufb %xmm5, %xmm11, %xmm0 -; AVX512-NEXT: vpshufb %xmm5, %xmm13, %xmm6 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 +; AVX512-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm3 +; AVX512-NEXT: vpmovdb %zmm3, %xmm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vmovdqa 64(%rdi), %xmm10 +; AVX512-NEXT: vmovdqa 80(%rdi), %xmm11 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm0 +; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm5 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm1, %xmm14, %xmm6 -; AVX512-NEXT: vpshufb %xmm1, %xmm4, %xmm7 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm0[6,7] -; AVX512-NEXT: vmovdqa (%rdi), %xmm10 -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm12 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm4, %xmm11, %xmm5 +; AVX512-NEXT: vpshufb %xmm4, %xmm10, %xmm6 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3,4,5],ymm0[6,7] +; AVX512-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm13 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm7 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512-NEXT: vpshufb %xmm5, %xmm0, %xmm6 -; AVX512-NEXT: vpshufb %xmm5, %xmm7, %xmm5 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX512-NEXT: vpshufb %xmm1, %xmm12, %xmm6 -; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm5 +; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; AVX512-NEXT: vpshufb %xmm4, %xmm13, %xmm5 +; AVX512-NEXT: vpshufb %xmm4, %xmm12, %xmm4 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm4 +; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm5 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm5, %xmm11, %xmm6 +; AVX512-NEXT: vpshufb %xmm5, %xmm10, %xmm1 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512-NEXT: vpshufb %xmm1, %xmm11, %xmm5 -; AVX512-NEXT: vpshufb %xmm1, %xmm13, %xmm6 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] +; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm4 +; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX512-NEXT: vpshufb %xmm5, %xmm13, %xmm4 +; AVX512-NEXT: vpshufb %xmm5, %xmm12, %xmm5 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm4 +; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm4, %xmm11, %xmm5 +; AVX512-NEXT: vpshufb %xmm4, %xmm10, %xmm6 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm6, %xmm14, %xmm2 -; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm3 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7] -; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm3 -; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX512-NEXT: vpshufb %xmm6, %xmm12, %xmm3 -; AVX512-NEXT: vpshufb %xmm6, %xmm10, %xmm5 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm3 -; AVX512-NEXT: vpshufb %xmm2, %xmm13, %xmm5 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm5, %xmm14, %xmm6 -; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX512-NEXT: vpshufb %xmm5, %xmm12, %xmm2 -; AVX512-NEXT: vpshufb %xmm5, %xmm10, %xmm4 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] +; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX512-NEXT: vpshufb %xmm4, %xmm13, %xmm3 +; AVX512-NEXT: vpshufb %xmm4, %xmm12, %xmm4 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX512-NEXT: vpcmpeqb %zmm8, %zmm9, %k0 ; AVX512-NEXT: vpcmpeqb %zmm0, %zmm1, %k1 ; AVX512-NEXT: kxnord %k1, %k0, %k0