diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -37890,6 +37890,50 @@ return SDValue(); } + // Try to refine our inputs given our knowledge of target shuffle mask. + for (auto I : enumerate(Ops)) { + int OpIdx = I.index(); + SDValue &Op = I.value(); + + // What range of shuffle mask element values results in picking from Op? + int lo = OpIdx * Mask.size(); + int hi = lo + Mask.size(); + + // Which elements of Op do we demand? + SmallVector OpDemandedIdentityMask(Mask.size(), -1); + for (int MaskElt : Mask) { + if (isInRange(MaskElt, lo, hi)) { // Picks from Op? + int OpEltIdx = MaskElt - lo; + OpDemandedIdentityMask[OpEltIdx] = OpEltIdx; + } + } + + unsigned NumOpElts = Op.getValueType().getVectorNumElements(); + + SmallVector ScaledOpDemandedIdentityMask; + bool scaled = scaleShuffleElements(OpDemandedIdentityMask, NumOpElts, + ScaledOpDemandedIdentityMask); + (void)scaled; + assert(scaled && + "We should always succeed in scaling the identity shuffle mask!"); + assert(isSequentialOrUndefInRange(ScaledOpDemandedIdentityMask, 0, + NumOpElts, 0) && + "Should still have an identity mask after scaling!"); + + // Transform (scaled) identity shuffle mask into a demandedelts mask. + APInt DemandedOpElts = APInt::getNullValue(NumOpElts); + for (int ScaledOpDemandedIdentityMaskElt : ScaledOpDemandedIdentityMask) + if (ScaledOpDemandedIdentityMaskElt >= 0) + DemandedOpElts.setBit(ScaledOpDemandedIdentityMaskElt); + + // Can this operand be simplified any further, given it's demanded elements? + if (SDValue NewOp = + DAG.getTargetLoweringInfo().SimplifyMultipleUseDemandedVectorElts( + Op, DemandedOpElts, DAG)) + Op = NewOp; + } + // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now? + // Canonicalize the combined shuffle mask chain with horizontal ops. // NOTE: This will update the Ops and Mask. if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp( diff --git a/llvm/test/CodeGen/X86/insertelement-ones.ll b/llvm/test/CodeGen/X86/insertelement-ones.ll --- a/llvm/test/CodeGen/X86/insertelement-ones.ll +++ b/llvm/test/CodeGen/X86/insertelement-ones.ll @@ -382,9 +382,11 @@ ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] ; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] +; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] -; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm3, %xmm5 +; SSE2-NEXT: por %xmm5, %xmm1 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: retq @@ -402,9 +404,11 @@ ; SSE3-NEXT: movdqa %xmm3, %xmm4 ; SSE3-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0] ; SSE3-NEXT: por %xmm4, %xmm0 -; SSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE3-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255] +; SSE3-NEXT: pand %xmm5, %xmm1 ; SSE3-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] -; SSE3-NEXT: por %xmm3, %xmm1 +; SSE3-NEXT: pandn %xmm3, %xmm5 +; SSE3-NEXT: por %xmm5, %xmm1 ; SSE3-NEXT: pand %xmm2, %xmm1 ; SSE3-NEXT: por %xmm4, %xmm1 ; SSE3-NEXT: retq diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -2261,12 +2261,13 @@ ; ; AVX1-LABEL: splat_v3i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpinsrd $2, 8(%rdi), %xmm0, %xmm1 -; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7] +; AVX1-NEXT: movq (%rdi), %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7] +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: splat_v3i32: @@ -2288,12 +2289,13 @@ ; ; XOP-LABEL: splat_v3i32: ; XOP: # %bb.0: -; XOP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; XOP-NEXT: vpinsrd $2, 8(%rdi), %xmm0, %xmm1 -; XOP-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7] -; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7] +; XOP-NEXT: movq (%rdi), %rax +; XOP-NEXT: vmovq %rax, %xmm0 +; XOP-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7] +; XOP-NEXT: vmovd %eax, %xmm2 +; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7] ; XOP-NEXT: retq %1 = load <3 x i32>, <3 x i32>* %ptr, align 1 %2 = shufflevector <3 x i32> %1, <3 x i32> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -494,7 +494,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12,13,14,15,4,5,14,15,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[0,1,2,3,0,1,10,11,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,0,1,10,11,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -309,7 +309,7 @@ ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm15[2],ymm1[3,4],ymm15[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,0,1,12,13,u,u,4,5] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[2,2,2,2,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7] @@ -333,7 +333,7 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,5,5,5] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,u,u,6,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,1,2,1,6,5,6,5] @@ -453,7 +453,7 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,4,5,u,u,u,u,8,9,u,u] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,0,1,12,13,u,u,4,5] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm9 ; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm2[2,3],ymm5[2,3] @@ -474,7 +474,7 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,5,5,5] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,u,u,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm11[1,1,1,1,4,5,6,7,9,9,9,9,12,13,14,15] diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -495,16 +495,14 @@ ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 -; X86-AVX512-NEXT: vshufpd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[2] -; X86-AVX512-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1] -; X86-AVX512-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[1],ymm4[0],ymm1[2],ymm4[3] -; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm5 = [0,0,3,0,8,0,1,0] -; X86-AVX512-NEXT: vpermt2pd %zmm2, %zmm5, %zmm3 -; X86-AVX512-NEXT: vmovapd %ymm3, (%edx) -; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = [0,0,3,0,10,0,1,0] -; X86-AVX512-NEXT: vpermt2pd %zmm0, %zmm3, %zmm4 -; X86-AVX512-NEXT: vmovapd %ymm4, (%ecx) +; X86-AVX512-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[0,1] +; X86-AVX512-NEXT: vshufpd {{.*#+}} ymm3 = ymm1[1],ymm3[0],ymm1[2],ymm3[3] +; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm4 = [0,0,2,0,8,0,1,0] +; X86-AVX512-NEXT: vpermi2pd %zmm2, %zmm0, %zmm4 +; X86-AVX512-NEXT: vmovapd %ymm4, (%edx) +; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm4 = [0,0,3,0,10,0,1,0] +; X86-AVX512-NEXT: vpermt2pd %zmm0, %zmm4, %zmm3 +; X86-AVX512-NEXT: vmovapd %ymm3, (%ecx) ; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = <3,0,11,0,u,u,u,u> ; X86-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm3 ; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm0 = [2,0,8,0,9,0,3,0] @@ -562,16 +560,14 @@ ; X64-AVX512-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; X64-AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X64-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 -; X64-AVX512-NEXT: vshufpd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[2] -; X64-AVX512-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1] -; X64-AVX512-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[1],ymm4[0],ymm1[2],ymm4[3] -; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm5 = [0,3,8,1] -; X64-AVX512-NEXT: vpermt2pd %zmm2, %zmm5, %zmm3 -; X64-AVX512-NEXT: vmovapd %ymm3, (%rdi) -; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = [0,3,10,1] -; X64-AVX512-NEXT: vpermt2pd %zmm0, %zmm3, %zmm4 -; X64-AVX512-NEXT: vmovapd %ymm4, (%rsi) +; X64-AVX512-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[0,1] +; X64-AVX512-NEXT: vshufpd {{.*#+}} ymm3 = ymm1[1],ymm3[0],ymm1[2],ymm3[3] +; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,8,1] +; X64-AVX512-NEXT: vpermi2pd %zmm2, %zmm0, %zmm4 +; X64-AVX512-NEXT: vmovapd %ymm4, (%rdi) +; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm4 = [0,3,10,1] +; X64-AVX512-NEXT: vpermt2pd %zmm0, %zmm4, %zmm3 +; X64-AVX512-NEXT: vmovapd %ymm3, (%rsi) ; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm3 = <3,11,u,u> ; X64-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm3 ; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm0 = [2,8,9,3] diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -568,12 +568,13 @@ ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: movd %edi, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] -; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[1,1] -; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: simplify_select: diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -717,93 +717,40 @@ ; ; AVX2-LABEL: interleaved_load_vf32_i8_stride4: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX2-NEXT: vmovdqa 112(%rdi), %xmm9 -; AVX2-NEXT: vpshufb %xmm0, %xmm9, %xmm1 -; AVX2-NEXT: vmovdqa 96(%rdi), %xmm10 -; AVX2-NEXT: vpshufb %xmm0, %xmm10, %xmm3 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vmovdqa 80(%rdi), %xmm12 -; AVX2-NEXT: vpshufb %xmm2, %xmm12, %xmm4 -; AVX2-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX2-NEXT: vpshufb %xmm2, %xmm5, %xmm6 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vmovdqa (%rdi), %xmm11 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm13 -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-NEXT: vpshufb %xmm0, %xmm7, %xmm1 -; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vpshufb %xmm2, %xmm13, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX2-NEXT: vpshufb %xmm1, %xmm9, %xmm2 -; AVX2-NEXT: vpshufb %xmm1, %xmm10, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm12, %xmm3 -; AVX2-NEXT: vpshufb %xmm2, %xmm5, %xmm4 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vpshufb %xmm1, %xmm7, %xmm3 -; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-NEXT: vpshufb %xmm2, %xmm13, %xmm3 -; AVX2-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpcmpeqb %ymm0, %ymm8, %ymm8 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX2-NEXT: vpshufb %xmm0, %xmm9, %xmm1 -; AVX2-NEXT: vpshufb %xmm0, %xmm10, %xmm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm12, %xmm3 -; AVX2-NEXT: vpshufb %xmm2, %xmm5, %xmm4 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vpshufb %xmm0, %xmm7, %xmm3 -; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX2-NEXT: vpshufb %xmm2, %xmm13, %xmm3 -; AVX2-NEXT: vpshufb %xmm2, %xmm11, %xmm2 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX2-NEXT: vpshufb %xmm1, %xmm9, %xmm2 -; AVX2-NEXT: vpshufb %xmm1, %xmm10, %xmm3 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm12, %xmm4 -; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX2-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm4 +; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm5 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vpshufb %xmm1, %xmm7, %xmm4 -; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX2-NEXT: vpshufb %xmm3, %xmm13, %xmm4 -; AVX2-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm6 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm6 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 +; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm0, %ymm8, %ymm0 +; AVX2-NEXT: vpcmpeqb %ymm1, %ymm4, %ymm1 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -814,74 +761,31 @@ ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm10 -; AVX512-NEXT: vmovdqa 80(%rdi), %xmm11 -; AVX512-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX512-NEXT: vmovdqa 112(%rdi), %xmm14 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512-NEXT: vpshufb %xmm1, %xmm14, %xmm0 -; AVX512-NEXT: vpshufb %xmm1, %xmm12, %xmm4 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm4 -; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm6 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX512-NEXT: vmovdqa (%rdi), %xmm13 -; AVX512-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX512-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX512-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm3 -; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm3 -; AVX512-NEXT: vpshufb %xmm2, %xmm13, %xmm2 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX512-NEXT: vpshufb %xmm1, %xmm14, %xmm2 -; AVX512-NEXT: vpshufb %xmm1, %xmm12, %xmm3 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm3, %xmm11, %xmm4 -; AVX512-NEXT: vpshufb %xmm3, %xmm10, %xmm5 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] -; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm4 -; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm4 -; AVX512-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX512-NEXT: vpshufb %xmm2, %xmm14, %xmm3 -; AVX512-NEXT: vpshufb %xmm2, %xmm12, %xmm4 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm4, %xmm11, %xmm5 -; AVX512-NEXT: vpshufb %xmm4, %xmm10, %xmm1 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX512-NEXT: vpshufb %xmm4, %xmm6, %xmm2 -; AVX512-NEXT: vpshufb %xmm4, %xmm13, %xmm3 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpcmpeqb %zmm8, %zmm9, %k0 -; AVX512-NEXT: vpcmpeqb %zmm0, %zmm15, %k1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm5 +; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm6 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX512-NEXT: vpermt2d %zmm5, %zmm7, %zmm6 +; AVX512-NEXT: vpshufb %xmm4, %xmm1, %xmm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm6 +; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm8 +; AVX512-NEXT: vpermt2d %zmm6, %zmm7, %zmm8 +; AVX512-NEXT: vpshufb %xmm5, %xmm1, %xmm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX512-NEXT: vpermt2d %zmm3, %zmm7, %zmm2 +; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vpcmpeqb %zmm4, %zmm0, %k0 +; AVX512-NEXT: vpcmpeqb %zmm1, %zmm5, %k1 ; AVX512-NEXT: kxnord %k1, %k0, %k0 ; AVX512-NEXT: vpmovm2b %k0, %zmm0 ; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0