Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -27153,13 +27153,18 @@ } if (Match) { unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize); - SrcVT = MVT::getVectorVT(MaskVT.getScalarType(), SrcSize / MaskEltSize); - if (SrcVT != MaskVT) + MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() : + MVT::getIntegerVT(MaskEltSize); + SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize); + + if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) { V1 = extractSubVector(V1, 0, DAG, DL, SrcSize); + Shuffle = unsigned(X86ISD::VZEXT); + } else + Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG); + DstVT = MVT::getIntegerVT(Scale * MaskEltSize); DstVT = MVT::getVectorVT(DstVT, NumDstElts); - Shuffle = SrcVT != MaskVT ? unsigned(X86ISD::VZEXT) - : unsigned(ISD::ZERO_EXTEND_VECTOR_INREG); return true; } } Index: test/CodeGen/X86/vector-shuffle-combining-avx2.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -897,3 +897,65 @@ %3 = shufflevector <32 x i8> %2, <32 x i8> undef, <32 x i32> ret <32 x i8> %3 } + +define internal fastcc <8 x float> @PR34577(<8 x float> %inp0, <8 x float> %inp1, <8 x float> %inp2) #0 { +; X32-AVX2-LABEL: PR34577: +; X32-AVX2: # BB#0: # %entry +; X32-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X32-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X32-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; X32-AVX2-NEXT: vmovaps {{.*#+}} ymm2 = +; X32-AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; X32-AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; X32-AVX2-NEXT: retl +; +; X32-AVX512-LABEL: PR34577: +; X32-AVX512: # BB#0: # %entry +; X32-AVX512-NEXT: vmovaps {{.*#+}} ymm3 = <1,u,u,u,2,u,5,0> +; X32-AVX512-NEXT: vpermps %ymm0, %ymm3, %ymm0 +; X32-AVX512-NEXT: vmovaps {{.*#+}} ymm3 = +; X32-AVX512-NEXT: vpermps %ymm2, %ymm3, %ymm2 +; X32-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5],ymm0[6,7] +; X32-AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X32-AVX512-NEXT: movb $86, %al +; X32-AVX512-NEXT: kmovw %eax, %k1 +; X32-AVX512-NEXT: vblendmps %zmm0, %zmm2, %zmm0 {%k1} +; X32-AVX512-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; X32-AVX512-NEXT: vmovaps {{.*#+}} ymm2 = +; X32-AVX512-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; X32-AVX512-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; X32-AVX512-NEXT: retl +; +; X64-AVX2-LABEL: PR34577: +; X64-AVX2: # BB#0: # %entry +; X64-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; X64-AVX2-NEXT: vmovaps {{.*#+}} ymm2 = +; X64-AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: PR34577: +; X64-AVX512: # BB#0: # %entry +; X64-AVX512-NEXT: vmovaps {{.*#+}} ymm3 = <1,u,u,u,2,u,5,0> +; X64-AVX512-NEXT: vpermps %ymm0, %ymm3, %ymm0 +; X64-AVX512-NEXT: vmovaps {{.*#+}} ymm3 = +; X64-AVX512-NEXT: vpermps %ymm2, %ymm3, %ymm2 +; X64-AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5],ymm0[6,7] +; X64-AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-AVX512-NEXT: movb $86, %al +; X64-AVX512-NEXT: kmovw %eax, %k1 +; X64-AVX512-NEXT: vblendmps %zmm0, %zmm2, %zmm0 {%k1} +; X64-AVX512-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3] +; X64-AVX512-NEXT: vmovaps {{.*#+}} ymm2 = +; X64-AVX512-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; X64-AVX512-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; X64-AVX512-NEXT: retq +entry: + %shuf0 = shufflevector <8 x float> %inp0, <8 x float> %inp2, <8 x i32> + %sel = select <8 x i1> , <8 x float> %shuf0, <8 x float> zeroinitializer + %shuf1 = shufflevector <8 x float> zeroinitializer, <8 x float> %sel, <8 x i32> + %shuf2 = shufflevector <8 x float> %inp1, <8 x float> %shuf1, <8 x i32> + ret <8 x float> %shuf2 +}