Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -14528,6 +14528,7 @@ bool V2IsUndef = V2.isUndef(); if (V1IsUndef && V2IsUndef) return DAG.getUNDEF(VT); + bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode()); // When we create a shuffle node we put the UNDEF node to second operand, // but in some cases the first operand may be transformed to UNDEF. @@ -14561,20 +14562,46 @@ if (Zeroable.isAllOnesValue()) return getZeroVector(VT, Subtarget, DAG, DL); + // Create an alternative mask with info about zeroable elements. + // Here we do not set undef elements as zeroable. + SmallVector ZeroableMask(Mask.begin(), Mask.end()); + if (V2IsZero) { + assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!"); + for (int i = 0, e = Mask.size(); i != e; ++i) + if (Mask[i] != SM_SentinelUndef && Zeroable[i]) + ZeroableMask[i] = SM_SentinelZero; + } + // Try to collapse shuffles into using a vector type with fewer elements but // wider element types. We cap this to not form integers or floating point // elements wider than 64 bits, but it might be interesting to form i128 // integers to handle flipping the low and high halves of AVX 256-bit vectors. SmallVector WidenedMask; if (VT.getScalarSizeInBits() < 64 && !Is1BitVector && - canWidenShuffleElements(Mask, WidenedMask)) { + canWidenShuffleElements(ZeroableMask, WidenedMask)) { MVT NewEltVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2) : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2); - MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2); + int NewNumElts = VT.getVectorNumElements() / 2; + MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts); // Make sure that the new vector type is legal. For example, v2f64 isn't // legal on SSE1. if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) { + if (V2IsZero) { + // Construct a new constant vector with zeros in elements that will + // allow blending, place undefs in remaining elements. + assert(find(WidenedMask, SM_SentinelZero) != WidenedMask.end() && + "V2's non-undef elements are used?!"); + APInt Undefs(NewNumElts, -1); + SmallVector ZeroVecVals(NewNumElts, + APInt(NewEltVT.getSizeInBits(), 0)); + for (int i = 0; i != NewNumElts; ++i) + if (WidenedMask[i] == SM_SentinelZero) { + WidenedMask[i] = i > NewNumElts ? i : i + NewNumElts; + Undefs.clearBit(i); + } + V2 = getConstVector(ZeroVecVals, Undefs, NewVT, DAG, SDLoc(V2)); + } V1 = DAG.getBitcast(NewVT, V1); V2 = DAG.getBitcast(NewVT, V2); return DAG.getBitcast( Index: test/CodeGen/X86/vector-shuffle-256-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v32.ll +++ test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -2852,31 +2852,15 @@ ; AVX1-LABEL: zeroable_src_to_zext: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: zeroable_src_to_zext: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] -; AVX2-NEXT: retq -; -; AVX512VLBW-LABEL: zeroable_src_to_zext: -; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX512VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] -; AVX512VLBW-NEXT: retq -; -; AVX512VLVBMI-LABEL: zeroable_src_to_zext: -; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [32,33,0,0,0,0,0,0,34,35,0,0,0,0,0,0,36,37,16,16,16,16,16,16,38,39,16,16,16,16,16,16] -; AVX512VLVBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVBMI-NEXT: vpermt2b %ymm0, %ymm2, %ymm1 -; AVX512VLVBMI-NEXT: vmovdqa %ymm1, %ymm0 -; AVX512VLVBMI-NEXT: retq +; AVX2OR512VL-LABEL: zeroable_src_to_zext: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2OR512VL-NEXT: retq %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> %2 = shufflevector <32 x i8> %1, <32 x i8> , <32 x i32> ret <32 x i8> %2