Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -15048,20 +15048,49 @@ if (Zeroable.isAllOnesValue()) return getZeroVector(VT, Subtarget, DAG, DL); + bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode()); + + // Create an alternative mask with info about zeroable elements. + // Here we do not set undef elements as zeroable. + SmallVector ZeroableMask(Mask.begin(), Mask.end()); + if (V2IsZero) { + assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!"); + for (int i = 0; i != NumElements; ++i) + if (Mask[i] != SM_SentinelUndef && Zeroable[i]) + ZeroableMask[i] = SM_SentinelZero; + } + // Try to collapse shuffles into using a vector type with fewer elements but // wider element types. We cap this to not form integers or floating point // elements wider than 64 bits, but it might be interesting to form i128 // integers to handle flipping the low and high halves of AVX 256-bit vectors. SmallVector WidenedMask; if (VT.getScalarSizeInBits() < 64 && !Is1BitVector && - canWidenShuffleElements(Mask, WidenedMask)) { + canWidenShuffleElements(ZeroableMask, WidenedMask)) { MVT NewEltVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2) : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2); - MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2); + int NewNumElts = NumElements / 2; + MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts); // Make sure that the new vector type is legal. For example, v2f64 isn't // legal on SSE1. if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) { + if (V2IsZero) { + // Modify the new Mask to take all zeros from the all-zero vector. + // Choose indices that are blend-friendly. + bool UsedZeroVector = false; + assert(find(WidenedMask, SM_SentinelZero) != WidenedMask.end() && + "V2's non-undef elements are used?!"); + for (int i = 0; i != NewNumElts; ++i) + if (WidenedMask[i] == SM_SentinelZero) { + WidenedMask[i] = i + NewNumElts; + UsedZeroVector = true; + } + // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits + // some elements to be undef. + if (UsedZeroVector) + V2 = getZeroVector(NewVT, Subtarget, DAG, DL); + } V1 = DAG.getBitcast(NewVT, V1); V2 = DAG.getBitcast(NewVT, V2); return DAG.getBitcast( Index: llvm/trunk/test/CodeGen/X86/avx-cast.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx-cast.ll +++ llvm/trunk/test/CodeGen/X86/avx-cast.ll @@ -9,9 +9,7 @@ define <8 x float> @castA(<4 x float> %m) nounwind uwtable readnone ssp { ; AVX-LABEL: castA: ; AVX: ## %bb.0: -; AVX-NEXT: ## kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vmovaps %xmm0, %xmm0 ; AVX-NEXT: retq %shuffle.i = shufflevector <4 x float> %m, <4 x float> zeroinitializer, <8 x i32> ret <8 x float> %shuffle.i Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -2890,31 +2890,15 @@ ; AVX1-LABEL: zeroable_src_to_zext: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5],zero,zero,zero,zero,zero,zero,xmm0[6,7],zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: zeroable_src_to_zext: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] -; AVX2-NEXT: retq -; -; AVX512VLBW-LABEL: zeroable_src_to_zext: -; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX512VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] -; AVX512VLBW-NEXT: retq -; -; AVX512VLVBMI-LABEL: zeroable_src_to_zext: -; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [32,33,0,0,0,0,0,0,34,35,0,0,0,0,0,0,36,37,16,16,16,16,16,16,38,39,16,16,16,16,16,16] -; AVX512VLVBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVBMI-NEXT: vpermt2b %ymm0, %ymm2, %ymm1 -; AVX512VLVBMI-NEXT: vmovdqa %ymm1, %ymm0 -; AVX512VLVBMI-NEXT: retq +; AVX2OR512VL-LABEL: zeroable_src_to_zext: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2OR512VL-NEXT: retq %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> %2 = shufflevector <32 x i8> %1, <32 x i8> , <32 x i32> ret <32 x i8> %2