Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -13734,6 +13734,13 @@ return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops); } +// Checks for a BUILD_VECTOR composed of either all-undef's, or constants +// possibly mixed with undef's. +static bool isAnyConstantBuildVector(const SDNode *N) { + return N->isUndef() || ISD::isBuildVectorOfConstantSDNodes(N) || + ISD::isBuildVectorOfConstantFPSDNodes(N); +} + SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { EVT VT = N->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); @@ -13849,14 +13856,33 @@ // Attempt to combine a shuffle of 2 inputs of 'scalar sources' - // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR. + // This combine is done in the following cases: + // 1. Both N0,N1 are BUILD_VECTOR's composed of constants or undefs. + // 2. Only one of N0,N1 is a BUILD_VECTOR composed of constants or undefs - + // Combine iff that node is ALL_ZEROS. We prefer not to combine a + // BUILD_VECTOR of all constants to allow efficient materialization of + // constant vectors, but the ALL_ZEROS is an exception because + // zero-extension matching seems to rely on having BUILD_VECTOR nodes with + // zero padding between elements. FIXME: Eliminate this exception for + // ALL_ZEROS constant vectors. + // 3. Neither N0,N1 are composed of only constants. if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) { + bool BothN0N1Const = isAnyConstantBuildVector(N0.getNode()) && + isAnyConstantBuildVector(N1.getNode()); + SmallVector IsConstNotAllZeroes = { + isAnyConstantBuildVector(N0.getNode()) && + !ISD::isBuildVectorAllZeros(N0.getNode()), + isAnyConstantBuildVector(N1.getNode()) && + !ISD::isBuildVectorAllZeros(N1.getNode())}; SmallVector Ops; for (int M : SVN->getMask()) { SDValue Op = DAG.getUNDEF(VT.getScalarType()); if (M >= 0) { int Idx = M % NumElts; SDValue &S = (M < (int)NumElts ? N0 : N1); - if (S.getOpcode() == ISD::BUILD_VECTOR && S.hasOneUse()) { + bool IsUnsuitableConst = IsConstNotAllZeroes[M < (int)NumElts ? 0 : 1]; + if (BothN0N1Const || (S.getOpcode() == ISD::BUILD_VECTOR && + S.hasOneUse() && !IsUnsuitableConst)) { Op = S.getOperand(Idx); } else if (S.getOpcode() == ISD::SCALAR_TO_VECTOR && S.hasOneUse()) { if (Idx == 0) Index: test/CodeGen/X86/vector-shuffle-combining.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining.ll +++ test/CodeGen/X86/vector-shuffle-combining.ll @@ -2839,36 +2839,26 @@ define <4 x float> @combine_constant_insertion_v4f32(float %f) { ; SSE2-LABEL: combine_constant_insertion_v4f32: ; SSE2: # BB#0: -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movaps {{.*#+}} xmm1 = +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_constant_insertion_v4f32: ; SSSE3: # BB#0: -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: movaps {{.*#+}} xmm1 = +; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_constant_insertion_v4f32: ; SSE41: # BB#0: -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_constant_insertion_v4f32: ; AVX: # BB#0: -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] ; AVX-NEXT: retq %a0 = insertelement <4 x float> undef, float %f, i32 0 %ret = shufflevector <4 x float> %a0, <4 x float> , <4 x i32> @@ -2878,53 +2868,35 @@ define <4 x i32> @combine_constant_insertion_v4i32(i32 %f) { ; SSE2-LABEL: combine_constant_insertion_v4i32: ; SSE2: # BB#0: -; SSE2-NEXT: movl $30, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movl $4, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movl $5, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: movd %edi, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movd %edi, %xmm1 +; SSE2-NEXT: movaps {{.*#+}} xmm0 = +; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_constant_insertion_v4i32: ; SSSE3: # BB#0: -; SSSE3-NEXT: movl $30, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movl $4, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: movl $5, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: movd %edi, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movd %edi, %xmm1 +; SSSE3-NEXT: movaps {{.*#+}} xmm0 = +; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_constant_insertion_v4i32: ; SSE41: # BB#0: ; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: movl $4, %eax -; SSE41-NEXT: pinsrd $1, %eax, %xmm0 -; SSE41-NEXT: movl $5, %eax -; SSE41-NEXT: pinsrd $2, %eax, %xmm0 -; SSE41-NEXT: movl $30, %eax -; SSE41-NEXT: pinsrd $3, %eax, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_constant_insertion_v4i32: -; AVX: # BB#0: -; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: movl $4, %eax -; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX-NEXT: movl $5, %eax -; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; AVX-NEXT: movl $30, %eax -; AVX-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_constant_insertion_v4i32: +; AVX1: # BB#0: +; AVX1-NEXT: vmovd %edi, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_constant_insertion_v4i32: +; AVX2: # BB#0: +; AVX2-NEXT: vmovd %edi, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] +; AVX2-NEXT: retq %a0 = insertelement <4 x i32> undef, i32 %f, i32 0 %ret = shufflevector <4 x i32> %a0, <4 x i32> , <4 x i32> ret <4 x i32> %ret