Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -834,6 +834,13 @@ return false; } +// Determines if a BUILD_VECTOR is composed of all-constants possibly mixed with +// undef's. +static bool isAnyConstantBuildVector(const SDNode *N) { + return ISD::isBuildVectorOfConstantSDNodes(N) || + ISD::isBuildVectorOfConstantFPSDNodes(N); +} + SDValue DAGCombiner::ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1) { EVT VT = N0.getValueType(); @@ -13747,6 +13754,71 @@ return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops); } +// Attempt to combine a shuffle of 2 inputs of 'scalar sources' - +// BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR. +// This combine is done in the following cases: +// 1. Both N0,N1 are BUILD_VECTOR's composed of constants or undefs. +// 2. Only one of N0,N1 is a BUILD_VECTOR composed of constants or undefs - +// Combine iff that node is ALL_ZEROS. We prefer not to combine a +// BUILD_VECTOR of all constants to allow efficient materialization of +// constant vectors, but the ALL_ZEROS is an exception because +// zero-extension matching seems to rely on having BUILD_VECTOR nodes with +// zero padding between elements. FIXME: Eliminate this exception for +// ALL_ZEROS constant vectors. +// 3. Neither N0,N1 are composed of only constants. +static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN, + SelectionDAG &DAG, + const TargetLowering &TLI) { + EVT VT = SVN->getValueType(0); + unsigned NumElts = VT.getVectorNumElements(); + SDValue N0 = SVN->getOperand(0); + SDValue N1 = SVN->getOperand(1); + + if (!N0->hasOneUse() || !N1->hasOneUse()) + return SDValue(); + // If only one of N1,N2 is constant, bail out if it is not ALL_ZEROS as + // discussed above. + if (!N1.isUndef()) { + bool N0AnyConst = isAnyConstantBuildVector(N0.getNode()); + bool N1AnyConst = isAnyConstantBuildVector(N1.getNode()); + if (N0AnyConst && !N1AnyConst && !ISD::isBuildVectorAllZeros(N0.getNode())) + return SDValue(); + if (!N0AnyConst && N1AnyConst && !ISD::isBuildVectorAllZeros(N1.getNode())) + return SDValue(); + } + + SmallVector Ops; + for (int M : SVN->getMask()) { + SDValue Op = DAG.getUNDEF(VT.getScalarType()); + if (M >= 0) { + int Idx = M < (int)NumElts ? M : M - NumElts; + SDValue &S = (M < (int)NumElts ? N0 : N1); + if (S.getOpcode() == ISD::BUILD_VECTOR) { + Op = S.getOperand(Idx); + } else if (S.getOpcode() == ISD::SCALAR_TO_VECTOR) { + if (Idx == 0) + Op = S.getOperand(0); + } else { + // Operand can't be combined - bail out. + return SDValue(); + } + } + Ops.push_back(Op); + } + // BUILD_VECTOR requires all inputs to be of the same type, find the + // maximum type and extend them all. + EVT SVT = VT.getScalarType(); + if (SVT.isInteger()) + for (SDValue &Op : Ops) + SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT); + if (SVT != VT.getScalarType()) + for (SDValue &Op : Ops) + Op = TLI.isZExtFree(Op.getValueType(), SVT) + ? DAG.getZExtOrTrunc(Op, SDLoc(SVN), SVT) + : DAG.getSExtOrTrunc(Op, SDLoc(SVN), SVT); + return DAG.getBuildVector(VT, SDLoc(SVN), Ops); +} + SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { EVT VT = N->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); @@ -13862,40 +13934,9 @@ // Attempt to combine a shuffle of 2 inputs of 'scalar sources' - // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR. - if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) { - SmallVector Ops; - for (int M : SVN->getMask()) { - SDValue Op = DAG.getUNDEF(VT.getScalarType()); - if (M >= 0) { - int Idx = M % NumElts; - SDValue &S = (M < (int)NumElts ? N0 : N1); - if (S.getOpcode() == ISD::BUILD_VECTOR && S.hasOneUse()) { - Op = S.getOperand(Idx); - } else if (S.getOpcode() == ISD::SCALAR_TO_VECTOR && S.hasOneUse()) { - if (Idx == 0) - Op = S.getOperand(0); - } else { - // Operand can't be combined - bail out. - break; - } - } - Ops.push_back(Op); - } - if (Ops.size() == VT.getVectorNumElements()) { - // BUILD_VECTOR requires all inputs to be of the same type, find the - // maximum type and extend them all. - EVT SVT = VT.getScalarType(); - if (SVT.isInteger()) - for (SDValue &Op : Ops) - SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT); - if (SVT != VT.getScalarType()) - for (SDValue &Op : Ops) - Op = TLI.isZExtFree(Op.getValueType(), SVT) - ? DAG.getZExtOrTrunc(Op, SDLoc(N), SVT) - : DAG.getSExtOrTrunc(Op, SDLoc(N), SVT); - return DAG.getBuildVector(VT, SDLoc(N), Ops); - } - } + if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) + if (SDValue Res = combineShuffleOfScalars(SVN, DAG, TLI)) + return Res; // If this shuffle only has a single input that is a bitcasted shuffle, // attempt to merge the 2 shuffles and suitably bitcast the inputs/output Index: test/CodeGen/X86/vector-shuffle-combining.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining.ll +++ test/CodeGen/X86/vector-shuffle-combining.ll @@ -2839,36 +2839,26 @@ define <4 x float> @combine_constant_insertion_v4f32(float %f) { ; SSE2-LABEL: combine_constant_insertion_v4f32: ; SSE2: # BB#0: -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movaps {{.*#+}} xmm1 = +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_constant_insertion_v4f32: ; SSSE3: # BB#0: -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSSE3-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: movaps {{.*#+}} xmm1 = +; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSSE3-NEXT: movaps %xmm1, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_constant_insertion_v4f32: ; SSE41: # BB#0: -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: combine_constant_insertion_v4f32: ; AVX: # BB#0: -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] ; AVX-NEXT: retq %a0 = insertelement <4 x float> undef, float %f, i32 0 %ret = shufflevector <4 x float> %a0, <4 x float> , <4 x i32> @@ -2878,53 +2868,35 @@ define <4 x i32> @combine_constant_insertion_v4i32(i32 %f) { ; SSE2-LABEL: combine_constant_insertion_v4i32: ; SSE2: # BB#0: -; SSE2-NEXT: movl $30, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movl $4, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movl $5, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: movd %edi, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movd %edi, %xmm1 +; SSE2-NEXT: movaps {{.*#+}} xmm0 = +; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: combine_constant_insertion_v4i32: ; SSSE3: # BB#0: -; SSSE3-NEXT: movl $30, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movl $4, %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: movl $5, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: movd %edi, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movd %edi, %xmm1 +; SSSE3-NEXT: movaps {{.*#+}} xmm0 = +; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: combine_constant_insertion_v4i32: ; SSE41: # BB#0: ; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: movl $4, %eax -; SSE41-NEXT: pinsrd $1, %eax, %xmm0 -; SSE41-NEXT: movl $5, %eax -; SSE41-NEXT: pinsrd $2, %eax, %xmm0 -; SSE41-NEXT: movl $30, %eax -; SSE41-NEXT: pinsrd $3, %eax, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7] ; SSE41-NEXT: retq ; -; AVX-LABEL: combine_constant_insertion_v4i32: -; AVX: # BB#0: -; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: movl $4, %eax -; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX-NEXT: movl $5, %eax -; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; AVX-NEXT: movl $30, %eax -; AVX-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_constant_insertion_v4i32: +; AVX1: # BB#0: +; AVX1-NEXT: vmovd %edi, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_constant_insertion_v4i32: +; AVX2: # BB#0: +; AVX2-NEXT: vmovd %edi, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] +; AVX2-NEXT: retq %a0 = insertelement <4 x i32> undef, i32 %f, i32 0 %ret = shufflevector <4 x i32> %a0, <4 x i32> , <4 x i32> ret <4 x i32> %ret