Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17976,6 +17976,34 @@ Op1, Op0.getOperand(1), NewInsIndex); } +/// If we have a unary shuffle of a shuffle, see if it can be folded away +/// completely. This has the potential to lose undef knowledge because the first +/// shuffle may not have an undef mask element where the second one does. So +/// only call this after doing simplifications based on demanded elements. +static SDValue simplifyShuffleOfShuffle(ShuffleVectorSDNode *Shuf) { + // shuf (shuf0 X, Y, Mask0), undef, Mask + auto *Shuf0 = dyn_cast(Shuf->getOperand(0)); + if (!Shuf0 || !Shuf->getOperand(1).isUndef()) + return SDValue(); + + ArrayRef Mask = Shuf->getMask(); + ArrayRef Mask0 = Shuf0->getMask(); + for (int i = 0, e = (int)Mask.size(); i != e; ++i) { + // Ignore undef elements. + if (Mask[i] == -1) + continue; + assert(Mask[i] >= 0 && Mask[i] < e && "Unexpected shuffle mask value"); + + // Is the element of the shuffle operand chosen by this shuffle the same as + // the element chosen by the shuffle operand itself? + if (Mask0[Mask[i]] != Mask0[i]) + return SDValue(); + } + // Every element of this shuffle is identical to the result of the previous + // shuffle, so we can replace this value. + return Shuf->getOperand(0); +} + SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { EVT VT = N->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); @@ -18086,6 +18114,11 @@ if (SimplifyDemandedVectorElts(SDValue(N, 0))) return SDValue(N, 0); + // This is intentionally placed after demanded elements simplification because + // it could eliminate knowledge of undef elements created by this shuffle. + if (SDValue ShufOp = simplifyShuffleOfShuffle(SVN)) + return ShufOp; + // Match shuffles that can be converted to any_vector_extend_in_reg. if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations)) return V; Index: llvm/trunk/test/CodeGen/X86/vector-zext.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-zext.ll +++ llvm/trunk/test/CodeGen/X86/vector-zext.ll @@ -2617,32 +2617,23 @@ define <8 x i32> @splatshuf_zext_v8i32_matching_undefs(<8 x i16> %x) { ; SSE2-LABEL: splatshuf_zext_v8i32_matching_undefs: ; SSE2: # %bb.0: -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,0,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,5,7] -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,7,7] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: splatshuf_zext_v8i32_matching_undefs: ; SSSE3: # %bb.0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[u,u],zero,zero,xmm0[6,7],zero,zero,xmm0[14,15],zero,zero ; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,6,7,14,15,0,1,6,7,6,7,14,15] -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: splatshuf_zext_v8i32_matching_undefs: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,6,7,14,15,0,1,6,7,6,7,14,15] -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,6,7,14,15,8,9,10,11,12,13,14,15] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatshuf_zext_v8i32_matching_undefs: @@ -2671,31 +2662,25 @@ ; SSE2-LABEL: splatshuf_zext_v8i32_unmatched_undef: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,5,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,5,5,4] -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: splatshuf_zext_v8i32_unmatched_undef: ; SSSE3: # %bb.0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[2,3],zero,zero,xmm0[6,7],zero,zero,xmm0[14,15],zero,zero ; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15] -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: splatshuf_zext_v8i32_unmatched_undef: ; SSE41: # %bb.0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15] -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,14,15,14,15,6,7,12,13,14,15] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: splatshuf_zext_v8i32_unmatched_undef: