Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -14868,6 +14868,55 @@ return SDValue(); } +// Combine shuffles of splat-shuffles of the form: +// shuffle (shuffle V, undef, splat-mask), undef, M +// If splat-mask contains undef elements, we need to be careful about +// introducing undef's in the folded mask which are not the result of composing +// the masks of the shuffles. +static SDValue combineShuffleOfSplat(ArrayRef UserMask, + ShuffleVectorSDNode *Splat, + SelectionDAG &DAG) { + ArrayRef SplatMask = Splat->getMask(); + assert(UserMask.size() == SplatMask.size() && "Mask length mismatch"); + + // Prefer simplifying to the splat-shuffle, if possible. This is legal if + // every undef mask element in the splat-shuffle has a corresponding undef + // element in the user-shuffle's mask or if the composition of mask elements + // would result in undef. + // Examples for (shuffle (shuffle v, undef, SplatMask), undef, UserMask): + // * UserMask=[0,2,u,u], SplatMask=[2,u,2,u] -> [2,2,u,u] + // In this case it is not legal to simplify to the splat-shuffle because we + // may be exposing the users of the shuffle an undef element at index 1 + // which was not there before the combine. + // * UserMask=[0,u,2,u], SplatMask=[2,u,2,u] -> [2,u,2,u] + // In this case the composition of masks yields SplatMask, so it's ok to + // simplify to the splat-shuffle. + // * UserMask=[3,u,2,u], SplatMask=[2,u,2,u] -> [u,u,2,u] + // In this case the composed mask includes all undef elements of SplatMask + // and in addition sets element zero to undef. It is safe to simplify to + // the splat-shuffle. + auto CanSimplifyToExistingSplat = [](ArrayRef UserMask, + ArrayRef SplatMask) { + for (unsigned i = 0, e = UserMask.size(); i != e; ++i) + if (UserMask[i] != -1 && SplatMask[i] == -1 && + SplatMask[UserMask[i]] != -1) + return false; + return true; + }; + if (CanSimplifyToExistingSplat(UserMask, SplatMask)) + return SDValue(Splat, 0); + + // Create a new shuffle with a mask that is composed of the two shuffles' + // masks. + SmallVector NewMask; + for (int Idx : UserMask) + NewMask.push_back(Idx == -1 ? -1 : SplatMask[Idx]); + + return DAG.getVectorShuffle(Splat->getValueType(0), SDLoc(Splat), + Splat->getOperand(0), Splat->getOperand(1), + NewMask); +} + SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { EVT VT = N->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); @@ -14914,6 +14963,11 @@ return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask); } + // A shuffle of a single vector that is a splat can always be folded. + if (auto *N0Shuf = dyn_cast(N0)) + if (N1->isUndef() && N0Shuf->isSplat()) + return combineShuffleOfSplat(SVN->getMask(), N0Shuf, DAG); + // If it is a splat, check if the argument vector is another splat or a // build_vector. if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) { Index: llvm/trunk/test/CodeGen/X86/shuffle-of-splat-multiuses.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/shuffle-of-splat-multiuses.ll +++ llvm/trunk/test/CodeGen/X86/shuffle-of-splat-multiuses.ll @@ -5,9 +5,8 @@ define <2 x double> @foo2(<2 x double> %v, <2 x double> *%p) nounwind { ; AVX2-LABEL: foo2: ; AVX2: # BB#0: -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,1] -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX2-NEXT: vmovapd %xmm1, (%rdi) +; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1] +; AVX2-NEXT: vmovapd %xmm0, (%rdi) ; AVX2-NEXT: retq %res = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> %res1 = shufflevector<2 x double> %res, <2 x double> undef, <2 x i32> @@ -18,9 +17,8 @@ define <4 x double> @foo4(<4 x double> %v, <4 x double> *%p) nounwind { ; AVX2-LABEL: foo4: ; AVX2: # BB#0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[2,0,2,3] -; AVX2-NEXT: vmovapd %ymm1, (%rdi) +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-NEXT: vmovapd %ymm0, (%rdi) ; AVX2-NEXT: retq %res = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> %res1 = shufflevector<4 x double> %res, <4 x double> undef, <4 x i32> @@ -32,10 +30,8 @@ ; AVX2-LABEL: foo8: ; AVX2: # BB#0: ; AVX2-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = <2,0,u,u,5,1,3,7> -; AVX2-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovapd %ymm1, (%rdi) +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-NEXT: vmovapd %ymm0, (%rdi) ; AVX2-NEXT: retq %res = shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> %res1 = shufflevector<8 x float> %res, <8 x float> undef, <8 x i32> @@ -46,7 +42,7 @@ define <4 x i32> @undef_splatmask(<4 x i32> %v) nounwind { ; AVX2-LABEL: undef_splatmask: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-NEXT: retq %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> @@ -66,7 +62,7 @@ define <4 x i32> @undef_splatmask3(<4 x i32> %v) nounwind { ; AVX2-LABEL: undef_splatmask3: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-NEXT: retq %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> @@ -76,9 +72,10 @@ define <4 x i32> @undef_splatmask4(<4 x i32> %v, <4 x i32>* %p) nounwind { ; AVX2-LABEL: undef_splatmask4: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: vmovdqa %xmm0, (%rdi) +; AVX2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX2-NEXT: retq %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> @@ -89,9 +86,10 @@ define <4 x i32> @undef_splatmask5(<4 x i32> %v, <4 x i32>* %p) nounwind { ; AVX2-LABEL: undef_splatmask5: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastq %xmm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rdi) +; AVX2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX2-NEXT: retq %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32>