diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -21255,14 +21255,17 @@ // Merge shuffles through binops if we are able to merge it with at least // one other shuffles. + // shuffle(bop(shuffle(x,y),shuffle(z,w)),undef) // shuffle(bop(shuffle(x,y),shuffle(z,w)),bop(shuffle(a,b),shuffle(c,d))) unsigned SrcOpcode = N0.getOpcode(); - if (SrcOpcode == N1.getOpcode() && TLI.isBinOp(SrcOpcode) && - N->isOnlyUserOf(N0.getNode()) && N->isOnlyUserOf(N1.getNode())) { + if (TLI.isBinOp(SrcOpcode) && N->isOnlyUserOf(N0.getNode()) && + (N1.isUndef() || + (SrcOpcode == N1.getOpcode() && N->isOnlyUserOf(N1.getNode())))) { + // Get binop source ops, or just pass on the undef. SDValue Op00 = N0.getOperand(0); - SDValue Op10 = N1.getOperand(0); SDValue Op01 = N0.getOperand(1); - SDValue Op11 = N1.getOperand(1); + SDValue Op10 = N1.isUndef() ? N1 : N1.getOperand(0); + SDValue Op11 = N1.isUndef() ? N1 : N1.getOperand(1); // TODO: We might be able to relax the VT check but we don't currently // have any isBinOp() that has different result/ops VTs so play safe until // we have test coverage. diff --git a/llvm/test/CodeGen/X86/haddsub-4.ll b/llvm/test/CodeGen/X86/haddsub-4.ll --- a/llvm/test/CodeGen/X86/haddsub-4.ll +++ b/llvm/test/CodeGen/X86/haddsub-4.ll @@ -123,26 +123,25 @@ define <8 x float> @hadd_reverse3_v8f32(<8 x float> %a0, <8 x float> %a1) { ; SSE-LABEL: hadd_reverse3_v8f32: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: haddps %xmm2, %xmm4 -; SSE-NEXT: haddps %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2,1,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,2,1,0] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: haddps %xmm1, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0,3,2] +; SSE-NEXT: haddps %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0,3,2] +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: hadd_reverse3_v8f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vhaddps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_reverse3_v8f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vhaddps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-NEXT: retq %shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll --- a/llvm/test/CodeGen/X86/haddsub-shuf.ll +++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll @@ -525,7 +525,6 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_v8i32b: @@ -615,7 +614,6 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vphsubd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hsub_v8i32b: @@ -705,7 +703,6 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_v16i16b: @@ -795,7 +792,6 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vphsubw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hsub_v16i16b: diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -513,9 +513,8 @@ ; X86: # %bb.0: ; X86-NEXT: vpsrad $25, %xmm0, %xmm0 ; X86-NEXT: vpsrad $25, %xmm1, %xmm1 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-NEXT: retl ; @@ -523,9 +522,8 @@ ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vpsrad $25, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpsrad $25, %xmm1, %xmm1 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: retq ; @@ -553,9 +551,8 @@ ; X86: # %bb.0: ; X86-NEXT: vpsrad $25, %xmm0, %xmm0 ; X86-NEXT: vpsrad $25, %xmm1, %xmm1 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-NEXT: retl ; @@ -563,9 +560,8 @@ ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vpsrad $25, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpsrad $25, %xmm1, %xmm1 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: retq ; @@ -593,9 +589,8 @@ ; X86: # %bb.0: ; X86-NEXT: vpsrad $25, %xmm0, %xmm0 ; X86-NEXT: vpsrad $25, %xmm1, %xmm1 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-NEXT: retl ; @@ -603,9 +598,8 @@ ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vpsrad $25, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpsrad $25, %xmm1, %xmm1 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: retq ; @@ -633,9 +627,8 @@ ; X86: # %bb.0: ; X86-NEXT: vpsrad $25, %xmm0, %xmm0 ; X86-NEXT: vpsrad $25, %xmm1, %xmm1 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-NEXT: retl ; @@ -643,9 +636,8 @@ ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vpsrad $25, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpsrad $25, %xmm1, %xmm1 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: retq ;