diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -44358,8 +44358,8 @@ /// A horizontal-op B, for some already available A and B, and if so then LHS is /// set to A, RHS to B, and the routine returns 'true'. static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, - const X86Subtarget &Subtarget, - bool IsCommutative) { + const X86Subtarget &Subtarget, bool IsCommutative, + SmallVectorImpl &PostShuffleMask) { // If either operand is undef, bail out. The binop should be simplified. if (LHS.isUndef() || RHS.isUndef()) return false; @@ -44452,6 +44452,10 @@ RMask.push_back(i); } + if (isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), LMask) || + isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), RMask)) + return false; + // If A and B occur in reverse order in RHS, then canonicalize by commuting // RHS operands and shuffle mask. if (A != C) { @@ -44462,6 +44466,9 @@ if (!(A == C && B == D)) return false; + PostShuffleMask.clear(); + PostShuffleMask.append(NumElts, SM_SentinelUndef); + // LHS and RHS are now: // LHS = shuffle A, B, LMask // RHS = shuffle A, B, RMask @@ -44470,6 +44477,7 @@ // so we just repeat the inner loop if this is a 256-bit op. unsigned Num128BitChunks = VT.getSizeInBits() / 128; unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks; + unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2; assert((NumEltsPer128BitChunk % 2 == 0) && "Vector type should have an even number of elements in each lane"); for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) { @@ -44481,25 +44489,40 @@ (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) continue; + // Check that successive odd/even elements are being operated on. If not, + // this is not a horizontal operation. + if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) && + !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative)) + return false; + + // Compute the post-shuffle mask index based on where the element + // is stored in the HOP result, and where it needs to be moved to. + int Base = LIdx & ~1u; + int Index = ((Base % NumEltsPer128BitChunk) / 2) + + ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1)); + // The low half of the 128-bit result must choose from A. // The high half of the 128-bit result must choose from B, // unless B is undef. In that case, we are always choosing from A. - unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2; - unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0; - - // Check that successive elements are being operated on. If not, this is - // not a horizontal operation. - int Index = 2 * (i % NumEltsPer64BitChunk) + NumElts * Src + j; - if (!(LIdx == Index && RIdx == Index + 1) && - !(IsCommutative && LIdx == Index + 1 && RIdx == Index)) - return false; + if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk)) + Index += NumEltsPer64BitChunk; + PostShuffleMask[i + j] = Index; } } LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. - if (!shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget)) + bool IsIdentityPostShuffle = + isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0); + if (IsIdentityPostShuffle) + PostShuffleMask.clear(); + + // Assume a SingleSource HOP if we only shuffle one input and don't need to + // shuffle the result. + if (!shouldUseHorizontalOp(LHS == RHS && + (NumShuffles < 2 || !IsIdentityPostShuffle), + DAG, Subtarget)) return false; LHS = DAG.getBitcast(VT, LHS); @@ -44518,10 +44541,16 @@ assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode"); // Try to synthesize horizontal add/sub from adds/subs of shuffles. + SmallVector PostShuffleMask; if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && - isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd)) - return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS); + isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd, PostShuffleMask)) { + SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS); + if (!PostShuffleMask.empty()) + HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp, + DAG.getUNDEF(VT), PostShuffleMask); + return HorizBinOp; + } return SDValue(); } @@ -47617,16 +47646,21 @@ return MAdd; // Try to synthesize horizontal adds from adds of shuffles. + SmallVector PostShuffleMask; if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasSSSE3() && - isHorizontalBinOp(Op0, Op1, DAG, Subtarget, true)) { + isHorizontalBinOp(Op0, Op1, DAG, Subtarget, true, PostShuffleMask)) { auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops); }; - return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, - HADDBuilder); + SDValue HorizBinOp = + SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HADDBuilder); + if (!PostShuffleMask.empty()) + HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp, + DAG.getUNDEF(VT), PostShuffleMask); + return HorizBinOp; } // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into @@ -47801,16 +47835,21 @@ // Try to synthesize horizontal subs from subs of shuffles. EVT VT = N->getValueType(0); + SmallVector PostShuffleMask; if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasSSSE3() && - isHorizontalBinOp(Op0, Op1, DAG, Subtarget, false)) { + isHorizontalBinOp(Op0, Op1, DAG, Subtarget, false, PostShuffleMask)) { auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops); }; - return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, - HSUBBuilder); + SDValue HorizBinOp = + SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HSUBBuilder); + if (!PostShuffleMask.empty()) + HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp, + DAG.getUNDEF(VT), PostShuffleMask); + return HorizBinOp; } // Try to create PSUBUS if SUB's argument is max/min diff --git a/llvm/test/CodeGen/X86/haddsub-3.ll b/llvm/test/CodeGen/X86/haddsub-3.ll --- a/llvm/test/CodeGen/X86/haddsub-3.ll +++ b/llvm/test/CodeGen/X86/haddsub-3.ll @@ -17,22 +17,46 @@ ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: retq ; -; SSSE3-LABEL: pr26491: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSSE3-NEXT: addps %xmm0, %xmm1 -; SSSE3-NEXT: movaps %xmm1, %xmm0 -; SSSE3-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSSE3-NEXT: addss %xmm1, %xmm0 -; SSSE3-NEXT: retq +; SSSE3-SLOW-LABEL: pr26491: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSSE3-SLOW-NEXT: addps %xmm0, %xmm1 +; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq ; -; AVX-LABEL: pr26491: -; AVX: # %bb.0: -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; SSSE3-FAST-LABEL: pr26491: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 +; SSSE3-FAST-NEXT: movaps %xmm0, %xmm1 +; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] +; SSSE3-FAST-NEXT: addss %xmm0, %xmm1 +; SSSE3-FAST-NEXT: movaps %xmm1, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX1-SLOW-LABEL: pr26491: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: pr26491: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: retq +; +; AVX2-LABEL: pr26491: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> %2 = fadd <4 x float> %1, %a0 %3 = extractelement <4 x float> %2, i32 2 diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll --- a/llvm/test/CodeGen/X86/haddsub-shuf.ll +++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll @@ -879,77 +879,59 @@ define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) { ; SSSE3_SLOW-LABEL: PR34724_1: ; SSSE3_SLOW: # %bb.0: -; SSSE3_SLOW-NEXT: movaps %xmm1, %xmm2 -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[3,2] -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] ; SSSE3_SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSSE3_SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3_SLOW-NEXT: addps %xmm0, %xmm2 -; SSSE3_SLOW-NEXT: movsldup {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSSE3_SLOW-NEXT: addps %xmm1, %xmm0 -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[1,0] -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,0] -; SSSE3_SLOW-NEXT: movaps %xmm2, %xmm0 +; SSSE3_SLOW-NEXT: haddps %xmm1, %xmm0 +; SSSE3_SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] +; SSSE3_SLOW-NEXT: addps %xmm1, %xmm2 +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0] +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[2,0] ; SSSE3_SLOW-NEXT: retq ; ; SSSE3_FAST-LABEL: PR34724_1: ; SSSE3_FAST: # %bb.0: -; SSSE3_FAST-NEXT: movaps %xmm1, %xmm2 -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[3,2] -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] ; SSSE3_FAST-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSSE3_FAST-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3_FAST-NEXT: addps %xmm0, %xmm2 +; SSSE3_FAST-NEXT: haddps %xmm1, %xmm0 ; SSSE3_FAST-NEXT: haddps %xmm1, %xmm1 -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[1,0] -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0] -; SSSE3_FAST-NEXT: movaps %xmm2, %xmm0 +; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] +; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] ; SSSE3_FAST-NEXT: retq ; ; AVX1_SLOW-LABEL: PR34724_1: ; AVX1_SLOW: # %bb.0: -; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX1_SLOW-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm1[0],zero,zero -; AVX1_SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero -; AVX1_SLOW-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX1_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] ; AVX1_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX1_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; AVX1_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3] ; AVX1_SLOW-NEXT: retq ; ; AVX1_FAST-LABEL: PR34724_1: ; AVX1_FAST: # %bb.0: -; AVX1_FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX1_FAST-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm1[0],zero,zero -; AVX1_FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero -; AVX1_FAST-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX1_FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX1_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; AVX1_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX1_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3] ; AVX1_FAST-NEXT: retq ; ; AVX2_SLOW-LABEL: PR34724_1: ; AVX2_SLOW: # %bb.0: -; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX2_SLOW-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm1[0],zero,zero -; AVX2_SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero -; AVX2_SLOW-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX2_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] ; AVX2_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX2_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; AVX2_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3] ; AVX2_SLOW-NEXT: retq ; ; AVX2_FAST-LABEL: PR34724_1: ; AVX2_FAST: # %bb.0: -; AVX2_FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX2_FAST-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm1[0],zero,zero -; AVX2_FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero -; AVX2_FAST-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX2_FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX2_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; AVX2_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX2_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,3] ; AVX2_FAST-NEXT: retq %t0 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> %t1 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> @@ -964,78 +946,49 @@ define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) { ; SSSE3_SLOW-LABEL: PR34724_2: ; SSSE3_SLOW: # %bb.0: -; SSSE3_SLOW-NEXT: movaps %xmm1, %xmm2 -; SSSE3_SLOW-NEXT: movsldup {{.*#+}} xmm3 = xmm1[0,0,2,2] -; SSSE3_SLOW-NEXT: addps %xmm1, %xmm3 -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[3,0] -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] +; SSSE3_SLOW-NEXT: haddps %xmm1, %xmm0 +; SSSE3_SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] ; SSSE3_SLOW-NEXT: addps %xmm1, %xmm2 -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[1,0] -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[2,0] -; SSSE3_SLOW-NEXT: movaps %xmm2, %xmm0 +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0] +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] ; SSSE3_SLOW-NEXT: retq ; ; SSSE3_FAST-LABEL: PR34724_2: ; SSSE3_FAST: # %bb.0: -; SSSE3_FAST-NEXT: movaps %xmm1, %xmm3 -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[2,0] -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3] -; SSSE3_FAST-NEXT: movaps %xmm1, %xmm2 -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[3,0] -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3] -; SSSE3_FAST-NEXT: addps %xmm3, %xmm2 +; SSSE3_FAST-NEXT: haddps %xmm1, %xmm0 ; SSSE3_FAST-NEXT: haddps %xmm1, %xmm1 -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[1,0] -; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0] -; SSSE3_FAST-NEXT: movaps %xmm2, %xmm0 +; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] +; SSSE3_FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; SSSE3_FAST-NEXT: retq ; ; AVX1_SLOW-LABEL: PR34724_2: ; AVX1_SLOW: # %bb.0: -; AVX1_SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3] -; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,0,2,3] -; AVX1_SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero -; AVX1_SLOW-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX1_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX1_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] ; AVX1_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX1_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; AVX1_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX1_SLOW-NEXT: retq ; ; AVX1_FAST-LABEL: PR34724_2: ; AVX1_FAST: # %bb.0: -; AVX1_FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3] -; AVX1_FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,0,2,3] -; AVX1_FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero -; AVX1_FAST-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX1_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; AVX1_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX1_FAST-NEXT: retq ; ; AVX2_SLOW-LABEL: PR34724_2: ; AVX2_SLOW: # %bb.0: -; AVX2_SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3] -; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,0,2,3] -; AVX2_SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero -; AVX2_SLOW-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX2_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX2_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] ; AVX2_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX2_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; AVX2_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2_SLOW-NEXT: retq ; ; AVX2_FAST-LABEL: PR34724_2: ; AVX2_FAST: # %bb.0: -; AVX2_FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3] -; AVX2_FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,0,2,3] -; AVX2_FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero -; AVX2_FAST-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1 -; AVX2_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2_FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,3] +; AVX2_FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX2_FAST-NEXT: retq %t0 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %t1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll --- a/llvm/test/CodeGen/X86/haddsub-undef.ll +++ b/llvm/test/CodeGen/X86/haddsub-undef.ll @@ -831,20 +831,30 @@ } define <4 x float> @PR45747_1(<4 x float> %a, <4 x float> %b) nounwind { -; SSE-LABEL: PR45747_1: -; SSE: # %bb.0: -; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE-NEXT: addps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE-SLOW-LABEL: PR45747_1: +; SSE-SLOW: # %bb.0: +; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE-SLOW-NEXT: addps %xmm0, %xmm1 +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE-SLOW-NEXT: movaps %xmm1, %xmm0 +; SSE-SLOW-NEXT: retq ; -; AVX-LABEL: PR45747_1: -; AVX: # %bb.0: -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: retq +; SSE-FAST-LABEL: PR45747_1: +; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: PR45747_1: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: PR45747_1: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %t0 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %t1 = fadd <4 x float> %t0, %a %shuffle = shufflevector <4 x float> %t1, <4 x float> undef, <4 x i32> @@ -852,19 +862,32 @@ } define <4 x float> @PR45747_2(<4 x float> %a, <4 x float> %b) nounwind { -; SSE-LABEL: PR45747_2: -; SSE: # %bb.0: -; SSE-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE-NEXT: addps %xmm1, %xmm0 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: retq +; SSE-SLOW-LABEL: PR45747_2: +; SSE-SLOW: # %bb.0: +; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-SLOW-NEXT: retq ; -; AVX-LABEL: PR45747_2: -; AVX: # %bb.0: -; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] -; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX-NEXT: retq +; SSE-FAST-LABEL: PR45747_2: +; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: movaps %xmm1, %xmm0 +; SSE-FAST-NEXT: haddps %xmm1, %xmm0 +; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: PR45747_2: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: PR45747_2: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm0 +; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX-FAST-NEXT: retq %t0 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> %t1 = fadd <4 x float> %t0, %b %shuffle = shufflevector <4 x float> %t1, <4 x float> undef, <4 x i32>