Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -39240,7 +39240,8 @@ /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form /// A horizontal-op B, for some already available A and B, and if so then LHS is /// set to A, RHS to B, and the routine returns 'true'. -static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { +static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative, + unsigned &NumShuffles) { // If either operand is undef, bail out. The binop should be simplified. if (LHS.isUndef() || RHS.isUndef()) return false; @@ -39292,12 +39293,14 @@ SDValue A, B; SmallVector LMask; GetShuffle(LHS, A, B, LMask); + NumShuffles = (LMask.empty() ? 0 : 1); // Likewise, view RHS in the form // RHS = VECTOR_SHUFFLE C, D, RMask SDValue C, D; SmallVector RMask; GetShuffle(RHS, C, D, RMask); + NumShuffles += (RMask.empty() ? 0 : 1); // At least one of the operands should be a vector shuffle. if (LMask.empty() && RMask.empty()) @@ -39375,10 +39378,11 @@ assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode"); // Try to synthesize horizontal add/sub from adds/subs of shuffles. + unsigned NumShuffles = 0; if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && - isHorizontalBinOp(LHS, RHS, IsFadd) && - shouldUseHorizontalOp(LHS == RHS, DAG, Subtarget)) + isHorizontalBinOp(LHS, RHS, IsFadd, NumShuffles) && + shouldUseHorizontalOp(LHS == RHS && NumShuffles < 2, DAG, Subtarget)) return DAG.getNode(HorizOpcode, SDLoc(N), VT, DAG.getBitcast(VT, LHS), DAG.getBitcast(VT, RHS)); @@ -42274,10 +42278,11 @@ return MAdd; // Try to synthesize horizontal adds from adds of shuffles. + unsigned NumShuffles = 0; if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 || VT == MVT::v8i32) && - Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true) && - shouldUseHorizontalOp(Op0 == Op1, DAG, Subtarget)) { + Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true, NumShuffles) && + shouldUseHorizontalOp(Op0 == Op1 && NumShuffles < 2, DAG, Subtarget)) { auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops); @@ -42406,11 +42411,12 @@ } // Try to synthesize horizontal subs from subs of shuffles. + unsigned NumShuffles = 0; EVT VT = N->getValueType(0); if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 || VT == MVT::v8i32) && - Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false) && - shouldUseHorizontalOp(Op0 == Op1, DAG, Subtarget)) { + Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false, NumShuffles) && + shouldUseHorizontalOp(Op0 == Op1 && NumShuffles < 2, DAG, Subtarget)) { auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops); Index: test/CodeGen/X86/avx2-phaddsub.ll =================================================================== --- test/CodeGen/X86/avx2-phaddsub.ll +++ test/CodeGen/X86/avx2-phaddsub.ll @@ -69,29 +69,15 @@ } define <8 x i32> @phaddd3(<8 x i32> %x) { -; X32-SLOW-LABEL: phaddd3: -; X32-SLOW: # %bb.0: -; X32-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] -; X32-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] -; X32-SLOW-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; X32-SLOW-NEXT: retl -; -; X32-FAST-LABEL: phaddd3: -; X32-FAST: # %bb.0: -; X32-FAST-NEXT: vphaddd %ymm0, %ymm0, %ymm0 -; X32-FAST-NEXT: retl -; -; X64-SLOW-LABEL: phaddd3: -; X64-SLOW: # %bb.0: -; X64-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] -; X64-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] -; X64-SLOW-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; X64-SLOW-NEXT: retq -; -; X64-FAST-LABEL: phaddd3: -; X64-FAST: # %bb.0: -; X64-FAST-NEXT: vphaddd %ymm0, %ymm0, %ymm0 -; X64-FAST-NEXT: retq +; X32-LABEL: phaddd3: +; X32: # %bb.0: +; X32-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: phaddd3: +; X64: # %bb.0: +; X64-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; X64-NEXT: retq %a = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> %b = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> %r = add <8 x i32> %a, %b Index: test/CodeGen/X86/haddsub-shuf.ll =================================================================== --- test/CodeGen/X86/haddsub-shuf.ll +++ test/CodeGen/X86/haddsub-shuf.ll @@ -10,45 +10,15 @@ ; PR34111 - https://bugs.llvm.org/show_bug.cgi?id=34111 define <4 x float> @hadd_v4f32(<4 x float> %a) { -; SSSE3_SLOW-LABEL: hadd_v4f32: -; SSSE3_SLOW: # %bb.0: -; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm1 -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSSE3_SLOW-NEXT: addps %xmm1, %xmm0 -; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] -; SSSE3_SLOW-NEXT: retq -; -; SSSE3_FAST-LABEL: hadd_v4f32: -; SSSE3_FAST: # %bb.0: -; SSSE3_FAST-NEXT: haddps %xmm0, %xmm0 -; SSSE3_FAST-NEXT: retq -; -; AVX1_SLOW-LABEL: hadd_v4f32: -; AVX1_SLOW: # %bb.0: -; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] -; AVX1_SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX1_SLOW-NEXT: retq -; -; AVX1_FAST-LABEL: hadd_v4f32: -; AVX1_FAST: # %bb.0: -; AVX1_FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX1_FAST-NEXT: retq -; -; AVX2_SLOW-LABEL: hadd_v4f32: -; AVX2_SLOW: # %bb.0: -; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] -; AVX2_SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX2_SLOW-NEXT: retq +; SSSE3-LABEL: hadd_v4f32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: haddps %xmm0, %xmm0 +; SSSE3-NEXT: retq ; -; AVX2_FAST-LABEL: hadd_v4f32: -; AVX2_FAST: # %bb.0: -; AVX2_FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX2_FAST-NEXT: retq +; AVX-LABEL: hadd_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> %hop = fadd <2 x float> %a02, %a13 @@ -87,51 +57,16 @@ } define <8 x float> @hadd_v8f32b(<8 x float> %a) { -; SSSE3_SLOW-LABEL: hadd_v8f32b: -; SSSE3_SLOW: # %bb.0: -; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm2 -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[2,3] -; SSSE3_SLOW-NEXT: movaps %xmm1, %xmm3 -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[2,3] -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSSE3_SLOW-NEXT: addps %xmm2, %xmm0 -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSSE3_SLOW-NEXT: addps %xmm3, %xmm1 -; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] -; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0] -; SSSE3_SLOW-NEXT: retq -; -; SSSE3_FAST-LABEL: hadd_v8f32b: -; SSSE3_FAST: # %bb.0: -; SSSE3_FAST-NEXT: haddps %xmm0, %xmm0 -; SSSE3_FAST-NEXT: haddps %xmm1, %xmm1 -; SSSE3_FAST-NEXT: retq -; -; AVX1_SLOW-LABEL: hadd_v8f32b: -; AVX1_SLOW: # %bb.0: -; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] -; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] -; AVX1_SLOW-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX1_SLOW-NEXT: retq -; -; AVX1_FAST-LABEL: hadd_v8f32b: -; AVX1_FAST: # %bb.0: -; AVX1_FAST-NEXT: vhaddps %ymm0, %ymm0, %ymm0 -; AVX1_FAST-NEXT: retq -; -; AVX2_SLOW-LABEL: hadd_v8f32b: -; AVX2_SLOW: # %bb.0: -; AVX2_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] -; AVX2_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] -; AVX2_SLOW-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX2_SLOW-NEXT: retq +; SSSE3-LABEL: hadd_v8f32b: +; SSSE3: # %bb.0: +; SSSE3-NEXT: haddps %xmm0, %xmm0 +; SSSE3-NEXT: haddps %xmm1, %xmm1 +; SSSE3-NEXT: retq ; -; AVX2_FAST-LABEL: hadd_v8f32b: -; AVX2_FAST: # %bb.0: -; AVX2_FAST-NEXT: vhaddps %ymm0, %ymm0, %ymm0 -; AVX2_FAST-NEXT: retq +; AVX-LABEL: hadd_v8f32b: +; AVX: # %bb.0: +; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX-NEXT: retq %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> %hop = fadd <8 x float> %a0, %a1 @@ -140,45 +75,15 @@ } define <4 x float> @hsub_v4f32(<4 x float> %a) { -; SSSE3_SLOW-LABEL: hsub_v4f32: -; SSSE3_SLOW: # %bb.0: -; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm1 -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSSE3_SLOW-NEXT: subps %xmm0, %xmm1 -; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] -; SSSE3_SLOW-NEXT: retq -; -; SSSE3_FAST-LABEL: hsub_v4f32: -; SSSE3_FAST: # %bb.0: -; SSSE3_FAST-NEXT: hsubps %xmm0, %xmm0 -; SSSE3_FAST-NEXT: retq -; -; AVX1_SLOW-LABEL: hsub_v4f32: -; AVX1_SLOW: # %bb.0: -; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] -; AVX1_SLOW-NEXT: vsubps %xmm0, %xmm1, %xmm0 -; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX1_SLOW-NEXT: retq -; -; AVX1_FAST-LABEL: hsub_v4f32: -; AVX1_FAST: # %bb.0: -; AVX1_FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 -; AVX1_FAST-NEXT: retq -; -; AVX2_SLOW-LABEL: hsub_v4f32: -; AVX2_SLOW: # %bb.0: -; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] -; AVX2_SLOW-NEXT: vsubps %xmm0, %xmm1, %xmm0 -; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX2_SLOW-NEXT: retq +; SSSE3-LABEL: hsub_v4f32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: hsubps %xmm0, %xmm0 +; SSSE3-NEXT: retq ; -; AVX2_FAST-LABEL: hsub_v4f32: -; AVX2_FAST: # %bb.0: -; AVX2_FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 -; AVX2_FAST-NEXT: retq +; AVX-LABEL: hsub_v4f32: +; AVX: # %bb.0: +; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> %hop = fsub <2 x float> %a02, %a13 @@ -217,51 +122,16 @@ } define <8 x float> @hsub_v8f32b(<8 x float> %a) { -; SSSE3_SLOW-LABEL: hsub_v8f32b: -; SSSE3_SLOW: # %bb.0: -; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm2 -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[2,3] -; SSSE3_SLOW-NEXT: movaps %xmm1, %xmm3 -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[2,3] -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSSE3_SLOW-NEXT: subps %xmm0, %xmm2 -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSSE3_SLOW-NEXT: subps %xmm1, %xmm3 -; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0] -; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm3[0,0] -; SSSE3_SLOW-NEXT: retq -; -; SSSE3_FAST-LABEL: hsub_v8f32b: -; SSSE3_FAST: # %bb.0: -; SSSE3_FAST-NEXT: hsubps %xmm0, %xmm0 -; SSSE3_FAST-NEXT: hsubps %xmm1, %xmm1 -; SSSE3_FAST-NEXT: retq -; -; AVX1_SLOW-LABEL: hsub_v8f32b: -; AVX1_SLOW: # %bb.0: -; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] -; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] -; AVX1_SLOW-NEXT: vsubps %ymm0, %ymm1, %ymm0 -; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX1_SLOW-NEXT: retq -; -; AVX1_FAST-LABEL: hsub_v8f32b: -; AVX1_FAST: # %bb.0: -; AVX1_FAST-NEXT: vhsubps %ymm0, %ymm0, %ymm0 -; AVX1_FAST-NEXT: retq -; -; AVX2_SLOW-LABEL: hsub_v8f32b: -; AVX2_SLOW: # %bb.0: -; AVX2_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] -; AVX2_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] -; AVX2_SLOW-NEXT: vsubps %ymm0, %ymm1, %ymm0 -; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX2_SLOW-NEXT: retq +; SSSE3-LABEL: hsub_v8f32b: +; SSSE3: # %bb.0: +; SSSE3-NEXT: hsubps %xmm0, %xmm0 +; SSSE3-NEXT: hsubps %xmm1, %xmm1 +; SSSE3-NEXT: retq ; -; AVX2_FAST-LABEL: hsub_v8f32b: -; AVX2_FAST: # %bb.0: -; AVX2_FAST-NEXT: vhsubps %ymm0, %ymm0, %ymm0 -; AVX2_FAST-NEXT: retq +; AVX-LABEL: hsub_v8f32b: +; AVX: # %bb.0: +; AVX-NEXT: vhsubps %ymm0, %ymm0, %ymm0 +; AVX-NEXT: retq %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> %hop = fsub <8 x float> %a0, %a1 @@ -536,44 +406,15 @@ } define <4 x i32> @hadd_v4i32(<4 x i32> %a) { -; SSSE3_SLOW-LABEL: hadd_v4i32: -; SSSE3_SLOW: # %bb.0: -; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSSE3_SLOW-NEXT: paddd %xmm1, %xmm0 -; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSSE3_SLOW-NEXT: retq -; -; SSSE3_FAST-LABEL: hadd_v4i32: -; SSSE3_FAST: # %bb.0: -; SSSE3_FAST-NEXT: phaddd %xmm0, %xmm0 -; SSSE3_FAST-NEXT: retq -; -; AVX1_SLOW-LABEL: hadd_v4i32: -; AVX1_SLOW: # %bb.0: -; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; AVX1_SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1_SLOW-NEXT: retq -; -; AVX1_FAST-LABEL: hadd_v4i32: -; AVX1_FAST: # %bb.0: -; AVX1_FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX1_FAST-NEXT: retq -; -; AVX2_SLOW-LABEL: hadd_v4i32: -; AVX2_SLOW: # %bb.0: -; AVX2_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX2_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; AVX2_SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2_SLOW-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX2_SLOW-NEXT: retq +; SSSE3-LABEL: hadd_v4i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-NEXT: retq ; -; AVX2_FAST-LABEL: hadd_v4i32: -; AVX2_FAST: # %bb.0: -; AVX2_FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX2_FAST-NEXT: retq +; AVX-LABEL: hadd_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> %hop = add <4 x i32> %a02, %a13 @@ -612,57 +453,25 @@ } define <8 x i32> @hadd_v8i32b(<8 x i32> %a) { -; SSSE3_SLOW-LABEL: hadd_v8i32b: -; SSSE3_SLOW: # %bb.0: -; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] -; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSSE3_SLOW-NEXT: paddd %xmm2, %xmm0 -; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSSE3_SLOW-NEXT: paddd %xmm3, %xmm1 -; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSSE3_SLOW-NEXT: retq -; -; SSSE3_FAST-LABEL: hadd_v8i32b: -; SSSE3_FAST: # %bb.0: -; SSSE3_FAST-NEXT: phaddd %xmm0, %xmm0 -; SSSE3_FAST-NEXT: phaddd %xmm1, %xmm1 -; SSSE3_FAST-NEXT: retq -; -; AVX1_SLOW-LABEL: hadd_v8i32b: -; AVX1_SLOW: # %bb.0: -; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] -; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] -; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1_SLOW-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1_SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1_SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX1_SLOW-NEXT: retq -; -; AVX1_FAST-LABEL: hadd_v8i32b: -; AVX1_FAST: # %bb.0: -; AVX1_FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm1 -; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1_FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1_FAST-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX1_FAST-NEXT: retq +; SSSE3-LABEL: hadd_v8i32b: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-NEXT: phaddd %xmm1, %xmm1 +; SSSE3-NEXT: retq ; -; AVX2_SLOW-LABEL: hadd_v8i32b: -; AVX2_SLOW: # %bb.0: -; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] -; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] -; AVX2_SLOW-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] -; AVX2_SLOW-NEXT: retq +; AVX1-LABEL: hadd_v8i32b: +; AVX1: # %bb.0: +; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1-NEXT: retq ; -; AVX2_FAST-LABEL: hadd_v8i32b: -; AVX2_FAST: # %bb.0: -; AVX2_FAST-NEXT: vphaddd %ymm0, %ymm0, %ymm0 -; AVX2_FAST-NEXT: retq +; AVX2-LABEL: hadd_v8i32b: +; AVX2: # %bb.0: +; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: retq %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> %hop = add <8 x i32> %a0, %a1 @@ -671,44 +480,15 @@ } define <4 x i32> @hsub_v4i32(<4 x i32> %a) { -; SSSE3_SLOW-LABEL: hsub_v4i32: -; SSSE3_SLOW: # %bb.0: -; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSSE3_SLOW-NEXT: psubd %xmm0, %xmm1 -; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSSE3_SLOW-NEXT: retq -; -; SSSE3_FAST-LABEL: hsub_v4i32: -; SSSE3_FAST: # %bb.0: -; SSSE3_FAST-NEXT: phsubd %xmm0, %xmm0 -; SSSE3_FAST-NEXT: retq -; -; AVX1_SLOW-LABEL: hsub_v4i32: -; AVX1_SLOW: # %bb.0: -; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; AVX1_SLOW-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1_SLOW-NEXT: retq -; -; AVX1_FAST-LABEL: hsub_v4i32: -; AVX1_FAST: # %bb.0: -; AVX1_FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 -; AVX1_FAST-NEXT: retq -; -; AVX2_SLOW-LABEL: hsub_v4i32: -; AVX2_SLOW: # %bb.0: -; AVX2_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX2_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; AVX2_SLOW-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; AVX2_SLOW-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX2_SLOW-NEXT: retq +; SSSE3-LABEL: hsub_v4i32: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phsubd %xmm0, %xmm0 +; SSSE3-NEXT: retq ; -; AVX2_FAST-LABEL: hsub_v4i32: -; AVX2_FAST: # %bb.0: -; AVX2_FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 -; AVX2_FAST-NEXT: retq +; AVX-LABEL: hsub_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> %hop = sub <4 x i32> %a02, %a13 @@ -747,57 +527,25 @@ } define <8 x i32> @hsub_v8i32b(<8 x i32> %a) { -; SSSE3_SLOW-LABEL: hsub_v8i32b: -; SSSE3_SLOW: # %bb.0: -; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] -; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSSE3_SLOW-NEXT: psubd %xmm0, %xmm2 -; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] -; SSSE3_SLOW-NEXT: psubd %xmm0, %xmm3 -; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] -; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1] -; SSSE3_SLOW-NEXT: retq -; -; SSSE3_FAST-LABEL: hsub_v8i32b: -; SSSE3_FAST: # %bb.0: -; SSSE3_FAST-NEXT: phsubd %xmm0, %xmm0 -; SSSE3_FAST-NEXT: phsubd %xmm1, %xmm1 -; SSSE3_FAST-NEXT: retq -; -; AVX1_SLOW-LABEL: hsub_v8i32b: -; AVX1_SLOW: # %bb.0: -; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] -; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] -; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1_SLOW-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1_SLOW-NEXT: vpsubd %xmm2, %xmm3, %xmm2 -; AVX1_SLOW-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX1_SLOW-NEXT: retq -; -; AVX1_FAST-LABEL: hsub_v8i32b: -; AVX1_FAST: # %bb.0: -; AVX1_FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm1 -; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1_FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 -; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1_FAST-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX1_FAST-NEXT: retq +; SSSE3-LABEL: hsub_v8i32b: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phsubd %xmm0, %xmm0 +; SSSE3-NEXT: phsubd %xmm1, %xmm1 +; SSSE3-NEXT: retq ; -; AVX2_SLOW-LABEL: hsub_v8i32b: -; AVX2_SLOW: # %bb.0: -; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] -; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] -; AVX2_SLOW-NEXT: vpsubd %ymm0, %ymm1, %ymm0 -; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] -; AVX2_SLOW-NEXT: retq +; AVX1-LABEL: hsub_v8i32b: +; AVX1: # %bb.0: +; AVX1-NEXT: vphsubd %xmm0, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vphsubd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1-NEXT: retq ; -; AVX2_FAST-LABEL: hsub_v8i32b: -; AVX2_FAST: # %bb.0: -; AVX2_FAST-NEXT: vphsubd %ymm0, %ymm0, %ymm0 -; AVX2_FAST-NEXT: retq +; AVX2-LABEL: hsub_v8i32b: +; AVX2: # %bb.0: +; AVX2-NEXT: vphsubd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: retq %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> %hop = sub <8 x i32> %a0, %a1 @@ -806,45 +554,15 @@ } define <8 x i16> @hadd_v8i16(<8 x i16> %a) { -; SSSE3_SLOW-LABEL: hadd_v8i16: -; SSSE3_SLOW: # %bb.0: -; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm1 -; SSSE3_SLOW-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3_SLOW-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; SSSE3_SLOW-NEXT: paddw %xmm1, %xmm0 -; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSSE3_SLOW-NEXT: retq -; -; SSSE3_FAST-LABEL: hadd_v8i16: -; SSSE3_FAST: # %bb.0: -; SSSE3_FAST-NEXT: phaddw %xmm0, %xmm0 -; SSSE3_FAST-NEXT: retq -; -; AVX1_SLOW-LABEL: hadd_v8i16: -; AVX1_SLOW: # %bb.0: -; AVX1_SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1_SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; AVX1_SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 -; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1_SLOW-NEXT: retq -; -; AVX1_FAST-LABEL: hadd_v8i16: -; AVX1_FAST: # %bb.0: -; AVX1_FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX1_FAST-NEXT: retq -; -; AVX2_SLOW-LABEL: hadd_v8i16: -; AVX2_SLOW: # %bb.0: -; AVX2_SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2_SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; AVX2_SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 -; AVX2_SLOW-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX2_SLOW-NEXT: retq +; SSSE3-LABEL: hadd_v8i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phaddw %xmm0, %xmm0 +; SSSE3-NEXT: retq ; -; AVX2_FAST-LABEL: hadd_v8i16: -; AVX2_FAST: # %bb.0: -; AVX2_FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX2_FAST-NEXT: retq +; AVX-LABEL: hadd_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> %hop = add <8 x i16> %a0246, %a1357 @@ -883,64 +601,25 @@ } define <16 x i16> @hadd_v16i16b(<16 x i16> %a) { -; SSSE3_SLOW-LABEL: hadd_v16i16b: -; SSSE3_SLOW: # %bb.0: -; SSSE3_SLOW-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm3 -; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm3 -; SSSE3_SLOW-NEXT: movdqa %xmm1, %xmm4 -; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm4 -; SSSE3_SLOW-NEXT: movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm0 -; SSSE3_SLOW-NEXT: paddw %xmm3, %xmm0 -; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm1 -; SSSE3_SLOW-NEXT: paddw %xmm4, %xmm1 -; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSSE3_SLOW-NEXT: retq -; -; SSSE3_FAST-LABEL: hadd_v16i16b: -; SSSE3_FAST: # %bb.0: -; SSSE3_FAST-NEXT: phaddw %xmm0, %xmm0 -; SSSE3_FAST-NEXT: phaddw %xmm1, %xmm1 -; SSSE3_FAST-NEXT: retq -; -; AVX1_SLOW-LABEL: hadd_v16i16b: -; AVX1_SLOW: # %bb.0: -; AVX1_SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1_SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm2 -; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1_SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX1_SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; AVX1_SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX1_SLOW-NEXT: vpaddw %xmm0, %xmm2, %xmm0 -; AVX1_SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm2 -; AVX1_SLOW-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX1_SLOW-NEXT: retq -; -; AVX1_FAST-LABEL: hadd_v16i16b: -; AVX1_FAST: # %bb.0: -; AVX1_FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm1 -; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1_FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1_FAST-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX1_FAST-NEXT: retq +; SSSE3-LABEL: hadd_v16i16b: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phaddw %xmm0, %xmm0 +; SSSE3-NEXT: phaddw %xmm1, %xmm1 +; SSSE3-NEXT: retq ; -; AVX2_SLOW-LABEL: hadd_v16i16b: -; AVX2_SLOW: # %bb.0: -; AVX2_SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2_SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] -; AVX2_SLOW-NEXT: vpaddw %ymm0, %ymm1, %ymm0 -; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] -; AVX2_SLOW-NEXT: retq +; AVX1-LABEL: hadd_v16i16b: +; AVX1: # %bb.0: +; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1-NEXT: retq ; -; AVX2_FAST-LABEL: hadd_v16i16b: -; AVX2_FAST: # %bb.0: -; AVX2_FAST-NEXT: vphaddw %ymm0, %ymm0, %ymm0 -; AVX2_FAST-NEXT: retq +; AVX2-LABEL: hadd_v16i16b: +; AVX2: # %bb.0: +; AVX2-NEXT: vphaddw %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: retq %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> %hop = add <16 x i16> %a0, %a1 @@ -949,45 +628,15 @@ } define <8 x i16> @hsub_v8i16(<8 x i16> %a) { -; SSSE3_SLOW-LABEL: hsub_v8i16: -; SSSE3_SLOW: # %bb.0: -; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm1 -; SSSE3_SLOW-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3_SLOW-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; SSSE3_SLOW-NEXT: psubw %xmm0, %xmm1 -; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] -; SSSE3_SLOW-NEXT: retq -; -; SSSE3_FAST-LABEL: hsub_v8i16: -; SSSE3_FAST: # %bb.0: -; SSSE3_FAST-NEXT: phsubw %xmm0, %xmm0 -; SSSE3_FAST-NEXT: retq -; -; AVX1_SLOW-LABEL: hsub_v8i16: -; AVX1_SLOW: # %bb.0: -; AVX1_SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1_SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; AVX1_SLOW-NEXT: vpsubw %xmm0, %xmm1, %xmm0 -; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1_SLOW-NEXT: retq -; -; AVX1_FAST-LABEL: hsub_v8i16: -; AVX1_FAST: # %bb.0: -; AVX1_FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm0 -; AVX1_FAST-NEXT: retq -; -; AVX2_SLOW-LABEL: hsub_v8i16: -; AVX2_SLOW: # %bb.0: -; AVX2_SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2_SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; AVX2_SLOW-NEXT: vpsubw %xmm0, %xmm1, %xmm0 -; AVX2_SLOW-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX2_SLOW-NEXT: retq +; SSSE3-LABEL: hsub_v8i16: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phsubw %xmm0, %xmm0 +; SSSE3-NEXT: retq ; -; AVX2_FAST-LABEL: hsub_v8i16: -; AVX2_FAST: # %bb.0: -; AVX2_FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm0 -; AVX2_FAST-NEXT: retq +; AVX-LABEL: hsub_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vphsubw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> %hop = sub <8 x i16> %a0246, %a1357 @@ -1026,64 +675,25 @@ } define <16 x i16> @hsub_v16i16b(<16 x i16> %a) { -; SSSE3_SLOW-LABEL: hsub_v16i16b: -; SSSE3_SLOW: # %bb.0: -; SSSE3_SLOW-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm3 -; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm3 -; SSSE3_SLOW-NEXT: movdqa %xmm1, %xmm4 -; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm4 -; SSSE3_SLOW-NEXT: movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm0 -; SSSE3_SLOW-NEXT: psubw %xmm0, %xmm3 -; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm1 -; SSSE3_SLOW-NEXT: psubw %xmm1, %xmm4 -; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,1] -; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] -; SSSE3_SLOW-NEXT: retq -; -; SSSE3_FAST-LABEL: hsub_v16i16b: -; SSSE3_FAST: # %bb.0: -; SSSE3_FAST-NEXT: phsubw %xmm0, %xmm0 -; SSSE3_FAST-NEXT: phsubw %xmm1, %xmm1 -; SSSE3_FAST-NEXT: retq -; -; AVX1_SLOW-LABEL: hsub_v16i16b: -; AVX1_SLOW: # %bb.0: -; AVX1_SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1_SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm2 -; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1_SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX1_SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] -; AVX1_SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX1_SLOW-NEXT: vpsubw %xmm0, %xmm2, %xmm0 -; AVX1_SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm2 -; AVX1_SLOW-NEXT: vpsubw %xmm2, %xmm1, %xmm1 -; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX1_SLOW-NEXT: retq -; -; AVX1_FAST-LABEL: hsub_v16i16b: -; AVX1_FAST: # %bb.0: -; AVX1_FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm1 -; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1_FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm0 -; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1_FAST-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX1_FAST-NEXT: retq +; SSSE3-LABEL: hsub_v16i16b: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phsubw %xmm0, %xmm0 +; SSSE3-NEXT: phsubw %xmm1, %xmm1 +; SSSE3-NEXT: retq ; -; AVX2_SLOW-LABEL: hsub_v16i16b: -; AVX2_SLOW: # %bb.0: -; AVX2_SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2_SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] -; AVX2_SLOW-NEXT: vpsubw %ymm0, %ymm1, %ymm0 -; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] -; AVX2_SLOW-NEXT: retq +; AVX1-LABEL: hsub_v16i16b: +; AVX1: # %bb.0: +; AVX1-NEXT: vphsubw %xmm0, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vphsubw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1-NEXT: retq ; -; AVX2_FAST-LABEL: hsub_v16i16b: -; AVX2_FAST: # %bb.0: -; AVX2_FAST-NEXT: vphsubw %ymm0, %ymm0, %ymm0 -; AVX2_FAST-NEXT: retq +; AVX2-LABEL: hsub_v16i16b: +; AVX2: # %bb.0: +; AVX2-NEXT: vphsubw %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: retq %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> %hop = sub <16 x i16> %a0, %a1 Index: test/CodeGen/X86/haddsub-undef.ll =================================================================== --- test/CodeGen/X86/haddsub-undef.ll +++ test/CodeGen/X86/haddsub-undef.ll @@ -487,30 +487,15 @@ } define <4 x float> @add_ps_007(<4 x float> %x) { -; SSE-SLOW-LABEL: add_ps_007: -; SSE-SLOW: # %bb.0: -; SSE-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-SLOW-NEXT: addps %xmm1, %xmm0 -; SSE-SLOW-NEXT: retq -; -; SSE-FAST-LABEL: add_ps_007: -; SSE-FAST: # %bb.0: -; SSE-FAST-NEXT: haddps %xmm0, %xmm0 -; SSE-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: add_ps_007: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: retq +; SSE-LABEL: add_ps_007: +; SSE: # %bb.0: +; SSE-NEXT: haddps %xmm0, %xmm0 +; SSE-NEXT: retq ; -; AVX-FAST-LABEL: add_ps_007: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: retq +; AVX-LABEL: add_ps_007: +; AVX: # %bb.0: +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %add = fadd <4 x float> %l, %r @@ -518,34 +503,17 @@ } define <4 x float> @add_ps_030(<4 x float> %x) { -; SSE-SLOW-LABEL: add_ps_030: -; SSE-SLOW: # %bb.0: -; SSE-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-SLOW-NEXT: addps %xmm1, %xmm0 -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,2,3] -; SSE-SLOW-NEXT: retq -; -; SSE-FAST-LABEL: add_ps_030: -; SSE-FAST: # %bb.0: -; SSE-FAST-NEXT: haddps %xmm0, %xmm0 -; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,2,3] -; SSE-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: add_ps_030: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX-SLOW-NEXT: retq +; SSE-LABEL: add_ps_030: +; SSE: # %bb.0: +; SSE-NEXT: haddps %xmm0, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,2,3] +; SSE-NEXT: retq ; -; AVX-FAST-LABEL: add_ps_030: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX-FAST-NEXT: retq +; AVX-LABEL: add_ps_030: +; AVX: # %bb.0: +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %add = fadd <4 x float> %l, %r @@ -554,36 +522,15 @@ } define <4 x float> @add_ps_007_2(<4 x float> %x) { -; SSE-SLOW-LABEL: add_ps_007_2: -; SSE-SLOW: # %bb.0: -; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-SLOW-NEXT: addps %xmm1, %xmm0 -; SSE-SLOW-NEXT: retq -; -; SSE-FAST-LABEL: add_ps_007_2: -; SSE-FAST: # %bb.0: -; SSE-FAST-NEXT: haddps %xmm0, %xmm0 -; SSE-FAST-NEXT: retq -; -; AVX1-SLOW-LABEL: add_ps_007_2: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX1-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: add_ps_007_2: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: retq +; SSE-LABEL: add_ps_007_2: +; SSE: # %bb.0: +; SSE-NEXT: haddps %xmm0, %xmm0 +; SSE-NEXT: retq ; -; AVX512-SLOW-LABEL: add_ps_007_2: -; AVX512-SLOW: # %bb.0: -; AVX512-SLOW-NEXT: vbroadcastss %xmm0, %xmm1 -; AVX512-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX512-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX512-SLOW-NEXT: retq +; AVX-LABEL: add_ps_007_2: +; AVX: # %bb.0: +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %add = fadd <4 x float> %l, %r @@ -651,41 +598,17 @@ } define <4 x float> @add_ps_018(<4 x float> %x) { -; SSE-SLOW-LABEL: add_ps_018: -; SSE-SLOW: # %bb.0: -; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-SLOW-NEXT: addps %xmm1, %xmm0 -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-SLOW-NEXT: retq -; -; SSE-FAST-LABEL: add_ps_018: -; SSE-FAST: # %bb.0: -; SSE-FAST-NEXT: haddps %xmm0, %xmm0 -; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-FAST-NEXT: retq -; -; AVX1-SLOW-LABEL: add_ps_018: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX1-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: add_ps_018: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-FAST-NEXT: retq +; SSE-LABEL: add_ps_018: +; SSE: # %bb.0: +; SSE-NEXT: haddps %xmm0, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: retq ; -; AVX512-SLOW-LABEL: add_ps_018: -; AVX512-SLOW: # %bb.0: -; AVX512-SLOW-NEXT: vbroadcastss %xmm0, %xmm1 -; AVX512-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX512-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX512-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512-SLOW-NEXT: retq +; AVX-LABEL: add_ps_018: +; AVX: # %bb.0: +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %add = fadd <4 x float> %l, %r Index: test/CodeGen/X86/haddsub.ll =================================================================== --- test/CodeGen/X86/haddsub.ll +++ test/CodeGen/X86/haddsub.ll @@ -103,30 +103,15 @@ } define <4 x float> @haddps3(<4 x float> %x) { -; SSE3-SLOW-LABEL: haddps3: -; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] -; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 -; SSE3-SLOW-NEXT: retq -; -; SSE3-FAST-LABEL: haddps3: -; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 -; SSE3-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: haddps3: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: retq +; SSE3-LABEL: haddps3: +; SSE3: # %bb.0: +; SSE3-NEXT: haddps %xmm0, %xmm0 +; SSE3-NEXT: retq ; -; AVX-FAST-LABEL: haddps3: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: retq +; AVX-LABEL: haddps3: +; AVX: # %bb.0: +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = fadd <4 x float> %a, %b @@ -134,30 +119,15 @@ } define <4 x float> @haddps4(<4 x float> %x) { -; SSE3-SLOW-LABEL: haddps4: -; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] -; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 -; SSE3-SLOW-NEXT: retq -; -; SSE3-FAST-LABEL: haddps4: -; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 -; SSE3-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: haddps4: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] -; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: retq +; SSE3-LABEL: haddps4: +; SSE3: # %bb.0: +; SSE3-NEXT: haddps %xmm0, %xmm0 +; SSE3-NEXT: retq ; -; AVX-FAST-LABEL: haddps4: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: retq +; AVX-LABEL: haddps4: +; AVX: # %bb.0: +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = fadd <4 x float> %a, %b @@ -165,30 +135,15 @@ } define <4 x float> @haddps5(<4 x float> %x) { -; SSE3-SLOW-LABEL: haddps5: -; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,3] -; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2,2,3] -; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 -; SSE3-SLOW-NEXT: retq -; -; SSE3-FAST-LABEL: haddps5: -; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 -; SSE3-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: haddps5: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,3,2,3] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,2,3] -; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: retq +; SSE3-LABEL: haddps5: +; SSE3: # %bb.0: +; SSE3-NEXT: haddps %xmm0, %xmm0 +; SSE3-NEXT: retq ; -; AVX-FAST-LABEL: haddps5: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: retq +; AVX-LABEL: haddps5: +; AVX: # %bb.0: +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = fadd <4 x float> %a, %b @@ -224,30 +179,15 @@ } define <4 x float> @haddps7(<4 x float> %x) { -; SSE3-SLOW-LABEL: haddps7: -; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 -; SSE3-SLOW-NEXT: retq -; -; SSE3-FAST-LABEL: haddps7: -; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 -; SSE3-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: haddps7: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: retq +; SSE3-LABEL: haddps7: +; SSE3: # %bb.0: +; SSE3-NEXT: haddps %xmm0, %xmm0 +; SSE3-NEXT: retq ; -; AVX-FAST-LABEL: haddps7: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: retq +; AVX-LABEL: haddps7: +; AVX: # %bb.0: +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = fadd <4 x float> %a, %b @@ -316,31 +256,15 @@ } define <4 x float> @hsubps2(<4 x float> %x) { -; SSE3-SLOW-LABEL: hsubps2: -; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] -; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE3-SLOW-NEXT: subps %xmm0, %xmm1 -; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 -; SSE3-SLOW-NEXT: retq -; -; SSE3-FAST-LABEL: hsubps2: -; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 -; SSE3-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: hsubps2: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX-SLOW-NEXT: vsubps %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: retq +; SSE3-LABEL: hsubps2: +; SSE3: # %bb.0: +; SSE3-NEXT: hsubps %xmm0, %xmm0 +; SSE3-NEXT: retq ; -; AVX-FAST-LABEL: hsubps2: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: retq +; AVX-LABEL: hsubps2: +; AVX: # %bb.0: +; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = fsub <4 x float> %a, %b @@ -348,31 +272,15 @@ } define <4 x float> @hsubps3(<4 x float> %x) { -; SSE3-SLOW-LABEL: hsubps3: -; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] -; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE3-SLOW-NEXT: subps %xmm0, %xmm1 -; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 -; SSE3-SLOW-NEXT: retq -; -; SSE3-FAST-LABEL: hsubps3: -; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 -; SSE3-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: hsubps3: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] -; AVX-SLOW-NEXT: vsubps %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: retq +; SSE3-LABEL: hsubps3: +; SSE3: # %bb.0: +; SSE3-NEXT: hsubps %xmm0, %xmm0 +; SSE3-NEXT: retq ; -; AVX-FAST-LABEL: hsubps3: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: retq +; AVX-LABEL: hsubps3: +; AVX: # %bb.0: +; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = fsub <4 x float> %a, %b @@ -442,35 +350,16 @@ } define <8 x float> @vhaddps3(<8 x float> %x) { -; SSE3-SLOW-LABEL: vhaddps3: -; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movaps %xmm1, %xmm2 -; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3] -; SSE3-SLOW-NEXT: movaps %xmm0, %xmm3 -; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[2,3] -; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE3-SLOW-NEXT: addps %xmm2, %xmm1 -; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE3-SLOW-NEXT: addps %xmm3, %xmm0 -; SSE3-SLOW-NEXT: retq -; -; SSE3-FAST-LABEL: vhaddps3: -; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 -; SSE3-FAST-NEXT: haddps %xmm1, %xmm1 -; SSE3-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: vhaddps3: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] -; AVX-SLOW-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; AVX-SLOW-NEXT: retq +; SSE3-LABEL: vhaddps3: +; SSE3: # %bb.0: +; SSE3-NEXT: haddps %xmm0, %xmm0 +; SSE3-NEXT: haddps %xmm1, %xmm1 +; SSE3-NEXT: retq ; -; AVX-FAST-LABEL: vhaddps3: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vhaddps %ymm0, %ymm0, %ymm0 -; AVX-FAST-NEXT: retq +; AVX-LABEL: vhaddps3: +; AVX: # %bb.0: +; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX-NEXT: retq %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> %r = fadd <8 x float> %a, %b @@ -495,37 +384,16 @@ } define <8 x float> @vhsubps3(<8 x float> %x) { -; SSE3-SLOW-LABEL: vhsubps3: -; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movaps %xmm1, %xmm2 -; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3] -; SSE3-SLOW-NEXT: movaps %xmm0, %xmm3 -; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[2,3] -; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE3-SLOW-NEXT: subps %xmm1, %xmm2 -; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE3-SLOW-NEXT: subps %xmm0, %xmm3 -; SSE3-SLOW-NEXT: movaps %xmm3, %xmm0 -; SSE3-SLOW-NEXT: movaps %xmm2, %xmm1 -; SSE3-SLOW-NEXT: retq -; -; SSE3-FAST-LABEL: vhsubps3: -; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 -; SSE3-FAST-NEXT: hsubps %xmm1, %xmm1 -; SSE3-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: vhsubps3: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] -; AVX-SLOW-NEXT: vsubps %ymm0, %ymm1, %ymm0 -; AVX-SLOW-NEXT: retq +; SSE3-LABEL: vhsubps3: +; SSE3: # %bb.0: +; SSE3-NEXT: hsubps %xmm0, %xmm0 +; SSE3-NEXT: hsubps %xmm1, %xmm1 +; SSE3-NEXT: retq ; -; AVX-FAST-LABEL: vhsubps3: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vhsubps %ymm0, %ymm0, %ymm0 -; AVX-FAST-NEXT: retq +; AVX-LABEL: vhsubps3: +; AVX: # %bb.0: +; AVX-NEXT: vhsubps %ymm0, %ymm0, %ymm0 +; AVX-NEXT: retq %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> %r = fsub <8 x float> %a, %b @@ -1614,10 +1482,7 @@ ; SSE3-SLOW-LABEL: PR39936_v8f32: ; SSE3-SLOW: # %bb.0: ; SSE3-SLOW-NEXT: haddps %xmm1, %xmm0 -; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] -; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: haddps %xmm0, %xmm0 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq @@ -1633,9 +1498,7 @@ ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] -; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vzeroupper Index: test/CodeGen/X86/phaddsub.ll =================================================================== --- test/CodeGen/X86/phaddsub.ll +++ test/CodeGen/X86/phaddsub.ll @@ -71,29 +71,15 @@ } define <4 x i32> @phaddd3(<4 x i32> %x) { -; SSSE3-SLOW-LABEL: phaddd3: -; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: retq -; -; SSSE3-FAST-LABEL: phaddd3: -; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: phaddd3: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: retq +; SSSE3-LABEL: phaddd3: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-NEXT: retq ; -; AVX-FAST-LABEL: phaddd3: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: retq +; AVX-LABEL: phaddd3: +; AVX: # %bb.0: +; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = add <4 x i32> %a, %b @@ -101,29 +87,15 @@ } define <4 x i32> @phaddd4(<4 x i32> %x) { -; SSSE3-SLOW-LABEL: phaddd4: -; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: retq -; -; SSSE3-FAST-LABEL: phaddd4: -; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: phaddd4: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: retq +; SSSE3-LABEL: phaddd4: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-NEXT: retq ; -; AVX-FAST-LABEL: phaddd4: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: retq +; AVX-LABEL: phaddd4: +; AVX: # %bb.0: +; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = add <4 x i32> %a, %b @@ -131,29 +103,15 @@ } define <4 x i32> @phaddd5(<4 x i32> %x) { -; SSSE3-SLOW-LABEL: phaddd5: -; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,3,2,3] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,2,2,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: retq -; -; SSSE3-FAST-LABEL: phaddd5: -; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: phaddd5: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,3,2,3] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: retq +; SSSE3-LABEL: phaddd5: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-NEXT: retq ; -; AVX-FAST-LABEL: phaddd5: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: retq +; AVX-LABEL: phaddd5: +; AVX: # %bb.0: +; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = add <4 x i32> %a, %b @@ -189,29 +147,15 @@ } define <4 x i32> @phaddd7(<4 x i32> %x) { -; SSSE3-SLOW-LABEL: phaddd7: -; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: retq -; -; SSSE3-FAST-LABEL: phaddd7: -; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: phaddd7: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: retq +; SSSE3-LABEL: phaddd7: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-NEXT: retq ; -; AVX-FAST-LABEL: phaddd7: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: retq +; AVX-LABEL: phaddd7: +; AVX: # %bb.0: +; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = add <4 x i32> %a, %b @@ -251,30 +195,15 @@ } define <4 x i32> @phsubd2(<4 x i32> %x) { -; SSSE3-SLOW-LABEL: phsubd2: -; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSSE3-SLOW-NEXT: psubd %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: retq -; -; SSSE3-FAST-LABEL: phsubd2: -; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: phsubd %xmm0, %xmm0 -; SSSE3-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: phsubd2: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-SLOW-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: retq +; SSSE3-LABEL: phsubd2: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phsubd %xmm0, %xmm0 +; SSSE3-NEXT: retq ; -; AVX-FAST-LABEL: phsubd2: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: retq +; AVX-LABEL: phsubd2: +; AVX: # %bb.0: +; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = sub <4 x i32> %a, %b @@ -282,30 +211,15 @@ } define <4 x i32> @phsubd3(<4 x i32> %x) { -; SSSE3-SLOW-LABEL: phsubd3: -; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSSE3-SLOW-NEXT: psubd %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: retq -; -; SSSE3-FAST-LABEL: phsubd3: -; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: phsubd %xmm0, %xmm0 -; SSSE3-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: phsubd3: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; AVX-SLOW-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: retq +; SSSE3-LABEL: phsubd3: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phsubd %xmm0, %xmm0 +; SSSE3-NEXT: retq ; -; AVX-FAST-LABEL: phsubd3: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: retq +; AVX-LABEL: phsubd3: +; AVX: # %bb.0: +; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = sub <4 x i32> %a, %b @@ -398,29 +312,15 @@ } define <4 x i32> @phaddd_single_source1(<4 x i32> %x) { -; SSSE3-SLOW-LABEL: phaddd_single_source1: -; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,2] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: retq -; -; SSSE3-FAST-LABEL: phaddd_single_source1: -; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: phaddd_single_source1: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,2] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: retq +; SSSE3-LABEL: phaddd_single_source1: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-NEXT: retq ; -; AVX-FAST-LABEL: phaddd_single_source1: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: retq +; AVX-LABEL: phaddd_single_source1: +; AVX: # %bb.0: +; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %add = add <4 x i32> %l, %r @@ -428,33 +328,17 @@ } define <4 x i32> @phaddd_single_source2(<4 x i32> %x) { -; SSSE3-SLOW-LABEL: phaddd_single_source2: -; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,2] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; SSSE3-SLOW-NEXT: retq -; -; SSSE3-FAST-LABEL: phaddd_single_source2: -; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; SSSE3-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: phaddd_single_source2: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,2] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX-SLOW-NEXT: retq +; SSSE3-LABEL: phaddd_single_source2: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] +; SSSE3-NEXT: retq ; -; AVX-FAST-LABEL: phaddd_single_source2: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX-FAST-NEXT: retq +; AVX-LABEL: phaddd_single_source2: +; AVX: # %bb.0: +; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %add = add <4 x i32> %l, %r @@ -463,36 +347,15 @@ } define <4 x i32> @phaddd_single_source3(<4 x i32> %x) { -; SSSE3-SLOW-LABEL: phaddd_single_source3: -; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: retq -; -; SSSE3-FAST-LABEL: phaddd_single_source3: -; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-FAST-NEXT: retq -; -; AVX1-SLOW-LABEL: phaddd_single_source3: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; AVX1-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: phaddd_single_source3: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: retq +; SSSE3-LABEL: phaddd_single_source3: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-NEXT: retq ; -; AVX2-SLOW-LABEL: phaddd_single_source3: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm1 -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: retq +; AVX-LABEL: phaddd_single_source3: +; AVX: # %bb.0: +; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %add = add <4 x i32> %l, %r @@ -559,41 +422,17 @@ } define <4 x i32> @phaddd_single_source6(<4 x i32> %x) { -; SSSE3-SLOW-LABEL: phaddd_single_source6: -; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSSE3-SLOW-NEXT: retq -; -; SSSE3-FAST-LABEL: phaddd_single_source6: -; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSSE3-FAST-NEXT: retq -; -; AVX1-SLOW-LABEL: phaddd_single_source6: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; AVX1-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: phaddd_single_source6: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX-FAST-NEXT: retq +; SSSE3-LABEL: phaddd_single_source6: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSSE3-NEXT: retq ; -; AVX2-SLOW-LABEL: phaddd_single_source6: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm1 -; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-SLOW-NEXT: retq +; AVX-LABEL: phaddd_single_source6: +; AVX: # %bb.0: +; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %add = add <4 x i32> %l, %r @@ -602,30 +441,15 @@ } define <8 x i16> @phaddw_single_source1(<8 x i16> %x) { -; SSSE3-SLOW-LABEL: phaddw_single_source1: -; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13] -; SSSE3-SLOW-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15] -; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: retq -; -; SSSE3-FAST-LABEL: phaddw_single_source1: -; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 -; SSSE3-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: phaddw_single_source1: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13] -; AVX-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15] -; AVX-SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: retq +; SSSE3-LABEL: phaddw_single_source1: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phaddw %xmm0, %xmm0 +; SSSE3-NEXT: retq ; -; AVX-FAST-LABEL: phaddw_single_source1: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: retq +; AVX-LABEL: phaddw_single_source1: +; AVX: # %bb.0: +; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %add = add <8 x i16> %l, %r @@ -633,41 +457,19 @@ } define <8 x i16> @phaddw_single_source2(<8 x i16> %x) { -; SSSE3-SLOW-LABEL: phaddw_single_source2: -; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSSE3-SLOW-NEXT: retq -; -; SSSE3-FAST-LABEL: phaddw_single_source2: -; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 -; SSSE3-FAST-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSSE3-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: phaddw_single_source2: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; AVX-SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX-SLOW-NEXT: retq +; SSSE3-LABEL: phaddw_single_source2: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phaddw %xmm0, %xmm0 +; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSSE3-NEXT: retq ; -; AVX-FAST-LABEL: phaddw_single_source2: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX-FAST-NEXT: retq +; AVX-LABEL: phaddw_single_source2: +; AVX: # %bb.0: +; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX-NEXT: retq %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %add = add <8 x i16> %l, %r @@ -676,33 +478,15 @@ } define <8 x i16> @phaddw_single_source3(<8 x i16> %x) { -; SSSE3-SLOW-LABEL: phaddw_single_source3: -; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: retq -; -; SSSE3-FAST-LABEL: phaddw_single_source3: -; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 -; SSSE3-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: phaddw_single_source3: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; AVX-SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: retq +; SSSE3-LABEL: phaddw_single_source3: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phaddw %xmm0, %xmm0 +; SSSE3-NEXT: retq ; -; AVX-FAST-LABEL: phaddw_single_source3: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: retq +; AVX-LABEL: phaddw_single_source3: +; AVX: # %bb.0: +; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %add = add <8 x i16> %l, %r @@ -739,42 +523,17 @@ } define <8 x i16> @phaddw_single_source6(<8 x i16> %x) { -; SSSE3-SLOW-LABEL: phaddw_single_source6: -; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSSE3-SLOW-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] -; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSSE3-SLOW-NEXT: retq -; -; SSSE3-FAST-LABEL: phaddw_single_source6: -; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 -; SSSE3-FAST-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSSE3-FAST-NEXT: retq -; -; AVX1-SLOW-LABEL: phaddw_single_source6: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; AVX1-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 -; AVX1-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-SLOW-NEXT: retq -; -; AVX-FAST-LABEL: phaddw_single_source6: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX-FAST-NEXT: retq +; SSSE3-LABEL: phaddw_single_source6: +; SSSE3: # %bb.0: +; SSSE3-NEXT: phaddw %xmm0, %xmm0 +; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq ; -; AVX2-SLOW-LABEL: phaddw_single_source6: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastw %xmm0, %xmm1 -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: retq +; AVX-LABEL: phaddw_single_source6: +; AVX: # %bb.0: +; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX-NEXT: retq %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %add = add <8 x i16> %l, %r @@ -787,9 +546,7 @@ ; SSSE3-SLOW-LABEL: PR39936_v8i32: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: phaddd %xmm0, %xmm0 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1 ; SSSE3-SLOW-NEXT: movd %xmm1, %eax @@ -807,9 +564,7 @@ ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-SLOW-NEXT: vmovd %xmm0, %eax @@ -830,9 +585,7 @@ ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vmovd %xmm0, %eax Index: test/CodeGen/X86/vector-shuffle-combining.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining.ll +++ test/CodeGen/X86/vector-shuffle-combining.ll @@ -2701,21 +2701,36 @@ } define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) { -; SSE-LABEL: PR22377: -; SSE: # %bb.0: # %entry -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[2,3] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,0,2] -; SSE-NEXT: addps %xmm0, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: retq +; SSE2-LABEL: PR22377: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,0,2] +; SSE2-NEXT: addps %xmm0, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: PR22377: +; SSSE3: # %bb.0: # %entry +; SSSE3-NEXT: movaps %xmm0, %xmm1 +; SSSE3-NEXT: haddps %xmm0, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: PR22377: +; SSE41: # %bb.0: # %entry +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: haddps %xmm0, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE41-NEXT: retq ; ; AVX-LABEL: PR22377: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2] -; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: retq entry: %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32>