Index: llvm/trunk/lib/Target/X86/X86.td =================================================================== --- llvm/trunk/lib/Target/X86/X86.td +++ llvm/trunk/lib/Target/X86/X86.td @@ -404,6 +404,15 @@ "Indicates that the BEXTR instruction is implemented as a single uop " "with good throughput.">; +// Combine vector math operations with shuffles into horizontal math +// instructions if a CPU implements horizontal operations (introduced with +// SSE3) with better latency/throughput than the alternative sequence. +def FeatureFastHorizontalOps + : SubtargetFeature< + "fast-hops", "HasFastHorizontalOps", "true", + "Prefer horizontal vector math instructions (haddp, phsub, etc.) over " + "normal vector instructions with shuffles", [FeatureSSE3]>; + // Merge branches using three-way conditional code. def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch", "ThreewayBranchProfitable", "true", @@ -998,7 +1007,8 @@ FeatureLAHFSAHF, FeatureFast15ByteNOP, FeatureFastBEXTR, - FeatureFastPartialYMMorZMMWrite + FeatureFastPartialYMMorZMMWrite, + FeatureFastHorizontalOps ]>; // Bulldozer Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -37031,9 +37031,6 @@ // The low half of the 128-bit result must choose from A. // The high half of the 128-bit result must choose from B, // unless B is undef. In that case, we are always choosing from A. - // TODO: Using a horizontal op on a single input is likely worse for - // performance on many CPUs, so this should be limited here or reversed - // in a later pass. unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2; unsigned Src = B.getNode() ? i >= NumEltsPer64BitChunk : 0; @@ -37051,6 +37048,16 @@ return true; } +/// Horizontal vector math instructions may be slower than normal math with +/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch +/// implementation, and likely shuffle complexity of the alternate sequence. +static bool shouldCombineToHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + bool IsOptimizingSize = DAG.getMachineFunction().getFunction().optForSize(); + bool HasFastHOps = Subtarget.hasFastHorizontalOps(); + return !IsSingleSource || IsOptimizingSize || HasFastHOps; +} + /// Do target-specific dag combines on floating-point adds/subs. static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -37063,7 +37070,8 @@ // Try to synthesize horizontal add/sub from adds/subs of shuffles. if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && - isHorizontalBinOp(LHS, RHS, IsFadd)) { + isHorizontalBinOp(LHS, RHS, IsFadd) && + shouldCombineToHorizontalOp(LHS == RHS, DAG, Subtarget)) { auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB; return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS); } @@ -39787,7 +39795,8 @@ // Try to synthesize horizontal adds from adds of shuffles. if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 || VT == MVT::v8i32) && - Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true)) { + Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, true) && + shouldCombineToHorizontalOp(Op0 == Op1, DAG, Subtarget)) { auto HADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops); @@ -39918,7 +39927,8 @@ EVT VT = N->getValueType(0); if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 || VT == MVT::v8i32) && - Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false)) { + Subtarget.hasSSSE3() && isHorizontalBinOp(Op0, Op1, false) && + shouldCombineToHorizontalOp(Op0 == Op1, DAG, Subtarget)) { auto HSUBBuilder = [](SelectionDAG &DAG, const SDLoc &DL, ArrayRef Ops) { return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops); Index: llvm/trunk/lib/Target/X86/X86Subtarget.h =================================================================== --- llvm/trunk/lib/Target/X86/X86Subtarget.h +++ llvm/trunk/lib/Target/X86/X86Subtarget.h @@ -388,6 +388,9 @@ /// Processor has a single uop BEXTR implementation. bool HasFastBEXTR = false; + /// Try harder to combine to horizontal vector ops if they are fast. + bool HasFastHorizontalOps = false; + /// Use a retpoline thunk rather than indirect calls to block speculative /// execution. bool UseRetpolineIndirectCalls = false; @@ -636,6 +639,7 @@ bool hasFastLZCNT() const { return HasFastLZCNT; } bool hasFastSHLDRotate() const { return HasFastSHLDRotate; } bool hasFastBEXTR() const { return HasFastBEXTR; } + bool hasFastHorizontalOps() const { return HasFastHorizontalOps; } bool hasMacroFusion() const { return HasMacroFusion; } bool hasERMSB() const { return HasERMSB; } bool hasSlowDivide32() const { return HasSlowDivide32; } Index: llvm/trunk/test/CodeGen/X86/avx2-phaddsub.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx2-phaddsub.ll +++ llvm/trunk/test/CodeGen/X86/avx2-phaddsub.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i686-- -mattr=+avx2 | FileCheck %s --check-prefixes=X32,X32-SLOW +; RUN: llc < %s -mtriple=i686-- -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=X32,X32-FAST +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-SLOW +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=X64,X64-FAST define <16 x i16> @phaddw1(<16 x i16> %x, <16 x i16> %y) { ; X32-LABEL: phaddw1: @@ -67,15 +69,29 @@ } define <8 x i32> @phaddd3(<8 x i32> %x) { -; X32-LABEL: phaddd3: -; X32: # %bb.0: -; X32-NEXT: vphaddd %ymm0, %ymm0, %ymm0 -; X32-NEXT: retl -; -; X64-LABEL: phaddd3: -; X64: # %bb.0: -; X64-NEXT: vphaddd %ymm0, %ymm0, %ymm0 -; X64-NEXT: retq +; X32-SLOW-LABEL: phaddd3: +; X32-SLOW: # %bb.0: +; X32-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] +; X32-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] +; X32-SLOW-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; X32-SLOW-NEXT: retl +; +; X32-FAST-LABEL: phaddd3: +; X32-FAST: # %bb.0: +; X32-FAST-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; X32-FAST-NEXT: retl +; +; X64-SLOW-LABEL: phaddd3: +; X64-SLOW: # %bb.0: +; X64-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] +; X64-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] +; X64-SLOW-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; X64-SLOW-NEXT: retq +; +; X64-FAST-LABEL: phaddd3: +; X64-FAST: # %bb.0: +; X64-FAST-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; X64-FAST-NEXT: retq %a = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> %b = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> %r = add <8 x i32> %a, %b Index: llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -6860,7 +6860,8 @@ ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: vmovd %xmm0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} @@ -6989,7 +6990,8 @@ ; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; X86-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vzeroupper ; X86-NEXT: retl @@ -7004,7 +7006,8 @@ ; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; X64-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; X64-NEXT: vmovd %xmm0, %eax ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -7210,7 +7213,8 @@ ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; X86-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovlpd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: movl %ebp, %esp @@ -7225,7 +7229,8 @@ ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; X64-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: @@ -7405,7 +7410,8 @@ ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; X86-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovlpd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: movl %ebp, %esp @@ -7422,7 +7428,8 @@ ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; X64-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: Index: llvm/trunk/test/CodeGen/X86/haddsub-shuf.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/haddsub-shuf.ll +++ llvm/trunk/test/CodeGen/X86/haddsub-shuf.ll @@ -1,21 +1,54 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3,SSSE3_SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3_FAST +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_FAST +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_FAST ; The next 8 tests check for matching the horizontal op and eliminating the shuffle. ; PR34111 - https://bugs.llvm.org/show_bug.cgi?id=34111 define <4 x float> @hadd_v4f32(<4 x float> %a) { -; SSSE3-LABEL: hadd_v4f32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: haddps %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: hadd_v4f32: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3_SLOW-LABEL: hadd_v4f32: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm1 +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3_SLOW-NEXT: addps %xmm1, %xmm0 +; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hadd_v4f32: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: haddps %xmm0, %xmm0 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hadd_v4f32: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX1_SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hadd_v4f32: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hadd_v4f32: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX2_SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hadd_v4f32: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX2_FAST-NEXT: retq %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> %hop = fadd <2 x float> %a02, %a13 @@ -54,16 +87,51 @@ } define <8 x float> @hadd_v8f32b(<8 x float> %a) { -; SSSE3-LABEL: hadd_v8f32b: -; SSSE3: # %bb.0: -; SSSE3-NEXT: haddps %xmm0, %xmm0 -; SSSE3-NEXT: haddps %xmm1, %xmm1 -; SSSE3-NEXT: retq -; -; AVX-LABEL: hadd_v8f32b: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 -; AVX-NEXT: retq +; SSSE3_SLOW-LABEL: hadd_v8f32b: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm2 +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[2,3] +; SSSE3_SLOW-NEXT: movaps %xmm1, %xmm3 +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[2,3] +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3_SLOW-NEXT: addps %xmm2, %xmm0 +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSSE3_SLOW-NEXT: addps %xmm3, %xmm1 +; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] +; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hadd_v8f32b: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: haddps %xmm0, %xmm0 +; SSSE3_FAST-NEXT: haddps %xmm1, %xmm1 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hadd_v8f32b: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] +; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] +; AVX1_SLOW-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hadd_v8f32b: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hadd_v8f32b: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] +; AVX2_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] +; AVX2_SLOW-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hadd_v8f32b: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX2_FAST-NEXT: retq %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> %hop = fadd <8 x float> %a0, %a1 @@ -72,15 +140,45 @@ } define <4 x float> @hsub_v4f32(<4 x float> %a) { -; SSSE3-LABEL: hsub_v4f32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: hsubps %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: hsub_v4f32: -; AVX: # %bb.0: -; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3_SLOW-LABEL: hsub_v4f32: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm1 +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3_SLOW-NEXT: subps %xmm0, %xmm1 +; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hsub_v4f32: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: hsubps %xmm0, %xmm0 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hsub_v4f32: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX1_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX1_SLOW-NEXT: vsubps %xmm0, %xmm1, %xmm0 +; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hsub_v4f32: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hsub_v4f32: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX2_SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX2_SLOW-NEXT: vsubps %xmm0, %xmm1, %xmm0 +; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hsub_v4f32: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 +; AVX2_FAST-NEXT: retq %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> %hop = fsub <2 x float> %a02, %a13 @@ -119,16 +217,51 @@ } define <8 x float> @hsub_v8f32b(<8 x float> %a) { -; SSSE3-LABEL: hsub_v8f32b: -; SSSE3: # %bb.0: -; SSSE3-NEXT: hsubps %xmm0, %xmm0 -; SSSE3-NEXT: hsubps %xmm1, %xmm1 -; SSSE3-NEXT: retq -; -; AVX-LABEL: hsub_v8f32b: -; AVX: # %bb.0: -; AVX-NEXT: vhsubps %ymm0, %ymm0, %ymm0 -; AVX-NEXT: retq +; SSSE3_SLOW-LABEL: hsub_v8f32b: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm2 +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[2,3] +; SSSE3_SLOW-NEXT: movaps %xmm1, %xmm3 +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm1[2,3] +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3_SLOW-NEXT: subps %xmm0, %xmm2 +; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSSE3_SLOW-NEXT: subps %xmm1, %xmm3 +; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0] +; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm3[0,0] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hsub_v8f32b: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: hsubps %xmm0, %xmm0 +; SSSE3_FAST-NEXT: hsubps %xmm1, %xmm1 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hsub_v8f32b: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] +; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] +; AVX1_SLOW-NEXT: vsubps %ymm0, %ymm1, %ymm0 +; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hsub_v8f32b: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vhsubps %ymm0, %ymm0, %ymm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hsub_v8f32b: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] +; AVX2_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] +; AVX2_SLOW-NEXT: vsubps %ymm0, %ymm1, %ymm0 +; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hsub_v8f32b: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vhsubps %ymm0, %ymm0, %ymm0 +; AVX2_FAST-NEXT: retq %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> %hop = fsub <8 x float> %a0, %a1 @@ -137,15 +270,42 @@ } define <2 x double> @hadd_v2f64(<2 x double> %a) { -; SSSE3-LABEL: hadd_v2f64: -; SSSE3: # %bb.0: -; SSSE3-NEXT: haddpd %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: hadd_v2f64: -; AVX: # %bb.0: -; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3_SLOW-LABEL: hadd_v2f64: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1 +; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSSE3_SLOW-NEXT: addpd %xmm0, %xmm1 +; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hadd_v2f64: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: haddpd %xmm0, %xmm0 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hadd_v2f64: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1_SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hadd_v2f64: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hadd_v2f64: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2_SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hadd_v2f64: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX2_FAST-NEXT: retq %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> %hop = fadd <2 x double> %a0, %a1 @@ -154,16 +314,47 @@ } define <4 x double> @hadd_v4f64(<4 x double> %a) { -; SSSE3-LABEL: hadd_v4f64: -; SSSE3: # %bb.0: -; SSSE3-NEXT: haddpd %xmm0, %xmm0 -; SSSE3-NEXT: haddpd %xmm1, %xmm1 -; SSSE3-NEXT: retq -; -; AVX-LABEL: hadd_v4f64: -; AVX: # %bb.0: -; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 -; AVX-NEXT: retq +; SSSE3_SLOW-LABEL: hadd_v4f64: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm2 +; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm3 +; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSSE3_SLOW-NEXT: addpd %xmm1, %xmm3 +; SSSE3_SLOW-NEXT: addpd %xmm0, %xmm2 +; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0] +; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm3[0,0] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hadd_v4f64: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: haddpd %xmm0, %xmm0 +; SSSE3_FAST-NEXT: haddpd %xmm1, %xmm1 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hadd_v4f64: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] +; AVX1_SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hadd_v4f64: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hadd_v4f64: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] +; AVX2_SLOW-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hadd_v4f64: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX2_FAST-NEXT: retq %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> %hop = fadd <4 x double> %a0, %a1 @@ -172,15 +363,42 @@ } define <2 x double> @hsub_v2f64(<2 x double> %a) { -; SSSE3-LABEL: hsub_v2f64: -; SSSE3: # %bb.0: -; SSSE3-NEXT: hsubpd %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: hsub_v2f64: -; AVX: # %bb.0: -; AVX-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3_SLOW-LABEL: hsub_v2f64: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1 +; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSSE3_SLOW-NEXT: subpd %xmm1, %xmm0 +; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hsub_v2f64: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: hsubpd %xmm0, %xmm0 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hsub_v2f64: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1_SLOW-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX1_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hsub_v2f64: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hsub_v2f64: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2_SLOW-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX2_SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hsub_v2f64: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 +; AVX2_FAST-NEXT: retq %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> %hop = fsub <2 x double> %a0, %a1 @@ -189,16 +407,47 @@ } define <4 x double> @hsub_v4f64(<4 x double> %a) { -; SSSE3-LABEL: hsub_v4f64: -; SSSE3: # %bb.0: -; SSSE3-NEXT: hsubpd %xmm0, %xmm0 -; SSSE3-NEXT: hsubpd %xmm1, %xmm1 -; SSSE3-NEXT: retq -; -; AVX-LABEL: hsub_v4f64: -; AVX: # %bb.0: -; AVX-NEXT: vhsubpd %ymm0, %ymm0, %ymm0 -; AVX-NEXT: retq +; SSSE3_SLOW-LABEL: hsub_v4f64: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm2 +; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm3 +; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSSE3_SLOW-NEXT: subpd %xmm3, %xmm1 +; SSSE3_SLOW-NEXT: subpd %xmm2, %xmm0 +; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] +; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hsub_v4f64: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: hsubpd %xmm0, %xmm0 +; SSSE3_FAST-NEXT: hsubpd %xmm1, %xmm1 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hsub_v4f64: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] +; AVX1_SLOW-NEXT: vsubpd %ymm1, %ymm0, %ymm0 +; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hsub_v4f64: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vhsubpd %ymm0, %ymm0, %ymm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hsub_v4f64: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] +; AVX2_SLOW-NEXT: vsubpd %ymm1, %ymm0, %ymm0 +; AVX2_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hsub_v4f64: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vhsubpd %ymm0, %ymm0, %ymm0 +; AVX2_FAST-NEXT: retq %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> %hop = fsub <4 x double> %a0, %a1 @@ -207,15 +456,44 @@ } define <4 x i32> @hadd_v4i32(<4 x i32> %a) { -; SSSE3-LABEL: hadd_v4i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: hadd_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3_SLOW-LABEL: hadd_v4i32: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3_SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hadd_v4i32: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hadd_v4i32: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX1_SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hadd_v4i32: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hadd_v4i32: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX2_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX2_SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX2_SLOW-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hadd_v4i32: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX2_FAST-NEXT: retq %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> %hop = add <4 x i32> %a02, %a13 @@ -254,25 +532,57 @@ } define <8 x i32> @hadd_v8i32b(<8 x i32> %a) { -; SSSE3-LABEL: hadd_v8i32b: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: phaddd %xmm1, %xmm1 -; SSSE3-NEXT: retq -; -; AVX1-LABEL: hadd_v8i32b: -; AVX1: # %bb.0: -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX1-NEXT: retq -; -; AVX2-LABEL: hadd_v8i32b: -; AVX2: # %bb.0: -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: retq +; SSSE3_SLOW-LABEL: hadd_v8i32b: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3_SLOW-NEXT: paddd %xmm2, %xmm0 +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSSE3_SLOW-NEXT: paddd %xmm3, %xmm1 +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hadd_v8i32b: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3_FAST-NEXT: phaddd %xmm1, %xmm1 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hadd_v8i32b: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] +; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] +; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1_SLOW-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1_SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1_SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hadd_v8i32b: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm1 +; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1_FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1_FAST-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hadd_v8i32b: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] +; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] +; AVX2_SLOW-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hadd_v8i32b: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2_FAST-NEXT: retq %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> %hop = add <8 x i32> %a0, %a1 @@ -281,15 +591,44 @@ } define <4 x i32> @hsub_v4i32(<4 x i32> %a) { -; SSSE3-LABEL: hsub_v4i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phsubd %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: hsub_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3_SLOW-LABEL: hsub_v4i32: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3_SLOW-NEXT: psubd %xmm0, %xmm1 +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hsub_v4i32: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: phsubd %xmm0, %xmm0 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hsub_v4i32: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX1_SLOW-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hsub_v4i32: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hsub_v4i32: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX2_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX2_SLOW-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX2_SLOW-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hsub_v4i32: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 +; AVX2_FAST-NEXT: retq %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> %hop = sub <4 x i32> %a02, %a13 @@ -328,25 +667,57 @@ } define <8 x i32> @hsub_v8i32b(<8 x i32> %a) { -; SSSE3-LABEL: hsub_v8i32b: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phsubd %xmm0, %xmm0 -; SSSE3-NEXT: phsubd %xmm1, %xmm1 -; SSSE3-NEXT: retq -; -; AVX1-LABEL: hsub_v8i32b: -; AVX1: # %bb.0: -; AVX1-NEXT: vphsubd %xmm0, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vphsubd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX1-NEXT: retq -; -; AVX2-LABEL: hsub_v8i32b: -; AVX2: # %bb.0: -; AVX2-NEXT: vphsubd %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: retq +; SSSE3_SLOW-LABEL: hsub_v8i32b: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3_SLOW-NEXT: psubd %xmm0, %xmm2 +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] +; SSSE3_SLOW-NEXT: psubd %xmm0, %xmm3 +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1] +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hsub_v8i32b: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: phsubd %xmm0, %xmm0 +; SSSE3_FAST-NEXT: phsubd %xmm1, %xmm1 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hsub_v8i32b: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] +; AVX1_SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] +; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1_SLOW-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1_SLOW-NEXT: vpsubd %xmm2, %xmm3, %xmm2 +; AVX1_SLOW-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hsub_v8i32b: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm1 +; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1_FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1_FAST-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hsub_v8i32b: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] +; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] +; AVX2_SLOW-NEXT: vpsubd %ymm0, %ymm1, %ymm0 +; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hsub_v8i32b: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vphsubd %ymm0, %ymm0, %ymm0 +; AVX2_FAST-NEXT: retq %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> %hop = sub <8 x i32> %a0, %a1 @@ -355,15 +726,45 @@ } define <8 x i16> @hadd_v8i16(<8 x i16> %a) { -; SSSE3-LABEL: hadd_v8i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddw %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: hadd_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3_SLOW-LABEL: hadd_v8i16: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm1 +; SSSE3_SLOW-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3_SLOW-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; SSSE3_SLOW-NEXT: paddw %xmm1, %xmm0 +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hadd_v8i16: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: phaddw %xmm0, %xmm0 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hadd_v8i16: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1_SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; AVX1_SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hadd_v8i16: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hadd_v8i16: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2_SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; AVX2_SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX2_SLOW-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hadd_v8i16: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX2_FAST-NEXT: retq %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> %hop = add <8 x i16> %a0246, %a1357 @@ -402,25 +803,64 @@ } define <16 x i16> @hadd_v16i16b(<16 x i16> %a) { -; SSSE3-LABEL: hadd_v16i16b: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddw %xmm0, %xmm0 -; SSSE3-NEXT: phaddw %xmm1, %xmm1 -; SSSE3-NEXT: retq -; -; AVX1-LABEL: hadd_v16i16b: -; AVX1: # %bb.0: -; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX1-NEXT: retq -; -; AVX2-LABEL: hadd_v16i16b: -; AVX2: # %bb.0: -; AVX2-NEXT: vphaddw %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: retq +; SSSE3_SLOW-LABEL: hadd_v16i16b: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm3 +; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm3 +; SSSE3_SLOW-NEXT: movdqa %xmm1, %xmm4 +; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm4 +; SSSE3_SLOW-NEXT: movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm0 +; SSSE3_SLOW-NEXT: paddw %xmm3, %xmm0 +; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm1 +; SSSE3_SLOW-NEXT: paddw %xmm4, %xmm1 +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hadd_v16i16b: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: phaddw %xmm0, %xmm0 +; SSSE3_FAST-NEXT: phaddw %xmm1, %xmm1 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hadd_v16i16b: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1_SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm2 +; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1_SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX1_SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; AVX1_SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX1_SLOW-NEXT: vpaddw %xmm0, %xmm2, %xmm0 +; AVX1_SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm2 +; AVX1_SLOW-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hadd_v16i16b: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm1 +; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1_FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1_FAST-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hadd_v16i16b: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2_SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] +; AVX2_SLOW-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hadd_v16i16b: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vphaddw %ymm0, %ymm0, %ymm0 +; AVX2_FAST-NEXT: retq %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> %hop = add <16 x i16> %a0, %a1 @@ -429,15 +869,45 @@ } define <8 x i16> @hsub_v8i16(<8 x i16> %a) { -; SSSE3-LABEL: hsub_v8i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phsubw %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: hsub_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vphsubw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3_SLOW-LABEL: hsub_v8i16: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm1 +; SSSE3_SLOW-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3_SLOW-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; SSSE3_SLOW-NEXT: psubw %xmm0, %xmm1 +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hsub_v8i16: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: phsubw %xmm0, %xmm0 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hsub_v8i16: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1_SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; AVX1_SLOW-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX1_SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hsub_v8i16: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hsub_v8i16: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2_SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; AVX2_SLOW-NEXT: vpsubw %xmm0, %xmm1, %xmm0 +; AVX2_SLOW-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hsub_v8i16: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm0 +; AVX2_FAST-NEXT: retq %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> %hop = sub <8 x i16> %a0246, %a1357 @@ -476,25 +946,64 @@ } define <16 x i16> @hsub_v16i16b(<16 x i16> %a) { -; SSSE3-LABEL: hsub_v16i16b: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phsubw %xmm0, %xmm0 -; SSSE3-NEXT: phsubw %xmm1, %xmm1 -; SSSE3-NEXT: retq -; -; AVX1-LABEL: hsub_v16i16b: -; AVX1: # %bb.0: -; AVX1-NEXT: vphsubw %xmm0, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vphsubw %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; AVX1-NEXT: retq -; -; AVX2-LABEL: hsub_v16i16b: -; AVX2: # %bb.0: -; AVX2-NEXT: vphsubw %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: retq +; SSSE3_SLOW-LABEL: hsub_v16i16b: +; SSSE3_SLOW: # %bb.0: +; SSSE3_SLOW-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm3 +; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm3 +; SSSE3_SLOW-NEXT: movdqa %xmm1, %xmm4 +; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm4 +; SSSE3_SLOW-NEXT: movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm0 +; SSSE3_SLOW-NEXT: psubw %xmm0, %xmm3 +; SSSE3_SLOW-NEXT: pshufb %xmm2, %xmm1 +; SSSE3_SLOW-NEXT: psubw %xmm1, %xmm4 +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,1] +; SSSE3_SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] +; SSSE3_SLOW-NEXT: retq +; +; SSSE3_FAST-LABEL: hsub_v16i16b: +; SSSE3_FAST: # %bb.0: +; SSSE3_FAST-NEXT: phsubw %xmm0, %xmm0 +; SSSE3_FAST-NEXT: phsubw %xmm1, %xmm1 +; SSSE3_FAST-NEXT: retq +; +; AVX1_SLOW-LABEL: hsub_v16i16b: +; AVX1_SLOW: # %bb.0: +; AVX1_SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1_SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm2 +; AVX1_SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1_SLOW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX1_SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15] +; AVX1_SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX1_SLOW-NEXT: vpsubw %xmm0, %xmm2, %xmm0 +; AVX1_SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm2 +; AVX1_SLOW-NEXT: vpsubw %xmm2, %xmm1, %xmm1 +; AVX1_SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1_SLOW-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1_SLOW-NEXT: retq +; +; AVX1_FAST-LABEL: hsub_v16i16b: +; AVX1_FAST: # %bb.0: +; AVX1_FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm1 +; AVX1_FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1_FAST-NEXT: vphsubw %xmm0, %xmm0, %xmm0 +; AVX1_FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1_FAST-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1_FAST-NEXT: retq +; +; AVX2_SLOW-LABEL: hsub_v16i16b: +; AVX2_SLOW: # %bb.0: +; AVX2_SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2_SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] +; AVX2_SLOW-NEXT: vpsubw %ymm0, %ymm1, %ymm0 +; AVX2_SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2_SLOW-NEXT: retq +; +; AVX2_FAST-LABEL: hsub_v16i16b: +; AVX2_FAST: # %bb.0: +; AVX2_FAST-NEXT: vphsubw %ymm0, %ymm0, %ymm0 +; AVX2_FAST-NEXT: retq %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> %hop = sub <16 x i16> %a0, %a1 Index: llvm/trunk/test/CodeGen/X86/haddsub-undef.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/haddsub-undef.ll +++ llvm/trunk/test/CodeGen/X86/haddsub-undef.ll @@ -1,7 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE-FAST +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST ; Verify that we correctly fold horizontal binop even in the presence of UNDEFs. @@ -339,8 +342,6 @@ ret <8 x i32> %vecinit5 } -; On AVX2, the following sequence can be folded into a single horizontal add. -; If the Subtarget doesn't support AVX2, then we avoid emitting two packed ; integer horizontal adds instead of two scalar adds followed by vector inserts. define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) { ; SSE-LABEL: test15_undef: @@ -451,15 +452,38 @@ } define <2 x double> @add_pd_003(<2 x double> %x) { -; SSE-LABEL: add_pd_003: -; SSE: # %bb.0: -; SSE-NEXT: haddpd %xmm0, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: add_pd_003: -; AVX: # %bb.0: -; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE-SLOW-LABEL: add_pd_003: +; SSE-SLOW: # %bb.0: +; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] +; SSE-SLOW-NEXT: addpd %xmm1, %xmm0 +; SSE-SLOW-NEXT: retq +; +; SSE-FAST-LABEL: add_pd_003: +; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: haddpd %xmm0, %xmm0 +; SSE-FAST-NEXT: retq +; +; AVX1-SLOW-LABEL: add_pd_003: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] +; AVX1-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: add_pd_003: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: add_pd_003: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] +; AVX2-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: add_pd_003: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: retq %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> %add = fadd <2 x double> %l, %x ret <2 x double> %add @@ -468,31 +492,84 @@ ; Change shuffle mask - no undefs. define <2 x double> @add_pd_003_2(<2 x double> %x) { -; SSE-LABEL: add_pd_003_2: -; SSE: # %bb.0: -; SSE-NEXT: haddpd %xmm0, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: add_pd_003_2: -; AVX: # %bb.0: -; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE-SLOW-LABEL: add_pd_003_2: +; SSE-SLOW: # %bb.0: +; SSE-SLOW-NEXT: movapd %xmm0, %xmm1 +; SSE-SLOW-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] +; SSE-SLOW-NEXT: addpd %xmm0, %xmm1 +; SSE-SLOW-NEXT: movapd %xmm1, %xmm0 +; SSE-SLOW-NEXT: retq +; +; SSE-FAST-LABEL: add_pd_003_2: +; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: haddpd %xmm0, %xmm0 +; SSE-FAST-NEXT: retq +; +; AVX1-SLOW-LABEL: add_pd_003_2: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: add_pd_003_2: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: add_pd_003_2: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX2-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: add_pd_003_2: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: retq %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> %add = fadd <2 x double> %l, %x ret <2 x double> %add } define <2 x double> @add_pd_010(<2 x double> %x) { -; SSE-LABEL: add_pd_010: -; SSE: # %bb.0: -; SSE-NEXT: haddpd %xmm0, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: add_pd_010: -; AVX: # %bb.0: -; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX-NEXT: retq +; SSE-SLOW-LABEL: add_pd_010: +; SSE-SLOW: # %bb.0: +; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] +; SSE-SLOW-NEXT: addpd %xmm0, %xmm1 +; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-SLOW-NEXT: movapd %xmm1, %xmm0 +; SSE-SLOW-NEXT: retq +; +; SSE-FAST-LABEL: add_pd_010: +; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: haddpd %xmm0, %xmm0 +; SSE-FAST-NEXT: retq +; +; AVX1-SLOW-LABEL: add_pd_010: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] +; AVX1-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: add_pd_010: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: add_pd_010: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] +; AVX2-SLOW-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: add_pd_010: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX2-FAST-NEXT: retq %l = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> %add = fadd <2 x double> %l, %x %shuffle2 = shufflevector <2 x double> %add, <2 x double> undef, <2 x i32> @@ -500,15 +577,42 @@ } define <4 x float> @add_ps_007(<4 x float> %x) { -; SSE-LABEL: add_ps_007: -; SSE: # %bb.0: -; SSE-NEXT: haddps %xmm0, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: add_ps_007: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE-SLOW-LABEL: add_ps_007: +; SSE-SLOW: # %bb.0: +; SSE-SLOW-NEXT: movaps %xmm0, %xmm1 +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE-SLOW-NEXT: retq +; +; SSE-FAST-LABEL: add_ps_007: +; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE-FAST-NEXT: retq +; +; AVX1-SLOW-LABEL: add_ps_007: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2] +; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: add_ps_007: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: add_ps_007: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: add_ps_007: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %add = fadd <4 x float> %l, %r @@ -516,17 +620,48 @@ } define <4 x float> @add_ps_030(<4 x float> %x) { -; SSE-LABEL: add_ps_030: -; SSE: # %bb.0: -; SSE-NEXT: haddps %xmm0, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,2,3] -; SSE-NEXT: retq -; -; AVX-LABEL: add_ps_030: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX-NEXT: retq +; SSE-SLOW-LABEL: add_ps_030: +; SSE-SLOW: # %bb.0: +; SSE-SLOW-NEXT: movaps %xmm0, %xmm1 +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,2,3] +; SSE-SLOW-NEXT: retq +; +; SSE-FAST-LABEL: add_ps_030: +; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,2,3] +; SSE-FAST-NEXT: retq +; +; AVX1-SLOW-LABEL: add_ps_030: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2] +; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: add_ps_030: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: add_ps_030: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,2] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: add_ps_030: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX2-FAST-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %add = fadd <4 x float> %l, %r @@ -535,15 +670,41 @@ } define <4 x float> @add_ps_007_2(<4 x float> %x) { -; SSE-LABEL: add_ps_007_2: -; SSE: # %bb.0: -; SSE-NEXT: haddps %xmm0, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: add_ps_007_2: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE-SLOW-LABEL: add_ps_007_2: +; SSE-SLOW: # %bb.0: +; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE-SLOW-NEXT: retq +; +; SSE-FAST-LABEL: add_ps_007_2: +; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE-FAST-NEXT: retq +; +; AVX1-SLOW-LABEL: add_ps_007_2: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] +; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: add_ps_007_2: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: add_ps_007_2: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: add_ps_007_2: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %add = fadd <4 x float> %l, %r @@ -551,32 +712,83 @@ } define <4 x float> @add_ps_008(<4 x float> %x) { -; SSE-LABEL: add_ps_008: -; SSE: # %bb.0: -; SSE-NEXT: haddps %xmm0, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: add_ps_008: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE-SLOW-LABEL: add_ps_008: +; SSE-SLOW: # %bb.0: +; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] +; SSE-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE-SLOW-NEXT: retq +; +; SSE-FAST-LABEL: add_ps_008: +; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE-FAST-NEXT: retq +; +; AVX1-SLOW-LABEL: add_ps_008: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] +; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: add_ps_008: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: add_ps_008: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] +; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: add_ps_008: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %add = fadd <4 x float> %l, %x ret <4 x float> %add } define <4 x float> @add_ps_017(<4 x float> %x) { -; SSE-LABEL: add_ps_017: -; SSE: # %bb.0: -; SSE-NEXT: haddps %xmm0, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE-NEXT: retq -; -; AVX-LABEL: add_ps_017: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX-NEXT: retq +; SSE-SLOW-LABEL: add_ps_017: +; SSE-SLOW: # %bb.0: +; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] +; SSE-SLOW-NEXT: addps %xmm0, %xmm1 +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] +; SSE-SLOW-NEXT: movaps %xmm1, %xmm0 +; SSE-SLOW-NEXT: retq +; +; SSE-FAST-LABEL: add_ps_017: +; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE-FAST-NEXT: retq +; +; AVX1-SLOW-LABEL: add_ps_017: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] +; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: add_ps_017: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: add_ps_017: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] +; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: add_ps_017: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX2-FAST-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %add = fadd <4 x float> %l, %x %shuffle2 = shufflevector <4 x float> %add, <4 x float> undef, <4 x i32> @@ -584,17 +796,47 @@ } define <4 x float> @add_ps_018(<4 x float> %x) { -; SSE-LABEL: add_ps_018: -; SSE: # %bb.0: -; SSE-NEXT: haddps %xmm0, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: retq -; -; AVX-LABEL: add_ps_018: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: retq +; SSE-SLOW-LABEL: add_ps_018: +; SSE-SLOW: # %bb.0: +; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-SLOW-NEXT: retq +; +; SSE-FAST-LABEL: add_ps_018: +; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-FAST-NEXT: retq +; +; AVX1-SLOW-LABEL: add_ps_018: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] +; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-SLOW-NEXT: retq +; +; AVX1-FAST-LABEL: add_ps_018: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: add_ps_018: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: add_ps_018: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-FAST-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %add = fadd <4 x float> %l, %r Index: llvm/trunk/test/CodeGen/X86/haddsub.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/haddsub.ll +++ llvm/trunk/test/CodeGen/X86/haddsub.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefix=SSE3 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE3,SSE3-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops | FileCheck %s --check-prefixes=SSE3,SSE3-FAST +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) { ; SSE3-LABEL: haddpd1: @@ -35,15 +37,29 @@ } define <2 x double> @haddpd3(<2 x double> %x) { -; SSE3-LABEL: haddpd3: -; SSE3: # %bb.0: -; SSE3-NEXT: haddpd %xmm0, %xmm0 -; SSE3-NEXT: retq -; -; AVX-LABEL: haddpd3: -; AVX: # %bb.0: -; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE3-SLOW-LABEL: haddpd3: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 +; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: addpd %xmm0, %xmm1 +; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 +; SSE3-SLOW-NEXT: retq +; +; SSE3-FAST-LABEL: haddpd3: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: haddpd %xmm0, %xmm0 +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: haddpd3: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: haddpd3: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> %r = fadd <2 x double> %a, %b @@ -83,15 +99,30 @@ } define <4 x float> @haddps3(<4 x float> %x) { -; SSE3-LABEL: haddps3: -; SSE3: # %bb.0: -; SSE3-NEXT: haddps %xmm0, %xmm0 -; SSE3-NEXT: retq -; -; AVX-LABEL: haddps3: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE3-SLOW-LABEL: haddps3: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: retq +; +; SSE3-FAST-LABEL: haddps3: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: haddps3: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: haddps3: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = fadd <4 x float> %a, %b @@ -99,15 +130,30 @@ } define <4 x float> @haddps4(<4 x float> %x) { -; SSE3-LABEL: haddps4: -; SSE3: # %bb.0: -; SSE3-NEXT: haddps %xmm0, %xmm0 -; SSE3-NEXT: retq -; -; AVX-LABEL: haddps4: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE3-SLOW-LABEL: haddps4: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: retq +; +; SSE3-FAST-LABEL: haddps4: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: haddps4: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: haddps4: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = fadd <4 x float> %a, %b @@ -115,15 +161,30 @@ } define <4 x float> @haddps5(<4 x float> %x) { -; SSE3-LABEL: haddps5: -; SSE3: # %bb.0: -; SSE3-NEXT: haddps %xmm0, %xmm0 -; SSE3-NEXT: retq -; -; AVX-LABEL: haddps5: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE3-SLOW-LABEL: haddps5: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[2,3] +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2,2,3] +; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: retq +; +; SSE3-FAST-LABEL: haddps5: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: haddps5: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,3,2,3] +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,2,3] +; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: haddps5: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = fadd <4 x float> %a, %b @@ -131,15 +192,27 @@ } define <4 x float> @haddps6(<4 x float> %x) { -; SSE3-LABEL: haddps6: -; SSE3: # %bb.0: -; SSE3-NEXT: haddps %xmm0, %xmm0 -; SSE3-NEXT: retq -; -; AVX-LABEL: haddps6: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE3-SLOW-LABEL: haddps6: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: retq +; +; SSE3-FAST-LABEL: haddps6: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: haddps6: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: haddps6: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = fadd <4 x float> %a, %b @@ -147,15 +220,30 @@ } define <4 x float> @haddps7(<4 x float> %x) { -; SSE3-LABEL: haddps7: -; SSE3: # %bb.0: -; SSE3-NEXT: haddps %xmm0, %xmm0 -; SSE3-NEXT: retq -; -; AVX-LABEL: haddps7: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE3-SLOW-LABEL: haddps7: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 +; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: retq +; +; SSE3-FAST-LABEL: haddps7: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: haddps7: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: haddps7: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = fadd <4 x float> %a, %b @@ -179,15 +267,28 @@ } define <2 x double> @hsubpd2(<2 x double> %x) { -; SSE3-LABEL: hsubpd2: -; SSE3: # %bb.0: -; SSE3-NEXT: hsubpd %xmm0, %xmm0 -; SSE3-NEXT: retq -; -; AVX-LABEL: hsubpd2: -; AVX: # %bb.0: -; AVX-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE3-SLOW-LABEL: hsubpd2: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 +; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: subpd %xmm1, %xmm0 +; SSE3-SLOW-NEXT: retq +; +; SSE3-FAST-LABEL: hsubpd2: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: hsubpd %xmm0, %xmm0 +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: hsubpd2: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-SLOW-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: hsubpd2: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> %b = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> %r = fsub <2 x double> %a, %b @@ -211,15 +312,31 @@ } define <4 x float> @hsubps2(<4 x float> %x) { -; SSE3-LABEL: hsubps2: -; SSE3: # %bb.0: -; SSE3-NEXT: hsubps %xmm0, %xmm0 -; SSE3-NEXT: retq -; -; AVX-LABEL: hsubps2: -; AVX: # %bb.0: -; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE3-SLOW-LABEL: hsubps2: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE3-SLOW-NEXT: subps %xmm0, %xmm1 +; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: retq +; +; SSE3-FAST-LABEL: hsubps2: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: hsubps2: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-SLOW-NEXT: vsubps %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: hsubps2: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = fsub <4 x float> %a, %b @@ -227,15 +344,31 @@ } define <4 x float> @hsubps3(<4 x float> %x) { -; SSE3-LABEL: hsubps3: -; SSE3: # %bb.0: -; SSE3-NEXT: hsubps %xmm0, %xmm0 -; SSE3-NEXT: retq -; -; AVX-LABEL: hsubps3: -; AVX: # %bb.0: -; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE3-SLOW-LABEL: hsubps3: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE3-SLOW-NEXT: subps %xmm0, %xmm1 +; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: retq +; +; SSE3-FAST-LABEL: hsubps3: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: hsubps3: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX-SLOW-NEXT: vsubps %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: hsubps3: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = fsub <4 x float> %a, %b @@ -243,15 +376,27 @@ } define <4 x float> @hsubps4(<4 x float> %x) { -; SSE3-LABEL: hsubps4: -; SSE3: # %bb.0: -; SSE3-NEXT: hsubps %xmm0, %xmm0 -; SSE3-NEXT: retq -; -; AVX-LABEL: hsubps4: -; AVX: # %bb.0: -; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE3-SLOW-LABEL: hsubps4: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE3-SLOW-NEXT: subps %xmm1, %xmm0 +; SSE3-SLOW-NEXT: retq +; +; SSE3-FAST-LABEL: hsubps4: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: hsubps4: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-SLOW-NEXT: vsubps %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: hsubps4: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %b = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = fsub <4 x float> %a, %b @@ -293,16 +438,35 @@ } define <8 x float> @vhaddps3(<8 x float> %x) { -; SSE3-LABEL: vhaddps3: -; SSE3: # %bb.0: -; SSE3-NEXT: haddps %xmm0, %xmm0 -; SSE3-NEXT: haddps %xmm1, %xmm1 -; SSE3-NEXT: retq -; -; AVX-LABEL: vhaddps3: -; AVX: # %bb.0: -; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 -; AVX-NEXT: retq +; SSE3-SLOW-LABEL: vhaddps3: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: movaps %xmm1, %xmm2 +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3] +; SSE3-SLOW-NEXT: movaps %xmm0, %xmm3 +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[2,3] +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE3-SLOW-NEXT: addps %xmm2, %xmm1 +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE3-SLOW-NEXT: addps %xmm3, %xmm0 +; SSE3-SLOW-NEXT: retq +; +; SSE3-FAST-LABEL: vhaddps3: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 +; SSE3-FAST-NEXT: haddps %xmm1, %xmm1 +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: vhaddps3: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] +; AVX-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] +; AVX-SLOW-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: vhaddps3: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX-FAST-NEXT: retq %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> %r = fadd <8 x float> %a, %b @@ -327,16 +491,37 @@ } define <8 x float> @vhsubps3(<8 x float> %x) { -; SSE3-LABEL: vhsubps3: -; SSE3: # %bb.0: -; SSE3-NEXT: hsubps %xmm0, %xmm0 -; SSE3-NEXT: hsubps %xmm1, %xmm1 -; SSE3-NEXT: retq -; -; AVX-LABEL: vhsubps3: -; AVX: # %bb.0: -; AVX-NEXT: vhsubps %ymm0, %ymm0, %ymm0 -; AVX-NEXT: retq +; SSE3-SLOW-LABEL: vhsubps3: +; SSE3-SLOW: # %bb.0: +; SSE3-SLOW-NEXT: movaps %xmm1, %xmm2 +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3] +; SSE3-SLOW-NEXT: movaps %xmm0, %xmm3 +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[2,3] +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,2,3] +; SSE3-SLOW-NEXT: subps %xmm1, %xmm2 +; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSE3-SLOW-NEXT: subps %xmm0, %xmm3 +; SSE3-SLOW-NEXT: movaps %xmm3, %xmm0 +; SSE3-SLOW-NEXT: movaps %xmm2, %xmm1 +; SSE3-SLOW-NEXT: retq +; +; SSE3-FAST-LABEL: vhsubps3: +; SSE3-FAST: # %bb.0: +; SSE3-FAST-NEXT: hsubps %xmm0, %xmm0 +; SSE3-FAST-NEXT: hsubps %xmm1, %xmm1 +; SSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: vhsubps3: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7] +; AVX-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] +; AVX-SLOW-NEXT: vsubps %ymm0, %ymm1, %ymm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: vhsubps3: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vhsubps %ymm0, %ymm0, %ymm0 +; AVX-FAST-NEXT: retq %a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> %b = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> %r = fsub <8 x float> %a, %b Index: llvm/trunk/test/CodeGen/X86/madd.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/madd.ll +++ llvm/trunk/test/CodeGen/X86/madd.ll @@ -50,7 +50,8 @@ ; AVX-NEXT: # %bb.2: # %middle.block ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq entry: @@ -129,7 +130,8 @@ ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -153,7 +155,8 @@ ; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX256-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vmovd %xmm0, %eax ; AVX256-NEXT: vzeroupper ; AVX256-NEXT: retq @@ -252,7 +255,8 @@ ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -278,7 +282,8 @@ ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -437,7 +442,8 @@ ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -469,7 +475,8 @@ ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -620,7 +627,8 @@ ; AVX-NEXT: # %bb.2: # %middle.block ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq entry: @@ -704,7 +712,8 @@ ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -729,7 +738,8 @@ ; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX256-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vmovd %xmm0, %eax ; AVX256-NEXT: vzeroupper ; AVX256-NEXT: retq @@ -836,7 +846,8 @@ ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -863,7 +874,8 @@ ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1039,7 +1051,8 @@ ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1073,7 +1086,8 @@ ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1222,7 +1236,8 @@ ; AVX-NEXT: # %bb.2: # %middle.block ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq entry: @@ -1313,7 +1328,8 @@ ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1338,7 +1354,8 @@ ; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX256-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vmovd %xmm0, %eax ; AVX256-NEXT: vzeroupper ; AVX256-NEXT: retq @@ -1460,7 +1477,8 @@ ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1491,7 +1509,8 @@ ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1699,7 +1718,8 @@ ; AVX1-NEXT: vpaddd %xmm0, %xmm8, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1742,7 +1762,8 @@ ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2692,7 +2713,8 @@ ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: retq ; @@ -2707,7 +2729,8 @@ ; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX256-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX256-NEXT: vmovd %xmm0, %eax ; AVX256-NEXT: vzeroupper ; AVX256-NEXT: retq Index: llvm/trunk/test/CodeGen/X86/phaddsub.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/phaddsub.ll +++ llvm/trunk/test/CodeGen/X86/phaddsub.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3,SSSE3-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3-FAST +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST define <8 x i16> @phaddw1(<8 x i16> %x, <8 x i16> %y) { ; SSSE3-LABEL: phaddw1: @@ -67,15 +69,29 @@ } define <4 x i32> @phaddd3(<4 x i32> %x) { -; SSSE3-LABEL: phaddd3: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: phaddd3: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3-SLOW-LABEL: phaddd3: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq +; +; SSSE3-FAST-LABEL: phaddd3: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddd3: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddd3: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = add <4 x i32> %a, %b @@ -83,15 +99,29 @@ } define <4 x i32> @phaddd4(<4 x i32> %x) { -; SSSE3-LABEL: phaddd4: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: phaddd4: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3-SLOW-LABEL: phaddd4: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq +; +; SSSE3-FAST-LABEL: phaddd4: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddd4: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddd4: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = add <4 x i32> %a, %b @@ -99,15 +129,29 @@ } define <4 x i32> @phaddd5(<4 x i32> %x) { -; SSSE3-LABEL: phaddd5: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: phaddd5: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3-SLOW-LABEL: phaddd5: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,3,2,3] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,2,2,3] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq +; +; SSSE3-FAST-LABEL: phaddd5: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddd5: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,3,2,3] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddd5: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = add <4 x i32> %a, %b @@ -115,15 +159,27 @@ } define <4 x i32> @phaddd6(<4 x i32> %x) { -; SSSE3-LABEL: phaddd6: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: phaddd6: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3-SLOW-LABEL: phaddd6: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq +; +; SSSE3-FAST-LABEL: phaddd6: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddd6: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddd6: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = add <4 x i32> %a, %b @@ -131,15 +187,29 @@ } define <4 x i32> @phaddd7(<4 x i32> %x) { -; SSSE3-LABEL: phaddd7: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: phaddd7: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3-SLOW-LABEL: phaddd7: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq +; +; SSSE3-FAST-LABEL: phaddd7: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddd7: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddd7: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = add <4 x i32> %a, %b @@ -179,15 +249,30 @@ } define <4 x i32> @phsubd2(<4 x i32> %x) { -; SSSE3-LABEL: phsubd2: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phsubd %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: phsubd2: -; AVX: # %bb.0: -; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3-SLOW-LABEL: phsubd2: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-SLOW-NEXT: psubd %xmm0, %xmm1 +; SSSE3-SLOW-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq +; +; SSSE3-FAST-LABEL: phsubd2: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phsubd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phsubd2: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-SLOW-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phsubd2: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = sub <4 x i32> %a, %b @@ -195,15 +280,30 @@ } define <4 x i32> @phsubd3(<4 x i32> %x) { -; SSSE3-LABEL: phsubd3: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phsubd %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: phsubd3: -; AVX: # %bb.0: -; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3-SLOW-LABEL: phsubd3: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; SSSE3-SLOW-NEXT: psubd %xmm0, %xmm1 +; SSSE3-SLOW-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq +; +; SSSE3-FAST-LABEL: phsubd3: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phsubd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phsubd3: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX-SLOW-NEXT: vpsubd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phsubd3: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = sub <4 x i32> %a, %b @@ -211,15 +311,27 @@ } define <4 x i32> @phsubd4(<4 x i32> %x) { -; SSSE3-LABEL: phsubd4: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phsubd %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: phsubd4: -; AVX: # %bb.0: -; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3-SLOW-LABEL: phsubd4: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSSE3-SLOW-NEXT: psubd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq +; +; SSSE3-FAST-LABEL: phsubd4: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phsubd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phsubd4: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-SLOW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phsubd4: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = sub <4 x i32> %a, %b @@ -284,15 +396,29 @@ } define <4 x i32> @phaddd_single_source1(<4 x i32> %x) { -; SSSE3-LABEL: phaddd_single_source1: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: phaddd_single_source1: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3-SLOW-LABEL: phaddd_single_source1: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,2] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq +; +; SSSE3-FAST-LABEL: phaddd_single_source1: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddd_single_source1: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,2] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddd_single_source1: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %add = add <4 x i32> %l, %r @@ -300,17 +426,33 @@ } define <4 x i32> @phaddd_single_source2(<4 x i32> %x) { -; SSSE3-LABEL: phaddd_single_source2: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; SSSE3-NEXT: retq -; -; AVX-LABEL: phaddd_single_source2: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX-NEXT: retq +; SSSE3-SLOW-LABEL: phaddd_single_source2: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,2] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] +; SSSE3-SLOW-NEXT: retq +; +; SSSE3-FAST-LABEL: phaddd_single_source2: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddd_single_source2: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,2] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddd_single_source2: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX-FAST-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %add = add <4 x i32> %l, %r @@ -319,15 +461,29 @@ } define <4 x i32> @phaddd_single_source3(<4 x i32> %x) { -; SSSE3-LABEL: phaddd_single_source3: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: phaddd_single_source3: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3-SLOW-LABEL: phaddd_single_source3: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq +; +; SSSE3-FAST-LABEL: phaddd_single_source3: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddd_single_source3: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; AVX-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddd_single_source3: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %add = add <4 x i32> %l, %r @@ -335,32 +491,58 @@ } define <4 x i32> @phaddd_single_source4(<4 x i32> %x) { -; SSSE3-LABEL: phaddd_single_source4: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: phaddd_single_source4: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3-SLOW-LABEL: phaddd_single_source4: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq +; +; SSSE3-FAST-LABEL: phaddd_single_source4: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddd_single_source4: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,2] +; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddd_single_source4: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %add = add <4 x i32> %l, %x ret <4 x i32> %add } define <4 x i32> @phaddd_single_source5(<4 x i32> %x) { -; SSSE3-LABEL: phaddd_single_source5: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSSE3-NEXT: retq -; -; AVX-LABEL: phaddd_single_source5: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX-NEXT: retq +; SSSE3-SLOW-LABEL: phaddd_single_source5: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,2] +; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] +; SSSE3-SLOW-NEXT: retq +; +; SSSE3-FAST-LABEL: phaddd_single_source5: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddd_single_source5: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,2] +; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddd_single_source5: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX-FAST-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %add = add <4 x i32> %l, %x %shuffle2 = shufflevector <4 x i32> %add, <4 x i32> undef, <4 x i32> @@ -368,17 +550,33 @@ } define <4 x i32> @phaddd_single_source6(<4 x i32> %x) { -; SSSE3-LABEL: phaddd_single_source6: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; SSSE3-NEXT: retq -; -; AVX-LABEL: phaddd_single_source6: -; AVX: # %bb.0: -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX-NEXT: retq +; SSSE3-SLOW-LABEL: phaddd_single_source6: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSSE3-SLOW-NEXT: retq +; +; SSSE3-FAST-LABEL: phaddd_single_source6: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddd_single_source6: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; AVX-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddd_single_source6: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX-FAST-NEXT: retq %l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> %add = add <4 x i32> %l, %r @@ -387,15 +585,30 @@ } define <8 x i16> @phaddw_single_source1(<8 x i16> %x) { -; SSSE3-LABEL: phaddw_single_source1: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddw %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: phaddw_single_source1: -; AVX: # %bb.0: -; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3-SLOW-LABEL: phaddw_single_source1: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-SLOW-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13] +; SSSE3-SLOW-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15] +; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq +; +; SSSE3-FAST-LABEL: phaddw_single_source1: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddw_single_source1: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13] +; AVX-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15] +; AVX-SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddw_single_source1: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %add = add <8 x i16> %l, %r @@ -403,19 +616,41 @@ } define <8 x i16> @phaddw_single_source2(<8 x i16> %x) { -; SSSE3-LABEL: phaddw_single_source2: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddw %xmm0, %xmm0 -; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; SSSE3-NEXT: retq -; -; AVX-LABEL: phaddw_single_source2: -; AVX: # %bb.0: -; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX-NEXT: retq +; SSSE3-SLOW-LABEL: phaddw_single_source2: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSSE3-SLOW-NEXT: retq +; +; SSSE3-FAST-LABEL: phaddw_single_source2: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 +; SSSE3-FAST-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] +; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddw_single_source2: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; AVX-SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddw_single_source2: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] +; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX-FAST-NEXT: retq %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %add = add <8 x i16> %l, %r @@ -424,15 +659,33 @@ } define <8 x i16> @phaddw_single_source3(<8 x i16> %x) { -; SSSE3-LABEL: phaddw_single_source3: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddw %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: phaddw_single_source3: -; AVX: # %bb.0: -; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3-SLOW-LABEL: phaddw_single_source3: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq +; +; SSSE3-FAST-LABEL: phaddw_single_source3: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddw_single_source3: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; AVX-SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddw_single_source3: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %add = add <8 x i16> %l, %r @@ -440,32 +693,63 @@ } define <8 x i16> @phaddw_single_source4(<8 x i16> %x) { -; SSSE3-LABEL: phaddw_single_source4: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddw %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; AVX-LABEL: phaddw_single_source4: -; AVX: # %bb.0: -; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSSE3-SLOW-LABEL: phaddw_single_source4: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-SLOW-NEXT: pslld $16, %xmm1 +; SSSE3-SLOW-NEXT: paddw %xmm0, %xmm1 +; SSSE3-SLOW-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: retq +; +; SSSE3-FAST-LABEL: phaddw_single_source4: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddw_single_source4: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpslld $16, %xmm0, %xmm1 +; AVX-SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddw_single_source4: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: retq %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %add = add <8 x i16> %l, %x ret <8 x i16> %add } define <8 x i16> @phaddw_single_source6(<8 x i16> %x) { -; SSSE3-LABEL: phaddw_single_source6: -; SSSE3: # %bb.0: -; SSSE3-NEXT: phaddw %xmm0, %xmm0 -; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq -; -; AVX-LABEL: phaddw_single_source6: -; AVX: # %bb.0: -; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX-NEXT: retq +; SSSE3-SLOW-LABEL: phaddw_single_source6: +; SSSE3-SLOW: # %bb.0: +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSSE3-SLOW-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] +; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSSE3-SLOW-NEXT: retq +; +; SSSE3-FAST-LABEL: phaddw_single_source6: +; SSSE3-FAST: # %bb.0: +; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0 +; SSSE3-FAST-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSSE3-FAST-NEXT: retq +; +; AVX-SLOW-LABEL: phaddw_single_source6: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; AVX-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX-SLOW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: phaddw_single_source6: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX-FAST-NEXT: retq %l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> %add = add <8 x i16> %l, %r Index: llvm/trunk/test/CodeGen/X86/required-vector-width.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/required-vector-width.ll +++ llvm/trunk/test/CodeGen/X86/required-vector-width.ll @@ -190,7 +190,8 @@ ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vmovd %xmm0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -319,7 +320,8 @@ ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vmovd %xmm0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq Index: llvm/trunk/test/CodeGen/X86/sad.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sad.ll +++ llvm/trunk/test/CodeGen/X86/sad.ll @@ -56,7 +56,8 @@ ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -80,7 +81,8 @@ ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -152,16 +154,16 @@ ; SSE2-NEXT: pxor %xmm12, %xmm12 ; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm6, %xmm6 ; SSE2-NEXT: pxor %xmm13, %xmm13 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm15, %xmm15 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm14, %xmm14 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB1_1: # %vector.body @@ -219,17 +221,17 @@ ; SSE2-NEXT: psrad $31, %xmm6 ; SSE2-NEXT: paddd %xmm6, %xmm7 ; SSE2-NEXT: pxor %xmm6, %xmm7 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE2-NEXT: paddd %xmm7, %xmm6 -; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm4, %xmm6 ; SSE2-NEXT: psrad $31, %xmm6 ; SSE2-NEXT: paddd %xmm6, %xmm4 ; SSE2-NEXT: pxor %xmm6, %xmm4 ; SSE2-NEXT: movdqa %xmm10, %xmm6 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; SSE2-NEXT: paddd %xmm4, %xmm7 -; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 ; SSE2-NEXT: paddd %xmm4, %xmm1 @@ -244,9 +246,9 @@ ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm5 ; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm0 @@ -256,9 +258,9 @@ ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: paddd %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm8, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm8 @@ -267,13 +269,13 @@ ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB1_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: paddd %xmm15, %xmm0 ; SSE2-NEXT: paddd %xmm14, %xmm13 ; SSE2-NEXT: paddd %xmm0, %xmm13 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; SSE2-NEXT: paddd %xmm13, %xmm6 ; SSE2-NEXT: paddd %xmm0, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,0,1] @@ -317,7 +319,8 @@ ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -343,7 +346,8 @@ ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -420,42 +424,42 @@ ; SSE2-NEXT: pxor %xmm14, %xmm14 ; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB2_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE2-NEXT: movaps a+1040(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa a+1024(%rax), %xmm12 ; SSE2-NEXT: movdqa a+1056(%rax), %xmm15 ; SSE2-NEXT: movdqa a+1072(%rax), %xmm4 @@ -516,7 +520,7 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] ; SSE2-NEXT: psubd %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] ; SSE2-NEXT: psubd %xmm0, %xmm15 ; SSE2-NEXT: movdqa %xmm7, %xmm0 @@ -524,8 +528,8 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] ; SSE2-NEXT: psubd %xmm3, %xmm9 -; SSE2-NEXT: movdqa %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE2-NEXT: movdqa %xmm2, %xmm9 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] @@ -534,7 +538,7 @@ ; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] ; SSE2-NEXT: psubd %xmm0, %xmm13 -; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm9, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] @@ -563,16 +567,16 @@ ; SSE2-NEXT: psrad $31, %xmm3 ; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm1 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm6, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm6 ; SSE2-NEXT: pxor %xmm1, %xmm6 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm6, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm5, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm5 @@ -584,118 +588,118 @@ ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm4 ; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm8, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm8 ; SSE2-NEXT: pxor %xmm1, %xmm8 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm11, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm11 ; SSE2-NEXT: pxor %xmm1, %xmm11 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm11, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm15, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm15 ; SSE2-NEXT: pxor %xmm1, %xmm15 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm15, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm10, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm10 ; SSE2-NEXT: pxor %xmm1, %xmm10 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm10, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm12, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm12 ; SSE2-NEXT: pxor %xmm1, %xmm12 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm12, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm9, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm9 ; SSE2-NEXT: pxor %xmm0, %xmm9 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: paddd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm7 ; SSE2-NEXT: pxor %xmm0, %xmm7 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: paddd %xmm7, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm13, %xmm1 ; SSE2-NEXT: movdqa %xmm13, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB2_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Folded Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; SSE2-NEXT: paddd %xmm1, %xmm4 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: paddd (%rsp), %xmm1 # 16-byte Folded Reload ; SSE2-NEXT: paddd %xmm4, %xmm1 ; SSE2-NEXT: paddd %xmm2, %xmm1 @@ -737,30 +741,30 @@ ; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpsubd %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpsubd %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpsubd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpsubd %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero ; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm4 @@ -803,27 +807,27 @@ ; AVX1-NEXT: vpabsd %xmm4, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm13, %xmm2 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: vpaddd %xmm13, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm13 -; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload +; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-NEXT: vpaddd %xmm8, %xmm1, %xmm1 -; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload +; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm8 ; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload +; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-NEXT: vpaddd %xmm9, %xmm1, %xmm1 -; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload +; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm9 ; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload +; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-NEXT: vpaddd %xmm10, %xmm1, %xmm1 -; AVX1-NEXT: vpabsd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload +; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm10 ; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 @@ -858,7 +862,8 @@ ; AVX1-NEXT: vpaddd %xmm0, %xmm14, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: addq $24, %rsp ; AVX1-NEXT: vzeroupper @@ -886,10 +891,10 @@ ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vmovdqu %ymm15, -{{[0-9]+}}(%rsp) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpsubd %ymm15, %ymm8, %ymm8 -; AVX2-NEXT: vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm9 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero @@ -903,9 +908,9 @@ ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpsubd %ymm15, %ymm14, %ymm14 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vmovdqu -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Reload +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-NEXT: vpsubd %ymm15, %ymm8, %ymm15 -; AVX2-NEXT: vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-NEXT: vpaddd %ymm7, %ymm8, %ymm7 ; AVX2-NEXT: vpabsd %ymm9, %ymm8 ; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5 @@ -935,7 +940,8 @@ ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1430,7 +1436,8 @@ ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: retq ; @@ -1448,7 +1455,8 @@ ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1533,7 +1541,8 @@ ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: retq ; @@ -1548,7 +1557,8 @@ ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq Index: llvm/trunk/test/CodeGen/X86/vector-reduce-add.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-reduce-add.ll +++ llvm/trunk/test/CodeGen/X86/vector-reduce-add.ll @@ -195,28 +195,21 @@ ; define i32 @test_v4i32(<4 x i32> %a0) { -; SSE2-LABEL: test_v4i32: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: phaddd %xmm1, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: retq +; SSE-LABEL: test_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: retq ; ; AVX-LABEL: test_v4i32: ; AVX: # %bb.0: ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq ; @@ -224,7 +217,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: retq %1 = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %a0) @@ -232,24 +226,15 @@ } define i32 @test_v8i32(<8 x i32> %a0) { -; SSE2-LABEL: test_v8i32: -; SSE2: # %bb.0: -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i32: -; SSE41: # %bb.0: -; SSE41-NEXT: paddd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: phaddd %xmm1, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: retq +; SSE-LABEL: test_v8i32: +; SSE: # %bb.0: +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: retq ; ; AVX1-LABEL: test_v8i32: ; AVX1: # %bb.0: @@ -257,7 +242,8 @@ ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -268,7 +254,8 @@ ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -279,7 +266,8 @@ ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -288,28 +276,17 @@ } define i32 @test_v16i32(<16 x i32> %a0) { -; SSE2-LABEL: test_v16i32: -; SSE2: # %bb.0: -; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i32: -; SSE41: # %bb.0: -; SSE41-NEXT: paddd %xmm3, %xmm1 -; SSE41-NEXT: paddd %xmm2, %xmm1 -; SSE41-NEXT: paddd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE41-NEXT: paddd %xmm1, %xmm0 -; SSE41-NEXT: phaddd %xmm0, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: retq +; SSE-LABEL: test_v16i32: +; SSE: # %bb.0: +; SSE-NEXT: paddd %xmm3, %xmm1 +; SSE-NEXT: paddd %xmm2, %xmm1 +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: retq ; ; AVX1-LABEL: test_v16i32: ; AVX1: # %bb.0: @@ -320,7 +297,8 @@ ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -332,7 +310,8 @@ ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -355,36 +334,21 @@ } define i32 @test_v32i32(<32 x i32> %a0) { -; SSE2-LABEL: test_v32i32: -; SSE2: # %bb.0: -; SSE2-NEXT: paddd %xmm6, %xmm2 -; SSE2-NEXT: paddd %xmm7, %xmm3 -; SSE2-NEXT: paddd %xmm5, %xmm3 -; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: paddd %xmm3, %xmm2 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: paddd %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v32i32: -; SSE41: # %bb.0: -; SSE41-NEXT: paddd %xmm6, %xmm2 -; SSE41-NEXT: paddd %xmm7, %xmm3 -; SSE41-NEXT: paddd %xmm5, %xmm3 -; SSE41-NEXT: paddd %xmm1, %xmm3 -; SSE41-NEXT: paddd %xmm4, %xmm2 -; SSE41-NEXT: paddd %xmm3, %xmm2 -; SSE41-NEXT: paddd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE41-NEXT: paddd %xmm2, %xmm0 -; SSE41-NEXT: phaddd %xmm0, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: retq +; SSE-LABEL: test_v32i32: +; SSE: # %bb.0: +; SSE-NEXT: paddd %xmm6, %xmm2 +; SSE-NEXT: paddd %xmm7, %xmm3 +; SSE-NEXT: paddd %xmm5, %xmm3 +; SSE-NEXT: paddd %xmm1, %xmm3 +; SSE-NEXT: paddd %xmm4, %xmm2 +; SSE-NEXT: paddd %xmm3, %xmm2 +; SSE-NEXT: paddd %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: retq ; ; AVX1-LABEL: test_v32i32: ; AVX1: # %bb.0: @@ -401,7 +365,8 @@ ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -415,7 +380,8 @@ ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -443,29 +409,18 @@ ; define i16 @test_v8i16(<8 x i16> %a0) { -; SSE2-LABEL: test_v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: paddw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: paddw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: phaddw %xmm0, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq +; SSE-LABEL: test_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq ; ; AVX-LABEL: test_v8i16: ; AVX: # %bb.0: @@ -473,7 +428,8 @@ ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq @@ -484,7 +440,8 @@ ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: retq @@ -493,31 +450,19 @@ } define i16 @test_v16i16(<16 x i16> %a0) { -; SSE2-LABEL: test_v16i16: -; SSE2: # %bb.0: -; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: paddw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: paddw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i16: -; SSE41: # %bb.0: -; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: phaddw %xmm0, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq +; SSE-LABEL: test_v16i16: +; SSE: # %bb.0: +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq ; ; AVX1-LABEL: test_v16i16: ; AVX1: # %bb.0: @@ -527,7 +472,8 @@ ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -541,7 +487,8 @@ ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddw %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -555,7 +502,8 @@ ; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vphaddw %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: # kill: def $ax killed $ax killed $eax ; AVX512-NEXT: vzeroupper @@ -565,35 +513,21 @@ } define i16 @test_v32i16(<32 x i16> %a0) { -; SSE2-LABEL: test_v32i16: -; SSE2: # %bb.0: -; SSE2-NEXT: paddw %xmm3, %xmm1 -; SSE2-NEXT: paddw %xmm2, %xmm1 -; SSE2-NEXT: paddw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: paddw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v32i16: -; SSE41: # %bb.0: -; SSE41-NEXT: paddw %xmm3, %xmm1 -; SSE41-NEXT: paddw %xmm2, %xmm1 -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: phaddw %xmm1, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq +; SSE-LABEL: test_v32i16: +; SSE: # %bb.0: +; SSE-NEXT: paddw %xmm3, %xmm1 +; SSE-NEXT: paddw %xmm2, %xmm1 +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq ; ; AVX1-LABEL: test_v32i16: ; AVX1: # %bb.0: @@ -606,7 +540,8 @@ ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -621,7 +556,8 @@ ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddw %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper @@ -648,43 +584,25 @@ } define i16 @test_v64i16(<64 x i16> %a0) { -; SSE2-LABEL: test_v64i16: -; SSE2: # %bb.0: -; SSE2-NEXT: paddw %xmm6, %xmm2 -; SSE2-NEXT: paddw %xmm7, %xmm3 -; SSE2-NEXT: paddw %xmm5, %xmm3 -; SSE2-NEXT: paddw %xmm1, %xmm3 -; SSE2-NEXT: paddw %xmm4, %xmm2 -; SSE2-NEXT: paddw %xmm3, %xmm2 -; SSE2-NEXT: paddw %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: paddw %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: paddw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v64i16: -; SSE41: # %bb.0: -; SSE41-NEXT: paddw %xmm6, %xmm2 -; SSE41-NEXT: paddw %xmm7, %xmm3 -; SSE41-NEXT: paddw %xmm5, %xmm3 -; SSE41-NEXT: paddw %xmm1, %xmm3 -; SSE41-NEXT: paddw %xmm4, %xmm2 -; SSE41-NEXT: paddw %xmm3, %xmm2 -; SSE41-NEXT: paddw %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE41-NEXT: paddw %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: phaddw %xmm1, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq +; SSE-LABEL: test_v64i16: +; SSE: # %bb.0: +; SSE-NEXT: paddw %xmm6, %xmm2 +; SSE-NEXT: paddw %xmm7, %xmm3 +; SSE-NEXT: paddw %xmm5, %xmm3 +; SSE-NEXT: paddw %xmm1, %xmm3 +; SSE-NEXT: paddw %xmm4, %xmm2 +; SSE-NEXT: paddw %xmm3, %xmm2 +; SSE-NEXT: paddw %xmm0, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE-NEXT: paddw %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-NEXT: retq ; ; AVX1-LABEL: test_v64i16: ; AVX1: # %bb.0: @@ -703,7 +621,8 @@ ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax ; AVX1-NEXT: vzeroupper @@ -720,7 +639,8 @@ ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vphaddw %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: # kill: def $ax killed $ax killed $eax ; AVX2-NEXT: vzeroupper Index: llvm/trunk/test/CodeGen/X86/vector-reduce-fadd-fast.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-reduce-fadd-fast.ll +++ llvm/trunk/test/CodeGen/X86/vector-reduce-fadd-fast.ll @@ -20,18 +20,20 @@ ; ; SSE41-LABEL: test_v2f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movaps %xmm1, %xmm0 -; SSE41-NEXT: haddps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: addps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f32: ; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm1, %xmm1, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vhaddps %xmm1, %xmm1, %xmm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; AVX512-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1) ret float %1 @@ -50,24 +52,27 @@ ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movaps %xmm1, %xmm0 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE41-NEXT: addps %xmm1, %xmm0 -; SSE41-NEXT: haddps %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: addps %xmm1, %xmm2 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: addps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v4f32: ; AVX: # %bb.0: ; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] ; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] ; AVX512-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1) ret float %1 @@ -88,10 +93,11 @@ ; SSE41-LABEL: test_v8f32: ; SSE41: # %bb.0: ; SSE41-NEXT: addps %xmm2, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE41-NEXT: addps %xmm1, %xmm0 -; SSE41-NEXT: haddps %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: addps %xmm1, %xmm2 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: addps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v8f32: @@ -100,7 +106,8 @@ ; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -111,7 +118,8 @@ ; AVX512-NEXT: vaddps %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -138,10 +146,11 @@ ; SSE41-NEXT: addps %xmm4, %xmm2 ; SSE41-NEXT: addps %xmm3, %xmm1 ; SSE41-NEXT: addps %xmm2, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE41-NEXT: addps %xmm1, %xmm0 -; SSE41-NEXT: haddps %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm2 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: addps %xmm1, %xmm2 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE41-NEXT: addps %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f32: @@ -151,7 +160,8 @@ ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -188,17 +198,20 @@ ; ; SSE41-LABEL: test_v2f32_zero: ; SSE41: # %bb.0: -; SSE41-NEXT: haddps %xmm0, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: addps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f32_zero: ; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f32_zero: ; AVX512: # %bb.0: -; AVX512-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0) ret float %1 @@ -220,7 +233,8 @@ ; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: addps %xmm0, %xmm1 -; SSE41-NEXT: haddps %xmm1, %xmm1 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: addps %xmm0, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -228,14 +242,16 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f32_zero: ; AVX512: # %bb.0: ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0) ret float %1 @@ -259,7 +275,8 @@ ; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: addps %xmm0, %xmm1 -; SSE41-NEXT: haddps %xmm1, %xmm1 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: addps %xmm0, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -269,7 +286,8 @@ ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -280,7 +298,8 @@ ; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -310,7 +329,8 @@ ; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: addps %xmm0, %xmm1 -; SSE41-NEXT: haddps %xmm1, %xmm1 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: addps %xmm0, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -321,7 +341,8 @@ ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -358,17 +379,20 @@ ; ; SSE41-LABEL: test_v2f32_undef: ; SSE41: # %bb.0: -; SSE41-NEXT: haddps %xmm0, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: addps %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f32_undef: ; AVX: # %bb.0: -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f32_undef: ; AVX512: # %bb.0: -; AVX512-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float undef, <2 x float> %a0) ret float %1 @@ -390,7 +414,8 @@ ; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: addps %xmm0, %xmm1 -; SSE41-NEXT: haddps %xmm1, %xmm1 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: addps %xmm0, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -398,14 +423,16 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v4f32_undef: ; AVX512: # %bb.0: ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> %a0) ret float %1 @@ -429,7 +456,8 @@ ; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: addps %xmm0, %xmm1 -; SSE41-NEXT: haddps %xmm1, %xmm1 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: addps %xmm0, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -439,7 +467,8 @@ ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -450,7 +479,8 @@ ; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -480,7 +510,8 @@ ; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] ; SSE41-NEXT: addps %xmm0, %xmm1 -; SSE41-NEXT: haddps %xmm1, %xmm1 +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: addps %xmm0, %xmm1 ; SSE41-NEXT: movaps %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -491,7 +522,8 @@ ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -518,53 +550,43 @@ ; define double @test_v2f64(double %a0, <2 x double> %a1) { -; SSE2-LABEL: test_v2f64: -; SSE2: # %bb.0: -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2f64: -; SSE41: # %bb.0: -; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: haddpd %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: test_v2f64: +; SSE: # %bb.0: +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64: ; AVX: # %bb.0: -; AVX-NEXT: vhaddpd %xmm1, %xmm1, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] +; AVX-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vhaddpd %xmm1, %xmm1, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] +; AVX512-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1) ret double %1 } define double @test_v4f64(double %a0, <4 x double> %a1) { -; SSE2-LABEL: test_v4f64: -; SSE2: # %bb.0: -; SSE2-NEXT: addpd %xmm2, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4f64: -; SSE41: # %bb.0: -; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: addpd %xmm2, %xmm0 -; SSE41-NEXT: haddpd %xmm0, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: test_v4f64: +; SSE: # %bb.0: +; SSE-NEXT: addpd %xmm2, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX-NEXT: vaddpd %ymm0, %ymm1, %ymm0 -; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -573,7 +595,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX512-NEXT: vaddpd %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -582,31 +605,23 @@ } define double @test_v8f64(double %a0, <8 x double> %a1) { -; SSE2-LABEL: test_v8f64: -; SSE2: # %bb.0: -; SSE2-NEXT: addpd %xmm4, %xmm2 -; SSE2-NEXT: addpd %xmm3, %xmm1 -; SSE2-NEXT: addpd %xmm2, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8f64: -; SSE41: # %bb.0: -; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: addpd %xmm4, %xmm2 -; SSE41-NEXT: addpd %xmm3, %xmm0 -; SSE41-NEXT: addpd %xmm2, %xmm0 -; SSE41-NEXT: haddpd %xmm0, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: test_v8f64: +; SSE: # %bb.0: +; SSE-NEXT: addpd %xmm4, %xmm2 +; SSE-NEXT: addpd %xmm3, %xmm1 +; SSE-NEXT: addpd %xmm2, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: test_v8f64: ; AVX: # %bb.0: ; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -627,32 +642,19 @@ } define double @test_v16f64(double %a0, <16 x double> %a1) { -; SSE2-LABEL: test_v16f64: -; SSE2: # %bb.0: -; SSE2-NEXT: addpd %xmm6, %xmm2 -; SSE2-NEXT: addpd %xmm7, %xmm3 -; SSE2-NEXT: addpd %xmm5, %xmm1 -; SSE2-NEXT: addpd %xmm3, %xmm1 -; SSE2-NEXT: addpd {{[0-9]+}}(%rsp), %xmm4 -; SSE2-NEXT: addpd %xmm2, %xmm4 -; SSE2-NEXT: addpd %xmm1, %xmm4 -; SSE2-NEXT: movapd %xmm4, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE2-NEXT: addpd %xmm4, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16f64: -; SSE41: # %bb.0: -; SSE41-NEXT: movapd %xmm4, %xmm0 -; SSE41-NEXT: addpd %xmm6, %xmm2 -; SSE41-NEXT: addpd %xmm7, %xmm3 -; SSE41-NEXT: addpd %xmm5, %xmm1 -; SSE41-NEXT: addpd %xmm3, %xmm1 -; SSE41-NEXT: addpd {{[0-9]+}}(%rsp), %xmm0 -; SSE41-NEXT: addpd %xmm2, %xmm0 -; SSE41-NEXT: addpd %xmm1, %xmm0 -; SSE41-NEXT: haddpd %xmm0, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: test_v16f64: +; SSE: # %bb.0: +; SSE-NEXT: addpd %xmm6, %xmm2 +; SSE-NEXT: addpd %xmm7, %xmm3 +; SSE-NEXT: addpd %xmm5, %xmm1 +; SSE-NEXT: addpd %xmm3, %xmm1 +; SSE-NEXT: addpd {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: addpd %xmm2, %xmm4 +; SSE-NEXT: addpd %xmm1, %xmm4 +; SSE-NEXT: movapd %xmm4, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE-NEXT: addpd %xmm4, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: test_v16f64: ; AVX: # %bb.0: @@ -661,7 +663,8 @@ ; AVX-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -687,53 +690,45 @@ ; define double @test_v2f64_zero(<2 x double> %a0) { -; SSE2-LABEL: test_v2f64_zero: -; SSE2: # %bb.0: -; SSE2-NEXT: movapd %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: addpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2f64_zero: -; SSE41: # %bb.0: -; SSE41-NEXT: haddpd %xmm0, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: test_v2f64_zero: +; SSE: # %bb.0: +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64_zero: ; AVX: # %bb.0: -; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64_zero: ; AVX512: # %bb.0: -; AVX512-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0) ret double %1 } define double @test_v4f64_zero(<4 x double> %a0) { -; SSE2-LABEL: test_v4f64_zero: -; SSE2: # %bb.0: -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: movapd %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: addpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4f64_zero: -; SSE41: # %bb.0: -; SSE41-NEXT: addpd %xmm1, %xmm0 -; SSE41-NEXT: haddpd %xmm0, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: test_v4f64_zero: +; SSE: # %bb.0: +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f64_zero: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -742,7 +737,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -751,31 +747,24 @@ } define double @test_v8f64_zero(<8 x double> %a0) { -; SSE2-LABEL: test_v8f64_zero: -; SSE2: # %bb.0: -; SSE2-NEXT: addpd %xmm3, %xmm1 -; SSE2-NEXT: addpd %xmm2, %xmm0 -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: movapd %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: addpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8f64_zero: -; SSE41: # %bb.0: -; SSE41-NEXT: addpd %xmm3, %xmm1 -; SSE41-NEXT: addpd %xmm2, %xmm0 -; SSE41-NEXT: addpd %xmm1, %xmm0 -; SSE41-NEXT: haddpd %xmm0, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: test_v8f64_zero: +; SSE: # %bb.0: +; SSE-NEXT: addpd %xmm3, %xmm1 +; SSE-NEXT: addpd %xmm2, %xmm0 +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: test_v8f64_zero: ; AVX: # %bb.0: ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -796,32 +785,19 @@ } define double @test_v16f64_zero(<16 x double> %a0) { -; SSE2-LABEL: test_v16f64_zero: -; SSE2: # %bb.0: -; SSE2-NEXT: addpd %xmm6, %xmm2 -; SSE2-NEXT: addpd %xmm4, %xmm0 -; SSE2-NEXT: addpd %xmm2, %xmm0 -; SSE2-NEXT: addpd %xmm7, %xmm3 -; SSE2-NEXT: addpd %xmm5, %xmm1 -; SSE2-NEXT: addpd %xmm3, %xmm1 -; SSE2-NEXT: addpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16f64_zero: -; SSE41: # %bb.0: -; SSE41-NEXT: addpd %xmm6, %xmm2 -; SSE41-NEXT: addpd %xmm4, %xmm0 -; SSE41-NEXT: addpd %xmm2, %xmm0 -; SSE41-NEXT: addpd %xmm7, %xmm3 -; SSE41-NEXT: addpd %xmm5, %xmm1 -; SSE41-NEXT: addpd %xmm3, %xmm1 -; SSE41-NEXT: addpd %xmm0, %xmm1 -; SSE41-NEXT: haddpd %xmm1, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: test_v16f64_zero: +; SSE: # %bb.0: +; SSE-NEXT: addpd %xmm6, %xmm2 +; SSE-NEXT: addpd %xmm4, %xmm0 +; SSE-NEXT: addpd %xmm2, %xmm0 +; SSE-NEXT: addpd %xmm7, %xmm3 +; SSE-NEXT: addpd %xmm5, %xmm1 +; SSE-NEXT: addpd %xmm3, %xmm1 +; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: test_v16f64_zero: ; AVX: # %bb.0: @@ -830,7 +806,8 @@ ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -856,53 +833,45 @@ ; define double @test_v2f64_undef(<2 x double> %a0) { -; SSE2-LABEL: test_v2f64_undef: -; SSE2: # %bb.0: -; SSE2-NEXT: movapd %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: addpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2f64_undef: -; SSE41: # %bb.0: -; SSE41-NEXT: haddpd %xmm0, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: test_v2f64_undef: +; SSE: # %bb.0: +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: test_v2f64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f64_undef: ; AVX512: # %bb.0: -; AVX512-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double undef, <2 x double> %a0) ret double %1 } define double @test_v4f64_undef(<4 x double> %a0) { -; SSE2-LABEL: test_v4f64_undef: -; SSE2: # %bb.0: -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: movapd %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: addpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4f64_undef: -; SSE41: # %bb.0: -; SSE41-NEXT: addpd %xmm1, %xmm0 -; SSE41-NEXT: haddpd %xmm0, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: test_v4f64_undef: +; SSE: # %bb.0: +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f64_undef: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -911,7 +880,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -920,31 +890,24 @@ } define double @test_v8f64_undef(<8 x double> %a0) { -; SSE2-LABEL: test_v8f64_undef: -; SSE2: # %bb.0: -; SSE2-NEXT: addpd %xmm3, %xmm1 -; SSE2-NEXT: addpd %xmm2, %xmm0 -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: movapd %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE2-NEXT: addpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8f64_undef: -; SSE41: # %bb.0: -; SSE41-NEXT: addpd %xmm3, %xmm1 -; SSE41-NEXT: addpd %xmm2, %xmm0 -; SSE41-NEXT: addpd %xmm1, %xmm0 -; SSE41-NEXT: haddpd %xmm0, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: test_v8f64_undef: +; SSE: # %bb.0: +; SSE-NEXT: addpd %xmm3, %xmm1 +; SSE-NEXT: addpd %xmm2, %xmm0 +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: test_v8f64_undef: ; AVX: # %bb.0: ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -965,32 +928,19 @@ } define double @test_v16f64_undef(<16 x double> %a0) { -; SSE2-LABEL: test_v16f64_undef: -; SSE2: # %bb.0: -; SSE2-NEXT: addpd %xmm6, %xmm2 -; SSE2-NEXT: addpd %xmm4, %xmm0 -; SSE2-NEXT: addpd %xmm2, %xmm0 -; SSE2-NEXT: addpd %xmm7, %xmm3 -; SSE2-NEXT: addpd %xmm5, %xmm1 -; SSE2-NEXT: addpd %xmm3, %xmm1 -; SSE2-NEXT: addpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16f64_undef: -; SSE41: # %bb.0: -; SSE41-NEXT: addpd %xmm6, %xmm2 -; SSE41-NEXT: addpd %xmm4, %xmm0 -; SSE41-NEXT: addpd %xmm2, %xmm0 -; SSE41-NEXT: addpd %xmm7, %xmm3 -; SSE41-NEXT: addpd %xmm5, %xmm1 -; SSE41-NEXT: addpd %xmm3, %xmm1 -; SSE41-NEXT: addpd %xmm0, %xmm1 -; SSE41-NEXT: haddpd %xmm1, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: test_v16f64_undef: +; SSE: # %bb.0: +; SSE-NEXT: addpd %xmm6, %xmm2 +; SSE-NEXT: addpd %xmm4, %xmm0 +; SSE-NEXT: addpd %xmm2, %xmm0 +; SSE-NEXT: addpd %xmm7, %xmm3 +; SSE-NEXT: addpd %xmm5, %xmm1 +; SSE-NEXT: addpd %xmm3, %xmm1 +; SSE-NEXT: addpd %xmm0, %xmm1 +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: addpd %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: test_v16f64_undef: ; AVX: # %bb.0: @@ -999,7 +949,8 @@ ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining.ll @@ -2700,36 +2700,21 @@ } define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) { -; SSE2-LABEL: PR22377: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,0,2] -; SSE2-NEXT: addps %xmm0, %xmm1 -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: PR22377: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movaps %xmm0, %xmm1 -; SSSE3-NEXT: haddps %xmm0, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: PR22377: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: haddps %xmm0, %xmm1 -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE41-NEXT: retq +; SSE-LABEL: PR22377: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,0,2] +; SSE-NEXT: addps %xmm0, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: retq ; ; AVX-LABEL: PR22377: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm1 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3] +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2] +; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX-NEXT: retq entry: %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32>