Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -32962,6 +32962,29 @@ return true; break; } + case X86ISD::HADD: + case X86ISD::HSUB: + case X86ISD::FHADD: + case X86ISD::FHSUB: { + // 256-bit horizontal ops are two 128-bit ops glued together. If we do not + // demand any of the high elements, then narrow the h-op to 128-bits: + // (hop ymm0, ymm1) --> insert undef, (hop xmm0, xmm1), 0 + if (VT.is256BitVector() && DemandedElts.lshr(NumElts / 2) == 0) { + SDLoc DL(Op); + EVT VT128 = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(), + VT.getVectorNumElements() / 2); + SDValue Zero = TLO.DAG.getIntPtrConstant(0, DL); + SDValue Ext0 = TLO.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT128, + Op.getOperand(0), Zero); + SDValue Ext1 = TLO.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT128, + Op.getOperand(1), Zero); + SDValue NarrowHop = TLO.DAG.getNode(Opc, DL, VT128, Ext0, Ext1); + SDValue Insert = TLO.DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, + TLO.DAG.getUNDEF(VT), NarrowHop, Zero); + return TLO.CombineTo(Op, Insert); + } + break; + } } // Simplify target shuffles. Index: test/CodeGen/X86/haddsub.ll =================================================================== --- test/CodeGen/X86/haddsub.ll +++ test/CodeGen/X86/haddsub.ll @@ -1392,11 +1392,10 @@ ; AVX-FAST-LABEL: fadd_reduce_v8f32: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX-FAST-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX-FAST-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-FAST-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX-FAST-NEXT: vhaddps %ymm0, %ymm0, %ymm0 -; AVX-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vzeroupper ; AVX-FAST-NEXT: retq %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1) @@ -1431,9 +1430,8 @@ ; AVX-FAST-LABEL: fadd_reduce_v4f64: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX-FAST-NEXT: vaddpd %ymm0, %ymm1, %ymm0 -; AVX-FAST-NEXT: vhaddpd %ymm0, %ymm0, %ymm0 -; AVX-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-FAST-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vzeroupper ; AVX-FAST-NEXT: retq %r = call fast double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1) Index: test/CodeGen/X86/phaddsub-extract.ll =================================================================== --- test/CodeGen/X86/phaddsub-extract.ll +++ test/CodeGen/X86/phaddsub-extract.ll @@ -932,32 +932,14 @@ ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq ; -; AVX1-FAST-LABEL: partial_reduction_add_v8i32: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: vzeroupper -; AVX1-FAST-NEXT: retq -; -; AVX2-FAST-LABEL: partial_reduction_add_v8i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-FAST-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vphaddd %ymm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovd %xmm0, %eax -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-FAST-LABEL: partial_reduction_add_v8i32: -; AVX512-FAST: # %bb.0: -; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-FAST-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX512-FAST-NEXT: vphaddd %ymm0, %ymm0, %ymm0 -; AVX512-FAST-NEXT: vmovd %xmm0, %eax -; AVX512-FAST-NEXT: vzeroupper -; AVX512-FAST-NEXT: retq +; AVX-FAST-LABEL: partial_reduction_add_v8i32: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: vmovd %xmm0, %eax +; AVX-FAST-NEXT: vzeroupper +; AVX-FAST-NEXT: retq %x23 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> %x0213 = add <8 x i32> %x, %x23 %x13 = shufflevector <8 x i32> %x0213, <8 x i32> undef, <8 x i32> @@ -1058,32 +1040,14 @@ ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq ; -; AVX1-FAST-LABEL: partial_reduction_sub_v8i32: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: vzeroupper -; AVX1-FAST-NEXT: retq -; -; AVX2-FAST-LABEL: partial_reduction_sub_v8i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-FAST-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vphsubd %ymm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovd %xmm0, %eax -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-FAST-LABEL: partial_reduction_sub_v8i32: -; AVX512-FAST: # %bb.0: -; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-FAST-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX512-FAST-NEXT: vphsubd %ymm0, %ymm0, %ymm0 -; AVX512-FAST-NEXT: vmovd %xmm0, %eax -; AVX512-FAST-NEXT: vzeroupper -; AVX512-FAST-NEXT: retq +; AVX-FAST-LABEL: partial_reduction_sub_v8i32: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-FAST-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX-FAST-NEXT: vphsubd %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: vmovd %xmm0, %eax +; AVX-FAST-NEXT: vzeroupper +; AVX-FAST-NEXT: retq %x23 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> %x0213 = sub <8 x i32> %x, %x23 %x13 = shufflevector <8 x i32> %x0213, <8 x i32> undef, <8 x i32>