Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -35624,6 +35624,57 @@ llvm_unreachable("All opcodes should return within switch"); } +/// Try to convert a vector reduction sequence composed of binops and shuffles +/// into horizontal ops. +static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller"); + bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize(); + if (!Subtarget.hasFastHorizontalOps() && !OptForSize) + return SDValue(); + SDValue Index = ExtElt->getOperand(1); + if (!isNullConstant(Index)) + return SDValue(); + + // TODO: Allow FADD with reduction and/or reassociation and no-signed-zeros. + ISD::NodeType Opc; + SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD}); + if (!Rdx) + return SDValue(); + + EVT VT = ExtElt->getValueType(0); + EVT VecVT = ExtElt->getOperand(0).getValueType(); + if (VecVT.getScalarType() != VT) + return SDValue(); + + unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD; + SDLoc DL(ExtElt); + + // 256-bit horizontal instructions operate on 128-bit chunks rather than + // across the whole vector, so we need an extract + hop preliminary stage. + // This is the only step where the operands of the hop are not the same value. + // TODO: We could extend this to handle 512-bit or even longer vectors. + if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) || + ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) { + unsigned NumElts = VecVT.getVectorNumElements(); + SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL); + SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL); + VecVT = EVT::getVectorVT(*DAG.getContext(), VT, NumElts / 2); + Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Hi, Lo); + } + if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) && + !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3())) + return SDValue(); + + // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0 + assert(Rdx.getValueType() == VecVT && "Unexpected reduction match"); + unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements()); + for (unsigned i = 0; i != ReductionSteps; ++i) + Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index); +} + /// Detect vector gather/scatter index generation and convert it from being a /// bunch of shuffles and extracts into a somewhat faster sequence. /// For i686, the best sequence is apparently storing the value and loading @@ -35710,6 +35761,9 @@ if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget)) return MinMax; + if (SDValue V = combineReductionToHorizontal(N, DAG, Subtarget)) + return V; + if (SDValue V = scalarizeExtEltFP(N, DAG)) return V; Index: llvm/trunk/test/CodeGen/X86/phaddsub-extract.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/phaddsub-extract.ll +++ llvm/trunk/test/CodeGen/X86/phaddsub-extract.ll @@ -1903,10 +1903,8 @@ ; ; SSE3-FAST-LABEL: hadd16_8: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE3-FAST-NEXT: paddw %xmm0, %xmm1 -; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE3-FAST-NEXT: paddw %xmm1, %xmm0 +; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0 +; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0 ; SSE3-FAST-NEXT: phaddw %xmm0, %xmm0 ; SSE3-FAST-NEXT: movd %xmm0, %eax ; SSE3-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -1926,10 +1924,8 @@ ; ; AVX-FAST-LABEL: hadd16_8: ; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vmovd %xmm0, %eax ; AVX-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -1956,10 +1952,9 @@ ; ; SSE3-FAST-LABEL: hadd32_4: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE3-FAST-NEXT: paddd %xmm0, %xmm1 -; SSE3-FAST-NEXT: phaddd %xmm1, %xmm1 -; SSE3-FAST-NEXT: movd %xmm1, %eax +; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSE3-FAST-NEXT: phaddd %xmm0, %xmm0 +; SSE3-FAST-NEXT: movd %xmm0, %eax ; SSE3-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: hadd32_4: @@ -1973,8 +1968,7 @@ ; ; AVX-FAST-LABEL: hadd32_4: ; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vmovd %xmm0, %eax ; AVX-FAST-NEXT: retq @@ -2097,10 +2091,8 @@ define i16 @hadd16_8_optsize(<8 x i16> %x223) optsize { ; SSE3-LABEL: hadd16_8_optsize: ; SSE3: # %bb.0: -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE3-NEXT: paddw %xmm0, %xmm1 -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE3-NEXT: paddw %xmm1, %xmm0 +; SSE3-NEXT: phaddw %xmm0, %xmm0 +; SSE3-NEXT: phaddw %xmm0, %xmm0 ; SSE3-NEXT: phaddw %xmm0, %xmm0 ; SSE3-NEXT: movd %xmm0, %eax ; SSE3-NEXT: # kill: def $ax killed $ax killed $eax @@ -2108,10 +2100,8 @@ ; ; AVX-LABEL: hadd16_8_optsize: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: # kill: def $ax killed $ax killed $eax @@ -2129,16 +2119,14 @@ define i32 @hadd32_4_optsize(<4 x i32> %x225) optsize { ; SSE3-LABEL: hadd32_4_optsize: ; SSE3: # %bb.0: -; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE3-NEXT: paddd %xmm0, %xmm1 -; SSE3-NEXT: phaddd %xmm1, %xmm1 -; SSE3-NEXT: movd %xmm1, %eax +; SSE3-NEXT: phaddd %xmm0, %xmm0 +; SSE3-NEXT: phaddd %xmm0, %xmm0 +; SSE3-NEXT: movd %xmm0, %eax ; SSE3-NEXT: retq ; ; AVX-LABEL: hadd32_4_optsize: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq Index: llvm/trunk/test/CodeGen/X86/vector-reduce-add-widen.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-reduce-add-widen.ll +++ llvm/trunk/test/CodeGen/X86/vector-reduce-add-widen.ll @@ -254,8 +254,7 @@ ; ; AVX1-FAST-LABEL: test_v4i32: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: retq @@ -307,9 +306,8 @@ ; AVX1-FAST-LABEL: test_v8i32: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: vzeroupper @@ -635,10 +633,8 @@ ; ; AVX1-FAST-LABEL: test_v8i16: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -704,11 +700,9 @@ ; AVX1-FAST-LABEL: test_v16i16: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax Index: llvm/trunk/test/CodeGen/X86/vector-reduce-add.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-reduce-add.ll +++ llvm/trunk/test/CodeGen/X86/vector-reduce-add.ll @@ -241,8 +241,7 @@ ; ; AVX1-FAST-LABEL: test_v4i32: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: retq @@ -294,9 +293,8 @@ ; AVX1-FAST-LABEL: test_v8i32: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: vzeroupper @@ -605,10 +603,8 @@ ; ; AVX1-FAST-LABEL: test_v8i16: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax @@ -674,11 +670,9 @@ ; AVX1-FAST-LABEL: test_v16i16: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm1, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovd %xmm0, %eax ; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax