Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -29053,6 +29053,40 @@ return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask); } +/// Eliminate a redundant shuffle of a horizontal math op. +static SDValue foldShuffleOfHorizOp(SDNode *N) { + if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef()) + return SDValue(); + + SDValue HOp = N->getOperand(0); + if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD && + HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB) + return SDValue(); + + // 128-bit horizontal math instructions are defined to operate on adjacent + // lanes of each operand as: + // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3] + // ...similarly for v2f64 and v8i16. + // TODO: 256-bit is not the same because...x86. + if (HOp.getOperand(0) != HOp.getOperand(1) || HOp.getValueSizeInBits() != 128) + return SDValue(); + + // When the operands of a horizontal math op are identical, the low half of + // the result is the same as the high half. If the shuffle is also replicating + // low and high halves, we don't need the shuffle. + // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X + ArrayRef Mask = cast(N)->getMask(); + // TODO: Other mask possibilities like {1,1} and {1,0} could be added here, + // but this should be tied to whatever horizontal op matching and shuffle + // canonicalization are producing. + if (isTargetShuffleEquivalent(Mask, { 0, 0 }) || + isTargetShuffleEquivalent(Mask, { 0, 1, 0, 1 }) || + isTargetShuffleEquivalent(Mask, { 0, 1, 2, 3, 0, 1, 2, 3 })) + return HOp; + + return SDValue(); +} + static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -29061,10 +29095,14 @@ const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // If we have legalized the vector types, look for blends of FADD and FSUB // nodes that we can fuse into an ADDSUB node. - if (TLI.isTypeLegal(VT)) + if (TLI.isTypeLegal(VT)) { if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG)) return AddSub; + if (SDValue HAddSub = foldShuffleOfHorizOp(N)) + return HAddSub; + } + // During Type Legalization, when promoting illegal vector types, // the backend might introduce new shuffle dag nodes and bitcasts. // Index: test/CodeGen/X86/haddsub-shuf.ll =================================================================== --- test/CodeGen/X86/haddsub-shuf.ll +++ test/CodeGen/X86/haddsub-shuf.ll @@ -9,13 +9,11 @@ ; SSSE3-LABEL: hadd_v4f32: ; SSSE3: # BB#0: ; SSSE3-NEXT: haddps %xmm0, %xmm0 -; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSSE3-NEXT: retq ; ; AVX-LABEL: hadd_v4f32: ; AVX: # BB#0: ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX-NEXT: retq %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> @@ -28,13 +26,11 @@ ; SSSE3-LABEL: hsub_v4f32: ; SSSE3: # BB#0: ; SSSE3-NEXT: hsubps %xmm0, %xmm0 -; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSSE3-NEXT: retq ; ; AVX-LABEL: hsub_v4f32: ; AVX: # BB#0: ; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX-NEXT: retq %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> @@ -47,13 +43,11 @@ ; SSSE3-LABEL: hadd_v2f64: ; SSSE3: # BB#0: ; SSSE3-NEXT: haddpd %xmm0, %xmm0 -; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSSE3-NEXT: retq ; ; AVX-LABEL: hadd_v2f64: ; AVX: # BB#0: ; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX-NEXT: retq %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> @@ -66,13 +60,11 @@ ; SSSE3-LABEL: hsub_v2f64: ; SSSE3: # BB#0: ; SSSE3-NEXT: hsubpd %xmm0, %xmm0 -; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSSE3-NEXT: retq ; ; AVX-LABEL: hsub_v2f64: ; AVX: # BB#0: ; AVX-NEXT: vhsubpd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX-NEXT: retq %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> @@ -85,13 +77,11 @@ ; SSSE3-LABEL: hadd_v4i32: ; SSSE3: # BB#0: ; SSSE3-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSSE3-NEXT: retq ; ; AVX-LABEL: hadd_v4i32: ; AVX: # BB#0: ; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: retq %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> @@ -104,13 +94,11 @@ ; SSSE3-LABEL: hsub_v4i32: ; SSSE3: # BB#0: ; SSSE3-NEXT: phsubd %xmm0, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSSE3-NEXT: retq ; ; AVX-LABEL: hsub_v4i32: ; AVX: # BB#0: ; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: retq %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> @@ -123,13 +111,11 @@ ; SSSE3-LABEL: hadd_v8i16: ; SSSE3: # BB#0: ; SSSE3-NEXT: phaddw %xmm0, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSSE3-NEXT: retq ; ; AVX-LABEL: hadd_v8i16: ; AVX: # BB#0: ; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: retq %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> @@ -142,13 +128,11 @@ ; SSSE3-LABEL: hsub_v8i16: ; SSSE3: # BB#0: ; SSSE3-NEXT: phsubw %xmm0, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSSE3-NEXT: retq ; ; AVX-LABEL: hsub_v8i16: ; AVX: # BB#0: ; AVX-NEXT: vphsubw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: retq %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32>