Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -39248,51 +39248,65 @@ // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 > // which is A horizontal-op B. - // At least one of the operands should be a vector shuffle. - if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE && - RHS.getOpcode() != ISD::VECTOR_SHUFFLE) - return false; - MVT VT = LHS.getSimpleValueType(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"); + unsigned NumElts = VT.getVectorNumElements(); + + auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1, + SmallVectorImpl &ShuffleMask) { + if (Op.getOpcode() == ISD::VECTOR_SHUFFLE) { + if (!Op.getOperand(0).isUndef()) + N0 = Op.getOperand(0); + if (!Op.getOperand(1).isUndef()) + N1 = Op.getOperand(1); + ArrayRef Mask = cast(Op)->getMask(); + ShuffleMask.append(Mask.begin(), Mask.end()); + return; + } + bool IsUnary; + SmallVector SrcOps; + SmallVector SrcShuffleMask; + SDValue BC = peekThroughBitcasts(Op); + if (isTargetShuffle(BC.getOpcode()) && + getTargetShuffleMask(BC.getNode(), BC.getSimpleValueType(), false, + SrcOps, SrcShuffleMask, IsUnary) && + SrcOps.size() <= 2 && SrcShuffleMask.size() == NumElts) { + N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue(); + N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue(); + ShuffleMask.append(SrcShuffleMask.begin(), SrcShuffleMask.end()); + } + }; // View LHS in the form // LHS = VECTOR_SHUFFLE A, B, LMask // If LHS is not a shuffle, then pretend it is the identity shuffle: // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1> // NOTE: A default initialized SDValue represents an UNDEF of type VT. - unsigned NumElts = VT.getVectorNumElements(); SDValue A, B; - SmallVector LMask(NumElts); - if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { - if (!LHS.getOperand(0).isUndef()) - A = LHS.getOperand(0); - if (!LHS.getOperand(1).isUndef()) - B = LHS.getOperand(1); - ArrayRef Mask = cast(LHS)->getMask(); - llvm::copy(Mask, LMask.begin()); - } else { - A = LHS; - for (unsigned i = 0; i != NumElts; ++i) - LMask[i] = i; - } + SmallVector LMask; + GetShuffle(LHS, A, B, LMask); // Likewise, view RHS in the form // RHS = VECTOR_SHUFFLE C, D, RMask SDValue C, D; - SmallVector RMask(NumElts); - if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { - if (!RHS.getOperand(0).isUndef()) - C = RHS.getOperand(0); - if (!RHS.getOperand(1).isUndef()) - D = RHS.getOperand(1); - ArrayRef Mask = cast(RHS)->getMask(); - llvm::copy(Mask, RMask.begin()); - } else { + SmallVector RMask; + GetShuffle(RHS, C, D, RMask); + + // At least one of the operands should be a vector shuffle. + if (LMask.empty() && RMask.empty()) + return false; + + if (LMask.empty()) { + A = LHS; + for (unsigned i = 0; i != NumElts; ++i) + LMask.push_back(i); + } + + if (RMask.empty()) { C = RHS; for (unsigned i = 0; i != NumElts; ++i) - RMask[i] = i; + RMask.push_back(i); } // If A and B occur in reverse order in RHS, then canonicalize by commuting @@ -39359,7 +39373,8 @@ (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && isHorizontalBinOp(LHS, RHS, IsFadd) && shouldUseHorizontalOp(LHS == RHS, DAG, Subtarget)) - return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS); + return DAG.getNode(HorizOpcode, SDLoc(N), VT, DAG.getBitcast(VT, LHS), + DAG.getBitcast(VT, RHS)); return SDValue(); } @@ -42261,6 +42276,8 @@ ArrayRef Ops) { return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops); }; + Op0 = DAG.getBitcast(VT, Op0); + Op1 = DAG.getBitcast(VT, Op1); return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HADDBuilder); } @@ -42392,6 +42409,8 @@ ArrayRef Ops) { return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops); }; + Op0 = DAG.getBitcast(VT, Op0); + Op1 = DAG.getBitcast(VT, Op1); return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HSUBBuilder); } Index: test/CodeGen/X86/haddsub.ll =================================================================== --- test/CodeGen/X86/haddsub.ll +++ test/CodeGen/X86/haddsub.ll @@ -1480,9 +1480,7 @@ ; AVX-SLOW-LABEL: PR39936_v8f32: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,2] -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; AVX-SLOW-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 @@ -1494,9 +1492,7 @@ ; AVX-FAST-LABEL: PR39936_v8f32: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,2] -; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; AVX-FAST-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vzeroupper Index: test/CodeGen/X86/phaddsub.ll =================================================================== --- test/CodeGen/X86/phaddsub.ll +++ test/CodeGen/X86/phaddsub.ll @@ -803,32 +803,51 @@ ; SSSE3-FAST-NEXT: movd %xmm0, %eax ; SSSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: PR39936_v8i32: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,2] -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: vmovd %xmm0, %eax -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: PR39936_v8i32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: PR39936_v8i32: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,2] -; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; AVX-FAST-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vmovd %xmm0, %eax -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: PR39936_v8i32: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: PR39936_v8i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vmovd %xmm0, %eax +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: PR39936_v8i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovd %xmm0, %eax +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq %2 = shufflevector <8 x i32> %0, <8 x i32> undef, <8 x i32> %3 = shufflevector <8 x i32> %0, <8 x i32> undef, <8 x i32> %4 = add <8 x i32> %2, %3 Index: test/CodeGen/X86/vector-shuffle-256-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v4.ll +++ test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1732,9 +1732,7 @@ ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: add_v4f64_0246_1357: @@ -1775,9 +1773,7 @@ ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: add_v4f64_4602_5713: Index: test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v8.ll +++ test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -2805,9 +2805,7 @@ ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] -; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vhaddps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: add_v8f32_02468ACE_13579BDF: @@ -2848,9 +2846,7 @@ ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] -; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vhaddps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: add_v8f32_8ACE0246_9BDF1357: