Index: include/llvm/CodeGen/TargetLowering.h =================================================================== --- include/llvm/CodeGen/TargetLowering.h +++ include/llvm/CodeGen/TargetLowering.h @@ -3787,6 +3787,14 @@ /// \returns True, if the expansion was successful, false otherwise bool expandCTTZ(SDNode *N, SDValue &Result, SelectionDAG &DAG) const; + /// Expand ABS nodes. Expands vector/scalar ABS nodes, + /// vector nodes can only succeed if all operations are legal/custom. + /// (ABS x) -> (XOR (ADD x, (SRA x, type_size)), (SRA x, type_size)) + /// \param N Node to expand + /// \param Result output after conversion + /// \returns True, if the expansion was successful, false otherwise + bool expandABS(SDNode *N, SDValue &Result, SelectionDAG &DAG) const; + /// Turn load of vector type into a load of the individual elements. /// \param LD load to expand /// \returns MERGE_VALUEs of the scalar loads with their chains. Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2645,6 +2645,10 @@ SDValue Tmp1, Tmp2, Tmp3, Tmp4; bool NeedInvert; switch (Node->getOpcode()) { + case ISD::ABS: + if (TLI.expandABS(Node, Tmp1, DAG)) + Results.push_back(Tmp1); + break; case ISD::CTPOP: if (TLI.expandCTPOP(Node, Tmp1, DAG)) Results.push_back(Tmp1); Index: lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -117,6 +117,12 @@ /// the remaining lanes, finally bitcasting to the proper type. SDValue ExpandZERO_EXTEND_VECTOR_INREG(SDValue Op); + /// Implement expand-based legalization of ABS vector operations. + /// If following expanding is legal/custom then do it: + /// (ABS x) --> (XOR (ADD x, (SRA x, sizeof(x)-1)), (SRA x, sizeof(x)-1)) + /// else unroll the operation. + SDValue ExpandABS(SDValue Op); + /// Expand bswap of vectors into a shuffle if legal. SDValue ExpandBSWAP(SDValue Op); @@ -355,6 +361,7 @@ case ISD::FSHR: case ISD::ROTL: case ISD::ROTR: + case ISD::ABS: case ISD::BSWAP: case ISD::BITREVERSE: case ISD::CTLZ: @@ -749,6 +756,8 @@ return ExpandFSUB(Op); case ISD::SETCC: return UnrollVSETCC(Op); + case ISD::ABS: + return ExpandABS(Op); case ISD::BITREVERSE: return ExpandBITREVERSE(Op); case ISD::CTPOP: @@ -1064,6 +1073,16 @@ return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Val); } +SDValue VectorLegalizer::ExpandABS(SDValue Op) { + // Attempt to expand using TargetLowering. + SDValue Result; + if (TLI.expandABS(Op.getNode(), Result, DAG)) + return Result; + + // Otherwise go ahead and unroll. + return DAG.UnrollVectorOp(Op.getNode()); +} + SDValue VectorLegalizer::ExpandFP_TO_UINT(SDValue Op) { // Attempt to expand using TargetLowering. SDValue Result; Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -4715,6 +4715,26 @@ return true; } +bool TargetLowering::expandABS(SDNode *N, SDValue &Result, + SelectionDAG &DAG) const { + SDLoc dl(N); + EVT VT = N->getValueType(0); + SDValue Op = N->getOperand(0); + + // Only expand vector types if we have the appropriate vector operations. + if (VT.isVector() && (!isOperationLegalOrCustom(ISD::SRA, VT) || + !isOperationLegalOrCustom(ISD::ADD, VT) || + !isOperationLegalOrCustomOrPromote(ISD::XOR, VT))) + return false; + + SDValue Shift = + DAG.getNode(ISD::SRA, dl, VT, Op, + DAG.getConstant(VT.getScalarSizeInBits() - 1, dl, VT)); + SDValue Add = DAG.getNode(ISD::ADD, dl, VT, Op, Shift); + Result = DAG.getNode(ISD::XOR, dl, VT, Add, Shift); + return true; +} + SDValue TargetLowering::scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const { SDLoc SL(LD); Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -867,6 +867,7 @@ for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); + setOperationAction(ISD::ABS, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use // setcc all the way to isel and prefer SETGT in some isel patterns. @@ -1207,6 +1208,7 @@ setOperationAction(ISD::MULHU, MVT::v32i8, Custom); setOperationAction(ISD::MULHS, MVT::v32i8, Custom); + setOperationAction(ISD::ABS, MVT::v4i64, Custom); setOperationAction(ISD::SMAX, MVT::v4i64, Custom); setOperationAction(ISD::UMAX, MVT::v4i64, Custom); setOperationAction(ISD::SMIN, MVT::v4i64, Custom); @@ -23567,7 +23569,8 @@ return split256IntArith(Op, DAG); } -static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) { // Since X86 does not have CMOV for 8-bit integer, we don't convert @@ -23581,10 +23584,23 @@ return DAG.getNode(X86ISD::CMOV, DL, VT, Ops); } - assert(Op.getSimpleValueType().is256BitVector() && - Op.getSimpleValueType().isInteger() && - "Only handle AVX 256-bit vector integer operation"); - return Lower256IntUnary(Op, DAG); + // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X). + if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) { + SDLoc DL(Op); + SDValue Src = Op.getOperand(0); + SDValue Sub = + DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src); + return DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, Src, Sub, Src); + } + + if (VT.is256BitVector() && !Subtarget.hasInt256()) { + assert(VT.isInteger() && + "Only handle AVX 256-bit vector integer operation"); + return Lower256IntUnary(Op, DAG); + } + + // Default to expand. + return SDValue(); } static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) { @@ -26269,7 +26285,7 @@ case ISD::SMIN: case ISD::UMAX: case ISD::UMIN: return LowerMINMAX(Op, DAG); - case ISD::ABS: return LowerABS(Op, DAG); + case ISD::ABS: return LowerABS(Op, Subtarget, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG); case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG); Index: test/CodeGen/AArch64/arm64-vabs.ll =================================================================== --- test/CodeGen/AArch64/arm64-vabs.ll +++ test/CodeGen/AArch64/arm64-vabs.ll @@ -542,7 +542,8 @@ define i64 @abs_1d_honestly(i64 %A) nounwind { ; CHECK-LABEL: abs_1d_honestly: -; CHECK: abs d0, d0 +; CHECK: cmp x0, #0 +; CHECK-NEXT: cneg x0, x0, mi %abs = call i64 @llvm.aarch64.neon.abs.i64(i64 %A) ret i64 %abs } Index: test/CodeGen/X86/combine-abs.ll =================================================================== --- test/CodeGen/X86/combine-abs.ll +++ test/CodeGen/X86/combine-abs.ll @@ -67,12 +67,8 @@ ; AVX2-LABEL: combine_v4i64_abs_abs: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: combine_v4i64_abs_abs: Index: test/CodeGen/X86/viabs.ll =================================================================== --- test/CodeGen/X86/viabs.ll +++ test/CodeGen/X86/viabs.ll @@ -528,29 +528,43 @@ } define <2 x i64> @test_abs_ge_v2i64(<2 x i64> %a) nounwind { -; SSE-LABEL: test_abs_ge_v2i64: -; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrad $31, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE-NEXT: paddq %xmm1, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_abs_ge_v2i64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: test_abs_ge_v2i64: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSSE3-NEXT: paddq %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: test_abs_ge_v2i64: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: psubq %xmm0, %xmm1 +; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: test_abs_ge_v2i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_abs_ge_v2i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm1 +; AVX2-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_abs_ge_v2i64: @@ -564,39 +578,62 @@ } define <4 x i64> @test_abs_gt_v4i64(<4 x i64> %a) nounwind { -; SSE-LABEL: test_abs_gt_v4i64: -; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE-NEXT: paddq %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE-NEXT: paddq %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: test_abs_gt_v4i64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: paddq %xmm2, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: paddq %xmm2, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: test_abs_gt_v4i64: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: paddq %xmm2, %xmm0 +; SSSE3-NEXT: pxor %xmm2, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSSE3-NEXT: paddq %xmm2, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: test_abs_gt_v4i64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: psubq %xmm0, %xmm4 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 +; SSE41-NEXT: psubq %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: test_abs_gt_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 -; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpsubq %xmm0, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_abs_gt_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm1 +; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_abs_gt_v4i64: @@ -610,60 +647,97 @@ } define <8 x i64> @test_abs_le_v8i64(<8 x i64> %a) nounwind { -; SSE-LABEL: test_abs_le_v8i64: -; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: psrad $31, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE-NEXT: paddq %xmm4, %xmm0 -; SSE-NEXT: pxor %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: psrad $31, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE-NEXT: paddq %xmm4, %xmm1 -; SSE-NEXT: pxor %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: psrad $31, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE-NEXT: paddq %xmm4, %xmm2 -; SSE-NEXT: pxor %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: psrad $31, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE-NEXT: paddq %xmm4, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: retq +; SSE2-LABEL: test_abs_le_v8i64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: paddq %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: paddq %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: paddq %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: paddq %xmm4, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: test_abs_le_v8i64: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: paddq %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: paddq %xmm4, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: paddq %xmm4, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: paddq %xmm4, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: test_abs_le_v8i64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: pxor %xmm5, %xmm5 +; SSE41-NEXT: pxor %xmm6, %xmm6 +; SSE41-NEXT: psubq %xmm0, %xmm6 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm4 +; SSE41-NEXT: pxor %xmm6, %xmm6 +; SSE41-NEXT: psubq %xmm1, %xmm6 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 +; SSE41-NEXT: pxor %xmm6, %xmm6 +; SSE41-NEXT: psubq %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm2 +; SSE41-NEXT: psubq %xmm3, %xmm5 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: test_abs_le_v8i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm6 -; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %ymm6, %ymm0, %ymm0 +; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubq %xmm0, %xmm3, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm5 -; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1 +; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsubq %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 +; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_abs_le_v8i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm0, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm2 +; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_abs_le_v8i64: @@ -677,55 +751,103 @@ } define <8 x i64> @test_abs_le_v8i64_fold(<8 x i64>* %a.ptr) nounwind { -; SSE-LABEL: test_abs_le_v8i64_fold: -; SSE: # %bb.0: -; SSE-NEXT: movdqu (%rdi), %xmm0 -; SSE-NEXT: movdqu 16(%rdi), %xmm1 -; SSE-NEXT: movdqu 32(%rdi), %xmm2 -; SSE-NEXT: movdqu 48(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: psrad $31, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE-NEXT: paddq %xmm4, %xmm0 -; SSE-NEXT: pxor %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: psrad $31, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE-NEXT: paddq %xmm4, %xmm1 -; SSE-NEXT: pxor %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: psrad $31, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE-NEXT: paddq %xmm4, %xmm2 -; SSE-NEXT: pxor %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: psrad $31, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE-NEXT: paddq %xmm4, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm3 -; SSE-NEXT: retq +; SSE2-LABEL: test_abs_le_v8i64_fold: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqu (%rdi), %xmm0 +; SSE2-NEXT: movdqu 16(%rdi), %xmm1 +; SSE2-NEXT: movdqu 32(%rdi), %xmm2 +; SSE2-NEXT: movdqu 48(%rdi), %xmm3 +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: paddq %xmm4, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: paddq %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: paddq %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm4, %xmm2 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSE2-NEXT: paddq %xmm4, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: test_abs_le_v8i64_fold: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqu (%rdi), %xmm0 +; SSSE3-NEXT: movdqu 16(%rdi), %xmm1 +; SSSE3-NEXT: movdqu 32(%rdi), %xmm2 +; SSSE3-NEXT: movdqu 48(%rdi), %xmm3 +; SSSE3-NEXT: movdqa %xmm0, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: paddq %xmm4, %xmm0 +; SSSE3-NEXT: pxor %xmm4, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: paddq %xmm4, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: paddq %xmm4, %xmm2 +; SSSE3-NEXT: pxor %xmm4, %xmm2 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] +; SSSE3-NEXT: paddq %xmm4, %xmm3 +; SSSE3-NEXT: pxor %xmm4, %xmm3 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: test_abs_le_v8i64_fold: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqu (%rdi), %xmm1 +; SSE41-NEXT: movdqu 16(%rdi), %xmm2 +; SSE41-NEXT: movdqu 32(%rdi), %xmm3 +; SSE41-NEXT: movdqu 48(%rdi), %xmm4 +; SSE41-NEXT: pxor %xmm5, %xmm5 +; SSE41-NEXT: pxor %xmm6, %xmm6 +; SSE41-NEXT: psubq %xmm1, %xmm6 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 +; SSE41-NEXT: pxor %xmm6, %xmm6 +; SSE41-NEXT: psubq %xmm2, %xmm6 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm2 +; SSE41-NEXT: pxor %xmm6, %xmm6 +; SSE41-NEXT: psubq %xmm3, %xmm6 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3 +; SSE41-NEXT: psubq %xmm4, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: movapd %xmm2, %xmm1 +; SSE41-NEXT: movapd %xmm3, %xmm2 +; SSE41-NEXT: movapd %xmm4, %xmm3 +; SSE41-NEXT: retq ; ; AVX1-LABEL: test_abs_le_v8i64_fold: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqu (%rdi), %xmm0 -; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqu 32(%rdi), %xmm2 -; AVX1-NEXT: vmovdqu 48(%rdi), %xmm3 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm6 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7 -; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %ymm7, %ymm0, %ymm0 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm5 -; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1 +; AVX1-NEXT: vmovupd (%rdi), %ymm0 +; AVX1-NEXT: vmovupd 32(%rdi), %ymm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpsubq 16(%rdi), %xmm2, %xmm3 +; AVX1-NEXT: vpsubq (%rdi), %xmm2, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vblendvpd %ymm0, %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vpsubq 48(%rdi), %xmm2, %xmm3 +; AVX1-NEXT: vpsubq 32(%rdi), %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_abs_le_v8i64_fold: @@ -733,12 +855,10 @@ ; AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpsubq %ymm0, %ymm2, %ymm3 +; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm2 +; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test_abs_le_v8i64_fold: