Index: docs/LangRef.rst =================================================================== --- docs/LangRef.rst +++ docs/LangRef.rst @@ -9558,6 +9558,122 @@ %r2 = call float @llvm.fmuladd.f32(float %a, float %b, float %c) ; yields float:r2 = (a * b) + c + +'``llvm.uabsdiff.*``' and '``llvm.sabsdiff.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. The loaded data is a vector of any integer or floating point data type. + +.. code-block:: llvm + + declare <4 x integer> @llvm.uabsdiff.v4i32(<4 x integer> %a, <4 x integer> %b) + declare <4 x float> @llvm.uabsdiff.v4f32(<4 x float> %a, <4 x float> %b) + + +Overview: +""""""""" + +The ``llvm.uabsdiff`` intrinsic returns a vector result of the absolute difference of the two operands, +treating them both as unsigned integers or floats. + +The ``llvm.sabsdiff`` intrinsic returns a vector result of the absolute difference of the two operands, +treating them both as signed integers or float. + +.. note:: + + These intrinsics are primarily used during optimization phase. + +Arguments: +"""""""""" + +Both intrinsics take two integer or float arguments of the same bitwidth. + +Semantics: +"""""""""" + +The expression:: + + call <4 x i32> @llvm.uabsdiff.v4i32(<4 x i32> %a, <4 x i32> %b) + +is equivalent to:: + + %sub = sub <4 x i32> %a, %b + %ispos = icmp ugt <4 x i32> %sub, + %neg = sub <4 x i32> zeroinitializer, %sub + %1 = select <4 x i1> %ispos, <4 x i32> %sub, <4 x i32> %neg + +Similarly the expression:: + + call <4 x i32> @llvm.sabsdiff.v4i32(<4 x i32> %a, <4 x i32> %b) + +is equivalent to:: + + %sub = sub nsw <4 x i32> %a, %b + %ispos = icmp sgt <4 x i32> %sub, + %neg = sub nsw <4 x i32> zeroinitializer, %sub + %1 = select <4 x i1> %ispos, <4 x i32> %sub, <4 x i32> %neg + + +'``llvm.uhadd.*``' and '``llvm.shadd.*``'Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. The loaded data is a vector of any integer or floating point data type. + +.. code-block:: llvm + + declare <4 x integer> @llvm.uhadd.v4i32(<4 x integer> %a) + declare <4 x float> @llvm.shadd.v4f32(<4 x float> %a) + +Overview: +""""""""" + +The ``llvm.uhadd`` intrinsic returns the result of the horizontal or reduction sum of the elements of the +vector operand, treating it as unsigned integers or floats. + +The ``llvm.shadd`` intrinsic returns a vector result of the horizontal or reduction sum of the elements +of the vector operand, treating it as signed integers or floats. + +.. note:: + + These intrinsics are primarily used during optimization phase. + +Arguments: +"""""""""" + +Both intrinsics take single integer or float argument of the same bitwidth. + +Semantics: +"""""""""" + +The expression:: + + call i32 @llvm.uhadd.v4i32(<4 x i32> %a) + +is equivalent to:: + + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + %2 = add <4 x i32> %a, %1 + %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> + %4 = add <4 x i32> %2, %3 + %5 = extractelement <4 x i32> %4, i32 0 + +Similarly the expression:: + + call <4 x i32> @llvm.shadd.v4i32(<4 x i32> %a) + +is equivalent to:: + + %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> + %2 = add nsw <4 x i32> %a, %1 + %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> + %4 = add <4 x i32> %2, %3 + %5 = extractelement <4 x i32> %4, i32 0 + + Half Precision Floating Point Intrinsics ---------------------------------------- Index: include/llvm/CodeGen/ISDOpcodes.h =================================================================== --- include/llvm/CodeGen/ISDOpcodes.h +++ include/llvm/CodeGen/ISDOpcodes.h @@ -329,6 +329,16 @@ /// Byte Swap and Counting operators. BSWAP, CTTZ, CTLZ, CTPOP, + /// [SU]ABSD - Signed/Unsigned absolute difference of two input integer or + /// float vector. + /// These nodes are generated from llvm.*absdiff* intrinsics. + SABSD, UABSD, + + /// [SU]HADD - Signed/Unsigned horizontal sum across the elements of sole + /// integer or float input vector. + /// These nodes are generated from llvm.*hadd* intrinsics. + UHADD, SHADD, + /// Bit counting operators with an undefined result for zero inputs. CTTZ_ZERO_UNDEF, CTLZ_ZERO_UNDEF, Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -595,6 +595,15 @@ def int_clear_cache : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty], [], "llvm.clear_cache">; +// Calculate the Absolute Differences of the two input vectors. +def int_sabsdiff : Intrinsic<[llvm_anyvector_ty], + [ llvm_anyvector_ty, llvm_anyvector_ty ], [IntrNoMem]>; +def int_uabsdiff : Intrinsic<[llvm_anyvector_ty], + [ llvm_anyvector_ty, llvm_anyvector_ty ], [IntrNoMem]>; + +// Calculate the horizontal/reduction sum across the elements of input vector. +def int_uhadd : Intrinsic<[llvm_anyint_ty], [llvm_anyvector_ty], [IntrNoMem]>; +def int_shadd : Intrinsic<[llvm_anyint_ty], [llvm_anyvector_ty], [IntrNoMem]>; //===-------------------------- Masked Intrinsics -------------------------===// // def int_masked_store : Intrinsic<[], [llvm_anyvector_ty, LLVMPointerTo<0>, Index: include/llvm/Target/TargetSelectionDAG.td =================================================================== --- include/llvm/Target/TargetSelectionDAG.td +++ include/llvm/Target/TargetSelectionDAG.td @@ -385,6 +385,10 @@ def umin : SDNode<"ISD::UMIN" , SDTIntBinOp>; def umax : SDNode<"ISD::UMAX" , SDTIntBinOp>; +def sabsd : SDNode<"ISD::SABSD" , SDTIntBinOp>; +def shadd : SDNode<"ISD::SHADD" , SDTIntUnaryOp>; +def uabsd : SDNode<"ISD::UABSD" , SDTIntBinOp>; +def uhadd : SDNode<"ISD::UHADD" , SDTIntUnaryOp>; def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>; def bswap : SDNode<"ISD::BSWAP" , SDTIntUnaryOp>; def ctlz : SDNode<"ISD::CTLZ" , SDTIntUnaryOp>; Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -2871,6 +2871,45 @@ case ISD::BSWAP: Results.push_back(ExpandBSWAP(Node->getOperand(0), dl)); break; + case ISD::UABSD: + case ISD::SABSD: { + EVT VT = Node->getValueType(0); + unsigned int SubOpc = ISD::SUB; + Tmp2 = Node->getOperand(0); + Tmp3 = Node->getOperand(1); + if (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) { + Tmp1 = DAG.getNode(SubOpc, dl, VT, Tmp2, Tmp3); + Tmp2 = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Tmp1); + Tmp4 = + DAG.getNode(ISD::SETCC, dl, getSetCCResultType(VT), Tmp2, + DAG.getConstant(0, dl, VT), DAG.getCondCode(ISD::SETLT)); + Tmp1 = DAG.getNode(ISD::VSELECT, dl, VT, Tmp4, Tmp1, Tmp2); + } + Results.push_back(Tmp1); + break; + } + case ISD::UHADD: + case ISD::SHADD: { + // FIXME: Improve the expansion + SDValue OpVal = Node->getOperand(0); + EVT VT = OpVal.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + EVT EltVT = VT.getVectorElementType(); + + SmallVector ShuffleMask(NumElems, -1); + for (unsigned i = NumElems; i != 1; i >>= 1) { + // Move the upper half of the vector to the lower half. + for (unsigned j = 0; j != i / 2; ++j) + ShuffleMask[j] = (i / 2 + j); + SDValue Shuffle = DAG.getVectorShuffle(VT, dl, OpVal, DAG.getUNDEF(VT), + &ShuffleMask[0]); + OpVal = DAG.getNode(ISD::ADD, dl, VT, OpVal, Shuffle); + } + Tmp1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, OpVal, + DAG.getConstant(0, dl, TLI.getVectorIdxTy())); + Results.push_back(Tmp1); + break; + } case ISD::FRAMEADDR: case ISD::RETURNADDR: case ISD::FRAME_TO_ARGS_OFFSET: Index: lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -146,6 +146,10 @@ case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: Res = PromoteIntRes_AtomicCmpSwap(cast(N), ResNo); break; + case ISD::UABSD: + case ISD::SABSD: + Res = PromoteIntRes_AbsoluteDiff(N); + break; } // If the result is null then the sub-method took care of registering it. @@ -153,6 +157,32 @@ SetPromotedInteger(SDValue(N, ResNo), Res); } +SDValue DAGTypeLegalizer::PromoteIntRes_AbsoluteDiff(SDNode *N) { + SDValue LHS, RHS; + if (N->getOpcode() == ISD::SABSD) { + LHS = SExtPromotedInteger(N->getOperand(0)); + RHS = SExtPromotedInteger(N->getOperand(1)); + } else { + LHS = ZExtPromotedInteger(N->getOperand(0)); + RHS = ZExtPromotedInteger(N->getOperand(1)); + } + return DAG.getNode(N->getOpcode(), SDLoc(N), + LHS.getValueType(), LHS, RHS); +} + +SDValue DAGTypeLegalizer::PromoteIntRes_HADD(SDNode *N) { + SDValue LHS, RHS; + if (N->getOpcode() == ISD::SHADD) { + LHS = SExtPromotedInteger(N->getOperand(0)); + RHS = SExtPromotedInteger(N->getOperand(1)); + } else { + LHS = ZExtPromotedInteger(N->getOperand(0)); + RHS = ZExtPromotedInteger(N->getOperand(1)); + } + return DAG.getNode(N->getOpcode(), SDLoc(N), + LHS.getValueType(), LHS, RHS); +} + SDValue DAGTypeLegalizer::PromoteIntRes_MERGE_VALUES(SDNode *N, unsigned ResNo) { SDValue Op = DisintegrateMERGE_VALUES(N, ResNo); @@ -875,6 +905,8 @@ case ISD::SRL: case ISD::ROTL: case ISD::ROTR: Res = PromoteIntOp_Shift(N); break; + case ISD::UHADD: + case ISD::SHADD: Res = PromoteIntOp_HADD(N); break; } // If the result is null, the sub-method took care of registering results etc. @@ -1208,6 +1240,16 @@ N->getOperand(0).getValueType().getScalarType()); } +SDValue DAGTypeLegalizer::PromoteIntOp_HADD(SDNode *N) { + SDValue Op; + if (N->getOpcode() == ISD::SHADD) { + Op = SExtPromotedInteger(N->getOperand(0)); + } else { + Op = ZExtPromotedInteger(N->getOperand(0)); + } + return DAG.getNode(N->getOpcode(), SDLoc(N), + N->getValueType(0), Op); +} //===----------------------------------------------------------------------===// // Integer Result Expansion Index: lib/CodeGen/SelectionDAG/LegalizeTypes.h =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -264,6 +264,8 @@ SDValue PromoteIntRes_UNDEF(SDNode *N); SDValue PromoteIntRes_VAARG(SDNode *N); SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo); + SDValue PromoteIntRes_AbsoluteDiff(SDNode *N); + SDValue PromoteIntRes_HADD(SDNode *N); // Integer Operand Promotion. bool PromoteIntegerOperand(SDNode *N, unsigned OperandNo); @@ -294,6 +296,7 @@ SDValue PromoteIntOp_ZERO_EXTEND(SDNode *N); SDValue PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo); SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo); + SDValue PromoteIntOp_HADD(SDNode *N); void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code); @@ -639,6 +642,7 @@ SDValue SplitVecOp_VSELECT(SDNode *N, unsigned OpNo); SDValue SplitVecOp_UnaryOp(SDNode *N); SDValue SplitVecOp_TruncateHelper(SDNode *N); + SDValue SplitVecOp_HADD(SDNode *N); SDValue SplitVecOp_BITCAST(SDNode *N); SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N); Index: lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -59,6 +59,9 @@ /// \brief Implements unrolling a VSETCC. SDValue UnrollVSETCC(SDValue Op); + /// \brief Implements unrolling a [SU]HADD. + SDValue UnrollHADD(SDValue Op); + /// \brief Implement expand-based legalization of vector operations. /// /// This is just a high-level routine to dispatch to specific code paths for @@ -704,6 +707,9 @@ return ExpandFNEG(Op); case ISD::SETCC: return UnrollVSETCC(Op); + case ISD::UHADD: + case ISD::SHADD: + return UnrollHADD(Op); default: return DAG.UnrollVectorOp(Op.getNode()); } @@ -1010,6 +1016,27 @@ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } +SDValue VectorLegalizer::UnrollHADD(SDValue Op) { + EVT VT = Op.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + EVT EltVT = VT.getVectorElementType(); + SDValue OpVal = Op.getOperand(0); + EVT TmpEltVT = OpVal.getValueType().getVectorElementType(); + SDLoc dl(Op); + + SDValue LHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, OpVal, + DAG.getConstant(0, dl, TLI.getVectorIdxTy())); + SDValue RHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, OpVal, + DAG.getConstant(1, dl, TLI.getVectorIdxTy())); + SDValue Ops = DAG.getNode(ISD::ADD, dl, VT, LHSElem, RHSElem); + + for (unsigned i = 2; i < NumElems; ++i) { + LHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, OpVal, + DAG.getConstant(i, dl, TLI.getVectorIdxTy())); + Ops = DAG.getNode(ISD::ADD, dl, VT, LHSElem, Ops); + } + return Ops; +} } bool SelectionDAG::LegalizeVectors() { Index: lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -640,6 +640,8 @@ case ISD::SINT_TO_FP: case ISD::TRUNCATE: case ISD::UINT_TO_FP: + case ISD::UHADD: + case ISD::SHADD: SplitVecRes_UnaryOp(N, Lo, Hi); break; @@ -675,6 +677,8 @@ case ISD::SMAX: case ISD::UMIN: case ISD::UMAX: + case ISD::UABSD: + case ISD::SABSD: SplitVecRes_BinOp(N, Lo, Hi); break; case ISD::FMA: @@ -1394,6 +1398,10 @@ case ISD::FTRUNC: Res = SplitVecOp_UnaryOp(N); break; + case ISD::UHADD: + case ISD::SHADD: + Res = SplitVecOp_HADD(N); + break; } } @@ -1479,6 +1487,18 @@ JoinIntegers(Lo, Hi)); } +SDValue DAGTypeLegalizer::SplitVecOp_HADD(SDNode *N) { + SDValue Lo, Hi; + EVT SubVT = N->getValueType(0); + SDValue OpVal = N->getOperand(0); + SDLoc dl(N); + GetSplitVector(OpVal, Lo, Hi); + Lo = DAG.getNode(N->getOpcode(), dl, SubVT, Lo); + Hi = DAG.getNode(N->getOpcode(), dl, SubVT, Hi); + + return DAG.getNode(ISD::ADD, dl, SubVT, Lo, Hi); +} + SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N) { // We know that the extracted result type is legal. EVT SubVT = N->getValueType(0); Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4581,6 +4581,28 @@ getValue(I.getArgOperand(0)).getValueType(), getValue(I.getArgOperand(0)))); return nullptr; + case Intrinsic::uabsdiff: + setValue(&I, DAG.getNode(ISD::UABSD, sdl, + getValue(I.getArgOperand(0)).getValueType(), + getValue(I.getArgOperand(0)), + getValue(I.getArgOperand(1)))); + return nullptr; + case Intrinsic::sabsdiff: + setValue(&I, DAG.getNode(ISD::SABSD, sdl, + getValue(I.getArgOperand(0)).getValueType(), + getValue(I.getArgOperand(0)), + getValue(I.getArgOperand(1)))); + return nullptr; + case Intrinsic::uhadd: + setValue(&I, DAG.getNode(ISD::UHADD, sdl, + TLI.getValueType(I.getType()), + getValue(I.getArgOperand(0)))); + return nullptr; + case Intrinsic::shadd: + setValue(&I, DAG.getNode(ISD::SHADD, sdl, + TLI.getValueType(I.getType()), + getValue(I.getArgOperand(0)))); + return nullptr; case Intrinsic::cttz: { SDValue Arg = getValue(I.getArgOperand(0)); ConstantInt *CI = cast(I.getArgOperand(1)); Index: lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -160,6 +160,8 @@ case ISD::FLOG: return "flog"; case ISD::FLOG2: return "flog2"; case ISD::FLOG10: return "flog10"; + case ISD::UHADD: return "uhadd"; + case ISD::SHADD: return "shadd"; // Binary operators case ISD::ADD: return "add"; @@ -224,6 +226,8 @@ case ISD::SHL_PARTS: return "shl_parts"; case ISD::SRA_PARTS: return "sra_parts"; case ISD::SRL_PARTS: return "srl_parts"; + case ISD::UABSD: return "uabsd"; + case ISD::SABSD: return "sabsd"; // Conversion operators. case ISD::SIGN_EXTEND: return "sign_extend"; Index: lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- lib/CodeGen/TargetLoweringBase.cpp +++ lib/CodeGen/TargetLoweringBase.cpp @@ -823,6 +823,10 @@ setOperationAction(ISD::USUBO, VT, Expand); setOperationAction(ISD::SMULO, VT, Expand); setOperationAction(ISD::UMULO, VT, Expand); + setOperationAction(ISD::UABSD, VT, Expand); + setOperationAction(ISD::SABSD, VT, Expand); + setOperationAction(ISD::UHADD, VT, Expand); + setOperationAction(ISD::SHADD, VT, Expand); // These library functions default to expand. setOperationAction(ISD::FROUND, VT, Expand); Index: lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- lib/Target/X86/X86ISelDAGToDAG.cpp +++ lib/Target/X86/X86ISelDAGToDAG.cpp @@ -2132,6 +2132,17 @@ switch (Opcode) { default: break; + case X86ISD::USAD: { + SDNode *New; + SDValue Ops[] = {Node->getOperand(0), Node->getOperand(1)}; + + if (Subtarget->hasAVX() || Subtarget->hasSSE1() || Subtarget->hasSSE2()) + New = + CurDAG->getMachineNode(X86::PSADBWrr, dl, Node->getValueType(0), Ops); + else + New = Node; + return New; + } case ISD::INTRINSIC_W_CHAIN: { unsigned IntNo = cast(Node->getOperand(1))->getZExtValue(); switch (IntNo) { Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -35,6 +35,9 @@ /// Bit scan reverse. BSR, + /// Sum of absolute differences + USAD, + /// Double shift instructions. These correspond to /// X86::SHLDxx and X86::SHRDxx instructions. SHLD, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -174,6 +174,9 @@ setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); + setOperationAction(ISD::UHADD , MVT::i8 , Promote); + setOperationAction(ISD::SHADD , MVT::i8 , Promote); + // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this // operation. setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); @@ -721,6 +724,10 @@ setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); + setOperationAction(ISD::UABSD, VT, Expand); + setOperationAction(ISD::SABSD, VT, Expand); + setOperationAction(ISD::UHADD, VT, Expand); + setOperationAction(ISD::SHADD, VT, Expand); setOperationAction(ISD::SETCC, VT, Expand); setOperationAction(ISD::FLOG, VT, Expand); setOperationAction(ISD::FLOG2, VT, Expand); @@ -1582,6 +1589,8 @@ } // We have target-specific dag combine patterns for the following nodes: + setTargetDAGCombine(ISD::UHADD); + setTargetDAGCombine(ISD::SHADD); setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::BITCAST); @@ -10469,6 +10478,17 @@ // FIXME: We should custom lower this by fixing the condition and using i8 // blends. + if(Op.getSimpleValueType().SimpleTy == MVT::v8i16) { + SDValue V0, V1, V2; + SDLoc DL(Op); + V0 = Op.getOperand(0); + V1 = Op.getOperand(1); + V2 = Op.getOperand(2); + V0 = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V0); + V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V1); + V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, V2); + return DAG.getNode(ISD::VSELECT, DL, MVT::v16i8, V0, V1, V2); + } return SDValue(); } } @@ -18168,6 +18188,7 @@ case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND"; case X86ISD::ADDS: return "X86ISD::ADDS"; case X86ISD::SUBS: return "X86ISD::SUBS"; + case X86ISD::USAD: return "X86ISD::USAD"; } return nullptr; } @@ -21992,6 +22013,50 @@ return true; } +static SDValue PerformHADDCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDLoc DL(N); + EVT DestVT = N->getValueType(0); + SDValue absd = N->getOperand(0); + + if (absd->getOpcode() != ISD::UABSD) + return SDValue(); + + SDValue arg1 = absd->getOperand(0); + SDValue arg2 = absd->getOperand(1); + SDValue result; + + if ((arg1.getValueType() != MVT::v8i8) && (arg1.getValueType() != MVT::v16i8)) + return SDValue(); + + if (Subtarget->hasSSE1() || Subtarget->hasSSE2() || Subtarget->hasAVX()) { + if (arg1.getValueType() == MVT::v16i8) { + SDValue sad = DAG.getNode(X86ISD::USAD, DL, MVT::v2i64, arg1, arg2); + EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(); + SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, + sad, DAG.getConstant(0, DL, VecIdxTy)); + SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, sad, + DAG.getConstant(1, DL, VecIdxTy)); + BottomHalf = DAG.getNode(ISD::TRUNCATE, DL, DestVT, BottomHalf); + TopHalf = DAG.getNode(ISD::TRUNCATE, DL, DestVT, TopHalf); + result = DAG.getNode(ISD::ADD, DL, DestVT, TopHalf, BottomHalf); + return result; + } else if (arg1.getValueType() == MVT::v8i8) { + EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(); + SDValue V0 = DAG.getUNDEF(MVT::v8i8); + SDValue Op0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V0, arg1); + SDValue Op1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V0, arg2); + SDValue Ops[] = {Op0, Op1}; + SDValue SAD = DAG.getNode(X86ISD::USAD, DL, MVT::v2i64, Ops); + SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, + SAD, DAG.getConstant(0, DL, VecIdxTy)); + result = DAG.getNode(ISD::TRUNCATE, DL, DestVT, BottomHalf); + return result; + } + } + return SDValue(); +} + /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -24467,6 +24532,8 @@ case X86ISD::SHRUNKBLEND: return PerformSELECTCombine(N, DAG, DCI, Subtarget); case ISD::BITCAST: return PerformBITCASTCombine(N, DAG); + case ISD::UHADD: + return PerformHADDCombine(N, DAG, Subtarget); case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); Index: test/CodeGen/X86/absdiff_intrinsic.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/absdiff_intrinsic.ll @@ -0,0 +1,189 @@ +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx < %s | FileCheck %s -check-prefix=CHECK + +declare <4 x i8> @llvm.uabsdiff.v4i8.v4i8.v4i8(<4 x i8>, <4 x i8>) + +define <4 x i8> @test_uabsdiff_v4i8_expand(<4 x i8> %a1, <4 x i8> %a2) { +; CHECK-LABEL: test_uabsdiff_v4i8_expand +; CHECK: vmovdqa .LCPI0_0(%rip), %xmm2 # xmm2 = [255,255,255,255] +; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpsubd %xmm0, %xmm1, %xmm2 +; CHECK-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + + %1 = call <4 x i8> @llvm.uabsdiff.v4i8.v4i8.v4i8(<4 x i8> %a1, <4 x i8> %a2) + ret <4 x i8> %1 +} + +declare <4 x i8> @llvm.sabsdiff.v4i8.v4i8.v4i8(<4 x i8>, <4 x i8>) + +define <4 x i8> @test_sabsdiff_v4i8_expand(<4 x i8> %a1, <4 x i8> %a2) { +; CHECK-LABEL: test_sabsdiff_v4i8_expand +; CHECK: vpslld $24, %xmm1, %xmm1 +; CHECK-NEXT: vpsrad $24, %xmm1, %xmm1 +; CHECK-NEXT: vpslld $24, %xmm0, %xmm0 +; CHECK-NEXT: vpsrad $24, %xmm0, %xmm0 +; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpsubd %xmm0, %xmm1, %xmm2 +; CHECK-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + + %1 = call <4 x i8> @llvm.sabsdiff.v4i8.v4i8.v4i8(<4 x i8> %a1, <4 x i8> %a2) + ret <4 x i8> %1 +} + +declare <4 x i32> @llvm.sabsdiff.v4i32.v4i8.v4i8(<4 x i8>, <4 x i8>) + +define <4 x i32> @test_sabsdiff_promote_intres(<4 x i8> %a1, <4 x i8> %a2) { +; CHECK-LABEL: test_sabsdiff_promote_intres +; CHECK: vpslld $24, %xmm1, %xmm1 +; CHECK-NEXT: vpsrad $24, %xmm1, %xmm1 +; CHECK-NEXT: vpslld $24, %xmm0, %xmm0 +; CHECK-NEXT: vpsrad $24, %xmm0, %xmm0 +; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpsubd %xmm0, %xmm1, %xmm2 +; CHECK-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %1 = call <4 x i32> @llvm.sabsdiff.v4i32.v4i8.v4i8(<4 x i8> %a1, <4 x i8> %a2) + ret <4 x i32> %1 +} + +declare <8 x i8> @llvm.sabsdiff.v8i8.v8i8.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_sabsdiff_v8i8_expand(<8 x i8> %a1, <8 x i8> %a2) { +; CHECK-LABEL: test_sabsdiff_v8i8_expand +; CHECK: vpsllw $8, %xmm1, %xmm1 +; CHECK-NEXT: vpsraw $8, %xmm1, %xmm1 +; CHECK-NEXT: vpsllw $8, %xmm0, %xmm0 +; CHECK-NEXT: vpsraw $8, %xmm0, %xmm0 +; CHECK-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpsubw %xmm0, %xmm1, %xmm2 +; CHECK-NEXT: vpcmpgtw %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %1 = call <8 x i8> @llvm.sabsdiff.v8i8.v8i8.v8i8(<8 x i8> %a1, <8 x i8> %a2) + ret <8 x i8> %1 +} + +declare <16 x i8> @llvm.uabsdiff.v16i8.v16i8.v16i8(<16 x i8>, <16 x i8>) +declare i32 @llvm.uhadd.i32.v16i8(<16 x i8>) + +define i32 @test_v16i8_uhadd_combine(<16 x i8> %a1, <16 x i8> %a2) { +; CHECK-LABEL: test_v16i8_uhadd_combine +; CHECK: psadbw %xmm1, %xmm0 +; CHECK-NEXT: vmovq %xmm0, %rcx +; CHECK-NEXT: vpextrq $1, %xmm0, %rax +; CHECK-NEXT: addl %ecx, %eax +; CHECK: retq + %1 = call <16 x i8> @llvm.uabsdiff.v16i8.v16i8.v16i8(<16 x i8> %a1, <16 x i8> %a2) + %2 = call i32 @llvm.uhadd.i32.v16i8(<16 x i8> %1) + ret i32 %2 +} + +declare <4 x i32> @llvm.sabsdiff.v4i32.v2i32.v2i32(<2 x i32>, <2 x i32>) + +define <4 x i32> @test_sabsdiff_v2i32_expand(<2 x i32> %a1, <2 x i32> %a2) { +; CHECK-LABEL: test_sabsdiff_v2i32_expand +; CHECK: vpextrq $1, %xmm1, %rax +; CHECK-NEXT: cltq +; CHECK-NEXT: vmovq %rax, %xmm2 +; CHECK-NEXT: vmovq %xmm1, %rax +; CHECK-NEXT: cltq +; CHECK-NEXT: vmovq %rax, %xmm1 +; CHECK-NEXT: vpunpcklqdq %xmm2, %xmm1, %xmm1 # xmm1 = xmm1[0],xmm2[0] +; CHECK-NEXT: vpextrq $1, %xmm0, %rax +; CHECK-NEXT: cltq +; CHECK-NEXT: vmovq %rax, %xmm2 +; CHECK-NEXT: vmovq %xmm0, %rax +; CHECK-NEXT: cltq +; CHECK-NEXT: vmovq %rax, %xmm0 +; CHECK-NEXT: vpunpcklqdq %xmm2, %xmm0, %xmm0 # xmm0 = xmm0[0],xmm2[0] +; CHECK-NEXT: vpsubq %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpsubq %xmm0, %xmm1, %xmm2 +; CHECK-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vpshufd $232, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: retq + %1 = call <4 x i32> @llvm.sabsdiff.v4i32.v2i32.v2i32(<2 x i32> %a1, <2 x i32> %a2) + ret <4 x i32> %1 +} + +declare <16 x i32> @llvm.sabsdiff.v16i32.v16i32.v16i32(<16 x i32>, <16 x i32>) + +define <16 x i32> @test_sabsdiff_v16i32_expand(<16 x i32> %a1, <16 x i32> %a2) { +; CHECK-LABEL: test_sabsdiff_v16i32_expand +; CHECK: vextractf128 $1, %ymm2, %xmm4 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm5 +; CHECK-NEXT: vpsubd %xmm4, %xmm5, %xmm4 +; CHECK-NEXT: vpsubd %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2 +; CHECK-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; CHECK-NEXT: vpsubd %xmm4, %xmm5, %xmm4 +; CHECK-NEXT: vpsubd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 +; CHECK-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4 +; CHECK-NEXT: vpcmpgtd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; CHECK-NEXT: vblendvps %ymm0, %ymm2, %ymm6, %ymm0 +; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm4 +; CHECK-NEXT: vpsubd %xmm2, %xmm4, %xmm2 +; CHECK-NEXT: vpsubd %xmm3, %xmm1, %xmm1 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm3 +; CHECK-NEXT: vpsubd %xmm2, %xmm5, %xmm2 +; CHECK-NEXT: vpsubd %xmm1, %xmm5, %xmm1 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4 +; CHECK-NEXT: vpcmpgtd %xmm2, %xmm5, %xmm2 +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm5, %xmm1 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; CHECK-NEXT: vblendvps %ymm1, %ymm3, %ymm4, %ymm1 +; CHECK-NEXT: retq + %1 = call <16 x i32> @llvm.sabsdiff.v16i32.v16i32.v16i32(<16 x i32> %a1, <16 x i32> %a2) + ret <16 x i32> %1 +} + +declare <4 x i32> @llvm.sabsdiff.v4i32.v4i32.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_sabsdiff_v4i32_expand(<4 x i32> %a1, <4 x i32> %a2) { +; CHECK-LABEL: test_sabsdiff_v4i32_expand +; CHECK: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpsubd %xmm0, %xmm1, %xmm2 +; CHECK-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %1 = call <4 x i32> @llvm.sabsdiff.v4i32.v4i32.v4i32(<4 x i32> %a1, <4 x i32> %a2) + ret <4 x i32> %1 +} + +declare <4 x i32> @llvm.uabsdiff.v4i32.v4i32.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_uabsdiff_v4i32_expand(<4 x i32> %a1, <4 x i32> %a2) { +; CHECK-LABEL: test_uabsdiff_v4i32_expand +; CHECK: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpsubd %xmm0, %xmm1, %xmm2 +; CHECK-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %1 = call <4 x i32> @llvm.uabsdiff.v4i32.v4i32.v4i32(<4 x i32> %a1, <4 x i32> %a2) + ret <4 x i32> %1 +} + +;define <4 x i32> @test_sabsdiff_expand(<4 x i32> %a1, <4 x i32> %a2) { +; %sub = sub nsw <4 x i32> %a1, %a2 +; %ispos = icmp sgt <4 x i32> %sub, +; %neg = sub nsw <4 x i32> zeroinitializer, %sub +; %1 = select <4 x i1> %ispos, <4 x i32> %sub, <4 x i32> %neg +; ret <4 x i32> %1 +;} + Index: test/CodeGen/X86/hadd_intrinsic.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/hadd_intrinsic.ll @@ -0,0 +1,163 @@ +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx < %s | FileCheck %s -check-prefix=CHECK + +declare i32 @llvm.uhadd.i32.v4i8(<4 x i8>) + +define i32 @test1_uhadd_intrinsic(<4 x i8> %a1) { +; CHECK-LABEL: test1_uhadd_intrinsic +; CHECK: vpand .LCPI0_0(%rip), %xmm0 +; CHECK-NEXT: vpshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd $237, %xmm0, %xmm1 # xmm1 = xmm0[1,3,2,3] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: retq + %1 = call i32 @llvm.uhadd.i32.v4i8(<4 x i8> %a1) + ret i32 %1 +} + +declare i32 @llvm.shadd.i32.v4i8(<4 x i8>) + +define i32 @test1_shadd_intrinsic(<4 x i8> %a1) { +; CHECK-LABEL: test1_shadd_intrinsic +; CHECK: vpslld $24, %xmm0, %xmm0 +; CHECK-NEXT: vpsrad $24, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd $237, %xmm0, %xmm1 # xmm1 = xmm0[1,3,2,3] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: retq + %1 = call i32 @llvm.shadd.i32.v4i8(<4 x i8> %a1) + ret i32 %1 +} + +declare i32 @llvm.uhadd.i32.v4i32(<4 x i32>) + +define i32 @test2_uhadd_intrinsic(<4 x i32> %a1) { +; CHECK-LABEL: test2_uhadd_intrinsic +; CHECK: vpshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd $237, %xmm0, %xmm1 # xmm1 = xmm0[1,3,2,3] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: retq + %1 = call i32 @llvm.uhadd.i32.v4i32(<4 x i32> %a1) + ret i32 %1 +} + +declare i32 @llvm.shadd.i32.v4i32(<4 x i32>) + +define i32 @test2_shadd_intrinsic(<4 x i32> %a1) { +; CHECK-LABEL: test2_shadd_intrinsic +; CHECK: vpshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd $237, %xmm0, %xmm1 # xmm1 = xmm0[1,3,2,3] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: retq + %1 = call i32 @llvm.shadd.i32.v4i32(<4 x i32> %a1) + ret i32 %1 +} + + +declare i32 @llvm.uhadd.i32.v8i32(<8 x i32>) + +define i32 @test3_uhadd_intrinsic(<8 x i32> %a1) { +; CHECK-LABEL: test3_uhadd_intrinsic +; CHECK: vperm2f128 $1, %ymm0, %ymm0, %ymm1 # ymm1 = ymm0[2,3,0,1] +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 +; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vunpckhpd %xmm2, %xmm0, %xmm1 # xmm1 = xmm0[1],xmm2[1] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vshufps $237, %xmm0, %xmm0, %xmm1 # xmm1 = xmm0[1,3,2,3] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %1 = call i32 @llvm.uhadd.i32.v8i32(<8 x i32> %a1) + ret i32 %1 +} + +declare i32 @llvm.shadd.i32.v8i32(<8 x i32>) + +define i32 @test3_shadd_intrinsic(<8 x i32> %a1) { +; CHECK-LABEL: test3_shadd_intrinsic +; CHECK: vperm2f128 $1, %ymm0, %ymm0, %ymm1 # ymm1 = ymm0[2,3,0,1] +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 +; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vunpckhpd %xmm2, %xmm0, %xmm1 # xmm1 = xmm0[1],xmm2[1] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vshufps $237, %xmm0, %xmm0, %xmm1 # xmm1 = xmm0[1,3,2,3] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %1 = call i32 @llvm.shadd.i32.v8i32(<8 x i32> %a1) + ret i32 %1 +} + +declare i32 @llvm.uhadd.i32.v16i32(<16 x i32>) + +define i32 @test4_uhadd_intrinsic(<16 x i32> %a1) { +; CHECK-LABEL: test4_uhadd_intrinsic +; CHECK: vperm2f128 $1, %ymm0, %ymm1, %ymm2 # ymm2 = ymm1[2,3,0,1] +; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3 +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm4 +; CHECK-NEXT: vpaddd %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vunpckhpd %xmm3, %xmm1, %xmm2 # xmm2 = xmm1[1],xmm3[1] +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vshufps $237, %xmm0, %xmm1, %xmm2 # xmm2 = xmm1[1,3],xmm0[2,3] +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vmovd %xmm1, %ecx +; CHECK-NEXT: vperm2f128 $1, %ymm0, %ymm0, %ymm1 # ymm1 = ymm0[2,3,0,1] +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 +; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vunpckhpd %xmm2, %xmm0, %xmm1 # xmm1 = xmm0[1],xmm2[1] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vshufps $237, %xmm0, %xmm0, %xmm1 # xmm1 = xmm0[1,3,2,3] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %1 = call i32 @llvm.uhadd.i32.v16i32(<16 x i32> %a1) + ret i32 %1 +} + +declare i32 @llvm.shadd.i32.v16i32(<16 x i32>) + +define i32 @test4_shadd_intrinsic(<16 x i32> %a1) { +; CHECK-LABEL: test4_shadd_intrinsic +; CHECK: vperm2f128 $1, %ymm0, %ymm1, %ymm2 # ymm2 = ymm1[2,3,0,1] +; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3 +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm4 +; CHECK-NEXT: vpaddd %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vunpckhpd %xmm3, %xmm1, %xmm2 # xmm2 = xmm1[1],xmm3[1] +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vshufps $237, %xmm0, %xmm1, %xmm2 # xmm2 = xmm1[1,3],xmm0[2,3] +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vmovd %xmm1, %ecx +; CHECK-NEXT: vperm2f128 $1, %ymm0, %ymm0, %ymm1 # ymm1 = ymm0[2,3,0,1] +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 +; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vunpckhpd %xmm2, %xmm0, %xmm1 # xmm1 = xmm0[1],xmm2[1] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vshufps $237, %xmm0, %xmm0, %xmm1 # xmm1 = xmm0[1,3,2,3] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %1 = call i32 @llvm.shadd.i32.v16i32(<16 x i32> %a1) + ret i32 %1 +}