Index: docs/LangRef.rst =================================================================== --- docs/LangRef.rst +++ docs/LangRef.rst @@ -9579,6 +9579,65 @@ %r2 = call float @llvm.fmuladd.f32(float %a, float %b, float %c) ; yields float:r2 = (a * b) + c + +'``llvm.uabsdiff.*``' and '``llvm.sabsdiff.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. The loaded data is a vector of any integer bit width. + +.. code-block:: llvm + + declare <4 x integer> @llvm.uabsdiff.v4i32(<4 x integer> %a, <4 x integer> %b) + + +Overview: +""""""""" + +The ``llvm.uabsdiff`` intrinsic returns a vector result of the absolute difference of the two operands, +treating them both as unsigned integers. + +The ``llvm.sabsdiff`` intrinsic returns a vector result of the absolute difference of the two operands, +treating them both as signed integers. + +.. note:: + + These intrinsics are primarily used during the code generation stage of compilation. + They are generated by compiler passes such as the Loop and SLP vectorizers.it is not + recommended for users to create them manually. + +Arguments: +"""""""""" + +Both intrinsics take two integer of the same bitwidth. + +Semantics: +"""""""""" + +The expression:: + + call <4 x i32> @llvm.uabsdiff.v4i32(<4 x i32> %a, <4 x i32> %b) + +is equivalent to:: + + %sub = sub <4 x i32> %a, %b + %ispos = icmp ugt <4 x i32> %sub, + %neg = sub <4 x i32> zeroinitializer, %sub + %1 = select <4 x i1> %ispos, <4 x i32> %sub, <4 x i32> %neg + +Similarly the expression:: + + call <4 x i32> @llvm.sabsdiff.v4i32(<4 x i32> %a, <4 x i32> %b) + +is equivalent to:: + + %sub = sub nsw <4 x i32> %a, %b + %ispos = icmp sgt <4 x i32> %sub, + %neg = sub nsw <4 x i32> zeroinitializer, %sub + %1 = select <4 x i1> %ispos, <4 x i32> %sub, <4 x i32> %neg + + Half Precision Floating Point Intrinsics ---------------------------------------- Index: include/llvm/CodeGen/ISDOpcodes.h =================================================================== --- include/llvm/CodeGen/ISDOpcodes.h +++ include/llvm/CodeGen/ISDOpcodes.h @@ -334,6 +334,10 @@ /// Byte Swap and Counting operators. BSWAP, CTTZ, CTLZ, CTPOP, + /// [SU]ABSDIFF - Signed/Unsigned absolute difference of two input integer + /// vector. These nodes are generated from llvm.*absdiff* intrinsics. + SABSDIFF, UABSDIFF, + /// Bit counting operators with an undefined result for zero inputs. CTTZ_ZERO_UNDEF, CTLZ_ZERO_UNDEF, Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -603,6 +603,12 @@ def int_clear_cache : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty], [], "llvm.clear_cache">; +// Calculate the Absolute Differences of the two input vectors. +def int_sabsdiff : Intrinsic<[llvm_anyvector_ty], + [ LLVMMatchType<0>, LLVMMatchType<0> ], [IntrNoMem]>; +def int_uabsdiff : Intrinsic<[llvm_anyvector_ty], + [ LLVMMatchType<0>, LLVMMatchType<0> ], [IntrNoMem]>; + //===-------------------------- Masked Intrinsics -------------------------===// // def int_masked_store : Intrinsic<[], [llvm_anyvector_ty, LLVMPointerTo<0>, Index: include/llvm/Target/TargetSelectionDAG.td =================================================================== --- include/llvm/Target/TargetSelectionDAG.td +++ include/llvm/Target/TargetSelectionDAG.td @@ -386,6 +386,8 @@ def umin : SDNode<"ISD::UMIN" , SDTIntBinOp>; def umax : SDNode<"ISD::UMAX" , SDTIntBinOp>; +def sabsdiff : SDNode<"ISD::SABSDIFF" , SDTIntBinOp>; +def uabsdiff : SDNode<"ISD::UABSDIFF" , SDTIntBinOp>; def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>; def bswap : SDNode<"ISD::BSWAP" , SDTIntUnaryOp>; def ctlz : SDNode<"ISD::CTLZ" , SDTIntUnaryOp>; Index: lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -146,6 +146,10 @@ case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: Res = PromoteIntRes_AtomicCmpSwap(cast(N), ResNo); break; + case ISD::UABSDIFF: + case ISD::SABSDIFF: + Res = PromoteIntRes_SimpleIntBinOp(N); + break; } // If the result is null then the sub-method took care of registering it. Index: lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -105,6 +105,7 @@ SDValue ExpandLoad(SDValue Op); SDValue ExpandStore(SDValue Op); SDValue ExpandFNEG(SDValue Op); + SDValue ExpandABSDIFF(SDValue Op); /// \brief Implements vector promotion. /// @@ -326,6 +327,8 @@ case ISD::SMAX: case ISD::UMIN: case ISD::UMAX: + case ISD::UABSDIFF: + case ISD::SABSDIFF: QueryType = Node->getValueType(0); break; case ISD::FP_ROUND_INREG: @@ -704,11 +707,45 @@ return ExpandFNEG(Op); case ISD::SETCC: return UnrollVSETCC(Op); + case ISD::UABSDIFF: + case ISD::SABSDIFF: + return ExpandABSDIFF(Op); default: return DAG.UnrollVectorOp(Op.getNode()); } } +SDValue VectorLegalizer::ExpandABSDIFF(SDValue Op) { + SDLoc dl(Op); + SDValue Tmp1, Tmp2, Tmp3, Tmp4; + EVT VT = Op.getValueType(); + if (Op->getOpcode() == ISD::UABSDIFF) { + Tmp2 = Op.getOperand(0); + Tmp3 = Op.getOperand(1); + Tmp1 = DAG.getNode(ISD::SUB, dl, VT, Tmp2, Tmp3); + Tmp2 = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Tmp1); + Tmp4 = DAG.getNode( + ISD::SETCC, dl, TLI.getSetCCResultType(*DAG.getContext(), VT), Tmp2, + DAG.getConstant(0, dl, VT), DAG.getCondCode(ISD::SETULT)); + Tmp1 = DAG.getNode(ISD::VSELECT, dl, VT, Tmp4, Tmp1, Tmp2); + return Tmp1; + } else { + SDNodeFlags Flags; + Flags.setNoSignedWrap(true); + + Tmp2 = Op.getOperand(0); + Tmp3 = Op.getOperand(1); + Tmp1 = DAG.getNode(ISD::SUB, dl, VT, Tmp2, Tmp3, &Flags); + Tmp2 = + DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Tmp1, &Flags); + Tmp4 = DAG.getNode(ISD::SETCC, dl, + TLI.getSetCCResultType(*DAG.getContext(), VT), Tmp2, + DAG.getConstant(0, dl, VT), DAG.getCondCode(ISD::SETLT)); + Tmp1 = DAG.getNode(ISD::VSELECT, dl, VT, Tmp4, Tmp1, Tmp2); + return Tmp1; + } +} + SDValue VectorLegalizer::ExpandSELECT(SDValue Op) { // Lower a select instruction where the condition is a scalar and the // operands are vectors. Lower this select to VSELECT and implement it Index: lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -675,6 +675,8 @@ case ISD::SMAX: case ISD::UMIN: case ISD::UMAX: + case ISD::UABSDIFF: + case ISD::SABSDIFF: SplitVecRes_BinOp(N, Lo, Hi); break; case ISD::FMA: Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4615,6 +4615,18 @@ getValue(I.getArgOperand(0)).getValueType(), getValue(I.getArgOperand(0)))); return nullptr; + case Intrinsic::uabsdiff: + setValue(&I, DAG.getNode(ISD::UABSDIFF, sdl, + getValue(I.getArgOperand(0)).getValueType(), + getValue(I.getArgOperand(0)), + getValue(I.getArgOperand(1)))); + return nullptr; + case Intrinsic::sabsdiff: + setValue(&I, DAG.getNode(ISD::SABSDIFF, sdl, + getValue(I.getArgOperand(0)).getValueType(), + getValue(I.getArgOperand(0)), + getValue(I.getArgOperand(1)))); + return nullptr; case Intrinsic::cttz: { SDValue Arg = getValue(I.getArgOperand(0)); ConstantInt *CI = cast(I.getArgOperand(1)); Index: lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -225,6 +225,8 @@ case ISD::SHL_PARTS: return "shl_parts"; case ISD::SRA_PARTS: return "sra_parts"; case ISD::SRL_PARTS: return "srl_parts"; + case ISD::UABSDIFF: return "uabsdiff"; + case ISD::SABSDIFF: return "sabsdiff"; // Conversion operators. case ISD::SIGN_EXTEND: return "sign_extend"; Index: lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- lib/CodeGen/TargetLoweringBase.cpp +++ lib/CodeGen/TargetLoweringBase.cpp @@ -827,6 +827,8 @@ setOperationAction(ISD::USUBO, VT, Expand); setOperationAction(ISD::SMULO, VT, Expand); setOperationAction(ISD::UMULO, VT, Expand); + setOperationAction(ISD::UABSDIFF, VT, Expand); + setOperationAction(ISD::SABSDIFF, VT, Expand); // These library functions default to expand. setOperationAction(ISD::FROUND, VT, Expand); Index: test/CodeGen/X86/absdiff_expand.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/absdiff_expand.ll @@ -0,0 +1,242 @@ +; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s -check-prefix=CHECK + +declare <4 x i8> @llvm.uabsdiff.v4i8(<4 x i8>, <4 x i8>) + +define <4 x i8> @test_uabsdiff_v4i8_expand(<4 x i8> %a1, <4 x i8> %a2) { +; CHECK-LABEL: test_uabsdiff_v4i8_expand +; CHECK: psubd %xmm1, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: psubd %xmm0, %xmm1 +; CHECK-NEXT: movdqa .LCPI{{[0-9_]*}} +; CHECK-NEXT: movdqa %xmm1, %xmm3 +; CHECK-NEXT: pxor %xmm2, %xmm3 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm2 +; CHECK-NEXT: pand %xmm2, %xmm0 +; CHECK-NEXT: pandn %xmm1, %xmm2 +; CHECK-NEXT: por %xmm2, %xmm0 +; CHECK-NEXT: retq + + %1 = call <4 x i8> @llvm.uabsdiff.v4i8(<4 x i8> %a1, <4 x i8> %a2) + ret <4 x i8> %1 +} + +declare <4 x i8> @llvm.sabsdiff.v4i8(<4 x i8>, <4 x i8>) + +define <4 x i8> @test_sabsdiff_v4i8_expand(<4 x i8> %a1, <4 x i8> %a2) { +; CHECK-LABEL: test_sabsdiff_v4i8_expand +; CHECK: psubd %xmm1, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: pxor %xmm2, %xmm2 +; CHECK-NEXT: psubd %xmm0, %xmm2 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm1 +; CHECK-NEXT: pand %xmm1, %xmm0 +; CHECK-NEXT: pandn %xmm2, %xmm1 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: retq + + %1 = call <4 x i8> @llvm.sabsdiff.v4i8(<4 x i8> %a1, <4 x i8> %a2) + ret <4 x i8> %1 +} + + +declare <8 x i8> @llvm.sabsdiff.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_sabsdiff_v8i8_expand(<8 x i8> %a1, <8 x i8> %a2) { +; CHECK-LABEL: test_sabsdiff_v8i8_expand +; CHECK: psubw %xmm1, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: pxor %xmm2, %xmm2 +; CHECK-NEXT: psubw %xmm0, %xmm2 +; CHECK-NEXT: pcmpgtw %xmm2, %xmm1 +; CHECK-NEXT: pand %xmm1, %xmm0 +; CHECK-NEXT: pandn %xmm2, %xmm1 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: retq + %1 = call <8 x i8> @llvm.sabsdiff.v8i8(<8 x i8> %a1, <8 x i8> %a2) + ret <8 x i8> %1 +} + +declare <16 x i8> @llvm.uabsdiff.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_uabsdiff_v16i8_expand(<16 x i8> %a1, <16 x i8> %a2) { +; CHECK-LABEL: test_uabsdiff_v16i8_expand +; CHECK: psubb %xmm1, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: psubb %xmm0, %xmm1 +; CHECK-NEXT: movdqa .LCPI{{[0-9_]*}} +; CHECK-NEXT: movdqa %xmm1, %xmm3 +; CHECK-NEXT: pxor %xmm2, %xmm3 +; CHECK-NEXT: pcmpgtb %xmm3, %xmm2 +; CHECK-NEXT: pand %xmm2, %xmm0 +; CHECK-NEXT: pandn %xmm1, %xmm2 +; CHECK-NEXT: por %xmm2, %xmm0 +; CHECK-NEXT: retq + %1 = call <16 x i8> @llvm.uabsdiff.v16i8(<16 x i8> %a1, <16 x i8> %a2) + ret <16 x i8> %1 +} + +declare <8 x i16> @llvm.uabsdiff.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_uabsdiff_v8i16_expand(<8 x i16> %a1, <8 x i16> %a2) { +; CHECK-LABEL: test_uabsdiff_v8i16_expand +; CHECK: psubw %xmm1, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: psubw %xmm0, %xmm1 +; CHECK-NEXT: movdqa .LCPI{{[0-9_]*}} +; CHECK-NEXT: movdqa %xmm1, %xmm3 +; CHECK-NEXT: pxor %xmm2, %xmm3 +; CHECK-NEXT: pcmpgtw %xmm3, %xmm2 +; CHECK-NEXT: pand %xmm2, %xmm0 +; CHECK-NEXT: pandn %xmm1, %xmm2 +; CHECK-NEXT: por %xmm2, %xmm0 +; CHECK-NEXT: retq + %1 = call <8 x i16> @llvm.uabsdiff.v8i16(<8 x i16> %a1, <8 x i16> %a2) + ret <8 x i16> %1 +} + +declare <8 x i16> @llvm.sabsdiff.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_sabsdiff_v8i16_expand(<8 x i16> %a1, <8 x i16> %a2) { +; CHECK-LABEL: test_sabsdiff_v8i16_expand +; CHECK: psubw %xmm1, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: pxor %xmm2, %xmm2 +; CHECK-NEXT: psubw %xmm0, %xmm2 +; CHECK-NEXT: pcmpgtw %xmm2, %xmm1 +; CHECK-NEXT: pand %xmm1, %xmm0 +; CHECK-NEXT: pandn %xmm2, %xmm1 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: retq + %1 = call <8 x i16> @llvm.sabsdiff.v8i16(<8 x i16> %a1, <8 x i16> %a2) + ret <8 x i16> %1 +} + +declare <4 x i32> @llvm.sabsdiff.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_sabsdiff_v4i32_expand(<4 x i32> %a1, <4 x i32> %a2) { +; CHECK-LABEL: test_sabsdiff_v4i32_expand +; CHECK: psubd %xmm1, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: pxor %xmm2, %xmm2 +; CHECK-NEXT: psubd %xmm0, %xmm2 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm1 +; CHECK-NEXT: pand %xmm1, %xmm0 +; CHECK-NEXT: pandn %xmm2, %xmm1 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: retq + %1 = call <4 x i32> @llvm.sabsdiff.v4i32(<4 x i32> %a1, <4 x i32> %a2) + ret <4 x i32> %1 +} + +declare <4 x i32> @llvm.uabsdiff.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_uabsdiff_v4i32_expand(<4 x i32> %a1, <4 x i32> %a2) { +; CHECK-LABEL: test_uabsdiff_v4i32_expand +; CHECK: psubd %xmm1, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: psubd %xmm0, %xmm1 +; CHECK-NEXT: movdqa .LCPI{{[0-9_]*}} +; CHECK-NEXT: movdqa %xmm1, %xmm3 +; CHECK-NEXT: pxor %xmm2, %xmm3 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm2 +; CHECK-NEXT: pand %xmm2, %xmm0 +; CHECK-NEXT: pandn %xmm1, %xmm2 +; CHECK-NEXT: por %xmm2, %xmm0 +; CHECK-NEXT: retq + %1 = call <4 x i32> @llvm.uabsdiff.v4i32(<4 x i32> %a1, <4 x i32> %a2) + ret <4 x i32> %1 +} + +declare <2 x i32> @llvm.sabsdiff.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_sabsdiff_v2i32_expand(<2 x i32> %a1, <2 x i32> %a2) { +; CHECK-LABEL: test_sabsdiff_v2i32_expand +; CHECK: psubq %xmm1, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: psubq %xmm0, %xmm1 +; CHECK-NEXT: movdqa .LCPI{{[0-9_]*}} +; CHECK-NEXT: movdqa %xmm1, %xmm3 +; CHECK-NEXT: pxor %xmm2, %xmm3 +; CHECK-NEXT: movdqa %xmm2, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm4 +; CHECK-NEXT: pshufd $160, %xmm4, %xmm5 # xmm5 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm2, %xmm3 +; CHECK-NEXT: pshufd $245, %xmm3, %xmm2 # xmm2 = xmm3[1,1,3,3] +; CHECK-NEXT: pand %xmm5, %xmm2 +; CHECK-NEXT: pshufd $245, %xmm4, %xmm3 # xmm3 = xmm4[1,1,3,3] +; CHECK-NEXT: por %xmm2, %xmm3 +; CHECK-NEXT: pand %xmm3, %xmm0 +; CHECK-NEXT: pandn %xmm1, %xmm3 +; CHECK-NEXT: por %xmm3, %xmm0 +; CHECK-NEXT: retq + %1 = call <2 x i32> @llvm.sabsdiff.v2i32(<2 x i32> %a1, <2 x i32> %a2) + ret <2 x i32> %1 +} + +declare <2 x i64> @llvm.sabsdiff.v2i64(<2 x i64>, <2 x i64>) + +define <2 x i64> @test_sabsdiff_v2i64_expand(<2 x i64> %a1, <2 x i64> %a2) { +; CHECK-LABEL: test_sabsdiff_v2i64_expand +; CHECK: psubq %xmm1, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: psubq %xmm0, %xmm1 +; CHECK-NEXT: movdqa .LCPI{{[0-9_]*}} +; CHECK-NEXT: movdqa %xmm1, %xmm3 +; CHECK-NEXT: pxor %xmm2, %xmm3 +; CHECK-NEXT: movdqa %xmm2, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm4 +; CHECK-NEXT: pshufd $160, %xmm4, %xmm5 # xmm5 = xmm4[0,0,2,2] +; CHECK-NEXT: pcmpeqd %xmm2, %xmm3 +; CHECK-NEXT: pshufd $245, %xmm3, %xmm2 # xmm2 = xmm3[1,1,3,3] +; CHECK-NEXT: pand %xmm5, %xmm2 +; CHECK-NEXT: pshufd $245, %xmm4, %xmm3 # xmm3 = xmm4[1,1,3,3] +; CHECK-NEXT: por %xmm2, %xmm3 +; CHECK-NEXT: pand %xmm3, %xmm0 +; CHECK-NEXT: pandn %xmm1, %xmm3 +; CHECK-NEXT: por %xmm3, %xmm0 +; CHECK-NEXT: retq + %1 = call <2 x i64> @llvm.sabsdiff.v2i64(<2 x i64> %a1, <2 x i64> %a2) + ret <2 x i64> %1 +} + +declare <16 x i32> @llvm.sabsdiff.v16i32(<16 x i32>, <16 x i32>) + +define <16 x i32> @test_sabsdiff_v16i32_expand(<16 x i32> %a1, <16 x i32> %a2) { +; CHECK-LABEL: test_sabsdiff_v16i32_expand +; CHECK: psubd %xmm4, %xmm0 +; CHECK-NEXT: pxor %xmm8, %xmm8 +; CHECK-NEXT: pxor %xmm9, %xmm9 +; CHECK-NEXT: psubd %xmm0, %xmm9 +; CHECK-NEXT: pxor %xmm4, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm9, %xmm4 +; CHECK-NEXT: pand %xmm4, %xmm0 +; CHECK-NEXT: pandn %xmm9, %xmm4 +; CHECK-NEXT: por %xmm4, %xmm0 +; CHECK-NEXT: psubd %xmm5, %xmm1 +; CHECK-NEXT: pxor %xmm4, %xmm4 +; CHECK-NEXT: psubd %xmm1, %xmm4 +; CHECK-NEXT: pxor %xmm5, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm5 +; CHECK-NEXT: pand %xmm5, %xmm1 +; CHECK-NEXT: pandn %xmm4, %xmm5 +; CHECK-NEXT: por %xmm5, %xmm1 +; CHECK-NEXT: psubd %xmm6, %xmm2 +; CHECK-NEXT: pxor %xmm4, %xmm4 +; CHECK-NEXT: psubd %xmm2, %xmm4 +; CHECK-NEXT: pxor %xmm5, %xmm5 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm5 +; CHECK-NEXT: pand %xmm5, %xmm2 +; CHECK-NEXT: pandn %xmm4, %xmm5 +; CHECK-NEXT: por %xmm5, %xmm2 +; CHECK-NEXT: psubd %xmm7, %xmm3 +; CHECK-NEXT: pxor %xmm4, %xmm4 +; CHECK-NEXT: psubd %xmm3, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm4, %xmm8 +; CHECK-NEXT: pand %xmm8, %xmm3 +; CHECK-NEXT: pandn %xmm4, %xmm8 +; CHECK-NEXT: por %xmm8, %xmm3 +; CHECK-NEXT: retq + %1 = call <16 x i32> @llvm.sabsdiff.v16i32(<16 x i32> %a1, <16 x i32> %a2) + ret <16 x i32> %1 +} +