Index: include/llvm/CodeGen/TargetLowering.h =================================================================== --- include/llvm/CodeGen/TargetLowering.h +++ include/llvm/CodeGen/TargetLowering.h @@ -2650,6 +2650,18 @@ const SmallVectorImpl &ArgLocs, const SmallVectorImpl &OutVals) const; + static bool isBasicSADPattern(SelectionDAG &DAG, SDNode *Extract, + SDValue &Zext0, SDValue &Zext1, + ArrayRef CandidateDataTypes, + ArrayRef CandidateExtOps); + + static bool detectExtAbsDiff(const SDValue &Select, SDValue &Op0, + SDValue &Op1, ArrayRef CandidateDataTypes, + ArrayRef CandidateExtOps); + + static SDValue matchBinOpReduction(SDNode *Extract, unsigned &BinOp, + ArrayRef CandidateBinOps); + //===--------------------------------------------------------------------===// // TargetLowering Optimization Methods // Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -4291,3 +4291,182 @@ } return SDValue(); } + +// Match a binop + shuffle pyramid that represents a horizontal reduction over +// the elements of a vector. +// Returns the vector that is being reduced on, or SDValue() if a reduction +// was not matched. +SDValue +TargetLowering::matchBinOpReduction(SDNode *Extract, unsigned &BinOp, + ArrayRef CandidateBinOps) { + // The pattern must end in an extract from index 0. + if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) || + !isNullConstant(Extract->getOperand(1))) + return SDValue(); + + SDValue Op = Extract->getOperand(0); + unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements()); + + // Match against one of the candidate binary ops. + if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) { + return Op.getOpcode() == unsigned(BinOp); + })) + return SDValue(); + + // At each stage, we're looking for something that looks like: + // %s = shufflevector <8 x i32> %op, <8 x i32> undef, + // <8 x i32> + // %a = binop <8 x i32> %op, %s + // Where the mask changes according to the stage. E.g. for a 3-stage pyramid, + // we expect something like: + // <4,5,6,7,u,u,u,u> + // <2,3,u,u,u,u,u,u> + // <1,u,u,u,u,u,u,u> + unsigned CandidateBinOp = Op.getOpcode(); + for (unsigned i = 0; i < Stages; ++i) { + if (Op.getOpcode() != CandidateBinOp) + return SDValue(); + + ShuffleVectorSDNode *Shuffle = + dyn_cast(Op.getOperand(0).getNode()); + if (Shuffle) { + Op = Op.getOperand(1); + } else { + Shuffle = dyn_cast(Op.getOperand(1).getNode()); + Op = Op.getOperand(0); + } + + // The first operand of the shuffle should be the same as the other operand + // of the binop. + if (!Shuffle || Shuffle->getOperand(0) != Op) + return SDValue(); + + // Verify the shuffle has the expected (at this stage of the pyramid) mask. + for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index) + if (Shuffle->getMaskElt(Index) != MaskEnd + Index) + return SDValue(); + } + + BinOp = CandidateBinOp; + return Op; +} + +// Given a select, detect the following pattern: +// 1: %2 = zext %0 to +// 2: %3 = zext %1 to +// 3: %4 = sub nsw %2, %3 +// 4: %5 = icmp sgt %4, [0 x N] or [-1 x N] +// 5: %6 = sub nsw zeroinitializer, %4 +// 6: %7 = select %5, %4, %6 +// This is useful as it is the input into a SAD pattern. +bool +TargetLowering::detectExtAbsDiff(const SDValue &Select, SDValue &Op0, + SDValue &Op1, + ArrayRef CandidateDataTypes, + ArrayRef CandidateExtOps) { + // Check the condition of the select instruction is greater-than. + SDValue SetCC = Select->getOperand(0); + if (SetCC.getOpcode() != ISD::SETCC) + return false; + ISD::CondCode CC = cast(SetCC.getOperand(2))->get(); + if (CC != ISD::SETGT && CC != ISD::SETLT) + return false; + + SDValue SelectOp1 = Select->getOperand(1); + SDValue SelectOp2 = Select->getOperand(2); + + // The following instructions assume SelectOp1 is the subtraction operand + // and SelectOp2 is the negation operand. + // In the case of SETLT this is the other way around. + if (CC == ISD::SETLT) + std::swap(SelectOp1, SelectOp2); + + // The second operand of the select should be the negation of the first + // operand, which is implemented as 0 - SelectOp1. + if (!(SelectOp2.getOpcode() == ISD::SUB && + ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) && + SelectOp2.getOperand(1) == SelectOp1)) + return false; + + // The first operand of SetCC is the first operand of the select, which is the + // difference between the two input vectors. + if (SetCC.getOperand(0) != SelectOp1) + return false; + + // In SetLT case, The second operand of the comparison can be either 1 or 0. + APInt SplatVal; + if ((CC == ISD::SETLT) && + !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) && + SplatVal.isOneValue()) || + (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode())))) + return false; + + // In SetGT case, The second operand of the comparison can be either -1 or 0. + if ((CC == ISD::SETGT) && + !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) || + ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode()))) + return false; + + // The first operand of the select is the difference between the two input + // vectors. + if (SelectOp1.getOpcode() != ISD::SUB) + return false; + + Op0 = SelectOp1.getOperand(0); + Op1 = SelectOp1.getOperand(1); + + // Check if the data type and signedness match for two input vector. + if (Op0.getOpcode() != Op1.getOpcode() || + Op0.getOperand(0).getValueType() != Op1.getOperand(0).getValueType()) + return false; + + // Match against one of the candidate extension type. + if (llvm::none_of(CandidateExtOps, [Op0](ISD::NodeType ExtOp) { + return Op0.getOpcode() == unsigned(ExtOp); + })) + return false; + + // Match against one of the candidate data type. + if (llvm::none_of(CandidateDataTypes, [Op0](EVT DT) { + return Op0.getOperand(0).getValueType().getVectorElementType() == DT; + })) + return false; + + return true; +} + +bool +TargetLowering::isBasicSADPattern(SelectionDAG &DAG, SDNode *Extract, + SDValue &Zext0, SDValue &Zext1, + ArrayRef CandidateDataTypes, + ArrayRef CandidateExtOps) { + // Match shuffle + add pyramid. + unsigned BinOp = 0; + SDValue Root = matchBinOpReduction(Extract, BinOp, {ISD::ADD}); + + // The operand is expected to be extended by one of extension opcode + // in CandidateExtOps from a data type in CandidateDataTypes + // (verified in detectExtAbsDiff). + // In order to convert to i64 and above, additional any/zero/sign + // extend is expected. + // The zero extend from 32 bit has no mathematical effect on the result. + // Also the sign extend is basically zero extend + // (extends the sign bit which is zero). + // So it is correct to skip the sign/zero extend instruction. + if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND || + Root.getOpcode() == ISD::ZERO_EXTEND || + Root.getOpcode() == ISD::ANY_EXTEND)) + Root = Root.getOperand(0); + + // If there was a match, we want Root to be a select that is the root of an + // abs-diff pattern. + if (!Root || (Root.getOpcode() != ISD::VSELECT)) + return false; + + // Check whether we have an abs-diff pattern feeding into the select. + if (!detectExtAbsDiff(Root, Zext0, Zext1, CandidateDataTypes, CandidateExtOps)) + return false; + + return true; +} Index: lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCISelLowering.cpp +++ lib/Target/PowerPC/PPCISelLowering.cpp @@ -1023,6 +1023,8 @@ setTargetDAGCombine(ISD::FSQRT); } + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + // Darwin long double math library functions have $LDBL128 appended. if (Subtarget.isDarwin()) { setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128"); @@ -12090,6 +12092,107 @@ return SDValue(); } +static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) { + // Currently, we support SAD pattern only on ppc64le with VSX + if (!(Subtarget.hasAltivec() && Subtarget.isPPC64() && + Subtarget.isLittleEndian())) + return SDValue(); + + // Verify the type we're extracting from is any integer type above i16. + EVT VT = Extract->getOperand(0).getValueType(); + if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16)) + return SDValue(); + + // We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512. + // TODO: We should be able to handle larger vectors by splitting them before + // feeding them into several SADs, and then reducing over those. + if (VT.getVectorNumElements() != 16 && VT.getVectorNumElements() != 8) + return SDValue(); + + SDValue Zext0, Zext1; + if (!TargetLowering::isBasicSADPattern(DAG, Extract, Zext0, Zext1, + {MVT::i8, MVT::i16}, + {ISD::ZERO_EXTEND, ISD::SIGN_EXTEND})) + return SDValue(); + + EVT SrcVT = Zext0.getOperand(0).getValueType(); + bool IsSigned = (Zext0.getOpcode() == ISD::SIGN_EXTEND); + + SDLoc DL(Extract); + SDValue VZero = SDValue(DAG.getMachineNode(PPC::V_SET0, DL, MVT::v4i32), 0); + SDNode *MaxNode, *MinNode; + if (SrcVT == MVT::v16i8) { + if (IsSigned) { + MaxNode = DAG.getMachineNode(PPC::VMAXSB, DL, MVT::v16i8, + Zext0.getOperand(0), Zext1.getOperand(0)); + MinNode = DAG.getMachineNode(PPC::VMINSB, DL, MVT::v16i8, + Zext0.getOperand(0), Zext1.getOperand(0)); + } + else { + MaxNode = DAG.getMachineNode(PPC::VMAXUB, DL, MVT::v16i8, + Zext0.getOperand(0), Zext1.getOperand(0)); + MinNode = DAG.getMachineNode(PPC::VMINUB, DL, MVT::v16i8, + Zext0.getOperand(0), Zext1.getOperand(0)); + } + SDNode *AbsNode = DAG.getMachineNode(PPC::VSUBUBM, DL, MVT::v16i8, + SDValue(MaxNode, 0), + SDValue(MinNode, 0)); + SDNode *Sum1Node = DAG.getMachineNode(PPC::VSUM4UBS, DL, MVT::v4i32, + SDValue(AbsNode, 0), VZero); + SDNode *Sum2Node = DAG.getMachineNode(PPC::VSUMSWS, DL, MVT::v4i32, + SDValue(Sum1Node, 0), VZero); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, + SDValue(Sum2Node, 0), Extract->getOperand(1)); + } + if (SrcVT == MVT::v8i16) { + if (IsSigned) { + MaxNode = DAG.getMachineNode(PPC::VMAXSH, DL, MVT::v8i16, + Zext0.getOperand(0), Zext1.getOperand(0)); + MinNode = DAG.getMachineNode(PPC::VMINSH, DL, MVT::v8i16, + Zext0.getOperand(0), Zext1.getOperand(0)); + } + else { + MaxNode = DAG.getMachineNode(PPC::VMAXUH, DL, MVT::v8i16, + Zext0.getOperand(0), Zext1.getOperand(0)); + MinNode = DAG.getMachineNode(PPC::VMINUH, DL, MVT::v8i16, + Zext0.getOperand(0), Zext1.getOperand(0)); + } + SDNode *AbsNode = DAG.getMachineNode(PPC::VSUBUHM, DL, MVT::v8i16, + SDValue(MaxNode, 0), + SDValue(MinNode, 0)); + + // We cannot use VSUM4SHS since the absolute value in AbsNode is unsigned. + // So we create two zero-extended v4i32 vectors from input v8i16 vector + // and execute two VSUMSWS instructions. + SmallVector Mask1, Mask2; + for (unsigned i = 0; i < 16; i++) + if (i & 2) { + Mask1.push_back(DAG.getConstant(0, DL, MVT::i32)); + Mask2.push_back(DAG.getConstant(0, DL, MVT::i32)); + } + else { + Mask1.push_back(DAG.getConstant(29 - i, DL, MVT::i32)); + Mask2.push_back(DAG.getConstant(0xFF, DL, MVT::i32)); + } + SDValue VMask1 = DAG.getBuildVector(MVT::v16i8, DL, Mask1); + SDValue VMask2 = DAG.getBuildVector(MVT::v16i8, DL, Mask2); + SDNode *AbsOddNode = DAG.getMachineNode(PPC::VPERM, DL, MVT::v8i16, + VZero, SDValue(AbsNode, 0), + VMask1); + SDNode *AbsEvenNode = DAG.getMachineNode(PPC::VAND, DL, MVT::v8i16, + SDValue(AbsNode, 0), VMask2); + SDNode *Sum1Node = DAG.getMachineNode(PPC::VSUMSWS, DL, MVT::v4i32, + SDValue(AbsEvenNode, 0), VZero); + SDNode *Sum2Node = DAG.getMachineNode(PPC::VSUMSWS, DL, MVT::v4i32, + SDValue(AbsOddNode, 0), + SDValue(Sum1Node, 0)); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, + SDValue(Sum2Node, 0), Extract->getOperand(1)); + } + return SDValue(); +} + // expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for // builtins) into loads with swaps. SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N, @@ -12928,8 +13031,12 @@ } case ISD::BUILD_VECTOR: return DAGCombineBuildVector(N, DCI); + + case ISD::EXTRACT_VECTOR_ELT: + return combineBasicSADPattern(N, DAG, Subtarget); } + return SDValue(); } Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -31184,136 +31184,6 @@ return SDValue(); } -// Match a binop + shuffle pyramid that represents a horizontal reduction over -// the elements of a vector. -// Returns the vector that is being reduced on, or SDValue() if a reduction -// was not matched. -static SDValue matchBinOpReduction(SDNode *Extract, unsigned &BinOp, - ArrayRef CandidateBinOps) { - // The pattern must end in an extract from index 0. - if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) || - !isNullConstant(Extract->getOperand(1))) - return SDValue(); - - SDValue Op = Extract->getOperand(0); - unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements()); - - // Match against one of the candidate binary ops. - if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) { - return Op.getOpcode() == unsigned(BinOp); - })) - return SDValue(); - - // At each stage, we're looking for something that looks like: - // %s = shufflevector <8 x i32> %op, <8 x i32> undef, - // <8 x i32> - // %a = binop <8 x i32> %op, %s - // Where the mask changes according to the stage. E.g. for a 3-stage pyramid, - // we expect something like: - // <4,5,6,7,u,u,u,u> - // <2,3,u,u,u,u,u,u> - // <1,u,u,u,u,u,u,u> - unsigned CandidateBinOp = Op.getOpcode(); - for (unsigned i = 0; i < Stages; ++i) { - if (Op.getOpcode() != CandidateBinOp) - return SDValue(); - - ShuffleVectorSDNode *Shuffle = - dyn_cast(Op.getOperand(0).getNode()); - if (Shuffle) { - Op = Op.getOperand(1); - } else { - Shuffle = dyn_cast(Op.getOperand(1).getNode()); - Op = Op.getOperand(0); - } - - // The first operand of the shuffle should be the same as the other operand - // of the binop. - if (!Shuffle || Shuffle->getOperand(0) != Op) - return SDValue(); - - // Verify the shuffle has the expected (at this stage of the pyramid) mask. - for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index) - if (Shuffle->getMaskElt(Index) != MaskEnd + Index) - return SDValue(); - } - - BinOp = CandidateBinOp; - return Op; -} - -// Given a select, detect the following pattern: -// 1: %2 = zext %0 to -// 2: %3 = zext %1 to -// 3: %4 = sub nsw %2, %3 -// 4: %5 = icmp sgt %4, [0 x N] or [-1 x N] -// 5: %6 = sub nsw zeroinitializer, %4 -// 6: %7 = select %5, %4, %6 -// This is useful as it is the input into a SAD pattern. -static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0, - SDValue &Op1) { - // Check the condition of the select instruction is greater-than. - SDValue SetCC = Select->getOperand(0); - if (SetCC.getOpcode() != ISD::SETCC) - return false; - ISD::CondCode CC = cast(SetCC.getOperand(2))->get(); - if (CC != ISD::SETGT && CC != ISD::SETLT) - return false; - - SDValue SelectOp1 = Select->getOperand(1); - SDValue SelectOp2 = Select->getOperand(2); - - // The following instructions assume SelectOp1 is the subtraction operand - // and SelectOp2 is the negation operand. - // In the case of SETLT this is the other way around. - if (CC == ISD::SETLT) - std::swap(SelectOp1, SelectOp2); - - // The second operand of the select should be the negation of the first - // operand, which is implemented as 0 - SelectOp1. - if (!(SelectOp2.getOpcode() == ISD::SUB && - ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) && - SelectOp2.getOperand(1) == SelectOp1)) - return false; - - // The first operand of SetCC is the first operand of the select, which is the - // difference between the two input vectors. - if (SetCC.getOperand(0) != SelectOp1) - return false; - - // In SetLT case, The second operand of the comparison can be either 1 or 0. - APInt SplatVal; - if ((CC == ISD::SETLT) && - !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) && - SplatVal.isOneValue()) || - (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode())))) - return false; - - // In SetGT case, The second operand of the comparison can be either -1 or 0. - if ((CC == ISD::SETGT) && - !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) || - ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode()))) - return false; - - // The first operand of the select is the difference between the two input - // vectors. - if (SelectOp1.getOpcode() != ISD::SUB) - return false; - - Op0 = SelectOp1.getOperand(0); - Op1 = SelectOp1.getOperand(1); - - // Check if the operands of the sub are zero-extended from vectors of i8. - if (Op0.getOpcode() != ISD::ZERO_EXTEND || - Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 || - Op1.getOpcode() != ISD::ZERO_EXTEND || - Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8) - return false; - - return true; -} - // Given two zexts of to , create a PSADBW of the inputs // to these zexts. static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0, @@ -31358,7 +31228,7 @@ // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns. unsigned BinOp; - SDValue Src = matchBinOpReduction( + SDValue Src = TargetLowering::matchBinOpReduction( Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}); if (!Src) return SDValue(); @@ -31438,7 +31308,7 @@ // Check for OR(any_of) and AND(all_of) horizontal reduction patterns. unsigned BinOp = 0; - SDValue Match = matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND}); + SDValue Match = TargetLowering::matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND}); if (!Match) return SDValue(); @@ -31496,6 +31366,12 @@ return DAG.getSExtOrTrunc(Res, DL, ExtractVT); } +static bool detectZextAbsDiff(const SDValue &SelectOp, SDValue &Op0, + SDValue &Op1) { + return TargetLowering::detectExtAbsDiff(SelectOp, Op0, Op1, {MVT::i8}, + {ISD::ZERO_EXTEND}); +} + static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget) { // PSADBW is only supported on SSE2 and up. @@ -31519,31 +31395,9 @@ if (RegSize / VT.getVectorNumElements() < 8) return SDValue(); - // Match shuffle + add pyramid. - unsigned BinOp = 0; - SDValue Root = matchBinOpReduction(Extract, BinOp, {ISD::ADD}); - - // The operand is expected to be zero extended from i8 - // (verified in detectZextAbsDiff). - // In order to convert to i64 and above, additional any/zero/sign - // extend is expected. - // The zero extend from 32 bit has no mathematical effect on the result. - // Also the sign extend is basically zero extend - // (extends the sign bit which is zero). - // So it is correct to skip the sign/zero extend instruction. - if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND || - Root.getOpcode() == ISD::ZERO_EXTEND || - Root.getOpcode() == ISD::ANY_EXTEND)) - Root = Root.getOperand(0); - - // If there was a match, we want Root to be a select that is the root of an - // abs-diff pattern. - if (!Root || (Root.getOpcode() != ISD::VSELECT)) - return SDValue(); - - // Check whether we have an abs-diff pattern feeding into the select. SDValue Zext0, Zext1; - if (!detectZextAbsDiff(Root, Zext0, Zext1)) + if (!TargetLowering::isBasicSADPattern(DAG, Extract, Zext0, Zext1, {MVT::i8}, + {ISD::ZERO_EXTEND})) return SDValue(); // Create the SAD instruction. Index: test/CodeGen/PowerPC/ppc64_basicSAD.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/ppc64_basicSAD.ll @@ -0,0 +1,132 @@ +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -verify-machineinstrs | FileCheck %s + +define zeroext i32 @func8s(i8* nocapture readonly %pix1, i8* nocapture readonly %pix2) { +; CHECK-LABEL: @func8s +; CHECK-DAG: vminsb [[MIN:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} +; CHECK-DAG: vmaxsb [[MAX:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} +; CHECK-DAG: vxor [[ZERO:[0-9]+]], [[ZERO]], [[ZERO]] +; CHECK: vsububm [[ABS:[0-9]+]], [[MAX]], [[MIN]] +; CHECK: vsum4ubs [[SUM1:[0-9]+]], [[ABS]], [[ZERO]] +; CHECK: vsumsws [[SUM2:[0-9]+]], [[SUM1]], [[ZERO]] +; CHECK: mfvsrwz {{[0-9]+}} +entry: + %0 = bitcast i8* %pix1 to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + %2 = sext <16 x i8> %1 to <16 x i32> + %3 = bitcast i8* %pix2 to <16 x i8>* + %4 = load <16 x i8>, <16 x i8>* %3, align 1 + %5 = sext <16 x i8> %4 to <16 x i32> + %6 = sub nsw <16 x i32> %2, %5 + %7 = icmp sgt <16 x i32> %6, + %8 = sub nsw <16 x i32> zeroinitializer, %6 + %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8 + %rdx.shuf = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> + %bin.rdx = add nsw <16 x i32> %9, %rdx.shuf + %rdx.shuf12 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> + %bin.rdx13 = add nsw <16 x i32> %bin.rdx, %rdx.shuf12 + %rdx.shuf14 = shufflevector <16 x i32> %bin.rdx13, <16 x i32> undef, <16 x i32> + %bin.rdx15 = add nsw <16 x i32> %bin.rdx13, %rdx.shuf14 + %rdx.shuf16 = shufflevector <16 x i32> %bin.rdx15, <16 x i32> undef, <16 x i32> + %bin.rdx17 = add nsw <16 x i32> %bin.rdx15, %rdx.shuf16 + %10 = extractelement <16 x i32> %bin.rdx17, i32 0 + ret i32 %10 +} + +define zeroext i32 @func8u(i8* nocapture readonly %pix1, i8* nocapture readonly %pix2) { +; CHECK-LABEL: @func8u +; CHECK-DAG: vminub [[MIN:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} +; CHECK-DAG: vmaxub [[MAX:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} +; CHECK-DAG: vxor [[ZERO:[0-9]+]], [[ZERO]], [[ZERO]] +; CHECK: vsububm [[ABS:[0-9]+]], [[MAX]], [[MIN]] +; CHECK: vsum4ubs [[SUM1:[0-9]+]], [[ABS]], [[ZERO]] +; CHECK: vsumsws [[SUM2:[0-9]+]], [[SUM1]], [[ZERO]] +; CHECK: mfvsrwz {{[0-9]+}} +entry: + %0 = bitcast i8* %pix1 to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + %2 = zext <16 x i8> %1 to <16 x i32> + %3 = bitcast i8* %pix2 to <16 x i8>* + %4 = load <16 x i8>, <16 x i8>* %3, align 1 + %5 = zext <16 x i8> %4 to <16 x i32> + %6 = sub nsw <16 x i32> %2, %5 + %7 = icmp sgt <16 x i32> %6, + %8 = sub nsw <16 x i32> zeroinitializer, %6 + %9 = select <16 x i1> %7, <16 x i32> %6, <16 x i32> %8 + %rdx.shuf = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> + %bin.rdx = add <16 x i32> %9, %rdx.shuf + %rdx.shuf12 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> + %bin.rdx13 = add <16 x i32> %bin.rdx, %rdx.shuf12 + %rdx.shuf14 = shufflevector <16 x i32> %bin.rdx13, <16 x i32> undef, <16 x i32> + %bin.rdx15 = add <16 x i32> %bin.rdx13, %rdx.shuf14 + %rdx.shuf16 = shufflevector <16 x i32> %bin.rdx15, <16 x i32> undef, <16 x i32> + %bin.rdx17 = add <16 x i32> %bin.rdx15, %rdx.shuf16 + %10 = extractelement <16 x i32> %bin.rdx17, i32 0 + ret i32 %10 +} + + +define signext i32 @func16s(i16* nocapture readonly %pix1, i16* nocapture readonly %pix2) { +; CHECK-LABEL: @func16s +; CHECK-DAG: vminsh [[MIN:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} +; CHECK-DAG: vmaxsh [[MAX:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} +; CHECK-DAG: vxor [[ZERO:[0-9]+]], [[ZERO]], [[ZERO]] +; CHECK: vsubuhm [[ABS:[0-9]+]], [[MAX]], [[MIN]] +; CHECK-DAG: vand [[EVEN:[0-9]+]], [[ABS]], {{[0-9]+}} +; CHECK-DAG: vperm [[ODD:[0-9]+]], [[ZERO]], [[ABS]], {{[0-9]+}} +; CHECK: vsumsws [[SUM1:[0-9]+]], [[EVEN]], [[ZERO]] +; CHECK: vsumsws [[SUM2:[0-9]+]], [[ODD]], [[SUM1]] +; CHECK: mfvsrwz {{[0-9]+}} + +entry: + %0 = bitcast i16* %pix1 to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + %2 = sext <8 x i16> %1 to <8 x i32> + %3 = bitcast i16* %pix2 to <8 x i16>* + %4 = load <8 x i16>, <8 x i16>* %3, align 2 + %5 = sext <8 x i16> %4 to <8 x i32> + %6 = sub nsw <8 x i32> %2, %5 + %7 = icmp sgt <8 x i32> %6, + %8 = sub nsw <8 x i32> zeroinitializer, %6 + %9 = select <8 x i1> %7, <8 x i32> %6, <8 x i32> %8 + %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> + %bin.rdx = add nsw <8 x i32> %9, %rdx.shuf + %rdx.shuf12 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> + %bin.rdx13 = add nsw <8 x i32> %bin.rdx, %rdx.shuf12 + %rdx.shuf14 = shufflevector <8 x i32> %bin.rdx13, <8 x i32> undef, <8 x i32> + %bin.rdx15 = add nsw <8 x i32> %bin.rdx13, %rdx.shuf14 + %10 = extractelement <8 x i32> %bin.rdx15, i32 0 + ret i32 %10 +} + +define signext i32 @func16u(i16* nocapture readonly %pix1, i16* nocapture readonly %pix2) { +; CHECK-LABEL: @func16u +; CHECK-DAG: vminuh [[MIN:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} +; CHECK-DAG: vmaxuh [[MAX:[0-9]+]], {{[0-9]+}}, {{[0-9]+}} +; CHECK-DAG: vxor [[ZERO:[0-9]+]], [[ZERO]], [[ZERO]] +; CHECK: vsubuhm [[ABS:[0-9]+]], [[MAX]], [[MIN]] +; CHECK-DAG: vand [[EVEN:[0-9]+]], [[ABS]], {{[0-9]+}} +; CHECK-DAG: vperm [[ODD:[0-9]+]], [[ZERO]], [[ABS]], {{[0-9]+}} +; CHECK: vsumsws [[SUM1:[0-9]+]], [[EVEN]], [[ZERO]] +; CHECK: vsumsws [[SUM2:[0-9]+]], [[ODD]], [[SUM1]] +; CHECK: mfvsrwz {{[0-9]+}} + +entry: + %0 = bitcast i16* %pix1 to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + %2 = zext <8 x i16> %1 to <8 x i32> + %3 = bitcast i16* %pix2 to <8 x i16>* + %4 = load <8 x i16>, <8 x i16>* %3, align 2 + %5 = zext <8 x i16> %4 to <8 x i32> + %6 = sub nsw <8 x i32> %2, %5 + %7 = icmp sgt <8 x i32> %6, + %8 = sub nsw <8 x i32> zeroinitializer, %6 + %9 = select <8 x i1> %7, <8 x i32> %6, <8 x i32> %8 + %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> + %bin.rdx = add nsw <8 x i32> %9, %rdx.shuf + %rdx.shuf12 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> + %bin.rdx13 = add nsw <8 x i32> %bin.rdx, %rdx.shuf12 + %rdx.shuf14 = shufflevector <8 x i32> %bin.rdx13, <8 x i32> undef, <8 x i32> + %bin.rdx15 = add nsw <8 x i32> %bin.rdx13, %rdx.shuf14 + %10 = extractelement <8 x i32> %bin.rdx15, i32 0 + ret i32 %10 +}