Index: include/llvm/CodeGen/SelectionDAGNodes.h =================================================================== --- include/llvm/CodeGen/SelectionDAGNodes.h +++ include/llvm/CodeGen/SelectionDAGNodes.h @@ -328,6 +328,7 @@ bool NoInfs : 1; bool NoSignedZeros : 1; bool AllowReciprocal : 1; + bool Reduction : 1; public: /// Default constructor turns off all optimization flags. @@ -340,6 +341,7 @@ NoInfs = false; NoSignedZeros = false; AllowReciprocal = false; + Reduction = false; } // These are mutators for each flag. @@ -351,6 +353,7 @@ void setNoInfs(bool b) { NoInfs = b; } void setNoSignedZeros(bool b) { NoSignedZeros = b; } void setAllowReciprocal(bool b) { AllowReciprocal = b; } + void setReduction(bool b) { Reduction = b; } // These are accessors for each flag. bool hasNoUnsignedWrap() const { return NoUnsignedWrap; } @@ -361,6 +364,7 @@ bool hasNoInfs() const { return NoInfs; } bool hasNoSignedZeros() const { return NoSignedZeros; } bool hasAllowReciprocal() const { return AllowReciprocal; } + bool hasReduction() const { return Reduction; } /// Return a raw encoding of the flags. /// This function should only be used to add data to the NodeID value. Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -2322,6 +2322,7 @@ bool nuw = false; bool nsw = false; bool exact = false; + bool reduction = false; FastMathFlags FMF; if (const OverflowingBinaryOperator *OFBinOp = @@ -2335,10 +2336,41 @@ if (const FPMathOperator *FPOp = dyn_cast(&I)) FMF = FPOp->getFastMathFlags(); + // Check if this binary operation performs a reduction. + // FIXME: Support more reduction operations for more pattern recognitions. + if (OpCode == ISD::ADD) { + // Check if this binary op is a reduction. + auto IsReductionPHI = [](const Value *V) { + const PHINode *PN = dyn_cast(V); + return PN && PN->getMetadata("llvm.loop.vectorize.reduction"); + }; + + // This lambda checks if an instruction is a reduction one. It is a + // reduction if one of its operands is either a reduction PHI or another + // reduction operation. + std::function IsReductionOp = [&](const User *I) { + if (I == nullptr) + return false; + const BinaryOperator *BOp = dyn_cast(I); + if (BOp == nullptr) + return false; + Instruction::BinaryOps OpCode = BOp->getOpcode(); + if (OpCode != Instruction::Add) + return false; + return IsReductionPHI(I->getOperand(0)) || + IsReductionPHI(I->getOperand(1)) || + IsReductionOp(dyn_cast(I->getOperand(0))) || + IsReductionOp(dyn_cast(I->getOperand(1))); + }; + + reduction = IsReductionOp(&I); + } + SDNodeFlags Flags; Flags.setExact(exact); Flags.setNoSignedWrap(nsw); Flags.setNoUnsignedWrap(nuw); + Flags.setReduction(reduction); if (EnableFMFInDAG) { Flags.setAllowReciprocal(FMF.allowReciprocal()); Flags.setNoInfs(FMF.noInfs()); @@ -2346,8 +2378,8 @@ Flags.setNoSignedZeros(FMF.noSignedZeros()); Flags.setUnsafeAlgebra(FMF.unsafeAlgebra()); } - SDValue BinNodeValue = DAG.getNode(OpCode, getCurSDLoc(), Op1.getValueType(), - Op1, Op2, &Flags); + SDValue BinNodeValue = + DAG.getNode(OpCode, getCurSDLoc(), Op1.getValueType(), Op1, Op2, &Flags); setValue(&I, BinNodeValue); } Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -26926,9 +26926,163 @@ DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp); } +// Check if the given SDValue is a constant vector with all Val. +static bool isConstVectorOf(SDValue V, int Val) { + BuildVectorSDNode *BV = dyn_cast(V); + if (!BV || !BV->isConstant()) + return false; + auto NumOperands = V.getNumOperands(); + for (unsigned i = 0; i < NumOperands; i++) { + ConstantSDNode *C = dyn_cast(V.getOperand(i)); + if (!C || C->getSExtValue() != Val) + return false; + } + return true; +} + +static SDValue detectSADPattern(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + if (!VT.isVector() || !VT.isSimple()) + return SDValue(); + unsigned NumElems = VT.getVectorNumElements(); + + if (!(VT.getVectorElementType() == MVT::i32 && isPowerOf2_32(NumElems))) + return SDValue(); + + unsigned RegSize = 128; + if (Subtarget->hasAVX512()) + RegSize = 512; + else if (Subtarget->hasAVX2()) + RegSize = 256; + + if (VT.getSizeInBits() > RegSize) + return SDValue(); + + // Detect the following pattern: + // + // 1: %2 = zext %0 to + // 2: %3 = zext %1 to + // 3: %4 = sub nsw %2, %3 + // 4: %5 = icmp sgt %4, [0 x N] or [-1 x N] + // 5: %6 = sub nsw zeroinitializer, %4 + // 6: %7 = select %5, %4, %6 + // 7: %8 = add nsw %7, %vec.phi + // + // The last instruction must be a reduction add. The instructions 3-6 forms an + // ABSDIFF pattern. + + // The two operands of reduction add are from PHI and a select-op as in line 7 + // above. + SDValue SelectOp, Phi; + if (Op0.getOpcode() == ISD::VSELECT) { + SelectOp = Op0; + Phi = Op1; + } else if (Op1.getOpcode() == ISD::VSELECT) { + SelectOp = Op1; + Phi = Op0; + } else + return SDValue(); + + // Check the condition of the select instruction is greater-than. + SDValue SetCC = SelectOp->getOperand(0); + if (SetCC.getOpcode() != ISD::SETCC) + return SDValue(); + ISD::CondCode CC = cast(SetCC.getOperand(2))->get(); + if (CC != ISD::SETGT) + return SDValue(); + + Op0 = SelectOp->getOperand(1); + Op1 = SelectOp->getOperand(2); + + // The second operand of SelectOp Op1 is the negation of the first operand + // Op0, which is implementes as 0 - Op0. + if (!(Op1.getOpcode() == ISD::SUB && isConstVectorOf(Op1.getOperand(0), 0) && + Op1.getOperand(1) == Op0)) + return SDValue(); + + // The first operand of SetCC is the first operand of SelectOp, which is the + // difference between two input vectors. + if (SetCC.getOperand(0) != Op0) + return SDValue(); + + // The second operand of > comparison can be either -1 or 0. + if (!(isConstVectorOf(SetCC.getOperand(1), 0) || + isConstVectorOf(SetCC.getOperand(1), -1))) + return SDValue(); + + // The first operand of SelectOp is the difference between two input vectors. + if (Op0.getOpcode() != ISD::SUB) + return SDValue(); + + Op1 = Op0.getOperand(1); + Op0 = Op0.getOperand(0); + + // Check if the operands of the diff are zero-extended from vectors of i8. + if (Op0.getOpcode() != ISD::ZERO_EXTEND || + Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 || + Op1.getOpcode() != ISD::ZERO_EXTEND || + Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8) + return SDValue(); + + // SAD pattern detected. Now build a SAD instruction and an addition for + // reduction. Note that the number of elments of the result of SAD is less + // than the number of elements of its input. Therefore, we could only update + // part of elements in the reduction vector. + + // Legalize the type of the inputs of PSADBW. + EVT InVT = Op0.getOperand(0).getValueType(); + if (InVT.getSizeInBits() <= 128) + RegSize = 128; + else if (InVT.getSizeInBits() <= 256) + RegSize = 256; + + unsigned NumConcat = + RegSize / Op0.getOperand(0).getValueType().getSizeInBits(); + SmallVector Ops( + NumConcat, DAG.getConstant(0, DL, Op0.getOperand(0).getValueType())); + Ops[0] = Op0.getOperand(0); + MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8); + Op0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops); + Ops[0] = Op1.getOperand(0); + Op1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops); + + // The output of PSADBW is a vector of i64. + MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64); + SDValue Sad = DAG.getNode(X86ISD::PSADBW, DL, SadVT, Op0, Op1); + + // We need to turn the vector of i64 into a vector of i32. + MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); + Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad); + + NumConcat = VT.getSizeInBits() / ResVT.getSizeInBits(); + if (NumConcat > 1) { + // Update part of elements of the reduction vector. This is done by first + // extracting a sub-vector from it, updating this sub-vector, and inserting + // it back. + SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi, + DAG.getIntPtrConstant(0, DL)); + SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res, + DAG.getIntPtrConstant(0, DL)); + } else + return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi); +} + /// PerformADDCombine - Do target-specific dag combines on integer adds. static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { + const SDNodeFlags *Flags = &cast(N)->Flags; + if (Flags->hasReduction()) { + SDValue Sad = detectSADPattern(N, DAG, Subtarget); + if (Sad.getNode()) + return Sad; + } + EVT VT = N->getValueType(0); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -4066,6 +4066,10 @@ int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>; defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, int_x86_avx2_pmadd_wd, SSE_PMADD, 1>; +defm PAVGB : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b, + int_x86_avx2_pavg_b, SSE_INTALU_ITINS_P, 1>; +defm PAVGW : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w, + int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>; let Predicates = [HasAVX] in defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128, Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3618,8 +3618,11 @@ // This is phase one of vectorizing PHIs. Type *VecTy = (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); - Entry[part] = PHINode::Create( + PHINode *PHI = PHINode::Create( VecTy, 2, "vec.phi", &*LoopVectorBody.back()->getFirstInsertionPt()); + MDNode *MD = MDNode::get(PHI->getContext(), None); + PHI->setMetadata("llvm.loop.vectorize.reduction", MD); + Entry[part] = PHI; } PV->push_back(P); return; Index: test/CodeGen/X86/sad.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/sad.ll @@ -0,0 +1,65 @@ +; RUN: opt < %s -O2 -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 -force-target-max-vector-interleave=1 -unroll-count=1 | llc | FileCheck %s --check-prefix=SSE2 +; RUN: opt < %s -O2 -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -force-target-max-vector-interleave=1 -unroll-count=1 | llc | FileCheck %s --check-prefix=AVX2 +; RUN: opt < %s -O2 -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw -force-target-max-vector-interleave=1 -unroll-count=1 | llc | FileCheck %s --check-prefix=AVX512BW +; RUN: opt < %s -O2 -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 -force-target-max-vector-interleave=1 | llc | FileCheck %s --check-prefix=SSE2-UNROLL +; RUN: opt < %s -O2 -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -force-target-max-vector-interleave=1 | llc | FileCheck %s --check-prefix=AVX2-UNROLL +; RUN: opt < %s -O2 -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw -force-target-max-vector-interleave=1 | llc | FileCheck %s --check-prefix=AVX512BW-UNROLL + +@a = global [1024 x i8] zeroinitializer, align 16 +@b = global [1024 x i8] zeroinitializer, align 16 + +define i32 @sad() { +; SSE2-LABEL: sad +; SSE2: psadbw %xmm1, %xmm2 +; SSE2: paddd %xmm2, %xmm0 +; +; AVX2-LABEL: sad +; AVX2: vpsadbw %xmm1, %xmm2, %xmm1 +; AVX2: vpaddd %xmm0, %xmm1, %xmm1 +; +; AVX512BW-LABEL: sad +; AVX512BW: vpsadbw {{.*}}, %xmm1, %xmm1 +; AVX512BW: vpaddd %xmm0, %xmm1, %xmm1 +; +; SSE2-UNROLL-LABEL: sad +; SSE2-UNROLL: psadbw %xmm1, %xmm2 +; SSE2-UNROLL: paddd %xmm0, %xmm2 +; SSE2-UNROLL: psadbw %xmm1, %xmm0 +; SSE2-UNROLL: paddd %xmm2, %xmm0 +; +; AVX2-UNROLL-LABEL: sad +; AVX2-UNROLL: vpsadbw %xmm1, %xmm2, %xmm1 +; AVX2-UNROLL: vpaddd %xmm0, %xmm1, %xmm1 +; AVX2-UNROLL: vpsadbw %xmm2, %xmm3, %xmm2 +; AVX2-UNROLL: vpaddd %xmm1, %xmm2, %xmm1 +; +; AVX512BW-UNROLL-LABEL: sad +; AVX512BW-UNROLL: vpsadbw {{.*}}, %xmm1, %xmm1 +; AVX512BW-UNROLL: vpaddd %xmm0, %xmm1, %xmm1 +; AVX512BW-UNROLL: vpsadbw {{.*}}, %xmm1, %xmm1 +; AVX512BW-UNROLL: vpaddd %xmm0, %xmm1, %xmm1 +; +entry: + br label %for.body + +for.cond.cleanup: + ret i32 %add + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %s.010 = phi i32 [ 0, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %indvars.iv + %0 = load i8, i8* %arrayidx, align 1 + %conv = zext i8 %0 to i32 + %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %indvars.iv + %1 = load i8, i8* %arrayidx2, align 1 + %conv3 = zext i8 %1 to i32 + %sub = sub nsw i32 %conv, %conv3 + %ispos = icmp sgt i32 %sub, -1 + %neg = sub nsw i32 0, %sub + %2 = select i1 %ispos, i32 %sub, i32 %neg + %add = add nsw i32 %2, %s.010 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} Index: test/Transforms/LoopVectorize/vectorize-once.ll =================================================================== --- test/Transforms/LoopVectorize/vectorize-once.ll +++ test/Transforms/LoopVectorize/vectorize-once.ll @@ -68,11 +68,11 @@ attributes #0 = { nounwind readonly ssp uwtable "fp-contract-model"="standard" "no-frame-pointer-elim" "no-frame-pointer-elim-non-leaf" "realign-stack" "relocation-model"="pic" "ssp-buffers-size"="8" } -; CHECK: !0 = distinct !{!0, !1, !2} -; CHECK: !1 = !{!"llvm.loop.vectorize.width", i32 1} -; CHECK: !2 = !{!"llvm.loop.interleave.count", i32 1} -; CHECK: !3 = distinct !{!3, !4, !1, !2} -; CHECK: !4 = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: !1 = distinct !{!1, !2, !3} +; CHECK: !2 = !{!"llvm.loop.vectorize.width", i32 1} +; CHECK: !3 = !{!"llvm.loop.interleave.count", i32 1} +; CHECK: !4 = distinct !{!4, !5, !2, !3} +; CHECK: !5 = !{!"llvm.loop.unroll.runtime.disable"} !0 = !{!0, !1} !1 = !{!"llvm.loop.vectorize.width", i32 1}