Index: include/llvm/CodeGen/SelectionDAGNodes.h =================================================================== --- include/llvm/CodeGen/SelectionDAGNodes.h +++ include/llvm/CodeGen/SelectionDAGNodes.h @@ -328,6 +328,7 @@ bool NoInfs : 1; bool NoSignedZeros : 1; bool AllowReciprocal : 1; + bool Reduction : 1; public: /// Default constructor turns off all optimization flags. @@ -340,6 +341,7 @@ NoInfs = false; NoSignedZeros = false; AllowReciprocal = false; + Reduction = false; } // These are mutators for each flag. @@ -351,6 +353,7 @@ void setNoInfs(bool b) { NoInfs = b; } void setNoSignedZeros(bool b) { NoSignedZeros = b; } void setAllowReciprocal(bool b) { AllowReciprocal = b; } + void setReduction(bool b) { Reduction = b; } // These are accessors for each flag. bool hasNoUnsignedWrap() const { return NoUnsignedWrap; } @@ -361,6 +364,7 @@ bool hasNoInfs() const { return NoInfs; } bool hasNoSignedZeros() const { return NoSignedZeros; } bool hasAllowReciprocal() const { return AllowReciprocal; } + bool hasReduction() const { return Reduction; } /// Return a raw encoding of the flags. /// This function should only be used to add data to the NodeID value. Index: include/llvm/IR/Instructions.h =================================================================== --- include/llvm/IR/Instructions.h +++ include/llvm/IR/Instructions.h @@ -2399,9 +2399,13 @@ // class PHINode : public Instruction { void *operator new(size_t, unsigned) = delete; + /// ReservedSpace - The number of operands actually allocated. NumOperands is /// the number actually in use. - unsigned ReservedSpace; + unsigned ReservedSpace : 31; + /// IsRecution - a flag indicating if this Phi is a reduction Phi. + bool IsReduction : 1; + PHINode(const PHINode &PN); // allocate space for exactly zero operands void *operator new(size_t s) { @@ -2409,17 +2413,17 @@ } explicit PHINode(Type *Ty, unsigned NumReservedValues, const Twine &NameStr = "", - Instruction *InsertBefore = nullptr) + Instruction *InsertBefore = nullptr, bool IsRdx = false) : Instruction(Ty, Instruction::PHI, nullptr, 0, InsertBefore), - ReservedSpace(NumReservedValues) { + ReservedSpace(NumReservedValues), IsReduction(IsRdx) { setName(NameStr); allocHungoffUses(ReservedSpace); } PHINode(Type *Ty, unsigned NumReservedValues, const Twine &NameStr, - BasicBlock *InsertAtEnd) + BasicBlock *InsertAtEnd, bool IsRdx = false) : Instruction(Ty, Instruction::PHI, nullptr, 0, InsertAtEnd), - ReservedSpace(NumReservedValues) { + ReservedSpace(NumReservedValues), IsReduction(IsRdx) { setName(NameStr); allocHungoffUses(ReservedSpace); } @@ -2441,12 +2445,14 @@ /// edges that this phi node will have (use 0 if you really have no idea). static PHINode *Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr = "", - Instruction *InsertBefore = nullptr) { - return new PHINode(Ty, NumReservedValues, NameStr, InsertBefore); + Instruction *InsertBefore = nullptr, + bool IsRdx = false) { + return new PHINode(Ty, NumReservedValues, NameStr, InsertBefore, IsRdx); } static PHINode *Create(Type *Ty, unsigned NumReservedValues, - const Twine &NameStr, BasicBlock *InsertAtEnd) { - return new PHINode(Ty, NumReservedValues, NameStr, InsertAtEnd); + const Twine &NameStr, BasicBlock *InsertAtEnd, + bool IsRdx = false) { + return new PHINode(Ty, NumReservedValues, NameStr, InsertAtEnd, IsRdx); } /// Provide fast operand accessors @@ -2577,6 +2583,8 @@ /// same value, return the value, otherwise return null. Value *hasConstantValue() const; + bool isReduction() const { return IsReduction; } + /// Methods for support type inquiry through isa, cast, and dyn_cast: static inline bool classof(const Instruction *I) { return I->getOpcode() == Instruction::PHI; Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -2317,6 +2317,7 @@ bool nuw = false; bool nsw = false; bool exact = false; + bool reduction = false; FastMathFlags FMF; if (const OverflowingBinaryOperator *OFBinOp = @@ -2330,10 +2331,20 @@ if (const FPMathOperator *FPOp = dyn_cast(&I)) FMF = FPOp->getFastMathFlags(); + // Check if this binary op is a reduction. + { + const PHINode *PN; + if ((PN = dyn_cast(I.getOperand(0)))) + reduction = PN->isReduction(); + if (!reduction && (PN = dyn_cast(I.getOperand(1)))) + reduction = PN->isReduction(); + } + SDNodeFlags Flags; Flags.setExact(exact); Flags.setNoSignedWrap(nsw); Flags.setNoUnsignedWrap(nuw); + Flags.setReduction(reduction); if (EnableFMFInDAG) { Flags.setAllowReciprocal(FMF.allowReciprocal()); Flags.setNoInfs(FMF.noInfs()); Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -26621,9 +26621,122 @@ DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp); } +static SDValue detectSADPattern(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + if (!VT.isVector() || !VT.isSimple()) + return SDValue(); + unsigned NumElems = VT.getVectorNumElements(); + + if (!(VT.getVectorElementType() == MVT::i32 && isPowerOf2_32(NumElems))) + return SDValue(); + + if (Subtarget->hasAVX512()) { + if (VT.getSizeInBits() > 512) + return SDValue(); + } else if (Subtarget->hasAVX2()) { + if (VT.getSizeInBits() > 256) + return SDValue(); + } else { + if (VT.getSizeInBits() > 128) + return SDValue(); + } + + // Detect the following pattern: + // + // 1: %2 = zext <4 x i8> %0 to <4 x i32> + // 2: %3 = zext <4 x i8> %1 to <4 x i32> + // 3: %4 = sub nsw <4 x i32> %2, %3 + // 4: %5 = icmp sgt <4 x i32> %4, zeroinitializer + // 5: %6 = sub nsw <4 x i32> zeroinitializer, %4 + // 6: %7 = select <4 x i1> %5, <4 x i32> %4, <4 x i32> %6 + // 7: %8 = add nsw <4 x i32> %7, %vec.phi + // + // The last instruction must be a reduction add. The instructions 3-6 forms an + // ABSDIFF pattern. + + SDValue SelectOp, Phi; + if (Op0.getOpcode() == ISD::CopyFromReg && Op1.getOpcode() == ISD::VSELECT) { + SelectOp = Op1; + Phi = Op0; + } else if (Op1.getOpcode() == ISD::CopyFromReg && + Op0.getOpcode() == ISD::VSELECT) { + SelectOp = Op0; + Phi = Op1; + } else + return SDValue(); + + SDValue SetCC = SelectOp->getOperand(0); + if (SetCC.getOpcode() != ISD::SETCC) + return SDValue(); + ISD::CondCode CC = cast(SetCC.getOperand(2))->get(); + if (CC != ISD::SETGT) + return SDValue(); + + Op0 = SelectOp->getOperand(1); + Op1 = SelectOp->getOperand(2); + if (Op0.getOpcode() != ISD::SUB || Op1.getOpcode() != ISD::SUB) + return SDValue(); + + if (SetCC.getOperand(0) != Op0 || Op1.getOperand(1) != Op0) + return SDValue(); + + // A lambda checking the given SDValue is a constant vector with all ones. + auto IsConstVectorOfZeros = [](SDValue V) { + BuildVectorSDNode *BV = dyn_cast(V); + if (!BV || !BV->isConstant()) + return false; + auto NumOperands = V.getNumOperands(); + for (unsigned i = 0; i < NumOperands; i++) { + ConstantSDNode *C = dyn_cast(V.getOperand(i)); + if (!C || !C->isNullValue()) + return false; } + return true; + }; + + if (!IsConstVectorOfZeros(SetCC.getOperand(1)) || + !IsConstVectorOfZeros(Op1.getOperand(0))) + return SDValue(); + + Op1 = Op0.getOperand(1); + Op0 = Op0.getOperand(0); + + assert(Op0.getOperand(0).getValueType().isVector()); + assert(Op1.getOperand(0).getValueType().isVector()); + if (Op0.getOpcode() != ISD::ZERO_EXTEND || + Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 || + Op1.getOpcode() != ISD::ZERO_EXTEND || + Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8) + return SDValue(); + + unsigned NumConcat = 128 / Op0.getOperand(0).getValueType().getSizeInBits(); + SmallVector Ops( + NumConcat, DAG.getConstant(0, DL, Op0.getOperand(0).getValueType())); + Ops[0] = Op0.getOperand(0); + Op0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Ops); + Ops[0] = Op1.getOperand(0); + Op1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Ops); + + SDValue SAD = + DAG.getNode(X86ISD::PSADBW, DL, MVT::v16i8, Op0, Op1); + SAD = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, SAD); + return DAG.getNode(ISD::ADD, DL, VT, SAD, Phi); +} + /// PerformADDCombine - Do target-specific dag combines on integer adds. static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { + const SDNodeFlags *Flags = &cast(N)->Flags; + if (Flags->hasReduction()) { + SDValue SAD = detectSADPattern(N, DAG, Subtarget); + if (SAD.getNode()) + return SAD; + } + EVT VT = N->getValueType(0); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -4062,6 +4062,8 @@ SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16, SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; +defm PSADBW : PDI_binop_all<0xF6, "psadbw", X86psadbw, v16i8, v32i8, + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; // Intrinsic forms defm PSUBSB : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b, @@ -4082,8 +4084,6 @@ int_x86_avx2_pavg_b, SSE_INTALU_ITINS_P, 1>; defm PAVGW : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w, int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>; -defm PSADBW : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw, - int_x86_avx2_psad_bw, SSE_PMADD, 1>; let Predicates = [HasAVX2] in def : Pat<(v32i8 (X86psadbw (v32i8 VR256:$src1), Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -278,6 +278,7 @@ X86_INTRINSIC_DATA(avx2_pmovzxdq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), X86_INTRINSIC_DATA(avx2_pmovzxwd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), X86_INTRINSIC_DATA(avx2_pmovzxwq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), + X86_INTRINSIC_DATA(avx2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0), X86_INTRINSIC_DATA(avx2_pmul_dq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0), X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), @@ -1694,6 +1695,7 @@ X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), X86_INTRINSIC_DATA(sse2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), X86_INTRINSIC_DATA(sse2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0), + X86_INTRINSIC_DATA(sse2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0), X86_INTRINSIC_DATA(sse2_pshuf_d, INTR_TYPE_2OP, X86ISD::PSHUFD, 0), X86_INTRINSIC_DATA(sse2_pshufh_w, INTR_TYPE_2OP, X86ISD::PSHUFHW, 0), X86_INTRINSIC_DATA(sse2_pshufl_w, INTR_TYPE_2OP, X86ISD::PSHUFLW, 0), Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3612,7 +3612,8 @@ Type *VecTy = (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF); Entry[part] = PHINode::Create( - VecTy, 2, "vec.phi", &*LoopVectorBody.back()->getFirstInsertionPt()); + VecTy, 2, "vec.phi", &*LoopVectorBody.back()->getFirstInsertionPt(), + true /* IsReduction */); } PV->push_back(P); return;