Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -449,6 +449,10 @@ unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys) const; + /// \returns the cost of SAD instruction. + unsigned getSADInstrCost(unsigned ISD, Type *RetTy, Type *op1, + Type *op2) const; + /// \returns The number of pieces into which the provided type must be /// split during legalization. Zero is returned when the answer is unknown. unsigned getNumberOfParts(Type *Tp) const; @@ -565,6 +569,8 @@ bool IsPairwiseForm) = 0; virtual unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys) = 0; + virtual unsigned getSADInstrCost(unsigned ISD, Type *RetTy, Type *op1, + Type *op2) = 0; virtual unsigned getNumberOfParts(Type *Tp) = 0; virtual unsigned getAddressComputationCost(Type *Ty, bool IsComplex) = 0; virtual unsigned getCostOfKeepingLiveOverCall(ArrayRef Tys) = 0; @@ -719,6 +725,10 @@ ArrayRef Tys) override { return Impl.getIntrinsicInstrCost(ID, RetTy, Tys); } + unsigned getSADInstrCost(unsigned ISD, Type *RetTy, Type *op1, + Type *op2) override { + return Impl.getSADInstrCost(ISD, RetTy, op1, op2); + } unsigned getNumberOfParts(Type *Tp) override { return Impl.getNumberOfParts(Tp); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -301,6 +301,11 @@ return 1; } + unsigned getSADInstrCost(unsigned ISD, Type *RetTy, + Type *op1, Type *op2) { + return 1; + } + unsigned getNumberOfParts(Type *Tp) { return 0; } unsigned getAddressComputationCost(Type *Tp, bool) { return 0; } Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -615,6 +615,10 @@ case Intrinsic::masked_load: return static_cast(this) ->getMaskedMemoryOpCost(Instruction::Load, RetTy, 0, 0); + case Intrinsic::sad: + ISD = ISD::SAD; + return static_cast(this) + ->getSADInstrCost(ISD, RetTy, Tys[0], Tys[1]); } const TargetLoweringBase *TLI = getTLI(); Index: include/llvm/CodeGen/ISDOpcodes.h =================================================================== --- include/llvm/CodeGen/ISDOpcodes.h +++ include/llvm/CodeGen/ISDOpcodes.h @@ -643,6 +643,10 @@ /// specifier. PREFETCH, + /// SAD - This corresponds to a SAD instruction. The operands are vectors + /// and the result is a scalar type. + SAD, + /// OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) /// This corresponds to the fence instruction. It takes an input chain, and /// two integer constants: an AtomicOrdering and a SynchronizationScope. Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -580,6 +580,11 @@ // maps to void __clear_cache() on supporting platforms def int_clear_cache : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty], [], "llvm.clear_cache">; +// Instrinsic takes any vector as argument and generate the +// Sum of Absolute difference(SAD) Instrunction according to target. +def int_sad : Intrinsic<[llvm_i32_ty], + [llvm_anyvector_ty, llvm_anyvector_ty], + [IntrNoMem]>; //===-------------------------- Masked Intrinsics -------------------------===// // Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -190,6 +190,11 @@ Opd1PropInfo, Opd2PropInfo); } +unsigned TargetTransformInfo::getSADInstrCost(unsigned ISD, Type *RetTy, + Type *op1, Type *op2) const { + return TTIImpl->getSADInstrCost(ISD, RetTy, op1, op2); +} + unsigned TargetTransformInfo::getShuffleCost(ShuffleKind Kind, Type *Ty, int Index, Type *SubTp) const { return TTIImpl->getShuffleCost(Kind, Ty, Index, SubTp); Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1287,7 +1287,12 @@ return; } break; - + case ISD::SAD: + // Sad operation with illegal type will be custom-lowered. + Action = TLI.getOperationAction(Node->getOpcode(), + Node->getOperand(0).getValueType()); + SimpleFinishLegalizing = false; + break; default: if (Node->getOpcode() >= ISD::BUILTIN_OP_END) { Action = TargetLowering::Legal; @@ -1382,7 +1387,7 @@ dbgs() << "\n"; #endif llvm_unreachable("Do not know how to legalize this operator!"); - + case ISD::SAD: case ISD::CALLSEQ_START: case ISD::CALLSEQ_END: break; Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4415,6 +4415,32 @@ return DAG.getNode(ISD::FPOW, dl, LHS.getValueType(), LHS, RHS); } +/// ExpandSad - expand the llvm sad intrinsic. +static SDValue ExpandSad(SDLoc DL, SDValue LHS, SDValue RHS, + SelectionDAG &DAG) { + SDValue result; + // if the type of SAD is legal i.e. v16i8, then the output of SAD + // instruction will be v2i64. We need to extract result from + // upper and lower half and add manually. + if(LHS.getValueType() == MVT::v16i8 && RHS.getValueType() == MVT::v16i8) + { + SDValue sad = DAG.getNode(ISD::SAD, DL, MVT::v2i64, LHS, RHS); + EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(); + SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, + sad, DAG.getConstant(0, VecIdxTy)); + SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, sad, + DAG.getConstant(1, VecIdxTy)); + + BottomHalf = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BottomHalf); + TopHalf = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, TopHalf); + result = DAG.getNode(ISD::ADD, DL, MVT::i32, TopHalf, BottomHalf); + } + else + // If the type is not legal then just generare SAD node which will + // be handled in custom lowering. + result = DAG.getNode(ISD::SAD, DL, MVT::i32, LHS, RHS); + return result; +} /// ExpandPowI - Expand a llvm.powi intrinsic. static SDValue ExpandPowI(SDLoc DL, SDValue LHS, SDValue RHS, @@ -5147,6 +5173,11 @@ getValue(I.getArgOperand(0)).getValueType(), getValue(I.getArgOperand(0)))); return nullptr; + case Intrinsic::sad: { + setValue(&I, ExpandSad(sdl, getValue(I.getArgOperand(0)), + getValue(I.getArgOperand(1)), DAG)); + return nullptr; + } case Intrinsic::cttz: { SDValue Arg = getValue(I.getArgOperand(0)); ConstantInt *CI = cast(I.getArgOperand(1)); Index: lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- lib/Target/X86/X86ISelDAGToDAG.cpp +++ lib/Target/X86/X86ISelDAGToDAG.cpp @@ -2084,6 +2084,16 @@ switch (Opcode) { default: break; + case ISD::SAD: { + SDNode* New; + SDValue Ops[] = {Node->getOperand(0), Node->getOperand(1)}; + if(Subtarget->hasAVX() || Subtarget->hasSSE1()) + New = CurDAG->getMachineNode(X86::PSADBWrr, dl, Node->getValueType(0), Ops); + else if(Subtarget->hasMMX()) + New = CurDAG->getMachineNode(X86::MMX_PSADBWirr, dl, + Node->getValueType(0), Ops); + return New; + } case ISD::INTRINSIC_W_CHAIN: { unsigned IntNo = cast(Node->getOperand(1))->getZExtValue(); switch (IntNo) { Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1041,7 +1041,9 @@ setOperationAction(ISD::LOAD, MVT::v2i64, Legal); setOperationAction(ISD::SELECT, MVT::v2f64, Custom); setOperationAction(ISD::SELECT, MVT::v2i64, Custom); - + + setOperationAction(ISD::SAD, MVT::v8i8, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); @@ -16768,6 +16770,45 @@ return SDValue(); } +static SDValue LowerSAD(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDNode *Node = Op.getNode(); + SDLoc dl(Node); + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + SDValue result; + if (Op.getSimpleValueType() != MVT::i32) + return SDValue(); + + if (Subtarget->hasSSE1() || Subtarget->hasAVX()) { + EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(); + SDValue V0 = DAG.getUNDEF(MVT::v8i8); + SDValue V1 = DAG.getUNDEF(MVT::v8i8); + SDValue Op0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, + V0, N0); + SDValue Op1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, + V1, N1); + SDValue Ops[] = {Op0, Op1}; + SDValue SAD = DAG.getNode(ISD::SAD, dl, MVT::v2i64, Ops); + SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, + dl, MVT::i64, SAD, + DAG.getConstant(0, VecIdxTy)); + result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf); + + }else if (Subtarget->hasMMX()) { + /// Convert 1st arg type of SUB, i8 to X86_MMX + SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::x86mmx, N0); + /// Convert 2nd arg type of SUB, i8 to X86_MMX + SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::x86mmx, N1); + SDValue Ops[] = {Op0, Op1}; + SDValue SAD = DAG.getNode(ISD::SAD, dl, MVT::x86mmx, Ops); + SDValue bitcast = DAG.getNode(ISD::BITCAST, dl, MVT::i64, SAD); + result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, bitcast); + } + else + result = SDValue(); + return result; +} static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDNode *Node = Op.getNode(); @@ -17106,6 +17147,7 @@ case ISD::ADD: return LowerADD(Op, DAG); case ISD::SUB: return LowerSUB(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); + case ISD::SAD: return LowerSAD(Op, Subtarget, DAG); } } Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -73,6 +73,8 @@ unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); unsigned getMaxInterleaveFactor(); + unsigned getSADInstrCost(unsigned ISD, Type *RetTy, + Type *op1, Type *op2); unsigned getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -78,6 +78,34 @@ return 2; } +unsigned X86TTIImpl::getSADInstrCost(unsigned ISD, Type *RetTy, + Type *op1, Type *op2) { + + unsigned result = 0; + std::pair LT = TLI->getTypeLegalizationCost(op1); + EVT arg1Ty = TLI->getValueType(op1); + EVT arg2Ty = TLI->getValueType(op2); + assert((arg1Ty == arg2Ty) && + "cannot handle different type of arguments for SAD"); + + MVT MTy = arg1Ty.getSimpleVT(); + + static const CostTblEntry SSE1CostTable[] = { + {ISD::SAD, MVT::v8i8, 4}, + {ISD::SAD, MVT::v16i8, 5}, + }; + + if (ST->hasSSE1() || ST->hasAVX()) { + int Idx = CostTableLookup(SSE1CostTable, ISD, MTy); + if (Idx != -1) + return result = LT.first * SSE1CostTable[Idx].Cost; + } + if (ST->is64Bit() || ST->hasMMX()) { + return 4; + } + return 0; +} + unsigned X86TTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -289,7 +289,11 @@ void createEmptyLoop(); /// Copy and widen the instructions from the old loop. virtual void vectorizeLoop(); - + + /// While widening the instructions, skip the SAD pattern and replace + /// with SAD intrinsic call. + void generateSADInstruction(Instruction *I, SmallVector &args, + VectorParts &Entry); /// \brief The Loop exit block may have single value PHI nodes where the /// incoming value is 'Undef'. While vectorizing we only handled real values /// that were defined inside the loop. Here we fix the 'undef case'. @@ -572,7 +576,8 @@ RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()). RK_FloatAdd, ///< Sum of floats. RK_FloatMult, ///< Product of floats. - RK_FloatMinMax ///< Min/max implemented in terms of select(cmp()). + RK_FloatMinMax, ///< Min/max implemented in terms of select(cmp()). + RK_IntegerSpecial ///< Special patterns like SAD. }; /// This enum represents the kinds of inductions that we support. @@ -593,6 +598,45 @@ MRK_FloatMax }; + // This enum is used to keep track of actions to be performed on + // Instructions in special patterns. + enum SpecialActionKind { + SAK_Invalid, + SAK_Skip, ///< Skip instruction which is part of special pattern + SAK_Replace, ///< replace the instruction + SAK_Arg, ///< arguments + SAK_Phi ///< Phi instruction + }; + + // This keeps track of special pattern kind + enum SpecialPatternKind { + SPK_Invalid, + SPK_Sad + }; + + // This struct holds information about each instruction in + // Special Patterns + struct SpecialPatternDescriptor { + SpecialPatternDescriptor(SpecialPatternKind SPKind, + SpecialActionKind SAK, unsigned miVF, + unsigned maVF) + : Kind(SPKind), Action(SAK), minVF(miVF), maxVF(maVF) {} + SpecialPatternDescriptor() : Kind(SPK_Invalid), Action(SAK_Invalid), + minVF(0), maxVF(1) {} + // Kind - to keep track of which pattern the instruction belongs + SpecialPatternKind Kind; + // Action - What is the action to be performed for this instruction + SpecialActionKind Action; + // minVF - tells about the minimum VF for which this pattern is possible + // For Instance, in X86 the minVF for SAD is 8. We can not think of pattern + // replace for less than 8. + unsigned minVF; + // maXVF tells about the maximun VF foe which this pattern is possible. + // For instance, in X86 the maxVF for SAD is 16. SAD operation cannot be + // performed for VF more than 16 as 32-SAD is not present. + unsigned maxVF; + }; + /// This struct holds information about reduction variables. struct ReductionDescriptor { ReductionDescriptor() : StartValue(nullptr), LoopExitInstr(nullptr), @@ -616,11 +660,16 @@ /// This POD struct holds information about a potential reduction operation. struct ReductionInstDesc { ReductionInstDesc(bool IsRedux, Instruction *I) : - IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {} + IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid), + SPKind(SPK_Invalid) {} ReductionInstDesc(Instruction *I, MinMaxReductionKind K) : IsReduction(true), PatternLastInst(I), MinMaxKind(K) {} + ReductionInstDesc(Instruction *I, MinMaxReductionKind K, + SpecialPatternKind SPK) : + IsReduction(true), PatternLastInst(I), MinMaxKind(K), SPKind(SPK) {} + // Is this instruction a reduction candidate. bool IsReduction; // The last instruction in a min/max pattern (select of the select(icmp()) @@ -628,6 +677,8 @@ Instruction *PatternLastInst; // If this is a min/max pattern the comparison predicate. MinMaxReductionKind MinMaxKind; + // if special pattern then hold the pattern kind. + SpecialPatternKind SPKind; }; /// A struct for saving information about induction variables. @@ -703,6 +754,10 @@ /// induction descriptor. typedef MapVector InductionList; + /// ActionList stores the actions to be performed for an instruction + /// which is part of special pattern in a basic block. + typedef DenseMap ActionList; + /// Returns true if it is legal to vectorize this loop. /// This does not mean that it is profitable to vectorize this /// loop, only that it is legal to do so. @@ -717,6 +772,9 @@ /// Returns the induction variables found in the loop. InductionList *getInductionVars() { return &Inductions; } + /// Returns the actionlist for instructions in the loop. + ActionList *getActionMap() { return &SpecialActions; } + /// Returns the widest induction type. Type *getWidestInductionType() { return WidestIndTy; } @@ -808,6 +866,13 @@ /// Collect the variables that need to stay uniform after vectorization. void collectLoopUniforms(); + /// Returns true if we find a SAD pattern. + bool isSADPattern(PHINode *Phi); + + /// Returns a ReductionInstDesc with SpecialPatternKind set if it matches + /// any special pattern. + ReductionInstDesc isSpecialPattern(PHINode *Phi, ReductionInstDesc &Prev); + /// Return true if all of the instructions in the block can be speculatively /// executed. \p SafePtrs is a list of addresses that are known to be legal /// and we know that we can read from them without segfault. @@ -874,6 +939,8 @@ PHINode *Induction; /// Holds the reduction variables. ReductionList Reductions; + /// Holds the actions for instructions in Loop. + ActionList SpecialActions; /// Holds all of the induction variables that we found in the loop. /// Notice that inductions don't need to start at zero and that induction /// variables can be pointers. @@ -935,6 +1002,9 @@ /// 64 bit loop indices. unsigned getWidestType(); + /// \return The size (in bits) of the smallest type in the code that + /// needs to be vectorized. + unsigned getSmallestType(); /// \return The most profitable unroll factor. /// If UserUF is non-zero then this method finds the best unroll-factor /// based on register pressure and other parameters. @@ -966,6 +1036,10 @@ /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. unsigned getInstructionCost(Instruction *I, unsigned VF); + + /// Returns the execution time cost of a special reduction instruction + /// for a given vector width. + unsigned SpecialPhiCost(PHINode *p, unsigned VF); /// Returns whether the instruction is a load or store and will be a emitted /// as a vector operation. @@ -2437,7 +2511,8 @@ case RK_IntegerXor: case RK_IntegerAdd: case RK_IntegerOr: - // Adding, Xoring, Oring zero to a number does not change it. + case RK_IntegerSpecial: + // Adding, Xoring, Oring zero to a number does not change it. return ConstantInt::get(Tp, 0); case RK_IntegerMult: // Multiplying a number by 1 does not change it. @@ -2461,6 +2536,7 @@ getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) { switch (Kind) { case LoopVectorizationLegality::RK_IntegerAdd: + case LoopVectorizationLegality::RK_IntegerSpecial: return Instruction::Add; case LoopVectorizationLegality::RK_IntegerMult: return Instruction::Mul; @@ -2597,6 +2673,47 @@ return V; } +void InnerLoopVectorizer::generateSADInstruction( + Instruction *I, SmallVector &args, + VectorParts &Entry) { + LoopVectorizationLegality::SpecialPatternDescriptor SPD = + (*Legal->getActionMap())[I]; + switch (SPD.Action) { + case LoopVectorizationLegality::SAK_Arg: { + Instruction *cv = dyn_cast(I); + if (cv) { + VectorParts &Op = getVectorValue(cv->getOperand(0)); + args.push_back(Op[0]); + } + break; + } + case LoopVectorizationLegality::SAK_Skip: + // do nothing + break; + case LoopVectorizationLegality::SAK_Replace: { + Module *M = I->getParent()->getParent()->getParent(); + SmallVector Tys; + Tys.push_back(args[0]->getType()); + Tys.push_back(args[1]->getType()); + Function *F = Intrinsic::getDeclaration(M, Intrinsic::sad, Tys); + Value *intr_call = Builder.CreateCall(F, args); + VectorParts &phi = getVectorValue(I->getOperand(1)); + Value *add = Builder.CreateBinOp(Instruction::Add, intr_call, phi[0]); + Entry[0] = add; + propagateMetadata(Entry, I); + break; + } + case LoopVectorizationLegality::SAK_Phi: { + Type *VecTy = I->getType(); + Entry[0] = PHINode::Create(VecTy, 2, "sadvec.phi", + LoopVectorBody.back()->getFirstInsertionPt()); + break; + } + case LoopVectorizationLegality::SAK_Invalid: + assert(0 && "Unknown action to be performed!!"); + break; + } +} void InnerLoopVectorizer::vectorizeLoop() { //===------------------------------------------------===// // @@ -2639,6 +2756,7 @@ for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end(); it != e; ++it) { PHINode *RdxPhi = *it; + bool isSAD = false; assert(RdxPhi && "Unable to recover vectorized PHI"); // Find the reduction variable descriptor. @@ -2646,6 +2764,15 @@ "Unable to find the reduction variable"); LoopVectorizationLegality::ReductionDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi]; + + if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerSpecial) { + LoopVectorizationLegality::SpecialPatternDescriptor SPD = + (*Legal->getActionMap())[RdxPhi]; + + if (SPD.minVF <= VF && SPD.maxVF >= VF && + SPD.Kind == LoopVectorizationLegality::SPK_Sad ) + isSAD = true; + } setDebugLocFromInst(Builder, RdxDesc.StartValue); @@ -2673,6 +2800,15 @@ RdxDesc.StartValue, "minmax.ident"); } + } else if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerSpecial) { + LoopVectorizationLegality::SpecialPatternDescriptor SPD = + (*Legal->getActionMap())[RdxPhi]; + if (SPD.minVF <= VF && SPD.maxVF >= VF + && SPD.Kind == LoopVectorizationLegality::SPK_Sad) { + // assert(RdxDesc.SPKind == LoopVectorizationLegality::SPK_Sad && + // "Doesnot support other special kinds"); + VectorStart = Identity = RdxDesc.StartValue; + } } else { // Handle other reduction kinds: Constant *Iden = @@ -2747,7 +2883,7 @@ ReducedPartRdx, RdxParts[part]); } - if (VF > 1) { + if (VF > 1 && !isSAD) { // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles // and vector ops, reducing the set of values being computed by half each // round. @@ -3028,8 +3164,28 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // For each instruction in the old loop. + SmallVector args; for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { VectorParts &Entry = WidenMap.get(it); + if (Legal->getActionMap()->count(it)) { + LoopVectorizationLegality::SpecialPatternDescriptor SPD = + (*Legal->getActionMap())[it]; + if (SPD.minVF <= VF && SPD.maxVF >= VF) { + switch (SPD.Kind) { + case LoopVectorizationLegality::SPK_Sad: + generateSADInstruction(it, args, Entry); + if (SPD.Action == LoopVectorizationLegality::SAK_Phi) { + PHINode *P = cast(it); + PV->push_back(P); + } + break; + case LoopVectorizationLegality::SPK_Invalid: + assert(0 && "Cannot be Invalid here!!"); + break; + } + continue; + } + } switch (it->getOpcode()) { case Instruction::Br: // Nothing to do for PHIs and BR, since we already took care of the @@ -3549,6 +3705,11 @@ continue; } + // Check the Special Patterns. + if (AddReductionVar(Phi, RK_IntegerSpecial)) { + DEBUG(dbgs() << "LV: Found a Special reduction PHI." << *Phi <<"\n"); + continue; + } if (AddReductionVar(Phi, RK_IntegerAdd)) { DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); continue; @@ -3779,6 +3940,16 @@ return Stride; } +LoopVectorizationLegality::ReductionInstDesc +LoopVectorizationLegality::isSpecialPattern(PHINode *Phi, + ReductionInstDesc &Prev) { + if (isSADPattern(Phi)) { + return ReductionInstDesc(Phi, Prev.MinMaxKind, SPK_Sad); + } else { + return Prev; + } +} + void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) { Value *Ptr = nullptr; if (LoadInst *LI = dyn_cast(MemAccess)) @@ -4023,6 +4194,11 @@ if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction) return false; + ReduxDesc = isSpecialPattern(Phi, ReduxDesc); + + if (Kind == RK_IntegerSpecial && ReduxDesc.SPKind == SPK_Invalid) + return false; + // We found a reduction var if we have reached the original phi node and we // only have a single instruction with out-of-loop users. @@ -4039,6 +4215,90 @@ return true; } +/// \brief This function checks for the following pattern + +/// %sum.05 = phi i32 [ 0, %entry ], [ %add, %for.body ] +/// .... +/// %sub = sub nsw i32 %conv, %conv3 +/// %ispos = icmp uge i8 %0, %1 +/// %neg = sub nsw i32 0, %sub +/// %2 = select i1 %ispos, i32 %sub, i32 %neg +/// %add = add nsw i32 %2, %sum.05 + +/// This pattern is matched from bottom. if everything matches +/// then it returns true. +bool LoopVectorizationLegality::isSADPattern(PHINode *Phi) { + + LLVMContext &Context = TheLoop->getHeader()->getContext(); + // SmallVector *SadIntrList = &SPDesc.IntrList; + for (User *U : Phi->users()) { + // This is supposed to be an add instruction + if (!isa(U)) + return false; + Instruction *UI = dyn_cast(U); + if (UI->getOpcode() != Instruction::Add) + return false; + // all uses of Add instruction must be phi instructions + for (User *Ur : UI->users()) { + Instruction *I = dyn_cast(Ur); + if (!isa(I)) + return false; + } + // Checking for Selection instruction + if (!isa(UI->getOperand(0))) + return false; + + // two subs and one icmp instruction are matched + Instruction *Sel = dyn_cast(UI->getOperand(0)); + if (!isa(Sel->getOperand(1))) + return false; + Instruction *Sub1 = dyn_cast(Sel->getOperand(1)); + if (!isa(Sel->getOperand(2))) + return false; + Instruction *Sub2 = dyn_cast(Sel->getOperand(2)); + if (Sub1->getOpcode() != Instruction::Sub || + Sub2->getOpcode() != Instruction::Sub || + !isa(Sel->getOperand(0))) + return false; + Instruction *cv1 = dyn_cast(Sub1->getOperand(0)); + Instruction *cv2 = dyn_cast(Sub1->getOperand(1)); + if (!cv1 || !cv2 || (cv1->getOperand(0)->getType()->getScalarType() != + Type::getInt8Ty(Context)) || + (cv2->getOperand(0)->getType()->getScalarType() != + Type::getInt8Ty(Context))) + return false; + + if (!isa(Sel->getOperand(0))) + return false; + Instruction *icmp = dyn_cast(Sel->getOperand(0)); + if (icmp->getOperand(0) != Sub1 || Sub2->getOperand(1) != Sub1) { + if (((icmp->getOperand(0) != cv1->getOperand(0)) && + (icmp->getOperand(1) != cv1->getOperand(0))) || + ((icmp->getOperand(0) != cv2->getOperand(0)) && + (icmp->getOperand(1) != cv2->getOperand(0)))) + return false; + } + // All these instructions are added to ActionList + SpecialActions[UI] = + SpecialPatternDescriptor(SPK_Sad, SAK_Replace, 8, 16); // SAK_Replace + SpecialActions[Sel] = + SpecialPatternDescriptor(SPK_Sad, SAK_Skip, 8, 16); // SAK_Skip + SpecialActions[icmp] = + SpecialPatternDescriptor(SPK_Sad, SAK_Skip, 8, 16); // SAK_Skip + SpecialActions[Sub2] = + SpecialPatternDescriptor(SPK_Sad, SAK_Skip, 8, 16); // SAK_Skip + SpecialActions[Sub1] = + SpecialPatternDescriptor(SPK_Sad, SAK_Skip, 8, 16); // SAK_Skip + SpecialActions[cv1] = + SpecialPatternDescriptor(SPK_Sad, SAK_Arg, 8, 16); // SAK_Arg + SpecialActions[cv2] = + SpecialPatternDescriptor(SPK_Sad, SAK_Arg, 8, 16); // SAK_Arg + SpecialActions[Phi] = + SpecialPatternDescriptor(SPK_Sad, SAK_Phi, 8, 16); // SAK_Phi + } + return true; +} + /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction /// pattern corresponding to a min(X, Y) or max(X, Y). LoopVectorizationLegality::ReductionInstDesc @@ -4095,6 +4355,12 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I, ReductionKind Kind, ReductionInstDesc &Prev) { + // This is to bypass this function for specialPatterns. We dont know which instruction + // corresponds to reduction pattern. + if(Kind == RK_IntegerSpecial) { + return ReductionInstDesc(true, I); + } + bool FP = I->getType()->isFloatingPointTy(); bool FastMath = FP && I->hasUnsafeAlgebra(); switch (I->getOpcode()) { @@ -4279,7 +4545,7 @@ unsigned TC = SE->getSmallConstantTripCount(TheLoop); DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); - unsigned WidestType = getWidestType(); + unsigned WidestType = getSmallestType(); unsigned WidestRegister = TTI.getRegisterBitWidth(true); unsigned MaxSafeDepDist = -1U; if (Legal->getMaxSafeDepDistBytes() != -1U) @@ -4418,7 +4684,49 @@ return MaxWidth; } +unsigned LoopVectorizationCostModel::getSmallestType() { + unsigned MinWidth = 8; + // For each block. + for (Loop::block_iterator bb = TheLoop->block_begin(), + be = TheLoop->block_end(); bb != be; ++bb) { + BasicBlock *BB = *bb; + + // For each instruction in the loop. + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + Type *T = it->getType(); + + // Ignore ephemeral values. + if (EphValues.count(it)) + continue; + + // Only examine Loads, Stores and PHINodes. + if (!isa(it) && !isa(it) && !isa(it)) + continue; + + // Examine PHI nodes that are reduction variables. + if (PHINode *PN = dyn_cast(it)) + if (!Legal->getReductionVars()->count(PN)) + continue; + + // Examine the stored values. + if (StoreInst *ST = dyn_cast(it)) + T = ST->getValueOperand()->getType(); + + // Ignore loaded pointer types and stored pointer types that are not + // consecutive. However, we do want to take consecutive stores/loads of + // pointer vectors into account. + if (T->isPointerTy() && !isConsecutiveLoadOrStore(it)) + continue; + + MinWidth = std::min(MinWidth, + (unsigned)DL->getTypeSizeInBits(T->getScalarType())); + } + } + + return MinWidth; +} + unsigned LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, unsigned VF, @@ -4704,6 +5012,22 @@ if (EphValues.count(it)) continue; + if (Legal->getActionMap()->count(it)) { + LoopVectorizationLegality::SpecialPatternDescriptor SPD = + (*Legal->getActionMap())[it]; + if (VF >= SPD.minVF && VF <= SPD.maxVF) { + if (SPD.Action == LoopVectorizationLegality::SAK_Phi) { + PHINode *p = dyn_cast(it); + unsigned PhiCost; + PhiCost= SpecialPhiCost(p, VF); + DEBUG(dbgs() << "LV: Found an estimated cost of " << PhiCost << " for VF " << + VF << " For special Phi instruction: " << *it << '\n'); + BlockCost += PhiCost; + } + continue; + } + } + unsigned C = getInstructionCost(it, VF); // Check if we should override the cost. @@ -4786,6 +5110,36 @@ return false; } +unsigned +LoopVectorizationCostModel::SpecialPhiCost(PHINode *P, unsigned VF) { + + if (Legal->getReductionVars()->count(P)) { + LoopVectorizationLegality::ReductionDescriptor RdxDesc = + (*Legal->getReductionVars())[P]; + assert(Legal->getActionMap()->count(P) && "something wrong"); + LoopVectorizationLegality::SpecialPatternDescriptor SPD = + (*Legal->getActionMap())[P]; + if (VF >= SPD.minVF && VF <= SPD.maxVF) { + switch (SPD.Kind) { + case LoopVectorizationLegality::SPK_Sad: { + LLVMContext &Context = TheLoop->getHeader()->getContext(); + SmallVector Tys; + Tys.push_back(VectorType::get(Type::getInt8Ty(Context), VF)); + Tys.push_back(VectorType::get(Type::getInt8Ty(Context), VF)); + Type *RetTy = Type::getInt32Ty(Context); + Intrinsic::ID ID = Intrinsic::sad; + unsigned IntrinsicCost = TTI.getIntrinsicInstrCost(ID, RetTy, Tys); + return IntrinsicCost; + } break; + + default: + break; + } + } + } + return 0; +} + unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // If we know that this instruction will remain uniform, check the cost of