Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -452,6 +452,10 @@ unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys) const; + /// \returns the cost of SAD instruction. + unsigned getSADInstrCost(Type *RetTy, Type *op1, + Type *op2) const; + /// \returns The cost of Call instructions. unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys) const; @@ -573,6 +577,8 @@ bool IsPairwiseForm) = 0; virtual unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys) = 0; + virtual unsigned getSADInstrCost(Type *RetTy, Type *op1, + Type *op2) = 0; virtual unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys) = 0; virtual unsigned getNumberOfParts(Type *Tp) = 0; @@ -732,6 +738,10 @@ ArrayRef Tys) override { return Impl.getIntrinsicInstrCost(ID, RetTy, Tys); } + unsigned getSADInstrCost(Type *RetTy, Type *op1, + Type *op2) override { + return Impl.getSADInstrCost(RetTy, op1, op2); + } unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys) override { return Impl.getCallInstrCost(F, RetTy, Tys); Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -302,6 +302,10 @@ ArrayRef Tys) { return 1; } + + unsigned getSADInstrCost(Type *RetTy, Type *op1, Type *op2) { + return 1; + } unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys) { return 1; Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -627,6 +627,9 @@ case Intrinsic::masked_load: return static_cast(this) ->getMaskedMemoryOpCost(Instruction::Load, RetTy, 0, 0); + case Intrinsic::sad: + return static_cast(this) + ->getSADInstrCost(RetTy, Tys[0], Tys[1]); } const TargetLoweringBase *TLI = getTLI(); Index: include/llvm/CodeGen/ISDOpcodes.h =================================================================== --- include/llvm/CodeGen/ISDOpcodes.h +++ include/llvm/CodeGen/ISDOpcodes.h @@ -642,6 +642,11 @@ /// read / write specifier, locality specifier and instruction / data cache /// specifier. PREFETCH, + + /// SAD - This corresponds to a Sum of Absolute Difference(SAD) instruction. + /// The operands are vectors of char type with length of vector being power + /// of 2 and the result is a scalar integer. + SAD, /// OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) /// This corresponds to the fence instruction. It takes an input chain, and Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -592,6 +592,12 @@ def int_clear_cache : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty], [], "llvm.clear_cache">; +// Calculate the Sum of Absolute Differences (SAD) of the two input vectors. +// Only vectors of char Type are allowed. +def int_sad : Intrinsic<[llvm_i32_ty], + [llvm_anyvector_ty, llvm_anyvector_ty], + [IntrNoMem]>; + //===-------------------------- Masked Intrinsics -------------------------===// // def int_masked_store : Intrinsic<[], [llvm_anyvector_ty, LLVMPointerTo<0>, Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -194,6 +194,11 @@ Opd1PropInfo, Opd2PropInfo); } +unsigned TargetTransformInfo::getSADInstrCost(Type *RetTy, Type *op1, + Type *op2) const { + return TTIImpl->getSADInstrCost(RetTy, op1, op2); +} + unsigned TargetTransformInfo::getShuffleCost(ShuffleKind Kind, Type *Ty, int Index, Type *SubTp) const { return TTIImpl->getShuffleCost(Kind, Ty, Index, SubTp); Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -73,6 +73,8 @@ unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); unsigned getMaxInterleaveFactor(); + unsigned getSADInstrCost(Type *RetTy, Type *op1, + Type *op2); unsigned getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -77,7 +77,34 @@ return 2; } +unsigned X86TTIImpl::getSADInstrCost(Type *RetTy, Type *op1, + Type *op2) { + unsigned result = 0; + std::pair LT = TLI->getTypeLegalizationCost(op1); + EVT arg1Ty = TLI->getValueType(op1); + EVT arg2Ty = TLI->getValueType(op2); + assert((arg1Ty == arg2Ty) && + "cannot handle different type of arguments for SAD"); + + MVT MTy = arg1Ty.getSimpleVT(); + + static const CostTblEntry SSE1CostTable[] = { + {ISD::SAD, MVT::v8i8, 4}, + {ISD::SAD, MVT::v16i8, 5}, + }; + + if (ST->hasSSE1() || ST->hasAVX()) { + int Idx = CostTableLookup(SSE1CostTable, ISD::SAD, MTy); + if (Idx != -1) + return result = LT.first * SSE1CostTable[Idx].Cost; + } + if (ST->is64Bit() || ST->hasMMX()) { + return 4; + } + return 0; +} + unsigned X86TTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -296,6 +296,11 @@ /// Copy and widen the instructions from the old loop. virtual void vectorizeLoop(); + /// While widening the instructions, skip the SAD pattern and replace + /// with SAD intrinsic call. + void generateSADInstruction(Instruction *I, SmallVector &args, + VectorParts &Entry); + /// \brief The Loop exit block may have single value PHI nodes where the /// incoming value is 'Undef'. While vectorizing we only handled real values /// that were defined inside the loop. Here we fix the 'undef case'. @@ -580,7 +585,8 @@ RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()). RK_FloatAdd, ///< Sum of floats. RK_FloatMult, ///< Product of floats. - RK_FloatMinMax ///< Min/max implemented in terms of select(cmp()). + RK_FloatMinMax, ///< Min/max implemented in terms of select(cmp()). + RK_IntegerSpecial ///< Special patterns like SAD. }; /// This enum represents the kinds of inductions that we support. @@ -601,6 +607,44 @@ MRK_FloatMax }; + // This enum is used to keep track of actions to be performed on + // Instructions in special patterns. + enum SpecialActionKind { + SAK_Invalid, + SAK_Skip, ///< Skip instruction which is part of special pattern + SAK_Replace, ///< replace the instruction + SAK_Arg, ///< arguments + SAK_Phi ///< Phi instruction + }; + + // This keeps track of special pattern kind + enum SpecialPatternKind { + SPK_Invalid, + SPK_Sad + }; + + // This struct holds information about each instruction in + // Special Patterns + struct SpecialPatternDescriptor { + SpecialPatternDescriptor(SpecialPatternKind SPKind, + SpecialActionKind SAK, unsigned miVF, + unsigned maVF) + : Kind(SPKind), Action(SAK), minVF(miVF), maxVF(maVF) {} + + SpecialPatternDescriptor() : Kind(SPK_Invalid), Action(SAK_Invalid), + minVF(0), maxVF(1) {} + // Kind - to keep track of which pattern the instruction belongs + SpecialPatternKind Kind; + // Action - Specifies the action to be performed for this instruction + SpecialActionKind Action; + // minVF - Specifies the minimum VF for which the target supports + // this pattern. For example, for x86 the minimum VF for SAD is 8. + unsigned minVF; + // maxVF - Specifies the maximum VF for which the target supports + // this pattern. For example, for x86 the maximum VF for SAD is 16. + unsigned maxVF; + }; + /// This struct holds information about reduction variables. struct ReductionDescriptor { ReductionDescriptor() : StartValue(nullptr), LoopExitInstr(nullptr), @@ -624,10 +668,15 @@ /// This POD struct holds information about a potential reduction operation. struct ReductionInstDesc { ReductionInstDesc(bool IsRedux, Instruction *I) : - IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {} + IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid), + SPKind(SPK_Invalid) {} + ReductionInstDesc(Instruction *I, MinMaxReductionKind K, + SpecialPatternKind SPK) : + IsReduction(true), PatternLastInst(I), MinMaxKind(K), SPKind(SPK) {} + ReductionInstDesc(Instruction *I, MinMaxReductionKind K) : - IsReduction(true), PatternLastInst(I), MinMaxKind(K) {} + IsReduction(true), PatternLastInst(I), MinMaxKind(K), SPKind(SPK_Invalid) {} // Is this instruction a reduction candidate. bool IsReduction; @@ -636,6 +685,8 @@ Instruction *PatternLastInst; // If this is a min/max pattern the comparison predicate. MinMaxReductionKind MinMaxKind; + // if special pattern then hold the pattern kind. + SpecialPatternKind SPKind; }; /// A struct for saving information about induction variables. @@ -711,6 +762,10 @@ /// induction descriptor. typedef MapVector InductionList; + /// ActionList stores the actions to be performed for an instruction + /// which is part of special pattern in a basic block. + typedef DenseMap ActionList; + /// Returns true if it is legal to vectorize this loop. /// This does not mean that it is profitable to vectorize this /// loop, only that it is legal to do so. @@ -725,6 +780,9 @@ /// Returns the induction variables found in the loop. InductionList *getInductionVars() { return &Inductions; } + /// Returns the actionlist for instructions in the loop. + ActionList *getActionMap() { return &SpecialActions; } + /// Returns the widest induction type. Type *getWidestInductionType() { return WidestIndTy; } @@ -816,6 +874,13 @@ /// Collect the variables that need to stay uniform after vectorization. void collectLoopUniforms(); + /// Returns true if we find a SAD pattern. + bool isSADPattern(PHINode *Phi); + + /// Returns a ReductionInstDesc with SpecialPatternKind set if it matches + /// any special pattern. + ReductionInstDesc isSpecialPattern(PHINode *Phi, ReductionInstDesc &Prev); + /// Return true if all of the instructions in the block can be speculatively /// executed. \p SafePtrs is a list of addresses that are known to be legal /// and we know that we can read from them without segfault. @@ -880,6 +945,8 @@ PHINode *Induction; /// Holds the reduction variables. ReductionList Reductions; + /// Holds the actions for instructions in Loop. + ActionList SpecialActions; /// Holds all of the induction variables that we found in the loop. /// Notice that inductions don't need to start at zero and that induction /// variables can be pointers. @@ -939,6 +1006,12 @@ /// needs to be vectorized. We ignore values that remain scalar such as /// 64 bit loop indices. unsigned getWidestType(); + + /// \return The most profitable type for the pattern. + /// For example, for SAD pattern the return value is the smallest type + /// presnt in basic block. + /// If there is no special pattern then the widest type is returned. + unsigned selectBestTypeForPattern(); /// \return The most profitable unroll factor. /// If UserUF is non-zero then this method finds the best unroll-factor @@ -972,6 +1045,10 @@ /// width. Vector width of one means scalar. unsigned getInstructionCost(Instruction *I, unsigned VF); + /// Returns the execution time cost of a special reduction instruction + /// for a given vector width. + unsigned SpecialPhiCost(PHINode *p, unsigned VF); + /// Returns whether the instruction is a load or store and will be a emitted /// as a vector operation. bool isConsecutiveLoadOrStore(Instruction *I); @@ -2476,6 +2553,7 @@ case RK_IntegerXor: case RK_IntegerAdd: case RK_IntegerOr: + case RK_IntegerSpecial: // Adding, Xoring, Oring zero to a number does not change it. return ConstantInt::get(Tp, 0); case RK_IntegerMult: @@ -2501,6 +2579,8 @@ switch (Kind) { case LoopVectorizationLegality::RK_IntegerAdd: return Instruction::Add; + case LoopVectorizationLegality::RK_IntegerSpecial: + return Instruction::Add; case LoopVectorizationLegality::RK_IntegerMult: return Instruction::Mul; case LoopVectorizationLegality::RK_IntegerOr: @@ -2635,6 +2715,47 @@ return V; } +void InnerLoopVectorizer::generateSADInstruction(Instruction *I, + SmallVector &args, + VectorParts &Entry) { + LoopVectorizationLegality::SpecialPatternDescriptor SPD = + (*Legal->getActionMap())[I]; + switch (SPD.Action) { + case LoopVectorizationLegality::SAK_Arg: { + Instruction *CV = dyn_cast(I); + if (CV) { + VectorParts &Op = getVectorValue(CV->getOperand(0)); + args.push_back(Op[0]); + } + break; + } + case LoopVectorizationLegality::SAK_Skip: + // do nothing + break; + case LoopVectorizationLegality::SAK_Replace: { + Module *M = I->getParent()->getParent()->getParent(); + SmallVector Tys; + Tys.push_back(args[0]->getType()); + Tys.push_back(args[1]->getType()); + Function *F = Intrinsic::getDeclaration(M, Intrinsic::sad, Tys); + Value *intr_call = Builder.CreateCall(F, args); + VectorParts &phi = getVectorValue(I->getOperand(1)); + Value *add = Builder.CreateBinOp(Instruction::Add, intr_call, phi[0]); + Entry[0] = add; + propagateMetadata(Entry, I); + break; + } + case LoopVectorizationLegality::SAK_Phi: { + Type *VecTy = I->getType(); + Entry[0] = PHINode::Create(VecTy, 2, "sadvec.phi", + LoopVectorBody.back()->getFirstInsertionPt()); + break; + } + case LoopVectorizationLegality::SAK_Invalid: + assert(0 && "Unknown action to be performed!!"); + break; + } +} /// Estimate the overhead of scalarizing a value. Insert and Extract are set if /// the result needs to be inserted and/or extracted from vectors. static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract, @@ -2766,6 +2887,7 @@ for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end(); it != e; ++it) { PHINode *RdxPhi = *it; + bool isSAD = false; assert(RdxPhi && "Unable to recover vectorized PHI"); // Find the reduction variable descriptor. @@ -2774,6 +2896,15 @@ LoopVectorizationLegality::ReductionDescriptor RdxDesc = (*Legal->getReductionVars())[RdxPhi]; + if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerSpecial) { + LoopVectorizationLegality::SpecialPatternDescriptor SPD = + (*Legal->getActionMap())[RdxPhi]; + + if (SPD.minVF <= VF && SPD.maxVF >= VF && + SPD.Kind == LoopVectorizationLegality::SPK_Sad ) + isSAD = true; + } + setDebugLocFromInst(Builder, RdxDesc.StartValue); // We need to generate a reduction vector from the incoming scalar. @@ -2800,6 +2931,12 @@ RdxDesc.StartValue, "minmax.ident"); } + } else if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerSpecial) { + LoopVectorizationLegality::SpecialPatternDescriptor SPD = + (*Legal->getActionMap())[RdxPhi]; + if (SPD.minVF <= VF && SPD.maxVF >= VF && + SPD.Kind == LoopVectorizationLegality::SPK_Sad) + VectorStart = Identity = RdxDesc.StartValue; } else { // Handle other reduction kinds: Constant *Iden = @@ -2874,7 +3011,7 @@ ReducedPartRdx, RdxParts[part]); } - if (VF > 1) { + if (VF > 1 && !isSAD) { // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles // and vector ops, reducing the set of values being computed by half each // round. @@ -3155,8 +3292,28 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) { // For each instruction in the old loop. + SmallVector args; for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { VectorParts &Entry = WidenMap.get(it); + if (Legal->getActionMap()->count(it)) { + LoopVectorizationLegality::SpecialPatternDescriptor SPD = + (*Legal->getActionMap())[it]; + if (SPD.minVF <= VF && SPD.maxVF >= VF) { + switch (SPD.Kind) { + case LoopVectorizationLegality::SPK_Sad: + generateSADInstruction(it, args, Entry); + if (SPD.Action == LoopVectorizationLegality::SAK_Phi) { + PHINode *P = cast(it); + PV->push_back(P); + } + break; + case LoopVectorizationLegality::SPK_Invalid: + assert(0 && "Cannot be Invalid here!!"); + break; + } + continue; + } + } switch (it->getOpcode()) { case Instruction::Br: // Nothing to do for PHIs and BR, since we already took care of the @@ -3710,7 +3867,11 @@ continue; } - + // Check the Special Patterns. + if (AddReductionVar(Phi, RK_IntegerSpecial)) { + DEBUG(dbgs() << "LV: Found a Special reduction PHI." << *Phi <<"\n"); + continue; + } if (AddReductionVar(Phi, RK_IntegerAdd)) { DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); continue; @@ -3944,6 +4105,16 @@ return Stride; } +LoopVectorizationLegality::ReductionInstDesc +LoopVectorizationLegality::isSpecialPattern(PHINode *Phi, + ReductionInstDesc &Prev) { + if (isSADPattern(Phi)) { + return ReductionInstDesc(Phi, Prev.MinMaxKind, SPK_Sad); + } else { + return Prev; + } +} + void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) { Value *Ptr = nullptr; if (LoadInst *LI = dyn_cast(MemAccess)) @@ -4199,7 +4370,12 @@ if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction) return false; + + ReduxDesc = isSpecialPattern(Phi, ReduxDesc); + if (Kind == RK_IntegerSpecial && ReduxDesc.SPKind == SPK_Invalid) + return false; + // We found a reduction var if we have reached the original phi node and we // only have a single instruction with out-of-loop users. @@ -4216,6 +4392,90 @@ return true; } +/// \brief This function checks for the following pattern + +/// %sum.05 = phi i32 [ 0, %entry ], [ %add, %for.body ] +/// .... +/// %sub = sub nsw i32 %conv, %conv3 +/// %ispos = icmp uge i8 %0, %1 +/// %neg = sub nsw i32 0, %sub +/// %2 = select i1 %ispos, i32 %sub, i32 %neg +/// %add = add nsw i32 %2, %sum.05 + +/// This pattern is matched from bottom. if everything matches +/// then it returns true. +bool LoopVectorizationLegality::isSADPattern(PHINode *Phi) { + + LLVMContext &Context = TheLoop->getHeader()->getContext(); + // SmallVector *SadIntrList = &SPDesc.IntrList; + for (User *U : Phi->users()) { + // This is supposed to be an add instruction + if (!isa(U)) + return false; + Instruction *UI = dyn_cast(U); + if (UI->getOpcode() != Instruction::Add) + return false; + // all uses of Add instruction must be phi instructions + for (User *Ur : UI->users()) { + Instruction *I = dyn_cast(Ur); + if (!isa(I)) + return false; + } + // Checking for Selection instruction + if (!isa(UI->getOperand(0))) + return false; + + // two subs and one icmp instruction are matched + Instruction *Sel = dyn_cast(UI->getOperand(0)); + if (!isa(Sel->getOperand(1))) + return false; + Instruction *Sub1 = dyn_cast(Sel->getOperand(1)); + if (!isa(Sel->getOperand(2))) + return false; + Instruction *Sub2 = dyn_cast(Sel->getOperand(2)); + if (Sub1->getOpcode() != Instruction::Sub || + Sub2->getOpcode() != Instruction::Sub || + !isa(Sel->getOperand(0))) + return false; + Instruction *cv1 = dyn_cast(Sub1->getOperand(0)); + Instruction *cv2 = dyn_cast(Sub1->getOperand(1)); + if (!cv1 || !cv2 || (cv1->getOperand(0)->getType()->getScalarType() != + Type::getInt8Ty(Context)) || + (cv2->getOperand(0)->getType()->getScalarType() != + Type::getInt8Ty(Context))) + return false; + + if (!isa(Sel->getOperand(0))) + return false; + Instruction *icmp = dyn_cast(Sel->getOperand(0)); + if (icmp->getOperand(0) != Sub1 || Sub2->getOperand(1) != Sub1) { + if (((icmp->getOperand(0) != cv1->getOperand(0)) && + (icmp->getOperand(1) != cv1->getOperand(0))) || + ((icmp->getOperand(0) != cv2->getOperand(0)) && + (icmp->getOperand(1) != cv2->getOperand(0)))) + return false; + } + // All these instructions are added to ActionList + SpecialActions[UI] = + SpecialPatternDescriptor(SPK_Sad, SAK_Replace, 8, 16); // SAK_Replace + SpecialActions[Sel] = + SpecialPatternDescriptor(SPK_Sad, SAK_Skip, 8, 16); // SAK_Skip + SpecialActions[icmp] = + SpecialPatternDescriptor(SPK_Sad, SAK_Skip, 8, 16); // SAK_Skip + SpecialActions[Sub2] = + SpecialPatternDescriptor(SPK_Sad, SAK_Skip, 8, 16); // SAK_Skip + SpecialActions[Sub1] = + SpecialPatternDescriptor(SPK_Sad, SAK_Skip, 8, 16); // SAK_Skip + SpecialActions[cv1] = + SpecialPatternDescriptor(SPK_Sad, SAK_Arg, 8, 16); // SAK_Arg + SpecialActions[cv2] = + SpecialPatternDescriptor(SPK_Sad, SAK_Arg, 8, 16); // SAK_Arg + SpecialActions[Phi] = + SpecialPatternDescriptor(SPK_Sad, SAK_Phi, 8, 16); // SAK_Phi + } + return true; +} + /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction /// pattern corresponding to a min(X, Y) or max(X, Y). LoopVectorizationLegality::ReductionInstDesc @@ -4274,6 +4534,9 @@ ReductionInstDesc &Prev) { bool FP = I->getType()->isFloatingPointTy(); bool FastMath = FP && I->hasUnsafeAlgebra(); + if (Kind == RK_IntegerSpecial) { + return ReductionInstDesc(true, I); + } switch (I->getOpcode()) { default: return ReductionInstDesc(false, I); @@ -4470,7 +4733,7 @@ unsigned TC = SE->getSmallConstantTripCount(TheLoop); DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); - unsigned WidestType = getWidestType(); + unsigned BestType = selectBestTypeForPattern(); unsigned WidestRegister = TTI.getRegisterBitWidth(true); unsigned MaxSafeDepDist = -1U; if (Legal->getMaxSafeDepDistBytes() != -1U) @@ -4477,8 +4740,8 @@ MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; WidestRegister = ((WidestRegister < MaxSafeDepDist) ? WidestRegister : MaxSafeDepDist); - unsigned MaxVectorSize = WidestRegister / WidestType; - DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n"); + unsigned MaxVectorSize = WidestRegister / BestType; + DEBUG(dbgs() << "LV: The Best type: " << BestType << " bits.\n"); DEBUG(dbgs() << "LV: The Widest register is: " << WidestRegister << " bits.\n"); @@ -4567,6 +4830,61 @@ return Factor; } +unsigned LoopVectorizationCostModel::selectBestTypeForPattern() { + unsigned MaxWidth = 8; + const DataLayout &DL = TheFunction->getParent()->getDataLayout(); + unsigned specialPatternWidth = 1024; + // For each block. + for (Loop::block_iterator bb = TheLoop->block_begin(), + be = TheLoop->block_end(); bb != be; ++bb) { + BasicBlock *BB = *bb; + + // For each instruction in the loop. + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + Type *T = it->getType(); + + // Ignore ephemeral values. + if (EphValues.count(it)) + continue; + + // Only examine Loads, Stores and PHINodes. + if (!isa(it) && !isa(it) && !isa(it)) + continue; + + // Examine PHI nodes that are reduction variables. + if (PHINode *PN = dyn_cast(it)) { + if (!Legal->getReductionVars()->count(PN)) + continue; + if(Legal->getActionMap()->count(PN)) { + LoopVectorizationLegality::SpecialPatternDescriptor SPD = + (*Legal->getActionMap())[it]; + switch(SPD.Kind) { + case LoopVectorizationLegality::SPK_Sad: + specialPatternWidth = std::min(specialPatternWidth,(unsigned)8); + continue; + default : break; + } + } + } + + // Examine the stored values. + if (StoreInst *ST = dyn_cast(it)) + T = ST->getValueOperand()->getType(); + + // Ignore loaded pointer types and stored pointer types that are not + // consecutive. However, we do want to take consecutive stores/loads of + // pointer vectors into account. + if (T->isPointerTy() && !isConsecutiveLoadOrStore(it)) + continue; + + MaxWidth = std::max(MaxWidth, + (unsigned)DL.getTypeSizeInBits(T->getScalarType())); + } + } + + return std::min(MaxWidth,specialPatternWidth); +} + unsigned LoopVectorizationCostModel::getWidestType() { unsigned MaxWidth = 8; const DataLayout &DL = TheFunction->getParent()->getDataLayout(); @@ -4903,7 +5221,23 @@ // Ignore ephemeral values. if (EphValues.count(it)) continue; - + + if (Legal->getActionMap()->count(it)) { + LoopVectorizationLegality::SpecialPatternDescriptor SPD = + (*Legal->getActionMap())[it]; + if (VF >= SPD.minVF && VF <= SPD.maxVF) { + if (SPD.Action == LoopVectorizationLegality::SAK_Phi) { + PHINode *p = dyn_cast(it); + unsigned PhiCost; + PhiCost= SpecialPhiCost(p, VF); + DEBUG(dbgs() << "LV: Found an estimated cost of " << PhiCost + << " for VF " << VF << + " For special Phi instruction: " << *it << '\n'); + BlockCost += PhiCost; + } + continue; + } + } unsigned C = getInstructionCost(it, VF); // Check if we should override the cost. @@ -4987,6 +5321,36 @@ } unsigned +LoopVectorizationCostModel::SpecialPhiCost(PHINode *P, unsigned VF) { + + if (Legal->getReductionVars()->count(P)) { + LoopVectorizationLegality::ReductionDescriptor RdxDesc = + (*Legal->getReductionVars())[P]; + assert(Legal->getActionMap()->count(P) && "something wrong"); + LoopVectorizationLegality::SpecialPatternDescriptor SPD = + (*Legal->getActionMap())[P]; + if (VF >= SPD.minVF && VF <= SPD.maxVF) { + switch (SPD.Kind) { + case LoopVectorizationLegality::SPK_Sad: { + LLVMContext &Context = TheLoop->getHeader()->getContext(); + SmallVector Tys; + Tys.push_back(VectorType::get(Type::getInt8Ty(Context), VF)); + Tys.push_back(VectorType::get(Type::getInt8Ty(Context), VF)); + Type *RetTy = Type::getInt32Ty(Context); + Intrinsic::ID ID = Intrinsic::sad; + unsigned IntrinsicCost = TTI.getIntrinsicInstrCost(ID, RetTy, Tys); + return IntrinsicCost; + } break; + + default: + break; + } + } + } + return 0; +} + +unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // If we know that this instruction will remain uniform, check the cost of // the scalar version. Index: test/Transforms/LoopVectorize/X86/sad-pattern.ll =================================================================== --- test/Transforms/LoopVectorize/X86/sad-pattern.ll +++ test/Transforms/LoopVectorize/X86/sad-pattern.ll @@ -0,0 +1,34 @@ +; RUN: opt < %s -loop-vectorize -S | FileCheck %s + +; ModuleID = '' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind readonly uwtable +; CHECK-LABEL: sad +; CHECK: call i32 @llvm.sad.v16i8.v16i8 +define i32 @sad(i8* nocapture readonly %pix1, i8* nocapture readonly %pix2) #0 { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %sum.07 = phi i32 [ 0, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds i8, i8* %pix1, i64 %indvars.iv + %0 = load i8, i8* %arrayidx, align 1 + %conv = zext i8 %0 to i32 + %arrayidx2 = getelementptr inbounds i8, i8* %pix2, i64 %indvars.iv + %1 = load i8, i8* %arrayidx2, align 1 + %conv3 = zext i8 %1 to i32 + %sub = sub nsw i32 %conv, %conv3 + %cmp4 = icmp sgt i32 %sub, 0 + %sub6 = sub nsw i32 0, %sub + %cond = select i1 %cmp4, i32 %sub, i32 %sub6 + %add = add nsw i32 %cond, %sum.07 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 16 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 %add +}