Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -452,6 +452,10 @@
   unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                  ArrayRef<Type *> Tys) const;
 
+  /// \returns the cost of SAD instruction.
+  unsigned getSADInstrCost(Type *RetTy, Type *op1,
+                           Type *op2) const;
+
   /// \returns The cost of Call instructions.
   unsigned getCallInstrCost(Function *F, Type *RetTy,
                             ArrayRef<Type *> Tys) const;
@@ -573,6 +577,8 @@
                                     bool IsPairwiseForm) = 0;
   virtual unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                          ArrayRef<Type *> Tys) = 0;
+  virtual unsigned getSADInstrCost(Type *RetTy, Type *op1,
+                                   Type *op2) = 0;
   virtual unsigned getCallInstrCost(Function *F, Type *RetTy,
                                     ArrayRef<Type *> Tys) = 0;
   virtual unsigned getNumberOfParts(Type *Tp) = 0;
@@ -732,6 +738,10 @@
                                  ArrayRef<Type *> Tys) override {
     return Impl.getIntrinsicInstrCost(ID, RetTy, Tys);
   }
+  unsigned getSADInstrCost(Type *RetTy, Type *op1,
+                           Type *op2) override {
+    return Impl.getSADInstrCost(RetTy, op1, op2);
+  }
   unsigned getCallInstrCost(Function *F, Type *RetTy,
                             ArrayRef<Type *> Tys) override {
     return Impl.getCallInstrCost(F, RetTy, Tys);
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -302,6 +302,10 @@
                                  ArrayRef<Type *> Tys) {
     return 1;
   }
+  
+  unsigned getSADInstrCost(Type *RetTy, Type *op1, Type *op2) {
+    return 1;
+  }
 
   unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) {
     return 1;
Index: include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- include/llvm/CodeGen/BasicTTIImpl.h
+++ include/llvm/CodeGen/BasicTTIImpl.h
@@ -627,6 +627,9 @@
     case Intrinsic::masked_load:
       return static_cast<T *>(this)
           ->getMaskedMemoryOpCost(Instruction::Load, RetTy, 0, 0);
+    case Intrinsic::sad:
+      return static_cast<T *>(this)
+          ->getSADInstrCost(RetTy, Tys[0], Tys[1]);
     }
 
     const TargetLoweringBase *TLI = getTLI();
Index: include/llvm/CodeGen/ISDOpcodes.h
===================================================================
--- include/llvm/CodeGen/ISDOpcodes.h
+++ include/llvm/CodeGen/ISDOpcodes.h
@@ -642,6 +642,11 @@
     /// read / write specifier, locality specifier and instruction / data cache
     /// specifier.
     PREFETCH,
+  
+    /// SAD - This corresponds to a Sum of Absolute Difference(SAD) instruction.
+    /// The operands are vectors of char type with length of vector being power
+    /// of 2 and the result is a scalar integer.
+    SAD,
 
     /// OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope)
     /// This corresponds to the fence instruction. It takes an input chain, and
Index: include/llvm/IR/Intrinsics.td
===================================================================
--- include/llvm/IR/Intrinsics.td
+++ include/llvm/IR/Intrinsics.td
@@ -592,6 +592,12 @@
 def int_clear_cache : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty],
                                 [], "llvm.clear_cache">;
 
+// Calculate the Sum of Absolute Differences (SAD) of the two input vectors.
+// Only vectors of char Type are allowed.
+def int_sad  : Intrinsic<[llvm_i32_ty],
+                         [llvm_anyvector_ty, llvm_anyvector_ty],
+                         [IntrNoMem]>;
+
 //===-------------------------- Masked Intrinsics -------------------------===//
 //
 def int_masked_store : Intrinsic<[], [llvm_anyvector_ty, LLVMPointerTo<0>,
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -194,6 +194,11 @@
                                          Opd1PropInfo, Opd2PropInfo);
 }
 
+unsigned TargetTransformInfo::getSADInstrCost(Type *RetTy, Type *op1,
+                                              Type *op2) const {
+  return TTIImpl->getSADInstrCost(RetTy, op1, op2);
+}
+
 unsigned TargetTransformInfo::getShuffleCost(ShuffleKind Kind, Type *Ty,
                                              int Index, Type *SubTp) const {
   return TTIImpl->getShuffleCost(Kind, Ty, Index, SubTp);
Index: lib/Target/X86/X86TargetTransformInfo.h
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.h
+++ lib/Target/X86/X86TargetTransformInfo.h
@@ -73,6 +73,8 @@
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector);
   unsigned getMaxInterleaveFactor();
+  unsigned getSADInstrCost(Type *RetTy, Type *op1, 
+                           Type *op2);
   unsigned getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -77,7 +77,34 @@
 
   return 2;
 }
+unsigned X86TTIImpl::getSADInstrCost(Type *RetTy, Type *op1,
+                                     Type *op2) {
 
+  unsigned result = 0;
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(op1);
+  EVT arg1Ty = TLI->getValueType(op1);
+  EVT arg2Ty = TLI->getValueType(op2);
+  assert((arg1Ty == arg2Ty) &&
+         "cannot handle different type of arguments for SAD");
+
+  MVT MTy = arg1Ty.getSimpleVT();
+
+  static const CostTblEntry<MVT::SimpleValueType> SSE1CostTable[] = {
+      {ISD::SAD, MVT::v8i8, 4},
+      {ISD::SAD, MVT::v16i8, 5},
+  };
+
+  if (ST->hasSSE1() || ST->hasAVX()) {
+    int Idx = CostTableLookup(SSE1CostTable, ISD::SAD, MTy);
+  if (Idx != -1)
+    return result = LT.first * SSE1CostTable[Idx].Cost;
+  }
+  if (ST->is64Bit() || ST->hasMMX()) {
+    return 4;
+  }
+  return 0;
+}
+
 unsigned X86TTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
     TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -296,6 +296,11 @@
   /// Copy and widen the instructions from the old loop.
   virtual void vectorizeLoop();
 
+  /// While widening the instructions, skip the SAD pattern and replace
+  /// with SAD intrinsic call.
+  void generateSADInstruction(Instruction *I, SmallVector<Value *, 4> &args,
+                              VectorParts &Entry);
+
   /// \brief The Loop exit block may have single value PHI nodes where the
   /// incoming value is 'Undef'. While vectorizing we only handled real values
   /// that were defined inside the loop. Here we fix the 'undef case'.
@@ -580,7 +585,8 @@
     RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()).
     RK_FloatAdd,    ///< Sum of floats.
     RK_FloatMult,   ///< Product of floats.
-    RK_FloatMinMax  ///< Min/max implemented in terms of select(cmp()).
+    RK_FloatMinMax,  ///< Min/max implemented in terms of select(cmp()).
+    RK_IntegerSpecial ///< Special patterns like SAD.
   };
 
   /// This enum represents the kinds of inductions that we support.
@@ -601,6 +607,44 @@
     MRK_FloatMax
   };
 
+  // This enum is used to keep track of actions to be performed on
+  // Instructions in special patterns.
+  enum SpecialActionKind {
+    SAK_Invalid,
+    SAK_Skip, ///< Skip instruction which is part of special pattern
+    SAK_Replace, ///< replace the instruction
+    SAK_Arg, ///< arguments
+    SAK_Phi ///< Phi instruction
+  };
+
+  // This keeps track of special pattern kind
+  enum SpecialPatternKind {
+    SPK_Invalid,
+    SPK_Sad
+  };
+
+  // This struct holds information about each instruction in
+  // Special Patterns
+  struct SpecialPatternDescriptor {
+    SpecialPatternDescriptor(SpecialPatternKind SPKind,
+                             SpecialActionKind SAK, unsigned miVF,
+                             unsigned maVF)
+      : Kind(SPKind), Action(SAK), minVF(miVF), maxVF(maVF) {}
+
+    SpecialPatternDescriptor() : Kind(SPK_Invalid), Action(SAK_Invalid),
+                                 minVF(0), maxVF(1) {}
+    // Kind - to keep track of which pattern the instruction belongs
+    SpecialPatternKind Kind;
+    // Action - Specifies the action to be performed for this instruction
+    SpecialActionKind Action;
+    // minVF - Specifies the minimum VF for which the target supports
+    // this pattern. For example, for x86 the minimum VF for SAD is 8.
+    unsigned minVF;
+    // maxVF - Specifies the maximum VF for which the target supports
+    // this pattern. For example, for x86 the maximum VF for SAD is 16.
+    unsigned maxVF;
+  };
+  
   /// This struct holds information about reduction variables.
   struct ReductionDescriptor {
     ReductionDescriptor() : StartValue(nullptr), LoopExitInstr(nullptr),
@@ -624,10 +668,15 @@
   /// This POD struct holds information about a potential reduction operation.
   struct ReductionInstDesc {
     ReductionInstDesc(bool IsRedux, Instruction *I) :
-      IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {}
+      IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid),
+      SPKind(SPK_Invalid) {}
 
+    ReductionInstDesc(Instruction *I, MinMaxReductionKind K,
+                      SpecialPatternKind SPK) :
+      IsReduction(true), PatternLastInst(I), MinMaxKind(K), SPKind(SPK) {}
+
     ReductionInstDesc(Instruction *I, MinMaxReductionKind K) :
-      IsReduction(true), PatternLastInst(I), MinMaxKind(K) {}
+      IsReduction(true), PatternLastInst(I), MinMaxKind(K), SPKind(SPK_Invalid) {}
 
     // Is this instruction a reduction candidate.
     bool IsReduction;
@@ -636,6 +685,8 @@
     Instruction *PatternLastInst;
     // If this is a min/max pattern the comparison predicate.
     MinMaxReductionKind MinMaxKind;
+    // if special pattern then hold the pattern kind.
+    SpecialPatternKind SPKind;
   };
 
   /// A struct for saving information about induction variables.
@@ -711,6 +762,10 @@
   /// induction descriptor.
   typedef MapVector<PHINode*, InductionInfo> InductionList;
 
+  /// ActionList stores the actions to be performed for an instruction
+  /// which is part of special pattern in a basic block.
+  typedef DenseMap<Instruction *, SpecialPatternDescriptor> ActionList;
+
   /// Returns true if it is legal to vectorize this loop.
   /// This does not mean that it is profitable to vectorize this
   /// loop, only that it is legal to do so.
@@ -725,6 +780,9 @@
   /// Returns the induction variables found in the loop.
   InductionList *getInductionVars() { return &Inductions; }
 
+  /// Returns the actionlist for instructions in the loop.
+  ActionList *getActionMap() { return &SpecialActions; }
+
   /// Returns the widest induction type.
   Type *getWidestInductionType() { return WidestIndTy; }
 
@@ -816,6 +874,13 @@
   /// Collect the variables that need to stay uniform after vectorization.
   void collectLoopUniforms();
 
+  /// Returns true if we find a SAD pattern.
+  bool isSADPattern(PHINode *Phi);
+
+  /// Returns a ReductionInstDesc with SpecialPatternKind set if it matches
+  /// any special pattern.
+  ReductionInstDesc isSpecialPattern(PHINode *Phi, ReductionInstDesc &Prev);
+
   /// Return true if all of the instructions in the block can be speculatively
   /// executed. \p SafePtrs is a list of addresses that are known to be legal
   /// and we know that we can read from them without segfault.
@@ -880,6 +945,8 @@
   PHINode *Induction;
   /// Holds the reduction variables.
   ReductionList Reductions;
+  /// Holds the actions for instructions in Loop.
+  ActionList SpecialActions;
   /// Holds all of the induction variables that we found in the loop.
   /// Notice that inductions don't need to start at zero and that induction
   /// variables can be pointers.
@@ -939,6 +1006,12 @@
   /// needs to be vectorized. We ignore values that remain scalar such as
   /// 64 bit loop indices.
   unsigned getWidestType();
+  
+  /// \return The most profitable type for the pattern.
+  /// For example, for SAD pattern the return value is the smallest type 
+  /// presnt in basic block.
+  /// If there is no special pattern then the widest type is returned. 
+  unsigned selectBestTypeForPattern();
 
   /// \return The most profitable unroll factor.
   /// If UserUF is non-zero then this method finds the best unroll-factor
@@ -972,6 +1045,10 @@
   /// width. Vector width of one means scalar.
   unsigned getInstructionCost(Instruction *I, unsigned VF);
 
+  /// Returns the execution time cost of a special reduction instruction
+  /// for a given vector width.
+  unsigned SpecialPhiCost(PHINode *p, unsigned VF);
+
   /// Returns whether the instruction is a load or store and will be a emitted
   /// as a vector operation.
   bool isConsecutiveLoadOrStore(Instruction *I);
@@ -2476,6 +2553,7 @@
   case RK_IntegerXor:
   case RK_IntegerAdd:
   case RK_IntegerOr:
+  case RK_IntegerSpecial:
     // Adding, Xoring, Oring zero to a number does not change it.
     return ConstantInt::get(Tp, 0);
   case RK_IntegerMult:
@@ -2501,6 +2579,8 @@
   switch (Kind) {
     case LoopVectorizationLegality::RK_IntegerAdd:
       return Instruction::Add;
+    case LoopVectorizationLegality::RK_IntegerSpecial:
+      return Instruction::Add;
     case LoopVectorizationLegality::RK_IntegerMult:
       return Instruction::Mul;
     case LoopVectorizationLegality::RK_IntegerOr:
@@ -2635,6 +2715,47 @@
   return V;
 }
 
+void InnerLoopVectorizer::generateSADInstruction(Instruction *I,
+                                                 SmallVector<Value *, 4> &args,
+                                                 VectorParts &Entry) {
+  LoopVectorizationLegality::SpecialPatternDescriptor SPD =
+                           (*Legal->getActionMap())[I];
+  switch (SPD.Action) {
+    case LoopVectorizationLegality::SAK_Arg: {
+      Instruction *CV = dyn_cast<CastInst>(I);
+      if (CV) {
+        VectorParts &Op = getVectorValue(CV->getOperand(0));
+        args.push_back(Op[0]);
+      }
+      break;
+    }
+    case LoopVectorizationLegality::SAK_Skip:
+      // do nothing
+      break;
+    case LoopVectorizationLegality::SAK_Replace: {
+      Module *M = I->getParent()->getParent()->getParent();
+      SmallVector<Type *, 4> Tys;
+      Tys.push_back(args[0]->getType());
+      Tys.push_back(args[1]->getType());
+      Function *F = Intrinsic::getDeclaration(M, Intrinsic::sad, Tys);
+      Value *intr_call = Builder.CreateCall(F, args);
+      VectorParts &phi = getVectorValue(I->getOperand(1));
+      Value *add = Builder.CreateBinOp(Instruction::Add, intr_call, phi[0]);
+      Entry[0] = add;
+      propagateMetadata(Entry, I);
+      break;
+    }
+    case LoopVectorizationLegality::SAK_Phi: {
+      Type *VecTy = I->getType();
+      Entry[0] = PHINode::Create(VecTy, 2, "sadvec.phi",
+      LoopVectorBody.back()->getFirstInsertionPt());
+      break;
+    }
+    case LoopVectorizationLegality::SAK_Invalid:
+      assert(0 && "Unknown action to be performed!!");
+      break;
+  }
+}
 /// Estimate the overhead of scalarizing a value. Insert and Extract are set if
 /// the result needs to be inserted and/or extracted from vectors.
 static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract,
@@ -2766,6 +2887,7 @@
   for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
        it != e; ++it) {
     PHINode *RdxPhi = *it;
+    bool isSAD = false;
     assert(RdxPhi && "Unable to recover vectorized PHI");
 
     // Find the reduction variable descriptor.
@@ -2774,6 +2896,15 @@
     LoopVectorizationLegality::ReductionDescriptor RdxDesc =
     (*Legal->getReductionVars())[RdxPhi];
 
+    if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerSpecial) {
+      LoopVectorizationLegality::SpecialPatternDescriptor SPD =
+         (*Legal->getActionMap())[RdxPhi];
+
+      if (SPD.minVF <= VF && SPD.maxVF >= VF &&
+          SPD.Kind == LoopVectorizationLegality::SPK_Sad )
+        isSAD = true;
+    }
+
     setDebugLocFromInst(Builder, RdxDesc.StartValue);
 
     // We need to generate a reduction vector from the incoming scalar.
@@ -2800,6 +2931,12 @@
                                                            RdxDesc.StartValue,
                                                            "minmax.ident");
       }
+    } else if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerSpecial) {
+       LoopVectorizationLegality::SpecialPatternDescriptor SPD =
+         (*Legal->getActionMap())[RdxPhi];
+         if (SPD.minVF <= VF && SPD.maxVF >= VF && 
+             SPD.Kind == LoopVectorizationLegality::SPK_Sad) 
+           VectorStart = Identity = RdxDesc.StartValue;
     } else {
       // Handle other reduction kinds:
       Constant *Iden =
@@ -2874,7 +3011,7 @@
                                         ReducedPartRdx, RdxParts[part]);
     }
 
-    if (VF > 1) {
+    if (VF > 1 && !isSAD) {
       // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
       // and vector ops, reducing the set of values being computed by half each
       // round.
@@ -3155,8 +3292,28 @@
 
 void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
   // For each instruction in the old loop.
+  SmallVector<Value *, 4> args;
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
     VectorParts &Entry = WidenMap.get(it);
+    if (Legal->getActionMap()->count(it)) {
+      LoopVectorizationLegality::SpecialPatternDescriptor SPD =
+          (*Legal->getActionMap())[it];
+      if (SPD.minVF <= VF && SPD.maxVF >= VF) {
+      switch (SPD.Kind) {
+        case LoopVectorizationLegality::SPK_Sad:
+          generateSADInstruction(it, args, Entry);
+          if (SPD.Action == LoopVectorizationLegality::SAK_Phi) {
+            PHINode *P = cast<PHINode>(it);
+            PV->push_back(P);
+          }
+          break;
+        case LoopVectorizationLegality::SPK_Invalid:
+          assert(0 && "Cannot be Invalid here!!");
+          break;
+        }
+        continue;
+      }
+    }
     switch (it->getOpcode()) {
     case Instruction::Br:
       // Nothing to do for PHIs and BR, since we already took care of the
@@ -3710,7 +3867,11 @@
 
           continue;
         }
-
+        // Check the Special Patterns.
+        if (AddReductionVar(Phi, RK_IntegerSpecial)) {
+          DEBUG(dbgs() << "LV: Found a Special reduction PHI." << *Phi <<"\n");
+          continue;
+        }
         if (AddReductionVar(Phi, RK_IntegerAdd)) {
           DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
           continue;
@@ -3944,6 +4105,16 @@
   return Stride;
 }
 
+LoopVectorizationLegality::ReductionInstDesc
+LoopVectorizationLegality::isSpecialPattern(PHINode *Phi, 
+                                            ReductionInstDesc &Prev) {
+  if (isSADPattern(Phi)) {
+    return ReductionInstDesc(Phi, Prev.MinMaxKind, SPK_Sad);
+  } else {
+    return Prev;
+  }
+}
+
 void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) {
   Value *Ptr = nullptr;
   if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
@@ -4199,7 +4370,12 @@
 
   if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
     return false;
+  
+  ReduxDesc = isSpecialPattern(Phi, ReduxDesc);
 
+  if (Kind == RK_IntegerSpecial && ReduxDesc.SPKind == SPK_Invalid)
+    return false;
+
   // We found a reduction var if we have reached the original phi node and we
   // only have a single instruction with out-of-loop users.
 
@@ -4216,6 +4392,90 @@
   return true;
 }
 
+/// \brief This function checks for the following pattern
+
+/// %sum.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+/// ....
+/// %sub = sub nsw i32 %conv, %conv3
+/// %ispos = icmp uge i8 %0, %1
+/// %neg = sub nsw i32 0, %sub
+/// %2 = select i1 %ispos, i32 %sub, i32 %neg
+/// %add = add nsw i32 %2, %sum.05
+
+/// This pattern is matched from bottom. if everything matches
+/// then it returns true.
+bool LoopVectorizationLegality::isSADPattern(PHINode *Phi) {
+
+  LLVMContext &Context = TheLoop->getHeader()->getContext();
+  // SmallVector<Instruction*, 2> *SadIntrList = &SPDesc.IntrList;
+  for (User *U : Phi->users()) {
+  // This is supposed to be an add instruction
+    if (!isa<Instruction>(U))
+      return false;
+    Instruction *UI = dyn_cast<Instruction>(U);
+    if (UI->getOpcode() != Instruction::Add)
+      return false;
+  // all uses of Add instruction must be phi instructions
+  for (User *Ur : UI->users()) {
+    Instruction *I = dyn_cast<Instruction>(Ur);
+    if (!isa<PHINode>(I))
+      return false;
+  }
+  // Checking for Selection instruction
+  if (!isa<SelectInst>(UI->getOperand(0)))
+    return false;
+
+  // two subs and one icmp instruction are matched
+  Instruction *Sel = dyn_cast<Instruction>(UI->getOperand(0));
+  if (!isa<Instruction>(Sel->getOperand(1)))
+    return false;
+  Instruction *Sub1 = dyn_cast<Instruction>(Sel->getOperand(1));
+  if (!isa<Instruction>(Sel->getOperand(2)))
+    return false;
+  Instruction *Sub2 = dyn_cast<Instruction>(Sel->getOperand(2));
+  if (Sub1->getOpcode() != Instruction::Sub ||
+      Sub2->getOpcode() != Instruction::Sub ||
+      !isa<ICmpInst>(Sel->getOperand(0)))
+    return false;
+  Instruction *cv1 = dyn_cast<CastInst>(Sub1->getOperand(0));
+  Instruction *cv2 = dyn_cast<CastInst>(Sub1->getOperand(1));
+  if (!cv1 || !cv2 || (cv1->getOperand(0)->getType()->getScalarType() !=
+                       Type::getInt8Ty(Context)) ||
+      (cv2->getOperand(0)->getType()->getScalarType() !=
+       Type::getInt8Ty(Context)))
+    return false;
+
+  if (!isa<Instruction>(Sel->getOperand(0)))
+    return false;
+  Instruction *icmp = dyn_cast<Instruction>(Sel->getOperand(0));
+  if (icmp->getOperand(0) != Sub1 || Sub2->getOperand(1) != Sub1) {
+    if (((icmp->getOperand(0) != cv1->getOperand(0)) &&
+         (icmp->getOperand(1) != cv1->getOperand(0))) ||
+        ((icmp->getOperand(0) != cv2->getOperand(0)) &&
+         (icmp->getOperand(1) != cv2->getOperand(0))))
+      return false;
+  }
+    // All these instructions are added to ActionList
+  SpecialActions[UI] =
+      SpecialPatternDescriptor(SPK_Sad, SAK_Replace, 8, 16); // SAK_Replace
+  SpecialActions[Sel] =
+      SpecialPatternDescriptor(SPK_Sad, SAK_Skip, 8, 16); // SAK_Skip
+  SpecialActions[icmp] =
+      SpecialPatternDescriptor(SPK_Sad, SAK_Skip, 8, 16); // SAK_Skip
+  SpecialActions[Sub2] =
+      SpecialPatternDescriptor(SPK_Sad, SAK_Skip, 8, 16); // SAK_Skip
+  SpecialActions[Sub1] =
+      SpecialPatternDescriptor(SPK_Sad, SAK_Skip, 8, 16); // SAK_Skip
+  SpecialActions[cv1] =
+      SpecialPatternDescriptor(SPK_Sad, SAK_Arg, 8, 16); // SAK_Arg
+  SpecialActions[cv2] =
+      SpecialPatternDescriptor(SPK_Sad, SAK_Arg, 8, 16); // SAK_Arg
+  SpecialActions[Phi] =
+      SpecialPatternDescriptor(SPK_Sad, SAK_Phi, 8, 16); // SAK_Phi
+  }
+  return true;
+}
+
 /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
 /// pattern corresponding to a min(X, Y) or max(X, Y).
 LoopVectorizationLegality::ReductionInstDesc
@@ -4274,6 +4534,9 @@
                                             ReductionInstDesc &Prev) {
   bool FP = I->getType()->isFloatingPointTy();
   bool FastMath = FP && I->hasUnsafeAlgebra();
+  if (Kind == RK_IntegerSpecial) {
+      return ReductionInstDesc(true, I);
+  }
   switch (I->getOpcode()) {
   default:
     return ReductionInstDesc(false, I);
@@ -4470,7 +4733,7 @@
   unsigned TC = SE->getSmallConstantTripCount(TheLoop);
   DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
 
-  unsigned WidestType = getWidestType();
+  unsigned BestType = selectBestTypeForPattern();
   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
   unsigned MaxSafeDepDist = -1U;
   if (Legal->getMaxSafeDepDistBytes() != -1U)
@@ -4477,8 +4740,8 @@
     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
   WidestRegister = ((WidestRegister < MaxSafeDepDist) ?
                     WidestRegister : MaxSafeDepDist);
-  unsigned MaxVectorSize = WidestRegister / WidestType;
-  DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
+  unsigned MaxVectorSize = WidestRegister / BestType;
+  DEBUG(dbgs() << "LV: The Best type: " << BestType << " bits.\n");
   DEBUG(dbgs() << "LV: The Widest register is: "
           << WidestRegister << " bits.\n");
 
@@ -4567,6 +4830,61 @@
   return Factor;
 }
 
+unsigned LoopVectorizationCostModel::selectBestTypeForPattern() {
+  unsigned MaxWidth = 8;
+  const DataLayout &DL = TheFunction->getParent()->getDataLayout();
+  unsigned specialPatternWidth = 1024;
+   // For each block.
+  for (Loop::block_iterator bb = TheLoop->block_begin(),
+       be = TheLoop->block_end(); bb != be; ++bb) {
+    BasicBlock *BB = *bb;
+
+    // For each instruction in the loop.
+    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+      Type *T = it->getType();
+
+      // Ignore ephemeral values.
+      if (EphValues.count(it))
+        continue;
+
+      // Only examine Loads, Stores and PHINodes.
+      if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
+        continue;
+
+      // Examine PHI nodes that are reduction variables.
+      if (PHINode *PN = dyn_cast<PHINode>(it)) {
+        if (!Legal->getReductionVars()->count(PN))
+          continue;
+        if(Legal->getActionMap()->count(PN)) {
+          LoopVectorizationLegality::SpecialPatternDescriptor SPD =
+            (*Legal->getActionMap())[it];
+          switch(SPD.Kind) {
+            case LoopVectorizationLegality::SPK_Sad:
+              specialPatternWidth = std::min(specialPatternWidth,(unsigned)8);
+              continue;
+            default : break;
+          }
+        }
+      }
+
+      // Examine the stored values.
+      if (StoreInst *ST = dyn_cast<StoreInst>(it))
+        T = ST->getValueOperand()->getType();
+
+      // Ignore loaded pointer types and stored pointer types that are not
+      // consecutive. However, we do want to take consecutive stores/loads of
+      // pointer vectors into account.
+      if (T->isPointerTy() && !isConsecutiveLoadOrStore(it))
+        continue;
+
+      MaxWidth = std::max(MaxWidth,
+                          (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
+    }
+  }
+
+  return std::min(MaxWidth,specialPatternWidth);
+}
+
 unsigned LoopVectorizationCostModel::getWidestType() {
   unsigned MaxWidth = 8;
   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
@@ -4903,7 +5221,23 @@
       // Ignore ephemeral values.
       if (EphValues.count(it))
         continue;
-
+   
+      if (Legal->getActionMap()->count(it)) {
+        LoopVectorizationLegality::SpecialPatternDescriptor SPD =
+            (*Legal->getActionMap())[it];
+        if (VF >= SPD.minVF && VF <= SPD.maxVF) {
+          if (SPD.Action == LoopVectorizationLegality::SAK_Phi) {
+             PHINode *p = dyn_cast<PHINode>(it);
+             unsigned PhiCost;
+             PhiCost= SpecialPhiCost(p, VF);
+             DEBUG(dbgs() << "LV: Found an estimated cost of " << PhiCost 
+                   << " for VF " << VF << 
+                   " For special Phi instruction: " << *it << '\n');
+             BlockCost += PhiCost;
+          }
+        continue;
+        }
+      }
       unsigned C = getInstructionCost(it, VF);
 
       // Check if we should override the cost.
@@ -4987,6 +5321,36 @@
 }
 
 unsigned
+LoopVectorizationCostModel::SpecialPhiCost(PHINode *P, unsigned VF) {
+
+  if (Legal->getReductionVars()->count(P)) {
+    LoopVectorizationLegality::ReductionDescriptor RdxDesc =
+        (*Legal->getReductionVars())[P];
+    assert(Legal->getActionMap()->count(P) && "something wrong");
+    LoopVectorizationLegality::SpecialPatternDescriptor SPD =
+        (*Legal->getActionMap())[P];
+    if (VF >= SPD.minVF && VF <= SPD.maxVF) {
+      switch (SPD.Kind) {
+      case LoopVectorizationLegality::SPK_Sad: {
+        LLVMContext &Context = TheLoop->getHeader()->getContext();
+        SmallVector<Type *, 4> Tys;
+        Tys.push_back(VectorType::get(Type::getInt8Ty(Context), VF));
+        Tys.push_back(VectorType::get(Type::getInt8Ty(Context), VF));
+        Type *RetTy = Type::getInt32Ty(Context);
+        Intrinsic::ID ID = Intrinsic::sad;
+        unsigned IntrinsicCost = TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
+        return IntrinsicCost;
+      } break;
+
+      default:
+        break;
+      }
+    }
+  }
+  return 0;
+}
+
+unsigned
 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   // If we know that this instruction will remain uniform, check the cost of
   // the scalar version.
Index: test/Transforms/LoopVectorize/X86/sad-pattern.ll
===================================================================
--- test/Transforms/LoopVectorize/X86/sad-pattern.ll
+++ test/Transforms/LoopVectorize/X86/sad-pattern.ll
@@ -0,0 +1,34 @@
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+
+; ModuleID = '<stdin>'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind readonly uwtable
+; CHECK-LABEL: sad
+; CHECK: call i32 @llvm.sad.v16i8.v16i8 
+define i32 @sad(i8* nocapture readonly %pix1, i8* nocapture readonly %pix2) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %sum.07 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds i8, i8* %pix1, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %arrayidx2 = getelementptr inbounds i8, i8* %pix2, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx2, align 1
+  %conv3 = zext i8 %1 to i32
+  %sub = sub nsw i32 %conv, %conv3
+  %cmp4 = icmp sgt i32 %sub, 0
+  %sub6 = sub nsw i32 0, %sub
+  %cond = select i1 %cmp4, i32 %sub, i32 %sub6
+  %add = add nsw i32 %cond, %sum.07
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 16
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 %add
+}