Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -449,6 +449,10 @@
   unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                  ArrayRef<Type *> Tys) const;
 
+  /// \returns the cost of SAD instruction.
+  unsigned getSADInstrCost(unsigned ISD, Type *RetTy, Type *op1,
+                        Type *op2) const;
+
   /// \returns The number of pieces into which the provided type must be
   /// split during legalization. Zero is returned when the answer is unknown.
   unsigned getNumberOfParts(Type *Tp) const;
@@ -565,6 +569,8 @@
                                     bool IsPairwiseForm) = 0;
   virtual unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
                                          ArrayRef<Type *> Tys) = 0;
+  virtual unsigned getSADInstrCost(unsigned ISD, Type *RetTy, Type *op1,
+                                Type *op2) = 0;
   virtual unsigned getNumberOfParts(Type *Tp) = 0;
   virtual unsigned getAddressComputationCost(Type *Ty, bool IsComplex) = 0;
   virtual unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) = 0;
@@ -719,6 +725,10 @@
                                  ArrayRef<Type *> Tys) override {
     return Impl.getIntrinsicInstrCost(ID, RetTy, Tys);
   }
+  unsigned getSADInstrCost(unsigned ISD, Type *RetTy, Type *op1,
+                        Type *op2) override {
+    return Impl.getSADInstrCost(ISD, RetTy, op1, op2);
+  }
   unsigned getNumberOfParts(Type *Tp) override {
     return Impl.getNumberOfParts(Tp);
   }
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -301,6 +301,11 @@
     return 1;
   }
 
+  unsigned getSADInstrCost(unsigned ISD, Type *RetTy, 
+                           Type *op1, Type *op2) {
+    return 1;
+  }
+
   unsigned getNumberOfParts(Type *Tp) { return 0; }
 
   unsigned getAddressComputationCost(Type *Tp, bool) { return 0; }
Index: include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- include/llvm/CodeGen/BasicTTIImpl.h
+++ include/llvm/CodeGen/BasicTTIImpl.h
@@ -615,6 +615,10 @@
     case Intrinsic::masked_load:
       return static_cast<T *>(this)
           ->getMaskedMemoryOpCost(Instruction::Load, RetTy, 0, 0);
+    case Intrinsic::sad:
+      ISD = ISD::SAD;
+      return static_cast<T *>(this)
+          ->getSADInstrCost(ISD, RetTy, Tys[0], Tys[1]);
     }
 
     const TargetLoweringBase *TLI = getTLI();
Index: include/llvm/CodeGen/ISDOpcodes.h
===================================================================
--- include/llvm/CodeGen/ISDOpcodes.h
+++ include/llvm/CodeGen/ISDOpcodes.h
@@ -643,6 +643,10 @@
     /// specifier.
     PREFETCH,
 
+    /// SAD - This corresponds to a SAD instruction. The operands are vectors
+    /// and the result is a scalar type.
+    SAD,
+
     /// OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope)
     /// This corresponds to the fence instruction. It takes an input chain, and
     /// two integer constants: an AtomicOrdering and a SynchronizationScope.
Index: include/llvm/IR/Intrinsics.td
===================================================================
--- include/llvm/IR/Intrinsics.td
+++ include/llvm/IR/Intrinsics.td
@@ -580,6 +580,11 @@
 // maps to void __clear_cache() on supporting platforms
 def int_clear_cache : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty],
                                 [], "llvm.clear_cache">;
+// Instrinsic takes any vector as argument and generate the 
+// Sum of Absolute difference(SAD) Instrunction according to target.
+def int_sad  : Intrinsic<[llvm_i32_ty], 
+                         [llvm_anyvector_ty, llvm_anyvector_ty], 
+                         [IntrNoMem]>;
 
 //===-------------------------- Masked Intrinsics -------------------------===//
 //
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -190,6 +190,11 @@
                                          Opd1PropInfo, Opd2PropInfo);
 }
 
+unsigned TargetTransformInfo::getSADInstrCost(unsigned ISD, Type *RetTy,
+                                              Type *op1, Type *op2) const {
+  return TTIImpl->getSADInstrCost(ISD, RetTy, op1, op2);
+}
+
 unsigned TargetTransformInfo::getShuffleCost(ShuffleKind Kind, Type *Ty,
                                              int Index, Type *SubTp) const {
   return TTIImpl->getShuffleCost(Kind, Ty, Index, SubTp);
Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1287,7 +1287,12 @@
       return;
     }
     break;
-
+  case ISD::SAD:
+    // Sad operation with illegal type will be custom-lowered.
+    Action = TLI.getOperationAction(Node->getOpcode(),
+                                    Node->getOperand(0).getValueType());
+    SimpleFinishLegalizing = false;
+    break;
   default:
     if (Node->getOpcode() >= ISD::BUILTIN_OP_END) {
       Action = TargetLowering::Legal;
@@ -1382,7 +1387,7 @@
     dbgs() << "\n";
 #endif
     llvm_unreachable("Do not know how to legalize this operator!");
-
+  case ISD::SAD:
   case ISD::CALLSEQ_START:
   case ISD::CALLSEQ_END:
     break;
Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4415,6 +4415,32 @@
   return DAG.getNode(ISD::FPOW, dl, LHS.getValueType(), LHS, RHS);
 }
 
+/// ExpandSad - expand the llvm sad intrinsic.
+static SDValue ExpandSad(SDLoc DL, SDValue LHS, SDValue RHS,
+                         SelectionDAG &DAG) {
+    SDValue result;
+    // if the type of SAD is legal i.e. v16i8, then the output of SAD 
+    // instruction will be v2i64. We need to extract  result from
+    // upper and lower half and add manually.
+    if(LHS.getValueType() == MVT::v16i8 && RHS.getValueType() == MVT::v16i8)
+    {
+      SDValue sad = DAG.getNode(ISD::SAD, DL, MVT::v2i64, LHS, RHS);
+      EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
+      SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
+                                       sad, DAG.getConstant(0, VecIdxTy));
+      SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, sad,
+                                    DAG.getConstant(1, VecIdxTy));
+ 
+      BottomHalf = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BottomHalf);
+      TopHalf = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, TopHalf);
+      result = DAG.getNode(ISD::ADD, DL, MVT::i32, TopHalf, BottomHalf);
+    }
+    else
+      // If the type is not legal then just generare SAD node which will
+      // be handled in custom lowering.
+      result = DAG.getNode(ISD::SAD, DL, MVT::i32, LHS, RHS);      
+  return result;
+}
 
 /// ExpandPowI - Expand a llvm.powi intrinsic.
 static SDValue ExpandPowI(SDLoc DL, SDValue LHS, SDValue RHS,
@@ -5147,6 +5173,11 @@
                              getValue(I.getArgOperand(0)).getValueType(),
                              getValue(I.getArgOperand(0))));
     return nullptr;
+  case Intrinsic::sad: {
+    setValue(&I, ExpandSad(sdl, getValue(I.getArgOperand(0)),
+                           getValue(I.getArgOperand(1)), DAG));
+    return nullptr;
+  }
   case Intrinsic::cttz: {
     SDValue Arg = getValue(I.getArgOperand(0));
     ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(1));
Index: lib/Target/X86/X86ISelDAGToDAG.cpp
===================================================================
--- lib/Target/X86/X86ISelDAGToDAG.cpp
+++ lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2084,6 +2084,16 @@
 
   switch (Opcode) {
   default: break;
+  case ISD::SAD: {
+    SDNode* New;
+    SDValue Ops[] = {Node->getOperand(0), Node->getOperand(1)};
+    if(Subtarget->hasAVX() || Subtarget->hasSSE1())
+      New = CurDAG->getMachineNode(X86::PSADBWrr, dl, Node->getValueType(0), Ops);
+    else if(Subtarget->hasMMX())
+      New = CurDAG->getMachineNode(X86::MMX_PSADBWirr, dl,
+                                   Node->getValueType(0), Ops); 
+    return New;
+  }
   case ISD::INTRINSIC_W_CHAIN: {
     unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
     switch (IntNo) {
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -1041,7 +1041,9 @@
     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
-
+    
+    setOperationAction(ISD::SAD,                MVT::v8i8, Custom);
+   
     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
 
@@ -16768,6 +16770,45 @@
   return SDValue();
 }
 
+static SDValue LowerSAD(SDValue Op, const X86Subtarget *Subtarget,
+                        SelectionDAG &DAG) {
+  SDNode *Node = Op.getNode();
+  SDLoc dl(Node);
+  SDValue N0 = Node->getOperand(0);
+  SDValue N1 = Node->getOperand(1);
+  SDValue result;
+  if (Op.getSimpleValueType() != MVT::i32)
+    return SDValue();
+
+  if (Subtarget->hasSSE1() || Subtarget->hasAVX()) {
+    EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy();
+    SDValue V0 = DAG.getUNDEF(MVT::v8i8);
+    SDValue V1 = DAG.getUNDEF(MVT::v8i8);
+    SDValue Op0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, 
+                              V0, N0);
+    SDValue Op1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, 
+                              V1, N1);
+    SDValue Ops[] = {Op0, Op1};
+    SDValue SAD = DAG.getNode(ISD::SAD, dl, MVT::v2i64, Ops);
+    SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, 
+                                      dl, MVT::i64, SAD, 
+                                      DAG.getConstant(0, VecIdxTy));
+    result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
+
+  }else if (Subtarget->hasMMX()) {
+    /// Convert 1st arg type of SUB, i8 to X86_MMX
+    SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::x86mmx, N0);
+    /// Convert 2nd arg type of SUB, i8 to X86_MMX
+    SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::x86mmx, N1);
+    SDValue Ops[] = {Op0, Op1};
+    SDValue SAD = DAG.getNode(ISD::SAD, dl, MVT::x86mmx, Ops);
+    SDValue bitcast = DAG.getNode(ISD::BITCAST, dl, MVT::i64, SAD);
+    result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, bitcast);
+  }
+  else
+    result = SDValue(); 
+  return result;
+}
 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget,
                           SelectionDAG &DAG) {
   SDNode *Node = Op.getNode();
@@ -17106,6 +17147,7 @@
   case ISD::ADD:                return LowerADD(Op, DAG);
   case ISD::SUB:                return LowerSUB(Op, DAG);
   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
+  case ISD::SAD:                return LowerSAD(Op, Subtarget, DAG);
   }
 }
 
Index: lib/Target/X86/X86TargetTransformInfo.h
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.h
+++ lib/Target/X86/X86TargetTransformInfo.h
@@ -73,6 +73,8 @@
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector);
   unsigned getMaxInterleaveFactor();
+  unsigned getSADInstrCost(unsigned ISD, Type *RetTy, 
+                        Type *op1, Type *op2);
   unsigned getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -78,6 +78,34 @@
   return 2;
 }
 
+unsigned X86TTIImpl::getSADInstrCost(unsigned ISD, Type *RetTy, 
+                                     Type *op1, Type *op2) {
+
+  unsigned result = 0;
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(op1);
+  EVT arg1Ty = TLI->getValueType(op1);
+  EVT arg2Ty = TLI->getValueType(op2);
+  assert((arg1Ty == arg2Ty) &&
+         "cannot handle different type of arguments for SAD");
+
+  MVT MTy = arg1Ty.getSimpleVT();
+
+  static const CostTblEntry<MVT::SimpleValueType> SSE1CostTable[] = {
+      {ISD::SAD, MVT::v8i8, 4}, 
+      {ISD::SAD, MVT::v16i8, 5},
+  };
+
+  if (ST->hasSSE1() || ST->hasAVX()) {
+    int Idx = CostTableLookup(SSE1CostTable, ISD, MTy);
+    if (Idx != -1)
+      return result = LT.first * SSE1CostTable[Idx].Cost;
+  }
+  if (ST->is64Bit() || ST->hasMMX()) {
+    return 4;
+  }
+  return 0;
+}
+
 unsigned X86TTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
     TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -289,7 +289,11 @@
   void createEmptyLoop();
   /// Copy and widen the instructions from the old loop.
   virtual void vectorizeLoop();
-
+ 
+  /// While widening the instructions, skip the SAD pattern and replace
+  /// with SAD intrinsic call.
+  void generateSADInstruction(Instruction *I, SmallVector<Value *, 4> &args,
+                              VectorParts &Entry);
   /// \brief The Loop exit block may have single value PHI nodes where the
   /// incoming value is 'Undef'. While vectorizing we only handled real values
   /// that were defined inside the loop. Here we fix the 'undef case'.
@@ -572,7 +576,8 @@
     RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()).
     RK_FloatAdd,    ///< Sum of floats.
     RK_FloatMult,   ///< Product of floats.
-    RK_FloatMinMax  ///< Min/max implemented in terms of select(cmp()).
+    RK_FloatMinMax,  ///< Min/max implemented in terms of select(cmp()).
+    RK_IntegerSpecial ///< Special patterns like SAD.
   };
 
   /// This enum represents the kinds of inductions that we support.
@@ -593,6 +598,45 @@
     MRK_FloatMax
   };
 
+  // This enum is used to keep track of actions to be performed on
+  // Instructions in special patterns. 
+  enum SpecialActionKind {
+    SAK_Invalid, 
+    SAK_Skip,    ///< Skip instruction which is part of special pattern
+    SAK_Replace, ///< replace the instruction
+    SAK_Arg,     ///< arguments 
+    SAK_Phi      ///< Phi instruction
+  };
+ 
+  // This keeps track of special pattern kind
+  enum SpecialPatternKind {
+    SPK_Invalid, 
+    SPK_Sad
+  };
+
+  // This struct holds information about each instruction in 
+  // Special Patterns
+  struct SpecialPatternDescriptor {
+    SpecialPatternDescriptor(SpecialPatternKind SPKind,
+                             SpecialActionKind SAK, unsigned miVF,
+                             unsigned maVF)
+        : Kind(SPKind), Action(SAK), minVF(miVF), maxVF(maVF) {}
+    SpecialPatternDescriptor() : Kind(SPK_Invalid), Action(SAK_Invalid),
+                                 minVF(0), maxVF(1) {}
+    // Kind - to keep track of which pattern the instruction belongs
+    SpecialPatternKind Kind;
+    // Action - What is the action to be performed for this instruction
+    SpecialActionKind Action;
+    // minVF - tells about the minimum VF for which this pattern is possible
+    // For Instance, in X86 the minVF for SAD is 8. We can not think of pattern
+    // replace for less than 8.
+    unsigned minVF;
+    // maXVF  tells about the maximun VF foe which this pattern is possible.
+    // For instance, in X86 the maxVF for SAD is 16. SAD operation cannot be 
+    // performed for VF more than 16 as 32-SAD is not present.
+    unsigned maxVF;
+  };
+
   /// This struct holds information about reduction variables.
   struct ReductionDescriptor {
     ReductionDescriptor() : StartValue(nullptr), LoopExitInstr(nullptr),
@@ -616,11 +660,16 @@
   /// This POD struct holds information about a potential reduction operation.
   struct ReductionInstDesc {
     ReductionInstDesc(bool IsRedux, Instruction *I) :
-      IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {}
+      IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid),
+      SPKind(SPK_Invalid) {}
 
     ReductionInstDesc(Instruction *I, MinMaxReductionKind K) :
       IsReduction(true), PatternLastInst(I), MinMaxKind(K) {}
 
+    ReductionInstDesc(Instruction *I, MinMaxReductionKind K, 
+                      SpecialPatternKind SPK) :
+      IsReduction(true), PatternLastInst(I), MinMaxKind(K), SPKind(SPK) {}
+
     // Is this instruction a reduction candidate.
     bool IsReduction;
     // The last instruction in a min/max pattern (select of the select(icmp())
@@ -628,6 +677,8 @@
     Instruction *PatternLastInst;
     // If this is a min/max pattern the comparison predicate.
     MinMaxReductionKind MinMaxKind;
+    // if special pattern then hold the pattern kind.
+    SpecialPatternKind SPKind;
   };
 
   /// A struct for saving information about induction variables.
@@ -703,6 +754,10 @@
   /// induction descriptor.
   typedef MapVector<PHINode*, InductionInfo> InductionList;
 
+  /// ActionList stores the actions to be performed for an instruction
+  /// which is part of special pattern in a basic block. 
+  typedef DenseMap<Instruction *, SpecialPatternDescriptor> ActionList;
+
   /// Returns true if it is legal to vectorize this loop.
   /// This does not mean that it is profitable to vectorize this
   /// loop, only that it is legal to do so.
@@ -717,6 +772,9 @@
   /// Returns the induction variables found in the loop.
   InductionList *getInductionVars() { return &Inductions; }
 
+  /// Returns the actionlist for instructions in the loop.
+  ActionList *getActionMap() { return &SpecialActions; }
+
   /// Returns the widest induction type.
   Type *getWidestInductionType() { return WidestIndTy; }
 
@@ -808,6 +866,13 @@
   /// Collect the variables that need to stay uniform after vectorization.
   void collectLoopUniforms();
 
+  /// Returns true if we find a SAD pattern.
+  bool isSADPattern(PHINode *Phi);
+
+  /// Returns a ReductionInstDesc with SpecialPatternKind set if it matches
+  /// any special pattern.
+  ReductionInstDesc isSpecialPattern(PHINode *Phi, ReductionInstDesc &Prev);
+
   /// Return true if all of the instructions in the block can be speculatively
   /// executed. \p SafePtrs is a list of addresses that are known to be legal
   /// and we know that we can read from them without segfault.
@@ -874,6 +939,8 @@
   PHINode *Induction;
   /// Holds the reduction variables.
   ReductionList Reductions;
+  /// Holds the actions for instructions in Loop.
+  ActionList SpecialActions;
   /// Holds all of the induction variables that we found in the loop.
   /// Notice that inductions don't need to start at zero and that induction
   /// variables can be pointers.
@@ -935,6 +1002,9 @@
   /// 64 bit loop indices.
   unsigned getWidestType();
 
+  /// \return The size (in bits) of the smallest type in the code that
+  /// needs to be vectorized.
+  unsigned getSmallestType();
   /// \return The most profitable unroll factor.
   /// If UserUF is non-zero then this method finds the best unroll-factor
   /// based on register pressure and other parameters.
@@ -966,6 +1036,10 @@
   /// Returns the execution time cost of an instruction for a given vector
   /// width. Vector width of one means scalar.
   unsigned getInstructionCost(Instruction *I, unsigned VF);
+  
+  /// Returns the execution time cost of a special reduction instruction 
+  /// for a given vector width.
+  unsigned SpecialPhiCost(PHINode *p, unsigned VF);
 
   /// Returns whether the instruction is a load or store and will be a emitted
   /// as a vector operation.
@@ -2437,7 +2511,8 @@
   case RK_IntegerXor:
   case RK_IntegerAdd:
   case RK_IntegerOr:
-    // Adding, Xoring, Oring zero to a number does not change it.
+  case RK_IntegerSpecial:
+      // Adding, Xoring, Oring zero to a number does not change it.
     return ConstantInt::get(Tp, 0);
   case RK_IntegerMult:
     // Multiplying a number by 1 does not change it.
@@ -2461,6 +2536,7 @@
 getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
   switch (Kind) {
     case LoopVectorizationLegality::RK_IntegerAdd:
+    case LoopVectorizationLegality::RK_IntegerSpecial:  
       return Instruction::Add;
     case LoopVectorizationLegality::RK_IntegerMult:
       return Instruction::Mul;
@@ -2597,6 +2673,47 @@
   return V;
 }
 
+void InnerLoopVectorizer::generateSADInstruction(
+     Instruction *I, SmallVector<Value *, 4> &args,
+                     VectorParts &Entry) {
+  LoopVectorizationLegality::SpecialPatternDescriptor SPD =
+      (*Legal->getActionMap())[I];
+  switch (SPD.Action) {
+    case LoopVectorizationLegality::SAK_Arg: {
+      Instruction *cv = dyn_cast<CastInst>(I);
+      if (cv) {
+        VectorParts &Op = getVectorValue(cv->getOperand(0));
+        args.push_back(Op[0]);
+      }
+      break;
+    }
+    case LoopVectorizationLegality::SAK_Skip:
+      // do nothing
+      break;
+    case LoopVectorizationLegality::SAK_Replace: {
+      Module *M = I->getParent()->getParent()->getParent();
+      SmallVector<Type *, 4> Tys;
+      Tys.push_back(args[0]->getType());
+      Tys.push_back(args[1]->getType());
+      Function *F = Intrinsic::getDeclaration(M, Intrinsic::sad, Tys);
+      Value *intr_call = Builder.CreateCall(F, args);
+      VectorParts &phi = getVectorValue(I->getOperand(1));
+      Value *add = Builder.CreateBinOp(Instruction::Add, intr_call, phi[0]);
+      Entry[0] = add;
+      propagateMetadata(Entry, I);
+      break;
+    }
+    case LoopVectorizationLegality::SAK_Phi: {
+      Type *VecTy = I->getType();
+      Entry[0] = PHINode::Create(VecTy, 2, "sadvec.phi",
+                               LoopVectorBody.back()->getFirstInsertionPt());
+      break;
+    }
+    case LoopVectorizationLegality::SAK_Invalid:
+      assert(0 && "Unknown action to be performed!!");
+      break; 
+  }
+}
 void InnerLoopVectorizer::vectorizeLoop() {
   //===------------------------------------------------===//
   //
@@ -2639,6 +2756,7 @@
   for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
        it != e; ++it) {
     PHINode *RdxPhi = *it;
+    bool isSAD = false;
     assert(RdxPhi && "Unable to recover vectorized PHI");
 
     // Find the reduction variable descriptor.
@@ -2646,6 +2764,15 @@
            "Unable to find the reduction variable");
     LoopVectorizationLegality::ReductionDescriptor RdxDesc =
     (*Legal->getReductionVars())[RdxPhi];
+ 
+    if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerSpecial) {
+      LoopVectorizationLegality::SpecialPatternDescriptor SPD =
+          (*Legal->getActionMap())[RdxPhi];
+
+      if (SPD.minVF <= VF && SPD.maxVF >= VF &&
+          SPD.Kind == LoopVectorizationLegality::SPK_Sad )
+        isSAD = true;
+    }
 
     setDebugLocFromInst(Builder, RdxDesc.StartValue);
 
@@ -2673,6 +2800,15 @@
                                                            RdxDesc.StartValue,
                                                            "minmax.ident");
       }
+    } else if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerSpecial) {
+      LoopVectorizationLegality::SpecialPatternDescriptor SPD =
+          (*Legal->getActionMap())[RdxPhi];
+      if (SPD.minVF <= VF && SPD.maxVF >= VF 
+          && SPD.Kind == LoopVectorizationLegality::SPK_Sad) {
+        //  assert(RdxDesc.SPKind == LoopVectorizationLegality::SPK_Sad &&
+        //    "Doesnot support other special kinds");
+        VectorStart = Identity = RdxDesc.StartValue;
+      }
     } else {
       // Handle other reduction kinds:
       Constant *Iden =
@@ -2747,7 +2883,7 @@
                                         ReducedPartRdx, RdxParts[part]);
     }
 
-    if (VF > 1) {
+    if (VF > 1 && !isSAD) {
       // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
       // and vector ops, reducing the set of values being computed by half each
       // round.
@@ -3028,8 +3164,28 @@
 
 void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
   // For each instruction in the old loop.
+  SmallVector<Value *, 4> args;
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
     VectorParts &Entry = WidenMap.get(it);
+    if (Legal->getActionMap()->count(it)) {
+      LoopVectorizationLegality::SpecialPatternDescriptor SPD =
+          (*Legal->getActionMap())[it];
+      if (SPD.minVF <= VF && SPD.maxVF >= VF) {
+        switch (SPD.Kind) {
+        case LoopVectorizationLegality::SPK_Sad:
+          generateSADInstruction(it, args, Entry);
+          if (SPD.Action == LoopVectorizationLegality::SAK_Phi) {
+            PHINode *P = cast<PHINode>(it);
+            PV->push_back(P);
+          }
+          break;
+        case LoopVectorizationLegality::SPK_Invalid:
+          assert(0 && "Cannot be Invalid here!!");
+          break;
+        }
+        continue;
+      }
+    }
     switch (it->getOpcode()) {
     case Instruction::Br:
       // Nothing to do for PHIs and BR, since we already took care of the
@@ -3549,6 +3705,11 @@
           continue;
         }
 
+        // Check the Special Patterns.
+        if (AddReductionVar(Phi, RK_IntegerSpecial)) {
+          DEBUG(dbgs() << "LV: Found a Special reduction PHI." << *Phi <<"\n");
+          continue;
+        }
         if (AddReductionVar(Phi, RK_IntegerAdd)) {
           DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
           continue;
@@ -3779,6 +3940,16 @@
   return Stride;
 }
 
+LoopVectorizationLegality::ReductionInstDesc
+LoopVectorizationLegality::isSpecialPattern(PHINode *Phi,
+                                            ReductionInstDesc &Prev) {
+  if (isSADPattern(Phi)) {
+    return ReductionInstDesc(Phi, Prev.MinMaxKind, SPK_Sad);
+  } else {
+    return Prev;
+  }
+}
+
 void LoopVectorizationLegality::collectStridedAccess(Value *MemAccess) {
   Value *Ptr = nullptr;
   if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
@@ -4023,6 +4194,11 @@
   if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
     return false;
 
+  ReduxDesc = isSpecialPattern(Phi, ReduxDesc);
+
+  if (Kind == RK_IntegerSpecial && ReduxDesc.SPKind == SPK_Invalid)
+    return false;
+
   // We found a reduction var if we have reached the original phi node and we
   // only have a single instruction with out-of-loop users.
 
@@ -4039,6 +4215,90 @@
   return true;
 }
 
+/// \brief This function checks for the following pattern
+
+///  %sum.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+///  ....
+///  %sub = sub nsw i32 %conv, %conv3
+///  %ispos = icmp uge i8 %0, %1
+///  %neg = sub nsw i32 0, %sub
+///  %2 = select i1 %ispos, i32 %sub, i32 %neg
+///  %add = add nsw i32 %2, %sum.05
+
+/// This pattern is matched from bottom. if everything matches
+/// then it returns true.
+bool LoopVectorizationLegality::isSADPattern(PHINode *Phi) {
+
+  LLVMContext &Context = TheLoop->getHeader()->getContext();
+  // SmallVector<Instruction*, 2> *SadIntrList = &SPDesc.IntrList;
+  for (User *U : Phi->users()) {
+    // This is supposed to be an add instruction
+    if (!isa<Instruction>(U))
+      return false;
+    Instruction *UI = dyn_cast<Instruction>(U);
+    if (UI->getOpcode() != Instruction::Add)
+      return false;
+    // all uses of Add instruction must be phi instructions
+    for (User *Ur : UI->users()) {
+      Instruction *I = dyn_cast<Instruction>(Ur);
+      if (!isa<PHINode>(I))
+        return false;
+    }
+    // Checking for Selection instruction
+    if (!isa<SelectInst>(UI->getOperand(0)))
+      return false;
+
+    //  two subs and one icmp instruction are matched
+    Instruction *Sel = dyn_cast<Instruction>(UI->getOperand(0));
+    if (!isa<Instruction>(Sel->getOperand(1)))
+      return false;
+    Instruction *Sub1 = dyn_cast<Instruction>(Sel->getOperand(1));
+    if (!isa<Instruction>(Sel->getOperand(2)))
+      return false;
+    Instruction *Sub2 = dyn_cast<Instruction>(Sel->getOperand(2));
+    if (Sub1->getOpcode() != Instruction::Sub ||
+        Sub2->getOpcode() != Instruction::Sub ||
+        !isa<ICmpInst>(Sel->getOperand(0)))
+      return false;
+    Instruction *cv1 = dyn_cast<CastInst>(Sub1->getOperand(0));
+    Instruction *cv2 = dyn_cast<CastInst>(Sub1->getOperand(1));
+    if (!cv1 || !cv2 || (cv1->getOperand(0)->getType()->getScalarType() !=
+                         Type::getInt8Ty(Context)) ||
+        (cv2->getOperand(0)->getType()->getScalarType() !=
+         Type::getInt8Ty(Context)))
+      return false;
+
+    if (!isa<Instruction>(Sel->getOperand(0)))
+      return false;
+    Instruction *icmp = dyn_cast<Instruction>(Sel->getOperand(0));
+    if (icmp->getOperand(0) != Sub1 || Sub2->getOperand(1) != Sub1) {
+      if (((icmp->getOperand(0) != cv1->getOperand(0)) &&
+           (icmp->getOperand(1) != cv1->getOperand(0))) ||
+          ((icmp->getOperand(0) != cv2->getOperand(0)) &&
+           (icmp->getOperand(1) != cv2->getOperand(0))))
+        return false;
+    }
+    // All these instructions are added to ActionList
+    SpecialActions[UI] = 
+        SpecialPatternDescriptor(SPK_Sad, SAK_Replace, 8, 16); // SAK_Replace
+    SpecialActions[Sel] =
+        SpecialPatternDescriptor(SPK_Sad, SAK_Skip, 8, 16); // SAK_Skip
+    SpecialActions[icmp] =
+        SpecialPatternDescriptor(SPK_Sad, SAK_Skip, 8, 16); // SAK_Skip
+    SpecialActions[Sub2] =
+        SpecialPatternDescriptor(SPK_Sad, SAK_Skip, 8, 16); // SAK_Skip
+    SpecialActions[Sub1] =
+        SpecialPatternDescriptor(SPK_Sad, SAK_Skip, 8, 16); // SAK_Skip
+    SpecialActions[cv1] =
+        SpecialPatternDescriptor(SPK_Sad, SAK_Arg, 8, 16); // SAK_Arg
+    SpecialActions[cv2] =
+        SpecialPatternDescriptor(SPK_Sad, SAK_Arg, 8, 16); // SAK_Arg
+    SpecialActions[Phi] = 
+        SpecialPatternDescriptor(SPK_Sad, SAK_Phi, 8, 16); // SAK_Phi
+  }
+  return true;
+}
+
 /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
 /// pattern corresponding to a min(X, Y) or max(X, Y).
 LoopVectorizationLegality::ReductionInstDesc
@@ -4095,6 +4355,12 @@
 LoopVectorizationLegality::isReductionInstr(Instruction *I,
                                             ReductionKind Kind,
                                             ReductionInstDesc &Prev) {
+  // This is to bypass this function for specialPatterns. We dont know which instruction 
+  // corresponds to reduction pattern.
+  if(Kind == RK_IntegerSpecial) {
+    return ReductionInstDesc(true, I); 
+  }
+  
   bool FP = I->getType()->isFloatingPointTy();
   bool FastMath = FP && I->hasUnsafeAlgebra();
   switch (I->getOpcode()) {
@@ -4279,7 +4545,7 @@
   unsigned TC = SE->getSmallConstantTripCount(TheLoop);
   DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
 
-  unsigned WidestType = getWidestType();
+  unsigned WidestType = getSmallestType();
   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
   unsigned MaxSafeDepDist = -1U;
   if (Legal->getMaxSafeDepDistBytes() != -1U)
@@ -4418,7 +4684,49 @@
 
   return MaxWidth;
 }
+unsigned LoopVectorizationCostModel::getSmallestType() {
+  unsigned MinWidth = 8;
 
+  // For each block.
+  for (Loop::block_iterator bb = TheLoop->block_begin(),
+       be = TheLoop->block_end(); bb != be; ++bb) {
+    BasicBlock *BB = *bb;
+
+    // For each instruction in the loop.
+    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+      Type *T = it->getType();
+
+      // Ignore ephemeral values.
+      if (EphValues.count(it))
+        continue;
+
+      // Only examine Loads, Stores and PHINodes.
+      if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
+        continue;
+
+      // Examine PHI nodes that are reduction variables.
+      if (PHINode *PN = dyn_cast<PHINode>(it))
+        if (!Legal->getReductionVars()->count(PN))
+          continue;
+
+      // Examine the stored values.
+      if (StoreInst *ST = dyn_cast<StoreInst>(it))
+        T = ST->getValueOperand()->getType();
+
+      // Ignore loaded pointer types and stored pointer types that are not
+      // consecutive. However, we do want to take consecutive stores/loads of
+      // pointer vectors into account.
+      if (T->isPointerTy() && !isConsecutiveLoadOrStore(it))
+        continue;
+      
+      MinWidth = std::min(MinWidth,
+                          (unsigned)DL->getTypeSizeInBits(T->getScalarType()));
+    }
+  }
+
+  return MinWidth;
+}
+         
 unsigned
 LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
                                                unsigned VF,
@@ -4704,6 +5012,22 @@
       if (EphValues.count(it))
         continue;
 
+      if (Legal->getActionMap()->count(it)) {
+        LoopVectorizationLegality::SpecialPatternDescriptor SPD =
+            (*Legal->getActionMap())[it];
+        if (VF >= SPD.minVF && VF <= SPD.maxVF) {
+          if (SPD.Action == LoopVectorizationLegality::SAK_Phi) {
+             PHINode *p = dyn_cast<PHINode>(it);
+             unsigned PhiCost;
+             PhiCost= SpecialPhiCost(p, VF);
+             DEBUG(dbgs() << "LV: Found an estimated cost of " << PhiCost << " for VF " <<
+                     VF << " For special Phi instruction: " << *it << '\n');
+             BlockCost += PhiCost;
+          }
+          continue;
+        }
+      }
+
       unsigned C = getInstructionCost(it, VF);
 
       // Check if we should override the cost.
@@ -4786,6 +5110,36 @@
   return false;
 }
 
+unsigned 
+LoopVectorizationCostModel::SpecialPhiCost(PHINode *P, unsigned VF) {
+ 
+  if (Legal->getReductionVars()->count(P)) {
+    LoopVectorizationLegality::ReductionDescriptor RdxDesc =
+        (*Legal->getReductionVars())[P];
+    assert(Legal->getActionMap()->count(P) && "something wrong");
+    LoopVectorizationLegality::SpecialPatternDescriptor SPD =
+        (*Legal->getActionMap())[P];
+    if (VF >= SPD.minVF && VF <= SPD.maxVF) {
+      switch (SPD.Kind) {
+      case LoopVectorizationLegality::SPK_Sad: {
+        LLVMContext &Context = TheLoop->getHeader()->getContext();
+        SmallVector<Type *, 4> Tys;
+        Tys.push_back(VectorType::get(Type::getInt8Ty(Context), VF));
+        Tys.push_back(VectorType::get(Type::getInt8Ty(Context), VF));
+        Type *RetTy = Type::getInt32Ty(Context);
+        Intrinsic::ID ID = Intrinsic::sad;
+        unsigned IntrinsicCost = TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
+        return IntrinsicCost;
+      } break;
+
+      default:
+        break;
+      }
+    }
+  }
+  return 0;
+}
+
 unsigned
 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   // If we know that this instruction will remain uniform, check the cost of